def _attend(self, query, mask=None): query = unsqueeze(query, 0) # ((1, H), B) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) attn_scores = dy.transpose(query * self.context) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores)
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): embedded = embed_sentence(in_seq) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(in_seq)*2): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_char = probs.index(max(probs)) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a): # iterate through input states to compute alphas # print 'computing scores...' # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs] scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs] # print 'computed scores' # normalize to alphas using softmax # print 'computing alphas...' alphas = pc.softmax(pc.concatenate(scores)) # print 'computed alphas...' # compute c using alphas # print 'computing c...' # import time # s = time.time() # dim = len(blstm_outputs[0].vec_value()) # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)]) # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs]) # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas)) # print "stack time:", time.time() - s # s = time.time() c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # print "pick time:", time.time() - s # print 'computed c' # print 'c len is {}'.format(len(c.vec_value())) # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015) # print 'computing h~...' h_output = pc.tanh(W_c * pc.concatenate([h_t, c])) # print 'len of h_output is {}'.format(len(h_output.vec_value())) # print 'computed h~' return h_output, alphas, W__a.value()
def generate(input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): def sample(probs): rnd = random.random() for i, p in enumerate(probs): rnd -= p if rnd <= 0: break return i embedded = embed_sentence(input) encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded) w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings])) out = '' count_EOS = 0 for i in range(len(input)*2): if count_EOS == 2: break vector = dy.concatenate([attend(encoded, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) probs = probs.vec_value() next_char = sample(probs) last_output_embeddings = output_lookup[next_char] if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component): w1_att_src = dy.parameter(w1_att_src_p) w1_att_tgt = dy.parameter(w1_att_tgt_p) w2_att = dy.parameter(w2_att_p) a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att alignment = dy.softmax(a_t) att_output = src_output_matrix * alignment return att_output, alignment
def tag_sent(words): vecs = build_tagging_graph(words) vecs = [dy.softmax(v) for v in vecs] probs = [v.npvalue() for v in vecs] tags = [] for prb in probs: tag = np.argmax(prb) tags.append(vt.i2w[tag]) return zip(words, tags)
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def __call__(self, x): W = dy.parameter(self.mw) b = dy.parameter(self.mb) W2 = dy.parameter(self.mw2) b2 = dy.parameter(self.mb2) mlp_output = W2 * (dy.tanh(W * x + b)) + b2 if fDo_3_Layers: W3 = dy.parameter(self.mw3) b3 = dy.parameter(self.mb3) mlp_output = W3 * (dy.tanh(dy.mlpoutput)) + b3 return dy.softmax(mlp_output)
def predict_next_word(self, sentence): dy.renew_cg() init_state = self.builder.initial_state() state = init_state for cw in sentence: # assume word is already a word-id x_t = self.lookup[int(cw)] state = state.add_input(x_t) y_t = state.output() r_t = self.bias + (self.R * y_t) prob = dy.softmax(r_t) return prob
def generate_sent(): dy.renew_cg() hist = [S] * N sent = [] while True: p = dy.softmax(calc_score_of_history(hist)).npvalue() next_word = np.random.choice(nwords, p=p/p.sum()) if next_word == S or len(sent) == MAX_LEN: break sent.append(next_word) hist = hist[1:] + [next_word] return sent
def predict_output_sequence(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, W_c, W__a, U__a, v__a, lemma, feats, alphabet_index, inverse_alphabet_index, feat_index, feature_types): pc.renew_cg() R = pc.parameter(R) bias = pc.parameter(bias) W_c = pc.parameter(W_c) W__a = pc.parameter(W__a) U__a = pc.parameter(U__a) v__a = pc.parameter(v__a) blstm_outputs = encode_feats_and_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, feat_index, feat_lookup, feats, feature_types, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] i = 0 predicted_sequence = [] # run the decoder through the sequence and predict characters while i < MAX_PREDICTION_LEN: # get current h of the decoder s = s.add_input(prev_output_vec) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas, W = attend(blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias # find best candidate output probs = pc.softmax(readout) next_char_index = common.argmax(probs.vec_value()) predicted_sequence.append(inverse_alphabet_index[next_char_index]) # check if reached end of word if predicted_sequence[-1] == END_WORD: break # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[next_char_index] i += 1 # remove the end word symbol return predicted_sequence[0:-1]
def attend(input_mat, state, w1dt): global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols # w1dt: (attdim x seqlen) # w2dt: (attdim x attdim) w2dt = w2*dy.concatenate(list(state.s())) # att_weights: (seqlen,) row vector unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) # context: (encoder_state) context = input_mat * att_weights return context
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = dy.parameter(attention_w1) w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) attention_weights = [] w2dt = w2*dy.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*dy.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def translate_sentence(self, sent): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) sent_rev = list(reversed(sent)) # Bidirectional representations l2r_state = self.l2r_builder.initial_state() r2l_state = self.r2l_builder.initial_state() l2r_contexts = [] r2l_contexts = [] for (cw_l2r, cw_r2l) in zip(sent, sent_rev): l2r_state = l2r_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r])) r2l_state = r2l_state.add_input( dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l])) l2r_contexts.append(l2r_state.output()) r2l_contexts.append(r2l_state.output()) r2l_contexts.reverse() h_fs = [] for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts): h_fs.append(dy.concatenate([l2r_i, r2l_i])) h_fs_matrix = dy.concatenate_cols(h_fs) # Decoder trans_sentence = ['<S>'] cw = trans_sentence[-1] c_t = dy.vecInput(self.hidden_size * 2) start = dy.concatenate( [dy.lookup(self.tgt_lookup, self.tgt_token_to_id['<S>']), c_t]) dec_state = self.dec_builder.initial_state().add_input(start) while len(trans_sentence) < self.max_len: h_e = dec_state.output() c_t = self.__attention_mlp(h_fs_matrix, h_e) embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw]) x_t = dy.concatenate([embed_t, c_t]) dec_state = dec_state.add_input(x_t) y_star = b_y + W_y * dec_state.output() p = dy.softmax(y_star) cw = self.tgt_id_to_token[np.argmax(p.npvalue())] if cw == '</S>': break trans_sentence.append(cw) return ' '.join(trans_sentence[1:])
def decoding(self, src_encodings): src_len = len(src_encodings) # NOTE: should transpose before calling `mst` method! s_arc, s_label = self.cal_scores(src_encodings) s_arc_values = dy.softmax(s_arc).npvalue().transpose() # src_len, src_len s_label_values = np.asarray([x.npvalue() for x in s_label]).transpose((2, 1, 0)) # src_len, src_len, n_labels # weights = np.zeros((src_len + 1, src_len + 1)) # weights[0, 1:(src_len + 1)] = np.inf # weights[1:(src_len + 1), 0] = np.inf # weights[1:(src_len + 1), 1:(src_len + 1)] = s_arc_values[batch] weights = s_arc_values pred_heads = mst(weights) pred_labels = [np.argmax(labels[head]) for head, labels in zip(pred_heads, s_label_values)] return pred_heads, pred_labels
def decode(self, features): last_output_embeddings = self.pattern_embeddings[0] s = self.decoder_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(self.hidden_dim), last_output_embeddings])) out = [] for i in range(self.max_rule_length): h_t = s.output() context = self.attend(features, h_t) out_vector = self.pt * dy.concatenate([context, h_t]) + self.pt_bias probs = dy.softmax(out_vector).vec_value() last_output = probs.index(max(probs)) last_output_embeddings = self.pattern_embeddings[last_output] s = s.add_input(dy.concatenate([context, last_output_embeddings])) if last_output != 0: out.append(last_output) else: return out return out
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o): # attention mechanism - Bahdanau style # iterate through input states to compute alphas # print 'computing scores...' # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs] alphas = pc.softmax(pc.concatenate(scores)) # c_i: 2 hidden c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i return attention_output_vector, alphas
def decode(self, pre_encoded, pos_encoded, refex, entity): refex = list(refex) refex = [self.token2int[c] for c in refex] h_pre = dy.concatenate_cols(pre_encoded) w1dt_pre = None h_pos = dy.concatenate_cols(pos_encoded) w1dt_pos = None last_output_embeddings = self.lookup[self.token2int[self.EOS]] entity_embedding = self.lookup[self.token2int[entity]] s = self.dec_lstm.initial_state().add_input( dy.concatenate([ dy.vecInput(self.config.state_dim * 4), last_output_embeddings, entity_embedding ])) loss = [] for word in refex: # w1dt can be computed and cached once for the entire decoding phase w1dt_pre = w1dt_pre or self.attention_w1_pre * h_pre w1dt_pos = w1dt_pos or self.attention_w1_pos * h_pos attention_pre, _ = self.attend(h_pre, s, w1dt_pre, self.attention_w2_pre, self.attention_v_pre) attention_pos, _ = self.attend(h_pos, s, w1dt_pos, self.attention_w2_pos, self.attention_v_pos) vector = dy.concatenate([ attention_pre, attention_pos, last_output_embeddings, entity_embedding ]) s = s.add_input(vector) out_vector = self.decoder_w * s.output() + self.decoder_b probs = dy.softmax(out_vector) prob = dy.pick(probs, word) last_output_embeddings = self.lookup[word] loss.append(-dy.log(prob)) loss = dy.esum(loss) return loss
def calc_loss(self, translator, src, trg): batch_size = trg.batch_size() uniques = [set() for _ in range(batch_size)] deltas = [] probs = [] sign = -1 if self.inv_eval else 1 search_outputs = translator.generate_search_output( src, self.search_strategy) for search_output in search_outputs: logprob = search_output.logsoftmaxes sample = search_output.word_ids attentions = search_output.attentions logprob = dy.esum(logprob) * self.alpha # Calculate the evaluation score eval_score = np.zeros(batch_size, dtype=float) mask = np.zeros(batch_size, dtype=float) for j in range(batch_size): ref_j = self.remove_eos(trg[j].words) hyp_j = self.remove_eos(sample[j].tolist()) if self.unique_sample: hash_val = hash(tuple(hyp_j)) if len(hyp_j) == 0 or hash_val in uniques[j]: mask[j] = -1e20 # represents negative infinity continue else: uniques[j].add(hash_val) # Calc evaluation score eval_score[j] = self.evaluation_metric.evaluate_one_sent( ref_j, hyp_j) * sign # Appending the delta and logprob of this sample prob = logprob + dy.inputTensor(mask, batched=True) deltas.append(dy.inputTensor(eval_score, batched=True)) probs.append(prob) sample_prob = dy.softmax(dy.concatenate(probs)) deltas = dy.concatenate(deltas) risk = dy.sum_elems(dy.cmult(sample_prob, deltas)) ### Debug #print(sample_prob.npvalue().transpose()[0]) #print(deltas.npvalue().transpose()[0]) #print("----------------------") ### End debug return FactoredLossExpr({"risk": risk})
def train(self, trainning_set): loss_chunk = 0 loss_all = 0 total_chunk = 0 total_all = 0 losses = [] for datapoint in trainning_set: premise = datapoint["premise"] hypothesis = datapoint["hypothesis"] gold_label = datapoint["gold_label"] ep = self.encode_sentence(premise) eh = self.encode_sentence(hypothesis) Ps = [] for i in range(self.projection_size): Ps.append(self.Phis[i].expr() * ep) P = dy.transpose(dy.concatenate_cols(Ps)) s = P * eh y = dy.softmax(self.W.expr() * s + self.b.expr()) losses.append(-dy.log(dy.pick(y, gold_label))) # process losses in chunks if len(losses) > 50: loss = dy.esum(losses) l = loss.scalar_value() loss.backward() self.trainer.update() dy.renew_cg() losses = [] loss_chunk += l loss_all += l total_chunk += 1 total_all += 1 # consider any remaining losses if len(losses) > 0: loss = dy.esum(losses) loss.scalar_value() loss.backward() self.trainer.update() dy.renew_cg() print(f'loss: {loss_all/total_all:.4f}')
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) self.param_exprs['ugW%d' % wlen] = dy.parameter( self.params['update_gate_W'][wlen - 1]) self.param_exprs['ugb%d' % wlen] = dy.parameter( self.params['update_gate_b'][wlen - 1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) comb = dy.concatenate([ dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]), chars ]) update_logits = self.param_exprs[ 'ugW%d' % wlen] * comb + self.param_exprs['ugb%d' % wlen] update_gate = dy.transpose( dy.concatenate_cols([ dy.softmax( dy.pickrange(update_logits, i * (wlen + 1), (i + 1) * (wlen + 1))) for i in xrange(self.options['ndims']) ])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols( dy.cmult(update_gate, dy.reshape(comb, (self.options['ndims'], wlen + 1)))) return word
def __call__(self, dec_hidden_state, xh_vecs): # enc_vecs: one vector for each input token # dec_hiden_vecs: represents current decoder hidden state, one vector for each layer if not xh_vecs: # TODO: fix this hack, possibly by padding xh_vecs with bos and eos return dy.vecInput(self.z_dim) s = dy.concatenate(list(dec_hidden_state)) W = dy.parameter(self.p_W) b = dy.parameter(self.p_b) UV = dy.parameter(self.p_UV) v = dy.parameter(self.p_v) vT = dy.transpose(v) Ws = W * s attn_weights = [vT * dy.tanh(Ws + UV * xh + b) for xh in xh_vecs] attn_dist = dy.softmax(dy.concatenate(attn_weights)) return dy.concatenate_cols(xh_vecs) * attn_dist, attn_dist
def compute_loss(self, in_sentence, out_sentence): from numpy import argmax dn.renew_cg() lookup, R, C, bias, encoder, decoder = self.get_params() in_s, out_s = self.wrap_sentence(in_sentence), self.wrap_sentence( out_sentence) loss = [] enc_s, _ = input_all(encoder.initial_state(), [lookup[c] for c in in_s]) s = decoder.initial_state().add_input(enc_s.output()) for char, next_char in zip(out_s, out_s[1:]): s = s.add_input(lookup[char]) probs = dn.softmax(R * s.output() + bias) loss.append(-dn.log(dn.pick(probs, next_char))) # loss.append( dn.pickneglogsoftmax(probs,next_char) ) loss = dn.esum(loss) return loss
def _attend(self, input_vectors, state_fw, state_bw): w1 = self.att_w1.expr() w2 = self.att_w2.expr() v = self.att_v.expr() attention_weights = [] w2dt = w2 * dy.concatenate([state_fw.h()[-1], state_bw.h()[-1]]) for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = \ dy.esum([vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def tag_sent(self, words, trans=True): self.eval = True if trans: if self.en_trans: self.en_trans.transliterate('\n'.join(set(words))) self.etrans = self.en_trans.trans_dict if self.hi_trans: self.hi_trans.transliterate('\n'.join(set(words))) self.htrans = self.hi_trans.trans_dict dy.renew_cg() vecs = self.build_tagging_graph(words) vecs = [dy.softmax(v) for v in vecs] probs = [v.npvalue() for v in vecs] tags = [] for prb in probs: tag = np.argmax(prb) tags.append(self.meta.i2t[tag]) return zip(words, tags)
def generate_top_n(logProb, state, words, wordID, n): if words[-1] == endSymbol: yield logProb, words h_e = state.output() c_t, unkIndex = self.__attention_mlp(h_fs_matrix, h_e) embed_t = dy.lookup(self.tgt_lookup, wordID) x_t = dy.concatenate([embed_t, c_t]) next_state = state.add_input(x_t) y_star = np.reshape( dy.softmax(W_y * next_state.output() + b_y).npvalue(), -1) for nextWordID in np.argpartition(-y_star, n)[n]: currentWord = self.tgt_id_to_token[nextWordID] if currentWord == unkSymbol: currentWord = self.src_id_to_token[unkIndex] currentLogProb = logProb + np.log(y_star[nextWordID]) newWords = words + [currentWord] yield currentLogProb, generate_top_n(currentProb, newWords, nextWordID, n), newWords
def decode(embedded, wf): wf = list(wf) + [EOS] wf = [char2int[c] for c in wf] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = lookup[char2int[EOS]] s = generator.initial_state().add_input(dy.concatenate([embedded, last_output_embeddings])) loss = [] for char in wf: # w1dt can be computed and cached once for the entire decoding phase out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = lookup[char] loss.append(-dy.log(dy.pick(probs, char))) s = s.add_input(dy.concatenate([embedded,last_output_embeddings])) loss = dy.esum(loss) return loss
def loss(self, observation, instance): #trans = instance.transformation #if trans not in self.known_transformations: #k newtrans = list(self.param_dict.keys())[0][0] ### SUPER ARBITRARY #k tqdm.write("WARNING: unknown transformtion picked for instance {}; using transformation {}".format(trans, newtrans)) #k trans = newtrans trans = 'lul' b = dy.parameter(self.param_dict[(trans, 'b')]) W = dy.parameter(self.param_dict[(trans, 'W')]) features, label = observation prediction = dy.softmax(dy.affine_transform([b, W, dy.inputVector(features)])) loss = -dy.log(dy.pick(prediction, label)) return prediction, loss
def run(self, question, image): image_conv = self.convnet(image) embeddings = self.embed_question(question, image_conv) h0 = dy.concatenate([self.lookup[self.word2id[self.EOS]], image_conv]) init_state = self.enc_fwd_lstm.initial_state().add_input(h0) fwd_vectors = self.run_lstm(init_state, embeddings) embeddings_rev = list(reversed(embeddings)) init_state = self.enc_bwd_lstm.initial_state().add_input(h0) bwd_vectors = self.run_lstm(init_state, embeddings_rev) bwd_vectors = list(reversed(bwd_vectors)) vector = dy.average( [dy.concatenate(list(p)) for p in zip(fwd_vectors, bwd_vectors)]) return dy.softmax(self.W * vector + self.b)
def do_one_sequence(rnn, params, sequence): # setup the sequence dy.renew_cg() s0 = rnn.initial_state() R = params["R"] bias = params["bias"] lookup = params["lookup"] input_sequence = [input_token2int[t] for (t, _) in sequence] output_sequence = [output_token2int[t] for (_, t) in sequence] s = s0 loss = [] for input_token, output_token in zip(input_sequence, output_sequence): s = s.add_input(lookup[input_token]) probs = dy.softmax(R * s.output() + bias) loss.append(-dy.log(dy.pick(probs, output_token))) loss = dy.esum(loss) return loss
def predict_next(self): (R, bias, W_c, W__a, U__a, v__a) = self.cg_params # soft attention vector att_scores = [ v__a * dy.tanh(W__a * self.output_state + U__a * h_input) for h_input in self.biencoder ] alphas = dy.softmax(dy.concatenate(att_scores)) c = dy.esum([ h_input * dy.pick(alphas, j) for j, h_input in enumerate(self.biencoder) ]) # softmax over vocabulary h_output = dy.tanh(W_c * dy.concatenate([self.output_state, c])) self.logprobs = (dy.log_softmax(R * h_output + bias)).npvalue() return self.logprobs
def predict(lstm, params, line, y): dy.renew_cg() s0 = lstm.initial_state() R = params["R"] bias = params["bias"] lookup = params["lookup"] sentence = ["<EOS>"] + list(line) + ["<EOS>"] sentence = [char2int[c] for c in sentence] s = s0 for char in sentence: s = s.add_input(lookup[char]) lstm_out = (R * s.output()) + bias yhat = dy.softmax(mlp(lstm_out, params)) loss = -(dy.log(dy.pick(yhat, y))) return loss, yhat
def test_sentence(self, words, word_idxs): dy.renew_cg() forward_init, backward_init = [ b.initial_state() for b in self.builders ] embed_words = words.tensor # entities = words.ents forward = forward_init.transduce(embed_words) backward = backward_init.transduce(reversed(embed_words)) predictions = [] for f, b in zip(forward, backward): r_t = self(dy.concatenate([f, b])) temp_val = dy.softmax(r_t).value() # chosen = np.argmax(temp_val) predictions.append(temp_val) return predictions
def _step(self, prev_samples, encoder_output, decoder_state, prev_att, prev_att_expr, runtime, compute_attention): if prev_att is None: prev_att = dy.inputVector([0] * len(encoder_output)) else: prev_att = dy.inputVector( prev_att ) #this truncates backpropagation - don't know if it is ok to do that #input from receptive network while len(prev_samples) < self.config.receptive_input: prev_samples = [0] + prev_samples input_vect2 = dy.inputVector( prev_samples[-self.config.receptive_input:]) input_vect3 = dy.inputVector( prev_samples[-self.config.sample_trail_size:]) for w, b in zip(self.receptive_w, self.receptive_b): input_vect2 = dy.rectify(w.expr() * input_vect2 + b.expr()) if not runtime: input_vect2 = dy.dropout(input_vect2, self.config.receptive_dropout) #input from encoder if compute_attention or prev_att_expr is None: att_vect = dy.inputVector( prev_samples[-self.config.receptive_input:]) for w, b in zip(self.attention_w, self.attention_b): att_vect = dy.rectify(w.expr() * att_vect + b.expr()) else: att_vect = None input_vect1, prev_att = self._attend(encoder_output, decoder_state, prev_att, prev_att_expr, att_vect, compute_attention) decoder_state = decoder_state.add_input( dy.concatenate([input_vect1, input_vect2])) presoftmax = dy.concatenate( [decoder_state.output(), input_vect2, input_vect3]) for w, b in zip(self.presoftmax_w, self.presoftmax_b): presoftmax = dy.rectify(w.expr() * presoftmax + b.expr()) softmax = dy.softmax(self.softmax_w.expr() * presoftmax + self.softmax_b.expr()) return softmax, decoder_state, prev_att.value(), prev_att
def generate(self, sentence): #embedded = embed_sentence(in_seq) encoded = self.encode_sentence(sentence) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) w1 = dy.parameter(self.attention_w1) dw = dy.parameter(self.duration_weight) db = dy.parameter(self.duration_bias) #duration = dw * state.output() + db input_mat = dy.concatenate_cols(encoded) w1dt = None last_output_embeddings = self.output_lookup[2] s = self.dec_lstm.initial_state().add_input( dy.concatenate( [dy.vecInput(self.state_size * 2), last_output_embeddings])) out = '' res = [] dur_g = [] count_EOS = 0 for i in range(len(sentence)): if count_EOS == 2: break # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate( [self.attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) #k = s #dloss = self.test_duration(k, i, b) out_vector = w * s.output() + b dur_pred = dw * s.output() + db probs = dy.softmax(out_vector).vec_value() next_word = probs.index(max(probs)) last_output_embeddings = self.output_lookup[next_word] if next_word == 2: count_EOS += 1 continue res.append(next_word) dur_g.append(dy.rectify(dur_pred)) #out += int2char[next_word] return res, dur_g
def generate(self, pre_context, pos_context, entity): embedded = self.embed_sentence(pre_context) pre_encoded = self.encode_sentence(self.encpre_fwd_lstm, self.encpre_bwd_lstm, embedded) embedded = self.embed_sentence(pos_context) pos_encoded = self.encode_sentence(self.encpos_fwd_lstm, self.encpos_bwd_lstm, embedded) w = dy.parameter(self.decoder_w) b = dy.parameter(self.decoder_b) last_output_embeddings = self.output_lookup[self.output2int[self.EOS]] try: entity_embedding = self.input_lookup[self.input2int[entity]] except: entity_embedding = self.input_lookup[self.input2int[self.EOS]] s = self.dec_lstm.initial_state().add_input( dy.concatenate([ pre_encoded, pos_encoded, last_output_embeddings, entity_embedding ])) out = [] count_EOS = 0 for i in range(self.config['GENERATION']): if count_EOS == 2: break vector = dy.concatenate([ pre_encoded, pos_encoded, last_output_embeddings, entity_embedding ]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector).vec_value() next_word = probs.index(max(probs)) last_output_embeddings = self.output_lookup[next_word] if self.int2output[next_word] == self.EOS: count_EOS += 1 continue out.append(self.int2output[next_word]) return out
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = pc.parameter(attention_w1) w2 = pc.parameter(attention_w2) v = pc.parameter(attention_v) attention_weights = [] w2dt = w2 * pc.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v * pc.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = pc.softmax(pc.concatenate(attention_weights)) output_vectors = pc.esum([ vector * attention_weight for vector, attention_weight in zip(input_vectors, attention_weights) ]) return output_vectors
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o): # attention mechanism - Bahdanau style # iterate through input states to compute alphas # print 'computing scores...' # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs] alphas = pc.softmax(pc.concatenate(scores)) # c_i: 2 hidden c_i = pc.esum([ h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs) ]) # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i return attention_output_vector, alphas
def sample(self, x: dy.Expression, n: numbers.Integral, temperature: numbers.Real=1.0): assert temperature != 0.0 scores_expr = self.calc_log_probs(x) if temperature != 1.0: scores_expr *= 1.0 / temperature scores = dy.softmax(scores_expr).npvalue() else: scores = dy.exp(scores_expr).npvalue() # Numpy is very picky. If the sum is off even by 1e-8 it complains. scores /= sum(scores) a = range(scores.shape[0]) samples = np.random.choice(a, (n,), replace=True, p=scores) r = [] for word in samples: r.append((word, dy.pick(scores_expr, word))) return r
def attend(self, input_vectors, state, batch_size): w1 = dynet.parameter(self.attention_w1) w2 = dynet.parameter(self.attention_w2) v = dynet.parameter(self.attention_v) src_len = len(input_vectors) # enc_size, sent_len, batch_size src_enc_all = dynet.concatenate_cols(input_vectors) att_hidden = dynet.tanh(dynet.colwise_add(w1 * src_enc_all, w2 * state)) att_weights = dynet.reshape(v * att_hidden, (src_len, ), batch_size) # sent_len, batch_size att_weights = dynet.softmax(att_weights) output_vectors = src_enc_all * att_weights return output_vectors, att_weights
def attention(self, src_encodings, h_t, batch_size): W1_att_f = dy.parameter(self.W1_att_f) W1_att_e = dy.parameter(self.W1_att_e) W2_att = dy.parameter(self.W2_att) src_len = len(src_encodings) # enc_size, sent_len, batch_size src_enc_all = dy.concatenate_cols(src_encodings) att_hidden = dy.tanh( dy.colwise_add(W1_att_f * src_enc_all, W1_att_e * h_t)) att_weights = dy.reshape(W2_att * att_hidden, (src_len, ), batch_size) # sent_len, batch_size att_weights = dy.softmax(att_weights) ctx = src_enc_all * att_weights return ctx, att_weights
def tag_sent(sent, builders): dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] wembs = [E[vw.w2i.get(w, UNK)] for w,t in sent] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] H = dy.parameter(pH) O = dy.parameter(pO) tags=[] for f,b,(w,t) in zip(fw,reversed(bw),sent): r_t = O*(dy.tanh(H * dy.concatenate([f,b]))) # r_t = O*dy.concatenate([f,b]) out = dy.softmax(r_t) chosen = np.argmax(out.npvalue()) tags.append(vt.i2w[chosen]) return tags
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(decoder_w) b = pc.parameter(decoder_b) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE*2)) loss = [] for char in output: vector = attend(vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def sample(self, eos, max_len): #dy.renew_cg() #self.new_graph() state = self.rnn.initial_state() state = state.set_s(self.initial_state) sent = [] while len(sent) < max_len: assert state != None so = state.output() assert so != None output_dist = dy.softmax(self.output_mlp(so)) output_dist = output_dist.vec_value() word = sample(output_dist) sent.append(word) if word == eos: break word_emb = self.embed_word(word) state = state.add_input(word_emb) return sent
def _attend(self, input_vectors, state): w1 = self.att_w1.expr(update=True) w2 = self.att_w2.expr(update=True) v = self.att_v.expr(update=True) attention_weights = [] w2dt = w2 * state.h()[-1] for input_vector in input_vectors: attention_weight = v * dy.tanh(w1 * input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([ vector * attention_weight for vector, attention_weight in zip( input_vectors, attention_weights) ]) return output_vectors
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def create_network_return_best(inputs): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) return np.argmax(net_output.npvalue())
def sample(self, first=1, nchars=0, stop=-1): res = [first] dy.renew_cg() state = self.builder.initial_state() cw = first while True: x_t = self.lookup[cw] state = state.add_input(x_t) y_t = state.output() r_t = self.bias + (self.R * y_t) ydist = dy.softmax(r_t) dist = ydist.vec_value() rnd = random.random() for i,p in enumerate(dist): rnd -= p if rnd <= 0: break res.append(i) cw = i if cw == stop: break if nchars and len(res) > nchars: break return res
def create_network_return_loss(inputs, expected_output): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) # from parameters to expressions b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) loss = -dy.log(dy.pick(net_output, expected_output)) return loss
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
def create_network_return_best(self, inputs, dropout=False): out = self(inputs, dropout) out = dy.softmax(out) return np.argmax(out.npvalue(), 0)