def evaluate_network_from_embs(self, wembs, renew=True): params = self.params if renew: dy.renew_cg() builders = params["builders"] W = params["W"] v = params["v"] lstms = [b.initial_state() for b in builders] # wembs = [dy.noise(we, 0.1) for we in wembs] # running the first level for getting b fw_lstm1 = lstms[0].transduce(wembs) bw_lstm1 = reversed(lstms[1].transduce(reversed(wembs))) inputs_to_2nd_layer = [ dy.concatenate([f, b]) for f, b in zip(fw_lstm1, bw_lstm1) ] fw_lstm2 = lstms[2].transduce(inputs_to_2nd_layer) bw_lstm2 = reversed(lstms[3].transduce(reversed(inputs_to_2nd_layer))) y = [dy.concatenate([f, b]) for f, b in zip(fw_lstm2, bw_lstm2)] tags_hat = [W * t + v for t in y] return tags_hat
def build_representations_bi(self, sentence, training, prefix=[], do_not_renew=False): if not do_not_renew: dy.renew_cg(immediate_compute=True, check_validity=True) coded_sentence = self.vocabulary.code_sentence_cw(sentence, training) coded_prefix = self.vocabulary.code_sentence_cw(prefix, training) w_init_f = self.wrnn[F].initial_state() w_init_b = self.wrnn[B].initial_state() f_lstm_input = self.get_static_representations(coded_prefix + coded_sentence) b_lstm_input = self.get_static_representations( coded_prefix + list(reversed(coded_sentence))) contextual_embeddings = [ w_init_f.transduce(f_lstm_input), list(reversed(w_init_b.transduce(b_lstm_input))) ] return (dy.concatenate([ contextual_embeddings[F][-1], contextual_embeddings[B][0] ]), [dy.concatenate(list(fb)) for fb in zip(*contextual_embeddings)])
def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim( (scores - max_score_expr_broadcast), 0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) if len(h.dim()[0]) == 2: L = h.dim()[0][1] if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, L), dtype=np.float32))]) else: if self.h_bias: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin else: return dy.transpose(blin)
def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([ self.get_w_repr(word, train=train, update=update) for word in words ]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array( [self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [ dynet.concatenate([w, c, l]) for w, c, l in zip(wfeatures, cfeatures, lex_features) ] else: features = [ dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures) ] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe, self.noise_sigma) for fe in features] return features
def get_pointergen_probs(self, c_t, state, x_t, a_t, probs, src1): if not self.pointer_gen: return probs, 1.0 unk_idx = self.tgt_vocab.str2int(UNK) p_gen = dy.logistic( self.ptr_w_c * c_t + self.ptr_w_s * dy.concatenate(list(state.s())) + self.ptr_w_x * x_t ) gen_probs = probs * p_gen copy_probs = a_t * (1 - p_gen) copy_probs_update = [] for i in gen_probs: copy_probs_update.append([i]) for char, prob in zip(src1, copy_probs): cur_idx = self.tgt_vocab.str2int(self.src1_vocab.int2str(char)) if cur_idx == unk_idx: continue if isinstance(cur_idx, int): copy_probs_update[cur_idx].append(prob) else: for idx in cur_idx: copy_probs_update[idx].append(prob / len(cur_idx)) sum_probs = dy.concatenate([dy.esum(exps) for exps in copy_probs_update]) return sum_probs, p_gen.scalar_value()
def __call__(self, h, s): if self.h_bias: if len(h.dim()[0]) == 2: h = dy.concatenate([ h, dy.inputTensor( np.ones((1, h.dim()[0][1]), dtype=np.float32)) ]) else: h = dy.concatenate( [h, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) if self.s_bias: if len(s.dim()[0]) == 2: s = dy.concatenate([ s, dy.inputTensor( np.ones((1, s.dim()[0][1]), dtype=np.float32)) ]) else: s = dy.concatenate( [s, dy.inputTensor(np.ones((1, ), dtype=np.float32))]) lin = self.U * s if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = dy.transpose(h) * lin return blin
def __call__(self, inputs, init_vecs=None, dropout_x=0., dropout_h=0., train=False): batch_size = inputs[0].dim()[1] if not self.fb_fusion: if self.param_init: f, b = self.f.initial_state(self.f_init), self.b.initial_state( self.b_init) elif init_vecs: f, b = self.f.initial_state( init_vecs["fwd"]), self.b.initial_state(init_vecs["bwd"]) else: f, b = self.f.initial_state(), self.b.initial_state() if train: self.f.set_dropouts(dropout_x, dropout_h) self.f.set_dropout_masks(batch_size) self.b.set_dropouts(dropout_x, dropout_h) self.b.set_dropout_masks(batch_size) else: self.f.set_dropouts(0., 0.) self.f.set_dropout_masks(batch_size) self.b.set_dropouts(0., 0.) self.b.set_dropout_masks(batch_size) f_in, b_in = inputs, reversed(inputs) f_out, b_out = f.add_inputs(f_in), b.add_inputs(b_in) f_last, b_last = f_out[-1].s(), b_out[-1].s() f_out, b_out = [state.h()[-1] for state in f_out ], [state.h()[-1] for state in b_out] out = [ dy.concatenate([f, b]) for f, b in zip(f_out, reversed(b_out)) ] last = [dy.concatenate([f, b]) for f, b in zip(f_last, b_last)] return (last, out) else: for f_lstm, b_lstm in self.DeepBiLSTM: f, b = f_lstm.initial_state(update=True), b_lstm.initial_state( update=True) if train: f_lstm.set_dropouts(dropout_x, dropout_h) f_lstm.set_dropout_masks(batch_size) b_lstm.set_dropouts(dropout_x, dropout_h) b_lstm.set_dropout_masks(batch_size) else: f_lstm.set_dropouts(0., 0.) f_lstm.set_dropout_masks(batch_size) b_lstm.set_dropouts(0., 0.) b_lstm.set_dropout_masks(batch_size) fs, bs = f.transduce(inputs), b.transduce(reversed(inputs)) inputs = [ dy.concatenate([f, b]) for f, b in zip(fs, reversed(bs)) ] return inputs
def process_one_instance(instance, update=True, x_y_vectors=None, features=None, mode='train'): lemma_lookup = self.model_parameters['lemma_lookup'] if self.opt['use_path']: pos_lookup = self.model_parameters['pos_lookup'] dep_lookup = self.model_parameters['dep_lookup'] dir_lookup = self.model_parameters['dir_lookup'] # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embeddings = [ self.get_path_embedding_from_cache( lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, mode) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embeddings) * (1.0 / num_paths) # Concatenate x and y embeddings if self.opt['use_xy_embeddings']: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) if self.opt['use_path']: input_vec = dy.concatenate([x_vector, input_vec, y_vector]) else: input_vec = dy.concatenate([x_vector, y_vector]) if self.opt['use_features']: for k in feat_dims: if 'diff' in k and not self.opt['use_freq_features']: continue feat = dy.lookup(self.model_parameters[k], features[k]) input_vec = dy.concatenate([input_vec, feat]) if self.opt['use_height_ebd']: if j in tree.term_height: h = tree.get_height(j) - 1 else: h = 0 height_vector = dy.lookup( self.model_parameters['height_lookup'], h) input_vec = dy.concatenate([input_vec, height_vector]) return input_vec
def set_initial_states(self, x): self.xt_embs = [dy.lookup(self.F, x_t) for x_t in x] if self.encoder_type == 'bow': self.W_enc = self.W * dy.average(self.xt_embs) elif self.encoder_type == 'attention': self.xb = dy.concatenate([ dy.esum(self.xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) self.xt = dy.transpose(dy.concatenate(self.xt_embs, d=1))
def attend(self, input_mat, state, w1dt, w2, v, coverage): w2dt = w2 * dy.concatenate(list(state.s())) if coverage: w1dt = w1dt + self.w_cov * dy.transpose(coverage) a_t = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) a_t = dy.softmax(a_t) return a_t, (input_mat * a_t)
def encode(self, embeds, fwd_lstm, bwd_lstm): embeds_rev = list(reversed(embeds)) fwd_vectors = self.run_lstm(fwd_lstm.initial_state(), embeds) bwd_vectors = self.run_lstm(bwd_lstm.initial_state(), embeds_rev) bwd_vectors = list(reversed(bwd_vectors)) vectors = [dy.concatenate(list(p)) for p in zip(fwd_vectors, bwd_vectors)] return vectors
def calc_compare(self, a_vecs, b_vecs, alphas, betas, dropout): ### not batched at the moment l_a = a_vecs.dim()[1] l_b = b_vecs.dim()[1] v1_i = [ self.compare.evaluate_network( dy.concatenate([dy.pick_batch_elem(a_vecs, i), betas[i]]), True, dropout) for i in range(l_a) ] v2_j = [ self.compare.evaluate_network( dy.concatenate([dy.pick_batch_elem(b_vecs, j), alphas[j]]), True, dropout) for j in range(l_b) ] return v1_i, v2_j
def __call__(self, sentence, c2i, maxn_char, act, train=False): words_batch = [] for token in sentence: chars_emb = [self.clookup[int(c2i.get(c, 0))] for c in token.chars] c2w = dy.concatenate_cols(chars_emb) c2w = dy.reshape(c2w, tuple(list(c2w.dim()[0]) + [1])) words_batch.append(c2w) words_batch = dy.concatenate_to_batch(words_batch) convds = [ dy.conv2d(words_batch, W, stride=(1, 1), is_valid=True) for W in self.Ws ] actds = [act(convd) for convd in convds] poolds = [ dy.maxpooling2d(actd, ksize=(1, maxn_char - win_size + 1), stride=(1, 1)) for win_size, actd in zip(self.win_sizes, actds) ] words_batch = [ dy.reshape(poold, (poold.dim()[0][2], )) for poold in poolds ] words_batch = dy.concatenate([out for out in words_batch]) c2w_emb = [] for idx, token in enumerate(sentence): c2w_emb.append(dy.pick_batch_elem(words_batch, idx)) return c2w_emb
def __call__(self, embeds, masks): # embeds: list(step) of {(n_emb, ), batch_size}, using padding for batches b_size = bs(embeds[0]) outputs = [embeds] # # todo(warn), disable masks for speeding up (although might not be critical) # masks = [None for _ in masks] for i, nn in zip(range(self.n_layers), self.nodes): init_hidden = dy.zeroes((self.n_hidden, ), batch_size=b_size) tmp_f = [] # forward tmp_f_prev = {"H": init_hidden, "C": init_hidden} for e, m in zip(outputs[-1], masks): one_output = nn[0](e, tmp_f_prev, m) tmp_f.append(one_output["H"]) tmp_f_prev = one_output tmp_b = [] # forward tmp_b_prev = {"H": init_hidden, "C": init_hidden} for e, m in zip(reversed(outputs[-1]), reversed(masks)): one_output = nn[1](e, tmp_b_prev, m) tmp_b.append(one_output["H"]) tmp_b_prev = one_output # concat ctx = [ dy.concatenate([f, b]) for f, b in zip(tmp_f, reversed(tmp_b)) ] outputs.append(ctx) return outputs[-1]
def get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update=True, drop=0.0): """ Get a vector representing a path :param builder: the LSTM builder :param lemma_lookup: the lemma embeddings lookup table :param pos_lookup: the part-of-speech embeddings lookup table :param dep_lookup: the dependency label embeddings lookup table :param dir_lookup: the direction embeddings lookup table :param path: sequence of edges :param update: whether to update the lemma embeddings :return: a vector representing a path """ # Concatenate the edge components to one vector inputs = [ dy.concatenate([ word_dropout(lemma_lookup, edge[0], drop, update), word_dropout(pos_lookup, edge[1], drop), word_dropout(dep_lookup, edge[2], drop), word_dropout(dir_lookup, edge[3], drop) ]) for edge in path ] return builder.initial_state().transduce(inputs)[-1]
def get_label_scores(self, lstm_outputs, left, right): ''' Get label scores and fix the score of empty label to zero. ''' non_empty_label_scores = self.f_label( self.get_span_encoding(lstm_outputs, left, right)) return dy.concatenate([dy.zeros(1), non_empty_label_scores])
def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim( (scores - max_score_expr_broadcast), 0)
def get_span_encoding(self, lstm_outputs, left, right): ''' Get the span representation using the difference of lstm_outputs of left and right. ''' forward = (lstm_outputs[right + 1][:self.lstm_dim] - lstm_outputs[left][:self.lstm_dim]) backward = (lstm_outputs[left + 1][self.lstm_dim:] - lstm_outputs[right + 2][self.lstm_dim:]) return dy.concatenate([forward, backward])
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1] * len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [ self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs ] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def build_network(params, x_data): _, E, b, U, W, bp = params if type(x_data) == dict: # print("DICT") prefix_ordinals = x_data['prefix'] suffix_ordinals = x_data['suffix'] x_ordinals = x_data['fullwords'] else: prefix_ordinals = None suffix_ordinals = None x_ordinals = x_data x = dy.concatenate([E[ord] for ord in x_ordinals]) if prefix_ordinals: x_pre = dy.concatenate([E[ord] for ord in prefix_ordinals]) x = x + x_pre if suffix_ordinals: x_suf = dy.concatenate([E[ord] for ord in suffix_ordinals]) x = x + x_suf output = dy.softmax(U * (dy.tanh(W * x + b)) + bp) return output
def __call__(self, x_embs): x_len = len(x_embs) # BiGRU hf = dy.concatenate_cols( self.fGRUBuilder.initial_state().transduce(x_embs)) hb = dy.concatenate_cols(self.bGRUBuilder.initial_state().transduce( x_embs[::-1])[::-1]) h = dy.concatenate([hf, hb]) # Selective Gate hb_1 = dy.pick(hb, index=0, dim=1) hf_n = dy.pick(hf, index=x_len - 1, dim=1) s = dy.concatenate([hb_1, hf_n]) # Selection sGate = dy.logistic(dy.colwise_add(self.Ws * h, self.Us * s + self.bs)) hp = dy.cmult(h, sGate) return hp, hb_1
def __call__(self, h, s): # hT -> ((L, h_dim), B), s -> ((s_dim, L), B) hT = dy.transpose(h) lin = self.U * s # ((h_dim*n_label, L), B) if self.n_label > 1: lin = dy.reshape(lin, (self.h_dim, self.n_label)) blin = hT * lin if self.n_label == 1: return blin + (hT * self.B if self.bias else 0) else: return dy.transpose(blin) + (self.V * dy.concatenate([h, s]) + self.B if self.bias else 0)
def get_coverage(self, a_t, prev_coverage, training=True): if not self.coverage: if not training: return None return dy.scalarInput(0), None coverage = a_t + prev_coverage if training: return ( dy.sum_elems(dy.min_dim(dy.concatenate([a_t, coverage], d=1), d=1)), coverage, ) return coverage
def evaluate_network_from_sentence(self, sentence): char_coded_sentence = self.p3b.encode_sentence(sentence) char_lstm_vectors = self.p3b.construct_vector(char_coded_sentence) word_coded_sentecne = self.encoder.encode_sentence_words(sentence) E = self.params["E"] word_embed_vectors = [E[w] for w, _ in word_coded_sentecne] concat_vec = [ dy.concatenate([e, c]) for e, c in zip(word_embed_vectors, char_lstm_vectors) ] return self.common.evaluate_network_from_embs(concat_vec, False)
def _feed_one(self, s, inputs, caches, prev_embeds): # first layer with attetion next_caches = self.anode(s, caches["hid"][0]["H"], caches) g_input = dy.concatenate([inputs, next_caches["ctx"]]) hidd = self.gnodes[0](g_input, caches["hid"][0]) this_hiddens = [hidd] # later layers for i in range(1, self.n_layers): ihidd = self.gnodes[i](this_hiddens[i - 1]["H"], caches["hid"][i]) this_hiddens.append(ihidd) # append and return next_caches["hid"] = this_hiddens return next_caches
def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def get_embeddings(self, word_inds, tag_inds, is_train=False, train_bert_embedding=None): if is_train: self.char_lstm.set_dropout(self.dropout) else: self.char_lstm.disable_dropout() embeddings = [] for idx, (w, t) in enumerate(zip(word_inds, tag_inds)): if w > 2: count = self.vocab.word_freq_list[w] if not count or (is_train and np.random.rand() < self.unk_param / (self.unk_param + count)): w = 0 tag_embedding = self.tag_embeddings[t] chars = list(self.vocab.i2w[w]) if w > 2 else [self.vocab.i2w[w]] char_lstm_outputs = self.char_lstm.transduce([ self.char_embeddings[self.vocab.c2i[char]] for char in [Vocabulary.START] + chars + [Vocabulary.STOP] ]) char_embedding = dy.concatenate([ char_lstm_outputs[-1][:self.char_lstm_dim], char_lstm_outputs[0][self.char_lstm_dim:] ]) word_embedding = self.word_embeddings[w] embs = [tag_embedding, char_embedding, word_embedding] if train_bert_embedding is not None: if w != 0: embs.append(dy.inputTensor(train_bert_embedding[idx])) else: embs.append(dy.zeros(768)) embeddings.append(dy.concatenate(embs)) return embeddings
def decode_loss(self, src1, src2, tgt): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) _, prev_coverage = self.get_coverage( a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1)) ) loss = [] cov_loss = [] diag_loss = [] embedded_tgt = self.embed_idx(tgt, self.tgt_lookup) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)): a_t, c1_t = self.attend( src1_mat, decoder_state, src1_w1dt, self.att1_w2, self.att1_v, prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = decoder_state.add_input(x_t) out_vector = self.dec_w * decoder_state.output() + self.dec_b probs = dy.softmax(out_vector) probs, _ = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) loss.append(-dy.log(dy.pick(probs, char))) cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage) cov_loss.append(cov_loss_cur) diag_loss.append(self.get_diag_loss(a_t, t)) last_output_embeddings = embedded_char loss = dy.esum(loss) cov_loss = dy.esum(cov_loss) diag_loss = dy.esum(diag_loss) return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
def encoder_forward(self, src1, src2): embedded_src1 = self.embed_idx(src1, self.src1_lookup) if self.single_source: embedded_src2 = [dy.vecInput(EMBEDDING_DIM) for idx in src2] else: embedded_src2 = self.embed_idx(src2, self.src2_lookup) encoded_src1 = self.encode( embedded_src1, self.enc1_fwd_lstm, self.enc1_bwd_lstm ) encoded_src2 = self.encode( embedded_src2, self.enc2_fwd_lstm, self.enc2_bwd_lstm ) src1_mat = dy.concatenate_cols(encoded_src1) src1_w1dt = self.att1_w1 * src1_mat src2_mat = dy.concatenate_cols(encoded_src2) src2_w1dt = self.att2_w1 * src2_mat if not self.single_source: start = ( self.W_s * dy.concatenate([encoded_src1[-1], encoded_src2[-1]]) + self.b_s ) else: start = ( self.W_s * dy.concatenate([encoded_src1[-1], dy.vecInput(2 * HIDDEN_DIM)]) + self.b_s ) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] c1_t = dy.vecInput(2 * HIDDEN_DIM) c2_t = dy.vecInput(2 * HIDDEN_DIM) decoder_state = self.dec_lstm.initial_state([start, dy.tanh(start)]).add_input( dy.concatenate([c1_t, c2_t, last_output_embeddings]) ) return src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state
def get_features(self, words, train=False, update=True): """ get feature representations """ # word embeddings wfeatures = np.array([self.get_w_repr(word, train=train, update=update) for word in words]) lex_features = [] if self.dictionary and not self.type_constraint: ## add lexicon features lex_features = np.array([self.get_lex_repr(word) for word in words]) # char embeddings if self.c_in_dim > 0: cfeatures = [self.get_c_repr(word, train=train) for word in words] if len(lex_features) > 0: lex_features = dynet.inputTensor(lex_features) features = [dynet.concatenate([w,c,l]) for w,c,l in zip(wfeatures,cfeatures,lex_features)] else: features = [dynet.concatenate([w, c]) for w, c in zip(wfeatures, cfeatures)] else: features = wfeatures if train: # only do at training time features = [dynet.noise(fe,self.noise_sigma) for fe in features] return features
def get_c_repr(self, word, train=False): """ Get representation of word via characters sub-LSTMs """ # get representation for words if word in self.w2c_cache: chars_of_token = self.w2c_cache[word] if train: chars_of_token = [drop(c, self.ccount, self.c_dropout_rate) for c in chars_of_token] else: chars_of_token = array.array('I',[self.c2i[WORD_START]]) + array.array('I',[self.get_c_idx(c, train=train) for c in word]) + array.array('I',[self.c2i[WORD_END]]) char_feats = [self.cembeds[c_id] for c_id in chars_of_token] # use last state as word representation f_char, b_char = self.char_rnn.predict_sequence(char_feats, char_feats) return dynet.concatenate([f_char[-1], b_char[-1]])
def predict(self, seq, train=False, output_confidences=False, unk_tag=None, update_embeds=True): """ predict tags for a sentence represented as char+word embeddings and compute losses for this instance """ if not train: dynet.renew_cg() features = self.get_features(seq.words, train=train, update=update_embeds) output_expected_at_layer = self.predictors["task_expected_at"][seq.task_id] output_expected_at_layer -=1 # go through layers # input is now combination of w + char emb prev = features prev_rev = features num_layers = self.h_layers for i in range(0,num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [self.activation(s) for s in forward_sequence] backward_sequence = [self.activation(s) for s in backward_sequence] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"][seq.task_id] concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))] if train and self.noise_sigma > 0.0: concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer] # fill-in predictions and get loss per tag losses = output_predictor.predict_sequence(seq, concat_layer, train=train, output_confidences=output_confidences, unk_tag=unk_tag, dictionary=self.dictionary, type_constraint=self.type_constraint) prev = forward_sequence prev_rev = backward_sequence if train: # return losses return losses else: return seq.pred_tags, seq.tag_confidences
def viterbi(self, observations, unk_tag=None, dictionary=None): #if dictionary: # raise NotImplementedError("type constraints not yet implemented for CRF") backpointers = [] init_vvars = [-1e10] * self.num_tags init_vvars[START_TAG] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_tags): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) if unk_tag: best_tag = self.index2tag[best_tag_id] if best_tag == unk_tag: next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0 best_tag_id = np.argmax(next_tag_arr) # get second best bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[END_TAG] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == START_TAG # Return best path and best path's score return best_path, path_score
def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0)