def score_sentence(self, score_vecs, tags): assert(len(score_vecs)==len(tags)) tags.insert(0, START_TAG) # add start total = dynet.scalarInput(.0) for i, obs in enumerate(score_vecs): # transition to next from i and emission next_tag = tags[i + 1] total += dynet.pick(self.trans_mat[next_tag],tags[i]) + dynet.pick(obs,next_tag) total += dynet.pick(self.trans_mat[END_TAG],tags[-1]) return total
def score_sentence(self, score_vecs, tags): assert(len(score_vecs)==len(tags)) tags.insert(0, START_TAG) # add start total = dynet.scalarInput(.0) for i, obs in enumerate(score_vecs): # transition to next from i and emission next_tag = tags[i + 1] total += dynet.pick(self.trans_mat[next_tag],tags[i]) + dynet.pick(obs,next_tag) total += dynet.pick(self.trans_mat[END_TAG],tags[-1]) return total
def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim( (scores - max_score_expr_broadcast), 0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def _get_loss(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) log_out = dy.log(layers[-1] + epsilon) loss = dy.zeros(1) for t in targets: loss += dy.pick(log_out, t) r = np.random.randint(self.dim_out) while r in targets: r = np.random.randint(self.dim_out) loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon) #loss -= dy.pick(log_out, r) return -loss
def pick_neg_log(self, pred, gold): # TODO make this a static function in both classes if not isinstance(gold, int) and not isinstance(gold, np.int64): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim( (scores - max_score_expr_broadcast), 0)
def _get_loss_and_prediction(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) output = layers[-1].value() res = {i for i in output if i > 0.5} log_out = dy.log(layers[-1] + epsilon) loss = dy.zeros(1) for t in targets: loss += dy.pick(log_out, t) r = np.random.randint(self.dim_out) while r in targets: r = np.random.randint(self.dim_out) loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon) #loss -= dy.pick(log_out, r) return -loss, res
def compute_loss_multilabel(self, task, seq, multi_y): """ computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels """ out_probs = self(task, seq) losses = [] for y in multi_y: assigned_prob = dn.pick(out_probs, y) losses.append(-dn.log(assigned_prob) / len(multi_y)) return dn.esum(losses)
def viterbi(self, observations, unk_tag=None, dictionary=None): #if dictionary: # raise NotImplementedError("type constraints not yet implemented for CRF") backpointers = [] init_vvars = [-1e10] * self.num_tags init_vvars[START_TAG] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_tags): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) if unk_tag: best_tag = self.index2tag[best_tag_id] if best_tag == unk_tag: next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0 best_tag_id = np.argmax( next_tag_arr) # get second best bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[END_TAG] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == START_TAG # Return best path and best path's score return best_path, path_score
def __call__(self, x_embs): x_len = len(x_embs) # BiGRU hf = dy.concatenate_cols( self.fGRUBuilder.initial_state().transduce(x_embs)) hb = dy.concatenate_cols(self.bGRUBuilder.initial_state().transduce( x_embs[::-1])[::-1]) h = dy.concatenate([hf, hb]) # Selective Gate hb_1 = dy.pick(hb, index=0, dim=1) hf_n = dy.pick(hf, index=x_len - 1, dim=1) s = dy.concatenate([hb_1, hf_n]) # Selection sGate = dy.logistic(dy.colwise_add(self.Ws * h, self.Us * s + self.bs)) hp = dy.cmult(h, sGate) return hp, hb_1
def viterbi(self, observations, unk_tag=None, dictionary=None): #if dictionary: # raise NotImplementedError("type constraints not yet implemented for CRF") backpointers = [] init_vvars = [-1e10] * self.num_tags init_vvars[START_TAG] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_tags): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) if unk_tag: best_tag = self.index2tag[best_tag_id] if best_tag == unk_tag: next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0 best_tag_id = np.argmax(next_tag_arr) # get second best bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[END_TAG] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == START_TAG # Return best path and best path's score return best_path, path_score
def decode_loss(self, src1, src2, tgt): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) _, prev_coverage = self.get_coverage( a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1)) ) loss = [] cov_loss = [] diag_loss = [] embedded_tgt = self.embed_idx(tgt, self.tgt_lookup) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)): a_t, c1_t = self.attend( src1_mat, decoder_state, src1_w1dt, self.att1_w2, self.att1_v, prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = decoder_state.add_input(x_t) out_vector = self.dec_w * decoder_state.output() + self.dec_b probs = dy.softmax(out_vector) probs, _ = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) loss.append(-dy.log(dy.pick(probs, char))) cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage) cov_loss.append(cov_loss_cur) diag_loss.append(self.get_diag_loss(a_t, t)) last_output_embeddings = embedded_char loss = dy.esum(loss) cov_loss = dy.esum(cov_loss) diag_loss = dy.esum(diag_loss) return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def get_loss_and_prediction(self, input, target, epsilon=1e-10): layers = self.compute_output_layer(input) return -dy.log(dy.pick(layers[-1], target) + epsilon), np.argmax( layers[-1].value())
def get_loss(self, input, target, epsilon=1e-10): layers = self.compute_output_layer(input) return -dy.log(dy.pick(layers[-1], target) + epsilon)
def pick_neg_log(pred, gold): return -dynet.log(dynet.pick(pred, gold))
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def pick_neg_log(pred, gold): return -dynet.log(dynet.pick(pred, gold))
def train(builder, model, model_parameters, X_train, y_train, nepochs, alpha=0.01, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Train the LSTM :param builder: the LSTM builder :param model: LSTM RNN model :param model_parameters: the model parameters :param X_train: the lstm instances :param y_train: the lstm labels :param nepochs: number of epochs :param alpha: the learning rate (only for SGD) :param update: whether to update the lemma embeddings :param dropout: dropout probability for all component embeddings :param x_y_vectors: the word vectors of x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network """ trainer = dy.AdamTrainer(model, alpha=alpha) minibatch_size = min(MINIBATCH_SIZE, len(y_train)) nminibatches = int(math.ceil(len(y_train) / minibatch_size)) previous_loss = 1000 for epoch in range(nepochs): total_loss = 0.0 epoch_indices = np.random.permutation(len(y_train)) for minibatch in range(nminibatches): path_cache = {} batch_indices = epoch_indices[minibatch * minibatch_size:(minibatch + 1) * minibatch_size] dy.renew_cg() loss = dy.esum([ -dy.log( dy.pick( process_one_instance( builder, model, model_parameters, X_train[batch_indices[i]], path_cache, update, dropout, x_y_vectors=x_y_vectors[batch_indices[i]] if x_y_vectors is not None else None, num_hidden_layers=num_hidden_layers), y_train[batch_indices[i]])) for i in range(minibatch_size) ]) total_loss += loss.value() # forward computation loss.backward() trainer.update() # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB # and requires an argument (would be epoch i guess...) # trainer.update_epoch() trainer.update() total_loss /= len(y_train) print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss # Early stopping if math.fabs(previous_loss - total_loss) < LOSS_EPSILON: break previous_loss = total_loss
def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0)