def train_item(args, model, sentence): loss = None seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in sentence.preprocessed_sentence ] if len(seq) > 0: encoded_sequence = encode_sequence(model, seq, model.sentence_rnn) last_output = encoded_sequence[-1] global_max = max_pooling(encoded_sequence) global_min = average_pooling(encoded_sequence) context = dy.concatenate([last_output, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) if sentence.permissions[args.permission_type]: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) loss.backward() model.trainer.update() loss_val = loss.scalar_value() dy.renew_cg() return loss_val return 0
def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p): """ Makes all the calculations and returns a relevance score """ idf_vec = dy.inputVector(q_idf) bm25_score = dy.scalarInput(bm25_score) overlap_features = dy.inputVector(overlap_features) # Pass each query term representation through the MLP term_scores = [] for hist in q_d_hists: q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist))) hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1) for i in range(0, self.mlp_layers): hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i]) term_scores.append(hidd_out * self.W_last + self.b_last) # Term Gating gating_weights = idf_vec * self.w_g bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 drop_out = dy.scalarInput(1) drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active drop_out.set(drop_num) bm25_feature *= drop_out drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer return doc_score
def __train(model, data): tagged_loss = 0 untagged_loss = 0 for index, sentence_report in enumerate(data): for phrase in sentence_report.all_phrases: loss = None encoded_phrase = __encode_sequence(model, phrase) if model.options.external_info != "no_info": encoded_phrase = dy.concatenate( [encoded_phrase, model.doclookup[sentence_report.app_id]]) y_pred = dy.logistic((model.mlp_w * encoded_phrase) + model.mlp_b) if sentence_report.mark: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) if sentence_report.mark: tagged_loss += loss.scalar_value() / (index + 1) else: untagged_loss += loss.scalar_value() / (index + 1) loss.backward() model.trainer.update() dy.renew_cg()
def perceptron_loss(scores, reference): if use_cost_augmented: predictions = hamming_augmented_decode(scores, reference) else: predictions = [np.argmax(score.npvalue()) for score in scores] margin = dy.scalarInput(-2) if predictions != reference: reference_score = calc_sequence_score(scores, reference) prediction_score = calc_sequence_score(scores, predictions) if use_cost_augmented: # One could actually get the hamming augmented value during decoding, but we didn't do it here for # demonstration purpose. hamming = dy.scalarInput(hamming_cost(predictions, reference)) loss = prediction_score + hamming - reference_score else: loss = prediction_score - reference_score if use_hinge: loss = dy.emax([dy.scalarInput(0), loss - margin]) return loss else: return dy.scalarInput(0)
def __train(self, data): def encode_sequence(seq): rnn_forward = self.phrase_rnn[0].initial_state() for entry in seq: vec = self.wlookup[int(self.w2i.get(entry, 0))] rnn_forward = rnn_forward.add_input(vec) return rnn_forward.output() tagged_loss = 0 untagged_loss = 0 for index, sentence_report in enumerate(data): for phrase in sentence_report.all_phrases: loss = None encoded_phrase = encode_sequence(phrase) y_pred = dy.logistic((self.mlp_w*encoded_phrase) + self.mlp_b) if sentence_report.mark: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) if index % 1000 == 0: print("Description : {}".format(index+1)) print("Marked {} Prediction Result {} : ".format(sentence_report.mark, y_pred.scalar_value())) print("Tagged loss {} Untagged Loss {} Total loss {}".format(tagged_loss, untagged_loss, tagged_loss+untagged_loss)) if sentence_report.mark: tagged_loss += loss.scalar_value()/(index+1) else: untagged_loss += loss.scalar_value()/(index+1) loss.backward() self.trainer.update() dy.renew_cg()
def learn(self, characters, target_mgc, guided_att=True): num_mgc = target_mgc.shape[0] # print num_mgc dy.renew_cg() output_mgc, output_stop, output_attention = self._predict( characters, target_mgc) losses = [] index = 0 for mgc, real_mgc in zip(output_mgc, target_mgc): t_mgc = dy.inputVector(real_mgc) # losses.append(self._compute_binary_divergence(mgc, t_mgc) ) losses.append(dy.l1_distance(mgc, t_mgc)) if index % 3 == 0: # attention loss if guided_att: att = output_attention[index / 3] losses.append( self._compute_guided_attention(att, index / 3, len(characters) + 2, num_mgc / 3)) # EOS loss stop = output_stop[index / 3] if index >= num_mgc - 6: losses.append(dy.l1_distance(stop, dy.scalarInput(-0.8))) else: losses.append(dy.l1_distance(stop, dy.scalarInput(0.8))) index += 1 loss = dy.esum(losses) loss_val = loss.value() / num_mgc loss.backward() self.trainer.update() return loss_val
def beam_search(self, char_seq, truth = None, mu =0.): start_agenda = Agenda(self.options['beam_size']) init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) start_agenda.push(Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None)) agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = Agenda(self.options['beam_size']) for wlen in xrange(1,min(idx,self.options['max_word_len'])+1): # generate candidate word vectors word = self.word_repr(char_seq[idx-wlen:idx]) word_score = dy.dot_product(word,self.param_exprs['U']) for sent in agenda[idx-wlen]: # join segmentation if truth is not None: margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: score = sent.score_expr + dy.dot_product(sent.y, word) + word_score if now.happy_with(score.scalar_value()): new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) now.push(Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen)) agenda.append(now) if truth is not None: return agenda[-1].max().score_expr return agenda
def beam_train_max_margin_with_answer_guidence(self, init_state, gold_ans): # perform two beam search; one for prediction and the other for state action suff # max reward y = argmax(r(y)) with the help of gold_ans # max y' = argmax f(x,y) - R(y') # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0) #end_state_list = self.beam_predict(init_state) end_state_list = self.beam_predict_max_violation( init_state, gold_ans) # have to use this to make it work.... reward_list = [x.reward(gold_ans) for x in end_state_list] violation_list = [ s.path_score_expression.value() - reward for s, reward in zip(end_state_list, reward_list) ] best_score_state_idx = violation_list.index(max( violation_list)) # find the best scoring seq with minimal reward best_score_state = end_state_list[best_score_state_idx] best_score_state_reward = reward_list[best_score_state_idx] loss_value = 0 if self.only_one_best: best_states = self.beam_find_actions_with_answer_guidence( init_state, gold_ans) if best_states == []: return 0, [] best_reward_state = best_states[0] #print ("debug: found best_reward_state: qid =", best_reward_state.qinfo.seq_qid, best_reward_state) best_reward_state_reward = best_reward_state.reward(gold_ans) #print ("debug: best_reward_state_reward =", best_reward_state_reward) loss = dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(best_reward_state_reward - best_score_state_reward)) else: best_states = self.beam_find_actions_with_answer_guidence( init_state, gold_ans) best_states_rewards = [s.reward(gold_ans) for s in best_states] max_reward = max(best_states_rewards) best_states = [ s for s, r in zip(best_states, best_states_rewards) if r == max_reward ] loss = dt.average([ dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(max_reward - best_score_state_reward)) for best_reward_state in best_states ]) loss_value = loss.value() loss.backward() self.neural_model.learner.update() #print ("debug: beam_train_max_margin_with_answer_guidence done. loss_value =", loss_value) return loss_value, best_states
def training_session(self, sentence, print_logger, pool): lstm_output = self.network.get_lstm_output(sentence) length = len(sentence) raw_exprs = self.network.edge_eval.get_complete_raw_exprs(lstm_output) yield raw_exprs scores = self.network.edge_eval.raw_exprs_to_scores(raw_exprs, length) exprs = self.network.edge_eval.raw_exprs_to_exprs(raw_exprs, length) gold = [entry.parent_id for entry in sentence] heads_future = pool.apply_async(self.decoder, (scores, gold if self.options.cost_augment else None)) yield None heads = heads_future.get() if self.labelsFlag: edges = [(head, sentence[modifier].relation, modifier) for modifier, head in enumerate(gold[1:], 1)] label_exprs = list(self.network.get_label_scores(lstm_output, edges)) yield label_exprs label_loss = self.label_decoder(edges, label_exprs, self.statistics.labels, True) else: label_loss = dn.scalarInput(0.0) yield [] head_exprs = [(exprs[h][i] - exprs[g][i] + 1) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] print_logger.correct_edge += len(sentence) - len(head_exprs) print_logger.total_edge += len(sentence) head_loss = dn.esum(head_exprs) if head_exprs else dn.scalarInput(0.0) yield label_loss + head_loss
def train(self, indices, gold_arcs, gold_labels, pos_indices=None): total_arc_loss = 0 total_label_loss = 0 start = time.time() for i in range(len(indices)): states = self.states( indices[i], pos_indices[i] if pos_indices is not None else None) arc_scores = self.score_arcs(states, value=False) label_scores = self.score_labels(states, gold_arcs[i], value=False) arc_loss = self.arc_loss(gold_arcs[i], arc_scores) label_loss = self.label_loss(gold_labels[i], label_scores) if len(arc_loss) > 0: arc_loss = dynet.esum(arc_loss) else: arc_loss = dynet.scalarInput(0) if len(label_loss) > 0: label_loss = dynet.esum(label_loss) else: label_loss = dynet.scalarInput(0) loss = dynet.esum([arc_loss, label_loss]) arc_loss = arc_loss.value() label_loss = label_loss.value() total_arc_loss += arc_loss total_label_loss += label_loss loss.backward() self.trainer.update() dynet.renew_cg() print(time.time() - start) return total_arc_loss, total_label_loss
def copy_src_probs_pick(token_type, token_literal): if token_type not in copy_atts: return dy.scalarInput(0.0) selected_indexes = copy_history[token_type][token_literal] if len(selected_indexes) == 0: return dy.scalarInput(0.0) probs = copy_src_probs(token_type) return dy.sum_elems(dy.select_rows(probs, selected_indexes))
def decomp_attend(self, vecsA, vecsB): # Fq^T Fc -> need to expedite using native matrix/tensor multiplication Fq = vecsA # the original word vector, not yet passing a NN as in Eq.1, # need a function F Fc = vecsB # need a function F expE = [] for fq in Fq: row = [] for fc in Fc: row.append(dt.exp(dt.dot_product(fq, fc))) expE.append(row) #print ("debug: expE", expE[0][0].value()) invSumExpEi = [] for i in xrange(len(Fq)): invSumExpEi.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1))) invSumExpEj = [] for j in xrange(len(Fc)): invSumExpEj.append( dt.pow(dt.esum([expE[i][j] for i in xrange(len(Fq))]), dt.scalarInput(-1))) beta = [] for i in xrange(len(Fq)): s = dt.esum([Fc[j] * expE[i][j] for j in xrange(len(Fc))]) beta.append(s * invSumExpEi[i]) #print("debug: beta", beta[0].value()) alpha = [] for j in xrange(len(Fc)): s = dt.esum([Fc[j] * expE[i][j] for i in xrange(len(Fq))]) alpha.append(s * invSumExpEj[j]) #print("debug: alpha", alpha[0].value()) # Compare v1i = [ dt.logistic(dt.concatenate([Fq[i], beta[i]])) for i in xrange(len(Fq)) ] # need a function G v2j = [ dt.logistic(dt.concatenate([Fc[j], alpha[j]])) for j in xrange(len(Fc)) ] # need a function G #print ("debug: v1i", v1i[0].value()) #print ("debug: v2j", v2j[0].value()) # Aggregate v1 = dt.esum(v1i) v2 = dt.esum(v2j) #print ("debug: v1.value()", v1.value()) #print ("debug: v2.value()", v2.value()) #colScore = dt.logistic(dt.dot_product(self.SelHW, dt.concatenate([v1,v2]))) return dt.dot_product(v1, v2)
def other_loss_function(self, pred, gold, word_id, lexicon): if len(lexicon[word_id]) > 0: n_labels = len(lexicon[word_id]) return dynet.scalarInput(1 / n_labels) * dynet.esum( [-dynet.log(pred[k]) for k in sorted(lexicon[word_id])]) else: n_labels = len(lexicon["LEX_POS"]) return dynet.scalarInput(1 / n_labels) * dynet.esum( [-dynet.log(pred[k]) for k in sorted(lexicon["LEX_POS"])])
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) query = unsqueeze(query, 0) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) attn_scores = dy.cdiv(dy.transpose(query * self.context), dy.scalarInput(self.scale)) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def _make_input(self, seq, lang_id, runtime): x_list = [] encoder_states_list = [None] lang_emb = self.lang_embeddings[lang_id] # add the root x_list.append(self.padd_embeddings[0]) for entry in seq: word = entry.word # prepare lexical embeddings char_emb, encoder_states = self.character_network.compute_embeddings( word, runtime=runtime, language_embeddings=lang_emb) encoder_states_list.append(encoder_states) word = word.lower() if word in self.encodings.word2int: holistic_emb = self.holistic_embeddings[ self.encodings.word2int[word]] else: holistic_emb = self.holistic_embeddings[ self.encodings.word2int['<UNK>']] # dropout lexical embeddings if runtime: w_emb = char_emb + holistic_emb else: p1 = random.random() p2 = random.random() m1 = 1 m2 = 1 if p1 < self.config.input_dropout_prob: m1 = 0 if p2 < self.config.input_dropout_prob: m2 = 0 scale = 1.0 if m1 + m2 > 0: scale = float(2) / (m1 + m2) m1 = dy.scalarInput(m1) m2 = dy.scalarInput(m2) scale = dy.scalarInput(scale) w_emb = (char_emb * m1 + holistic_emb * m2) * scale x_list.append(dy.concatenate([w_emb, lang_emb])) # close sequence x_list.append(self.padd_embeddings[1]) encoder_states_list.append(None) return x_list, encoder_states_list
def greedy_search(self, char_seq, truth = None, mu =0.): init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) init_sentence = Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None, golden=True) if truth is not None: cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] else: cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ] #cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] start_agenda = init_sentence agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = None for wlen in range(1,min(idx,self.options['max_word_len'])+1): # generate word candidate vectors # join segmentation sent + word word = self.word_repr(char_seq[idx-wlen:idx], cembs[idx-wlen:idx]) sent = agenda[idx-wlen] if truth is not None: word = dy.dropout(word,self.options['dropout_rate']) word_score = dy.dot_product(word,self.param_exprs['U']) if truth is not None: golden = sent.golden and truth[idx-1]==wlen margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: golden = False score = sent.score_expr + dy.dot_product(sent.y, word) + word_score good = (now is None or now.score < score.scalar_value()) if golden or good: new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) new_sent = Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen, golden=golden) if good: now = new_sent if golden: golden_sent = new_sent agenda.append(now) if truth is not None and truth[idx-1]>0 and (not now.golden): return (now.score_expr - golden_sent.score_expr) if truth is not None: return (now.score_expr - golden_sent.score_expr) return agenda
def get_last_layer_context_representations( self, sentence, context_representations_for_crf_loss, context_representations_for_md_loss): last_layer_context_representations = context_representations_for_crf_loss if self.parameters['active_models'] in [1, 2, 3]: if self.parameters['active_models'] == 1 and \ self.parameters['integration_mode'] != 0: assert False, "integration_mode should be set to zero when active_models == 1" if self.parameters['debug'] == 1: print(("str_words", sentence["str_words"])) morph_analysis_representations, morph_analysis_scores = \ self.get_morph_analysis_representations_and_scores(sentence, context_representations_for_md_loss) selected_morph_analysis_representations = \ self.disambiguate_morph_analyzes(morph_analysis_scores) if 'golden_morph_analysis_indices' in list(sentence.keys()): md_loss = dynet.esum([ dynet.pickneglogsoftmax(morph_analysis_scores_for_word, golden_idx) for golden_idx, morph_analysis_scores_for_word in zip( sentence['golden_morph_analysis_indices'], morph_analysis_scores) ]) else: md_loss = dynet.scalarInput(0) if self.parameters['integration_mode'] == 2: # on the other hand, we can implement two layer of contexts, which we use the # first for morphological disambiguation and then concatenate the predicted/computed/ # selected morphological analysis representation to use for calculating tag_scores last_layer_context_representations = \ [dynet.concatenate([context, morph_analysis_representations[word_pos] [selected_morph_analysis_representation_pos]]) for word_pos, (selected_morph_analysis_representation_pos, context) in enumerate( zip(selected_morph_analysis_representations, context_representations_for_crf_loss))] if md_loss.value() > 1000: logging.error("BEEP") else: # only the plain old NER model # we must decide whether we should implement the morphological embeddings scheme here. md_loss = dynet.scalarInput(0) selected_morph_analysis_representations = None last_layer_context_representations = context_representations_for_crf_loss assert last_layer_context_representations is not None return last_layer_context_representations, md_loss, selected_morph_analysis_representations
def get_factor_expressions(fws, bws, tfemb, tfdict, valid_fes, sentence, spaths_x=None, cpaths_x=None): factexprs = {} sentlen = len(fws) sortedtfd = sorted(list(tfdict.keys())) targetspan = (sortedtfd[0], sortedtfd[-1]) for j in range(sentlen): istart = 0 if USE_SPAN_CLIP and j > ALLOWED_SPANLEN: istart = max(0, j - ALLOWED_SPANLEN) for i in range(istart, j + 1): spanlen = dy.scalarInput(j - i + 1) logspanlen = dy.scalarInput(math.log(j - i + 1)) spanwidth = sp_x[SpanWidth.howlongisspan(i, j)] spanpos = ap_x[ArgPosition.whereisarg((i, j), targetspan)] fbemb_ij_basic = dy.concatenate([ fws[i][j], bws[i][j], tfemb, spanlen, logspanlen, spanwidth, spanpos ]) if USE_DEPS: outs = oh_s[OutHeads.getnumouts(i, j, sentence.outheads)] shp = spaths_x[sentence.shortest_paths[(i, j, targetspan[0])]] fbemb_ij = dy.concatenate([fbemb_ij_basic, outs, shp]) elif USE_CONSTITS: isconstit = dy.scalarInput((i, j) in sentence.constitspans) lca = ct_x[sentence.lca[(i, j)][1]] phrp = cpaths_x[sentence.cpaths[(i, j, targetspan[0])]] fbemb_ij = dy.concatenate( [fbemb_ij_basic, isconstit, lca, phrp]) else: fbemb_ij = fbemb_ij_basic for y in valid_fes: fctr = Factor(i, j, y) if USE_HIER and y in feparents: fefixed = dy.esum([fe_x[y]] + [fe_x[par] for par in feparents[y]]) else: fefixed = fe_x[y] fbemb_ijy = dy.concatenate([fefixed, fbemb_ij]) factexprs[fctr] = w_f * dy.rectify(w_z * fbemb_ijy + b_z) + b_f return factexprs
def beam_search(self, char_seq, truth=None, mu=0.): start_agenda = Agenda(self.options['beam_size']) init_state = self.params['lstm'].initial_state().add_input( self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) start_agenda.push( Sentence(score=init_score.scalar_value(), score_expr=init_score, LSTMState=init_state, y=init_y, prevState=None, wlen=None)) agenda = [start_agenda] for idx, _ in enumerate( char_seq, 1): # from left to right, character by character now = Agenda(self.options['beam_size']) for wlen in xrange(1, min(idx, self.options['max_word_len']) + 1): # generate candidate word vectors word = self.word_repr(char_seq[idx - wlen:idx]) word_score = dy.dot_product(word, self.param_exprs['U']) for sent in agenda[idx - wlen]: # join segmentation if truth is not None: margin = dy.scalarInput( mu * wlen if truth[idx - 1] != wlen else 0.) score = margin + sent.score_expr + dy.dot_product( sent.y, word) + word_score else: score = sent.score_expr + dy.dot_product( sent.y, word) + word_score if now.happy_with(score.scalar_value()): new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) now.push( Sentence(score=score.scalar_value(), score_expr=score, LSTMState=new_state, y=new_y, prevState=sent, wlen=wlen)) agenda.append(now) if truth is not None: return agenda[-1].max().score_expr return agenda
def calc_loss(batch_distances, batch_scores, lamb): batch_losses = [ lamb * dy.scalarInput(d) - (batch_scores[0] - s) for s, d in zip(batch_scores[1:], batch_distances[1:]) ] losses_pos = [ l if l.npvalue() >= 0 else dy.scalarInput(0) for l in batch_losses ] if len(losses_pos) == 0: return 0 return dy.esum(losses_pos)
def rule_loss_collector( beam_item # type: BeamItem ): if beam_item.sync_rule is None: yield dn.scalarInput(0.0) raise StopIteration node_info = beam_item.node_info_ref() if node_info.early_updated: yield dn.scalarInput(0.0) raise StopIteration if self.options.use_graph_embedding: # calculate rule loss correspondents = node_info.correspondents pred_expr = correspondents[beam_item.sync_rule] gold_expr = correspondents[node_info.gold_rule] loss = (pred_expr - gold_expr) if pred_expr is not gold_expr else dn.scalarInput(0.0) else: loss = dn.scalarInput(0.0) # add rule correctness statistics print_logger.total_count += 1 if beam_item.sync_rule == node_info.gold_rule: print_logger.correct_count += 1 # calculate edge loss gold_item = node_info.gold_item if gold_item is not beam_item: predict_edge_scores = edge_feature_to_scores( beam_item.own_features - gold_item.own_features, return_expr=True) # ignore common features gold_edge_scores = edge_feature_to_scores( gold_item.own_features - beam_item.own_features, True) print_logger.total_gold_score += gold_item.score print_logger.total_predict_score += beam_item.score struct_loss = predict_edge_scores - gold_edge_scores loss += struct_loss yield loss # loss of children node if beam_item.left is not None: for i in rule_loss_collector(beam_item.left): yield i if beam_item.right is not None: for i in rule_loss_collector(beam_item.right): yield i node_info.early_updated = 1
def greedy_train_max_sumlogllh(self, init_state, gold_actions): total_obj = dt.scalarInput(0) cur_state = init_state res = 0 idx = 0 while True: if cur_state.is_end(): break action_list = list(cur_state.get_action_set()) new_expression_list, meta_info_list = cur_state.get_next_score_expressions( action_list) prob_list = dt.softmax(new_expression_list) gold_action = gold_actions[idx] action_idx = action_list.index(gold_action) total_obj += -(dt.log(prob_list[action_idx])) cur_state = cur_state.get_new_state_after_action( gold_action, meta_info_list[action_idx]) idx += 1 #print (cur_state) res = total_obj.scalar_value() total_obj.backward() self.neural_model.learner.update() return res
def learn(self, seq): output, proj_x3 = self._predict(seq, runtime=False) # arcs for iSrc in range(len(seq)): for iDst in range(len(seq)): if iDst > iSrc: o = output[iSrc][iDst] # the softmax portion t = get_link(seq, iSrc, iDst) # if t==1: # self.losses.append(-dy.log(dy.pick(o, t))) self.losses.append(dy.binary_log_loss( o, dy.scalarInput(t))) # labels gs_chains, labels = self._get_gs_chains(seq) for chain, label in zip(gs_chains, labels): label_rnn = self.label_decoder.initial_state() for index in chain: label_rnn = label_rnn.add_input(proj_x3[index]) label_softmax = dy.softmax( self.label_w.expr(update=True) * label_rnn.output() + self.label_b.expr(update=True)) self.losses.append(-dy.log( dy.pick(label_softmax, self.encodings.label2int[label])))
def decode(self, states, y, encoded_input, train=False): def sample(probs): return np.argmax(probs) s = self.decoder_rnn.initial_state() start_encoded = self.l2e["sep"].encode("<s>", "sep") out = [] loss = dy.scalarInput(0.) #s = s.add_input(states[-1]) #s.add_input(dy.concatenate([start_encoded, states[-1]])) s = s.add_input(dy.concatenate([start_encoded, states[-1]])) generated_string = [] for char in y: true_char_encoded = self.l2e["l"].encode(char, "l") scores = self.predict_letter(s.output(), y) generated_string.append(scores) weighted_states = self.attend(s.output(), states, encoded_input) #s = s.add_input(weighted_states) #s.add_input(dy.concatenate([true_char_encoded, weighted_states])) s = s.add_input( dy.concatenate([true_char_encoded, weighted_states])) if char in self.C2I: loss += dy.pickneglogsoftmax(scores, self.C2I[char]) return loss, generated_string
def get_loss_classification(self, inputs, seq): """ Computes classification loss for this sequence based on input vectors. """ W_ent1 = dy.parameter(self.l2ent1) W_ent1b = dy.parameter(self.l2ent1b) W_ent2 = dy.parameter(self.l2ent2) W_ent2b = dy.parameter(self.l2ent2b) def ff(h): return W_ent2 * dy.tanh(W_ent1 * h + W_ent1b) + W_ent2b boundaries = get_boundaries(seq.l_seq) if self.__is_training else \ get_boundaries(seq.bio_pred) losses = [] if not self.__is_training: seq.ent_pred = [] for (s, t, entity) in boundaries: h = bilstm_single(inputs[s:t + 1], self.elstm1, self.elstm2) g = ff(h) if self.__is_training: gold = self.e_enc[entity] losses.append(dy.pickneglogsoftmax(g, gold)) else: seq.ent_pred.append(self.e_dec[np.argmax(g.npvalue())]) if self.entemb_path: string = stringfy(seq.w_seq, s, t) with open(self.entemb_path, 'a') as inf: inf.write(string + '\t') for val in g.vec_value(): inf.write(str(val) + ' ') inf.write('\n') classification_loss = dy.esum(losses) if losses else dy.scalarInput(0.) return classification_loss
def get_loss_boundary(self, inputs, seq): """ Computes boundary loss for this sequence based on input vectors. """ W_bio1 = dy.parameter(self.l2bio1) W_bio1b = dy.parameter(self.l2bio1b) W_bio2 = dy.parameter(self.l2bio2) W_bio2b = dy.parameter(self.l2bio2b) def ff(h): return W_bio2 * dy.tanh(W_bio1 * h + W_bio1b) + W_bio2b gs = [ff(h) for h in inputs] # Inputs now 3 dimensional ("BIO scores") if self.loss == "global": boundary_loss = self.get_loss_boundary_global(gs, seq) elif self.loss == "local": boundary_loss = self.get_loss_boundary_local(gs, seq) else: sys.exit("Unknown loss \"{0}\"".format(self.loss)) losses = [] if not self.__is_training: seq.bio_pred = [] for i, g in enumerate(gs): if self.__is_training: gold = self.__BIO_ENC[seq.l_seq[i][0]] losses.append(dy.pickneglogsoftmax(g, gold)) else: seq.bio_pred.append(self.__BIO_DEC[np.argmax(g.npvalue())]) boundary_loss = dy.esum(losses) if losses else dy.scalarInput(0.) return boundary_loss
def get_loss_boundary_global(self, score_vecs, seq): start_b = dy.parameter(self.start_bias) T = dy.parameter(self.trans_mat) end_b = dy.parameter(self.end_bias) if not self.__is_training: seq.bio_pred = viterbi(start_b, T, end_b, score_vecs, self.valid) return dy.scalarInput(0.) pi = [[None for _ in xrange(3)] for _ in xrange(len(score_vecs))] for y in xrange(3): pi[0][y] = score_vecs[0][y] + start_b[y] for i in xrange(1, len(pi)): for y in xrange(3): pi[i][y] = dy.logsumexp([ pi[i - 1][y_prev] + T[y_prev][y] + score_vecs[i][y] for y_prev in xrange(3) ]) normalizer = dy.logsumexp([pi[-1][y] + end_b[y] for y in xrange(3)]) gold_score = score_crf(start_b, T, end_b, score_vecs, [self.__BIO_ENC[l[0]] for l in seq.l_seq]) return normalizer - gold_score
def beam_train_max_margin(self, init_state, gold_ans): #still did not use the gold sequence but use the min risk training #max reward y = argmax(r(y)) #max y' = argmax f(x,y) - R(y') # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0) end_state_list = self.beam_predict(init_state) reward_list = [x.reward(gold_ans) for x in end_state_list] violation_list = [ s.score - reward for s, reward in zip(end_state_list, reward_list) ] best_score_state_idx = violation_list.index(max( violation_list)) # find the best scoring seq with minimal reward best_reward_state_idx = reward_list.index( max(reward_list)) # find seq with the max reward in beam best_score_state = end_state_list[best_score_state_idx] best_reward_state = end_state_list[best_reward_state_idx] best_score_state_reward = reward_list[best_score_state_idx] best_reward_state_reward = reward_list[best_reward_state_idx] loss = dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(best_reward_state_reward - best_score_state_reward)) loss_value = loss.value() loss.backward() self.neural_model.learner.update() return loss_value
def train(self, mini_batch, num_train, k): words, pos_tags, chars, langs, signs, masks = mini_batch # Getting the last hidden layer from BiLSTM. rnn_out = self.rnn_mlp(mini_batch, True) h_out = rnn_out[-1] t_out_d = dy.reshape(h_out, (h_out.dim()[0][0], h_out.dim()[1])) t_out = dy.transpose(t_out_d) # Calculating the kq values for NCE. kq = dy.scalarInput(float(k) / num_train) lkq = dy.log(kq) loss_values = [] for i in range(len(langs)): for j in range(i + 1, len(langs)): if (langs[i] != langs[j]) and (signs[i] == 1 or signs[j] == 1): lu = -dy.squared_distance(t_out[i], t_out[j]) denom = dy.log(dy.exp(lu) + kq) if signs[i] == signs[j]: # both one nom = lu else: nom = lkq loss_values.append(denom - nom) err_value = 0 if len(loss_values) > 0: err = dy.esum(loss_values) / len(loss_values) err.forward() err_value = err.value() err.backward() self.trainer.update() dy.renew_cg() return err_value
def beam_train_max_margin_with_goldactions(self, init_state, gold_actions): #max y = gold y #max y' = argmax f(x,y) # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0) #loss #end_state_list = self.beam_predict(init_state) # top-k argmax_y f(x,y) end_state_list = self.beam_predict_max_violation( init_state, gold_actions ) # top-k argmax_y f(x,y) + R(y*) - R(y) // Current implementation is the same as Hamming distance best_score_state = end_state_list[0] reward_list = [x.reward(gold_actions) for x in end_state_list] best_reward_state = self.get_goldstate_with_gold_actions( init_state, gold_actions) best_reward = best_reward_state.reward(gold_actions) loss = dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(best_reward - reward_list[0])) loss_value = loss.value() loss.backward() self.neural_model.learner.update() return loss_value
def get_constit_loss(fws, bws, goldspans): if not USE_PTB_CONSTITS: raise Exception("should not be using the constit loss now!", USE_PTB_CONSTITS) if len(goldspans) == 0: return None, 0 losses = [] sentlen = len(fws) for j in range(sentlen): istart = 0 if USE_SPAN_CLIP and j > ALLOWED_SPANLEN: istart = max(0, j - ALLOWED_SPANLEN) for i in range(istart, j + 1): constit_ij = w_c * dy.rectify( w_fb * dy.concatenate([fws[i][j], bws[i][j]]) + b_fb) + b_c logloss = dy.log_softmax(constit_ij) isconstit = int((i, j) in goldspans) losses.append(pick(logloss, isconstit)) ptbconstitloss = dy.scalarInput(DELTA) * -esum(losses) numspanstagged = len(losses) return ptbconstitloss, numspanstagged
def loss_function(recon_x, x, mu, logvar): BCE = dy.binary_log_loss(recon_x, x) # equiv to torch.nn.functional.binary_cross_entropy(?,?, size_average=False) # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * dy.sum_elems(1 + logvar - dy.pow(mu, dy.scalarInput(2)) - dy.exp(logvar)) return BCE + KLD
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def truth_score(self, word_seq): wembs = [self.param_exprs['<bos>']]+[self.word_repr(word) for word in word_seq] init_state = self.params['lstm'].initial_state() hidden_states = init_state.transduce(wembs) score = dy.scalarInput(0.) for h, w in zip(hidden_states[:-1],wembs[1:]): y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb']) score = score + dy.dot_product(y,w) +dy.dot_product(w,self.param_exprs['U']) return score
def score_sentence(self, emissions, tags): """Get the score of a given sentence. :param emissions: List[dy.Expression ((H,), B)] :param tags: List[int] Returns: dy.Expression ((1,), B) """ tags = np.concatenate((np.array([self.start_idx], dtype=int), tags)) score = dy.scalarInput(0) transitions = self.transitions for i, e in enumerate(emissions): # Due to Dynet being column based it is best to use the transition # matrix so that x -> y is T[y, x]. score += dy.pick(dy.pick(transitions, tags[i + 1]), tags[i]) + dy.pick(e, tags[i + 1]) score += dy.pick(dy.pick(transitions, self.end_idx), tags[-1]) return score
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source): """ Perform one iteration of trying to score a node's neighbors above negative samples. """ # true instances likelihood trues = targets(g, node) if is_source else sources(g, node) side = '->' if is_source else '<-' if len(trues) == 0: return 0.0 if opts.debug: dy.renew_cg(immediate_compute = True, check_validity = True) else: dy.renew_cg() # compute association score as dynet expression (can't do this above due to staleness) true_scores = [] for tr in trues: if is_source: j_assoc_score = assoc_model.word_assoc_score(node, tr, rel) else: j_assoc_score = assoc_model.word_assoc_score(tr, node, rel) if log_file is not None: log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\ .format(node, side, tr, j_assoc_score.scalar_value())) true_scores.append(j_assoc_score) # false targets likelihood - negative sampling (uniform) # collect negative samples if opts.nll: sample_scores = [[ts] for ts in true_scores] else: margins = [] neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))] # remove source and true targets if applicable for t in [node] + trues: if t in neg_samples: neg_samples.remove(t) neg_samples.append(np.random.choice(range(N))) for (i,ns) in enumerate(neg_samples): # compute association score as dynet expression if is_source: ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel) else: ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel) if log_file is not None: log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\ .format(node, side, ns, ns_assoc_score.scalar_value())) corresponding_true = i // opts.neg_samp if opts.nll: sample_scores[corresponding_true].append(ns_assoc_score) else: # TODO maybe use dy.hinge() ctt_score = true_scores[corresponding_true] margin = ctt_score - ns_assoc_score margins.append(dy.rectify(dy.scalarInput(1.0) - margin)) # compute overall loss if opts.nll: if len(sample_scores) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores]) else: if len(margins) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum(margins) sc_loss = dy_loss.scalar_value() if log_file is not None: log_file.write('{}\tLOSS\t{:.3e}\n'\ .format(node, sc_loss)) # backprop and recompute score if opts.v > 1: timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\ .format(rel, node, 'source' if is_source else 'target', sc_loss)) dy_loss.backward() trainer.update() return sc_loss
pW1 = m.add_parameters((HIDDEN_SIZE, 2), device="GPU:1") pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1") pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0") pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0") pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU") pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) dy.renew_cg() W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa) x = dy.vecInput(2, "GPU:1") y = dy.scalarInput(0, "CPU") h1 = dy.tanh((W1*x) + b1) h1_gpu0 = dy.to_device(h1, "GPU:0") h2 = dy.tanh((W2*h1_gpu0) + b2) h2_cpu = dy.to_device(h2, "CPU") if xsent: y_pred = dy.logistic((V*h2_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h2_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1
HIDDEN_SIZE = 8 ITERATIONS = 2000 m = dy.Model() trainer = dy.SimpleSGDTrainer(m) W = m.add_parameters((HIDDEN_SIZE, 2)) b = m.add_parameters(HIDDEN_SIZE) V = m.add_parameters((1, HIDDEN_SIZE)) a = m.add_parameters(1) if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) x = dy.vecInput(2) y = dy.scalarInput(0) h = dy.tanh((W*x) + b) if xsent: y_pred = dy.logistic((V*h) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0