def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def calc_scores(words): # Create a computation graph, and add parameters dy.renew_cg() # Take the sum of all the embedding vectors for each word score = dy.esum([dy.lookup(W, x) for x in words]) # Add the bias vector and return return score + b
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a): # iterate through input states to compute alphas # print 'computing scores...' # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs] scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs] # print 'computed scores' # normalize to alphas using softmax # print 'computing alphas...' alphas = pc.softmax(pc.concatenate(scores)) # print 'computed alphas...' # compute c using alphas # print 'computing c...' # import time # s = time.time() # dim = len(blstm_outputs[0].vec_value()) # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)]) # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs]) # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas)) # print "stack time:", time.time() - s # s = time.time() c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # print "pick time:", time.time() - s # print 'computed c' # print 'c len is {}'.format(len(c.vec_value())) # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015) # print 'computing h~...' h_output = pc.tanh(W_c * pc.concatenate([h_t, c])) # print 'len of h_output is {}'.format(len(h_output.vec_value())) # print 'computed h~' return h_output, alphas, W__a.value()
def finalize(self, finished_epoch=False, **kwargs): """ Fit this model on collected samples :return self """ super().finalize(finished_epoch=finished_epoch, **kwargs) assert self.model, "Cannot finalize a model without initializing it first" if self.losses: loss = dy.esum(self.losses) loss.forward() self.config.print(lambda: "Total loss from %d time steps: %g" % (self.steps, loss.value()), level=4) loss.backward() try: self.trainer.update() except RuntimeError as e: Config().log("Error in update(): %s\n" % e) self.init_cg() self.losses = [] self.steps = 0 self.updates += 1 if finished_epoch: self.trainer.learning_rate /= (1 - self.learning_rate_decay) if self.config.args.verbose > 2: self.trainer.status() return self
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() #add padding to the sentence equal to the size of the window #as we need to predict the eos as well, the future window at that point is N past it padded_sent = [S] * N + sent + [S] * N padded_emb = [W_c_p[x] for x in padded_sent] # Step through the sentence all_losses = [] for i in range(N,len(sent)+N): c = dy.esum(padded_emb[i-N:i] + padded_emb[i+1:i+N+1]) s = W_w * c all_losses.append(dy.pickneglogsoftmax(s, padded_sent[i])) return dy.esum(all_losses)
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def sent_loss(words, tags): vecs = build_tagging_graph(words) errs = [] for v,t in zip(vecs,tags): tid = vt.w2i[t] err = dy.pickneglogsoftmax(v, tid) errs.append(err) return dy.esum(errs)
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_reinforce_loss(words, tags, delta): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) #calculate the probability distribution scores = [dy.affine_transform([b, W, x]) for x in word_reps] losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] probs = [-dy.exp(loss).as_array() for loss in losses] #then take samples from the probability distribution samples = [np.random.choice(range(len(x)), p=x) for x in probs] #calculate accuracy=reward correct = [sample == tag for sample, tag in zip(samples, tags)] r_i = float(sum(correct))/len(correct) r = dy.constant((1), r_i) # Reward baseline for each word W_bl = dy.parameter(W_bl_p) b_bl = dy.parameter(b_bl_p) r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps] #we need to take the value in order to break the computation graph #as the reward portion is trained seperatley and not backpropogated through during the overall score rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b] #the scores for training the baseline baseline_scores = [dy.square(r - x) for x in r_b] #then calculate the reinforce scores using reinforce reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)] #we want the first len(sent)-delta scores from xent then delta scores from reinforce #for mixer if len(scores) > delta: mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:] else: mixer_scores = reinforce_scores return dy.esum(mixer_scores), dy.esum(baseline_scores)
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def loss(self, preds, y): if self.do_crf is True: return self.crf.neg_log_loss(preds, y.squeeze(0)) else: element_loss = dy.pickneglogsoftmax errs = [] for pred, y_i in zip(preds, y.T): err = element_loss(pred, y_i) errs.append(err) return dy.esum(errs)
def transduce(seq,Y): seq = [E[i] for i in seq] fw = fwR.initial_state().transduce(seq) # this UNUSED part affects strategy 2 XXX = fwR2.initial_state().transduce([E[3],E[5]]) W = W_.expr() outs = [W*z for z in fw] losses = [dy.pickneglogsoftmax(o,y) for o,y in zip(outs,Y)] s = dy.esum(losses) return s
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # The initial history is equal to end of sentence symbols hist = [S] * N # Step through the sentence, including the end of sentence token all_losses = [] for next_word in sent + [S]: s = calc_score_of_history(hist) all_losses.append(dy.pickneglogsoftmax(s, next_word)) hist = hist[1:] + [next_word] return dy.esum(all_losses)
def sent_lm_loss(self, sent): rnn_cur = self.rnn.initial_state() losses = [] prev_word = self.start for word in sent: x_t = self.embeddings[prev_word] rnn_cur = rnn_cur.add_input(x_t) logits = dy.affine_transform([self.lb, self.h2l, rnn_cur.output()]) losses.append(dy.pickneglogsoftmax(logits, word)) prev_word = word return dy.esum(losses)
def BuildLMGraph(self, sent): dy.renew_cg() init_state = self.builder.initial_state() errs = [] # will hold expressions es=[] state = init_state inputs = [self.lookup[int(cw)] for cw in sent[:-1]] expected_outputs = [int(nw) for nw in sent[1:]] outputs = state.transduce(inputs) r_ts = ((self.bias + (self.R * y_t)) for y_t in outputs) errs = [dy.pickneglogsoftmax(r_t, eo) for r_t, eo in zip(r_ts, expected_outputs)] nerr = dy.esum(errs) return nerr
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = dy.parameter(attention_w1) w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) attention_weights = [] w2dt = w2*dy.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*dy.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o): # attention mechanism - Bahdanau style # iterate through input states to compute alphas # print 'computing scores...' # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs] alphas = pc.softmax(pc.concatenate(scores)) # c_i: 2 hidden c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i return attention_output_vector, alphas
def build_lm_graph(self, sent): dy.renew_cg() init_state = self.builder.initial_state() errs = [] # will hold expressions es=[] state = init_state for (cw,nw) in zip(sent,sent[1:]): # assume word is already a word-id x_t = dy.lookup(self.lookup, int(cw)) state = state.add_input(x_t) y_t = state.output() r_t = self.bias + (self.R * y_t) err = dy.pickneglogsoftmax(r_t, int(nw)) errs.append(err) nerr = dy.esum(errs) return nerr
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Step through the sentence and calculate binary prediction losses all_losses = [] for i, my_emb in enumerate(emb): scores = dy.logistic(W_c * my_emb) pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) word_repr = [[float(y) for y in np.binary_repr(x).zfill(nbits)] for x in pos_words] word_repr = [dy.inputVector(x) for x in word_repr] all_losses.extend([dy.binary_log_loss(scores, x) for x in word_repr]) return dy.esum(all_losses)
def BuildLMGraph(self, sents): dy.renew_cg() # initialize the RNN init_state = self.builder.initial_state() # parameters -> expressions R = dy.parameter(self.R) bias = dy.parameter(self.bias) S = vocab.w2i["<s>"] # get the cids and masks for each step tot_chars = 0 cids = [] masks = [] for i in range(len(sents[0])): cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent)>i else 0) for sent in sents] masks.append(mask) tot_chars += sum(mask) # start the rnn with "<s>" init_ids = cids[0] s = init_state.add_input(lookup_batch(self.lookup, init_ids)) losses = [] # feed char vectors into the RNN and predict the next char for cid, mask in zip(cids[1:], masks[1:]): score = dy.affine_transform([bias, R, s.output()]) loss = dy.pickneglogsoftmax_batch(score, cid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN cemb = dy.lookup_batch(self.lookup, cid) s = s.add_input(cemb) return dy.sum_batches(dy.esum(losses)), tot_chars
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def calc_lm_loss(sents): dy.renew_cg() # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = 0 wids = [] masks = [] for i in range(len(sents[0])): wids.append([(sent[i] if len(sent) > i else S) for sent in sents]) mask = [(1 if len(sent) > i else 0) for sent in sents] masks.append(mask) tot_words += sum(mask) # start the rnn by inputting "<s>" init_ids = [S] * len(sents) s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids)) # feed word vectors into the RNN and predict the next word losses = [] for wid, mask in zip(wids, masks): # calculate the softmax and loss score = dy.affine_transform([b_exp, W_exp, s.output()]) loss = dy.pickneglogsoftmax_batch(score, wid) # mask the loss if at least one sentence is shorter if mask[-1] != 1: mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,), len(sents)) loss = loss * mask_expr losses.append(loss) # update the state of the RNN wemb = dy.lookup_batch(WORDS_LOOKUP, wid) s = s.add_input(wemb) return dy.sum_batches(dy.esum(losses)), tot_words
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words": []} # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_lang_lstm_state = prev_lang_lstm.initial_state().add_input(lang_enc('*BOS*')) all_losses = [] lang_prec = 0.0 lang_items = 0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_lang_lstm_state.output()]) # run through the class mlp lang_mlp_output = lang_mlp(mlp_input) predicted_word_lang = lang_vocab.getItem(np.argmax(lang_mlp_output.npvalue())) confidence = np.max(lang_mlp_output.npvalue()) / np.sum(lang_mlp_output.npvalue()) lang_prec += 1 if predicted_word_lang == gold_word_lang else 0 lang_items += 1 tagged_daf["words"].append( {"word": word, "predicted_lang": predicted_word_lang, "confidence": confidence}) # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: all_losses.append(-dy.log(dy.pick(lang_mlp_output, lang_vocab[gold_word_lang]))) word_pos_ans = gold_word_lang # otherwise, set the answer to be the argmax elif not fRunning and fValidation: lang_conf_matrix(lang_vocab[predicted_word_lang], lang_vocab[gold_word_lang]) word_pos_ans = predicted_word_lang else: continue # run through the prev-pos-mlp prev_lang_lstm_state = prev_lang_lstm_state.add_input(lang_enc(word_pos_ans)) # prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) lang_prec = lang_prec / lang_items if lang_items > 0 else None # class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return lang_prec, tagged_daf if fRunning: return tagged_daf total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, lang_prec
all_time = 0 for ITER in range(100): random.shuffle(train) closs = 0.0 cwords = 0 start = time.time() batch = [] for i, tree in enumerate(train, 1): sents += 1 W = dy.parameter(W_) h, c = builder.expr_for_tree(tree, True) nodes = tree.nonterms() losses = [ dy.pickneglogsoftmax(W * nt._e, l2i[nt.label]) for nt in nodes ] loss = dy.esum(losses) batch.append(loss) if len(batch) == 50: loss = dy.esum(batch) closs += loss.value() cwords += len(nodes) loss.backward() trainer.update() batch = [] dy.renew_cg() if sents % 1000 == 0: trainer.status() print(closs / cwords, file=sys.stderr) closs = 0.0 cwords = 0 all_time += time.time() - start
def em_example(self, example): encoder_input = example[0] ground_labels = example[1] goal_vector = encoder_input[0] encoder_input = encoder_input[1] num_utterances = len(encoder_input) logits = self.MLP(goal_vector) sentence_initial_state = self.sentence_encoder.initial_state() pzs = [] for sentence in encoder_input: embedded_sentence = [self.embeddings[word] for word in sentence] final_state = sentence_initial_state.transduce( embedded_sentence)[-1] final_state = dy.concatenate([final_state, logits]) # Stochastic node: pzs.append(self.prob_z(final_state)) context_initial_state = self.context_encoder.initial_state() context_state = context_initial_state z_list = [] # Expectation e_time = time.time() for idx in range(num_utterances): pz = dy.nobackprop(pzs[idx]) max_prob = dy.scalarInput(-999999999) z_star = -999999999 z_star_onehot = self.onehotzs[0] for z in range(self.num_clusters): one_hot_z = self.onehotzs[z] state = context_state.add_input(one_hot_z).h()[-1] log_papx = dy.nobackprop( self.log_prob_papx(example, idx, state)) log_pz = dy.nobackprop(dy.log(dy.pick(pz, z))) # print("PZ: {}".format(log_pz.npvalue())) # print("PAPX: {}".format(log_papx.npvalue())) log_prob = dy.esum([log_papx, log_pz]) # print("PROB: {}".format(log_prob.npvalue())) if log_prob.value() > max_prob.value(): max_prob = log_prob z_star = z z_star_onehot = one_hot_z # print(z_star) # print(max_prob.npvalue()) # print(z_star) context_state = context_state.add_input(z_star_onehot) z_list.append(z_star) self.e_time += (time.time() - e_time) # Maximization m_time = time.time() context_state = context_initial_state probs = [] for idx in range(num_utterances): pz = pzs[idx] z = z_list[idx] log_pz = dy.log(dy.pick(pz, z)) one_hot_z = self.onehotzs[z] state = context_state.add_input(one_hot_z).h()[-1] # NO BACKPROP: log_papx = dy.nobackprop(self.log_prob_papx(example, idx, state)) log_prob = dy.esum([log_papx, log_pz]) probs.append(log_prob) # probs.append(log_papx) probs = dy.esum(probs) self.m_time += (time.time() - m_time) # TODO: check this: return (-probs, z_list)
def identify_frames(builders, tokens, postags, lexunit, targetpositions, goldframe=None): renew_cg() trainmode = (goldframe is not None) sentlen = len(tokens) - 1 emb_x = [v_x[tok] for tok in tokens] pos_x = [p_x[pos] for pos in postags] emb2_xi = [] for i in range(sentlen + 1): if tokens[i] in pretrained_embeddings_map: # If update set to False, prevents pretrained embeddings from being updated. emb_without_backprop = lookup(e_x, tokens[i], update=True) features_at_i = concatenate( [emb_x[i], pos_x[i], emb_without_backprop]) else: features_at_i = concatenate([emb_x[i], pos_x[i], u_x]) emb2_xi.append(w_e * features_at_i + b_e) emb2_x = [rectify(emb2_xi[i]) for i in range(sentlen + 1)] # initializing the two LSTMs if USE_DROPOUT and trainmode: builders[0].set_dropout(DROPOUT_RATE) builders[1].set_dropout(DROPOUT_RATE) f_init, b_init = [i.initial_state() for i in builders] fw_x = f_init.transduce(emb2_x) bw_x = b_init.transduce(reversed(emb2_x)) # only using the first target position - summing them hurts :( targetembs = [ concatenate([fw_x[targetidx], bw_x[sentlen - targetidx - 1]]) for targetidx in targetpositions ] targinit = tlstm.initial_state() target_vec = targinit.transduce(targetembs)[-1] valid_frames = list(lufrmmap[lexunit.id]) chosenframe = valid_frames[0] logloss = None if len(valid_frames) > 1: if USE_HIER and lexunit.id in relatedlus: lu_vec = esum([lu_x[luid] for luid in relatedlus[lexunit.id]]) else: lu_vec = lu_x[lexunit.id] fbemb_i = concatenate([target_vec, lu_vec, lp_x[lexunit.posid]]) # TODO(swabha): Add more Baidu-style features here. f_i = w_f * rectify(w_z * fbemb_i + b_z) + b_f if trainmode and USE_DROPOUT: f_i = dropout(f_i, DROPOUT_RATE) logloss = log_softmax(f_i, valid_frames) if not trainmode: chosenframe = np.argmax(logloss.npvalue()) if trainmode: chosenframe = goldframe.id losses = [] if logloss is not None: losses.append(pick(logloss, chosenframe)) prediction = { tidx: (lexunit, Frame(chosenframe)) for tidx in targetpositions } objective = -esum(losses) if losses else None return objective, prediction
def Train(self, trainData, options): mloss = 0.0 eloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 ninf = -float('inf') beg = time.time() start = time.time() random.shuffle( trainData ) # in certain cases the data will already have been shuffled after being read from file or while creating dev data print "Length of training data: ", len(trainData) errs = [] self.feature_extractor.Init(options) for iSentence, sentence in enumerate(trainData, 1): if iSentence % 100 == 0: loss_message = 'Processing sentence number: %d'%iSentence + \ ' Loss: %.3f'%(eloss / etotal)+ \ ' Errors: %.3f'%((float(eerrors)) / etotal)+\ ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\ ' Time: %.2gs'%(time.time()-start) print loss_message start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 sentence = deepcopy( sentence ) # ensures we are working with a clean copy of sentence and allows memory to be recycled each time round the loop conll_sentence = [ entry for entry in sentence if isinstance(entry, utils.ConllEntry) ] conll_sentence = conll_sentence[1:] + [conll_sentence[0]] self.feature_extractor.getWordEmbeddings(conll_sentence, True, options) stack = ParseForest([]) buf = ParseForest(conll_sentence) hoffset = 1 if self.headFlag else 0 for root in conll_sentence: root.lstms = [root.vec] if self.headFlag else [] root.lstms += [ self.feature_extractor.paddingVec for _ in range(self.nnvecs - hoffset) ] root.relation = root.relation if root.relation in self.irels else 'runk' while not (len(buf) == 1 and len(stack) == 0): scores = self.__evaluate(stack, buf, True) #to ensure that we have at least one wrong operation scores.append([(None, 4, ninf, None)]) stack_ids = [sitem.id for sitem in stack.roots] s1 = [stack.roots[-2]] if len(stack) > 1 else [] s0 = [stack.roots[-1]] if len(stack) > 0 else [] b = [buf.roots[0]] if len(buf) > 0 else [] beta = buf.roots[1:] if len(buf) > 1 else [] costs, shift_case = self.calculate_cost( scores, s0, s1, b, beta, stack_ids) bestValid = list( (s for s in chain(*scores) if costs[s[1]] == 0 and (s[1] == SHIFT or s[1] == SWAP or s[0] == s0[0].relation))) bestValid = max(bestValid, key=itemgetter(2)) bestWrong = max( (s for s in chain(*scores) if costs[s[1]] != 0 or (s[1] != SHIFT and s[1] != SWAP and s[0] != s0[0].relation)), key=itemgetter(2)) #force swap if costs[SWAP] == 0: best = bestValid else: #select a transition to follow # + aggresive exploration #1: might want to experiment with that parameter if bestWrong[1] == SWAP: best = bestValid else: best = bestValid if ( (not self.oracle) or (bestValid[2] - bestWrong[2] > 1.0) or (bestValid[2] > bestWrong[2] and random.random() > 0.1)) else bestWrong if best[1] == LEFT_ARC or best[1] == RIGHT_ARC: child = s0[0] #updates for the dynamic oracle if self.oracle: self.oracle_updates(best, b, s0, stack_ids, shift_case) self.apply_transition(best, stack, buf, hoffset) if bestValid[2] < bestWrong[2] + 1.0: loss = bestWrong[3] - bestValid[3] mloss += 1.0 + bestWrong[2] - bestValid[2] eloss += 1.0 + bestWrong[2] - bestValid[2] errs.append(loss) #labeled errors if best[1] == LEFT_ARC or best[1] == RIGHT_ARC: if (child.pred_parent_id != child.parent_id or child.pred_relation != child.relation): lerrors += 1 #attachment error if child.pred_parent_id != child.parent_id: eerrors += 1 #??? when did this happen and why? if best[1] == 0 or best[1] == 2: etotal += 1 #footnote 8 in Eli's original paper if len(errs) > 50: # or True: eerrs = dy.esum(errs) scalar_loss = eerrs.scalar_value() #forward eerrs.backward() self.trainer.update() errs = [] lerrs = [] dy.renew_cg() self.feature_extractor.Init(options) if len(errs) > 0: eerrs = (dy.esum(errs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] dy.renew_cg() self.trainer.update() print "Loss: ", mloss / iSentence print "Total Training Time: %.2gs" % (time.time() - beg)
def train(self, train_data, dev_data, num_epochs=150, batch_size=10): for I in range(num_epochs): print("EPOCH NUMBER {}".format(I)) avg_loss = 0. random.shuffle(train_data) good, bad = 0., 0. avg_edit_distance = 0. q = 0. losses = [] preds = [] for i, (x, y) in enumerate(train_data): if i % batch_size == 0 and i > 0: loss_sum = dy.esum(losses) loss_sum.forward() loss_sum.backward() self.trainer.update() losses = [] # evaluate trainset accuracy for (word_probs, y_true) in preds: generated_string = "" for char_probs in word_probs: generated_string += self.I2C[np.argmax( char_probs.npvalue())] if generated_string == y_true: good += 1 else: bad += 1 preds = [] dy.renew_cg() encoded_state, encoded_x = self.encode(x, y, train=True) loss, probs = self.decode(encoded_state, y, encoded_x, train=True) preds.append((probs, y)) losses.append(loss) if i % 2000 == 0 and i > 0: print(i) #print (avg_loss) avg_loss = 0. #self.test(dev_data) #print ('DROPOUT = 0.5') #self.embedding_collector.collect() print("training accuracy: {}".format(good / (good + bad))) acc, edit_dis = self.evaluate(dev_data) self.accs.append(acc) patience = 20 if I > 8 and abs( min(self.accs[-patience:]) - max(self.accs[-patience:])) < 0.01: return 0 if acc > self.best_acc: self.best_acc = acc self.model.save("preds-orto-no-diac-embs-cyclic.m") #self.embedding_collector.collect() return 0
def train(self, train_file, epochs, validation_file): plot_on = True # matplotlib config loss_values = [] validation_data = pickle.load(open(validation_file, 'rb')) validation_accs, train_accs = [], [] train_data_original = pickle.load(open(train_file, "rb" )) for i in range(epochs): print('started epoch', (i+1)) losses = [] train_data = pickle.load(open(train_file, "rb" )) # shuffle the training data. random.shuffle(train_data) step = 0 for fl in train_data: features, label = fl[:-1], fl[-1] gold_label = self.vocab.tag2id(label) result = self.build_graph(features) # getting loss with respect to negative log softmax function and the gold label loss = dynet.pickneglogsoftmax(result, gold_label) # appending to the minibatch losses losses.append(loss) step += 1 if len(losses) >= self.properties.minibatch_size: # now we have enough loss values to get loss for minibatch minibatch_loss = dynet.esum(losses) / len(losses) # calling dynet to run forward computation for all minibatch items minibatch_loss.forward() # getting float value of the loss for current minibatch minibatch_loss_value = minibatch_loss.value() # printing info and plotting loss_values.append((len(loss_values), minibatch_loss_value)) if len(loss_values)%10==0: progress = round(100 * float(step) / len(train_data), 2) print('current minibatch loss', minibatch_loss_value, 'progress:', progress, '%') # calling dynet to run backpropagation minibatch_loss.backward() # calling dynet to change parameter values with respect to current backpropagation self.updater.update() # empty the loss vector losses = [] # refresh the memory of dynet dynet.renew_cg() # get validation set accuracy if len(loss_values)%100==0: validation_accs.append((len(loss_values), self.calc_acc(validation_data))) train_accs.append((len(loss_values), self.calc_acc(train_data_original))) # there are still some minibatch items in the memory but they are smaller than the minibatch size # so we ask dynet to forget them dynet.renew_cg() # return these values just for plotting return loss_values, validation_accs, train_accs
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source): """ Perform one iteration of trying to score a node's neighbors above negative samples. """ # true instances likelihood trues = targets(g, node) if is_source else sources(g, node) side = '->' if is_source else '<-' if len(trues) == 0: return 0.0 if opts.debug: dy.renew_cg(immediate_compute = True, check_validity = True) else: dy.renew_cg() # compute association score as dynet expression (can't do this above due to staleness) true_scores = [] for tr in trues: if is_source: j_assoc_score = assoc_model.word_assoc_score(node, tr, rel) else: j_assoc_score = assoc_model.word_assoc_score(tr, node, rel) if log_file is not None: log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\ .format(node, side, tr, j_assoc_score.scalar_value())) true_scores.append(j_assoc_score) # false targets likelihood - negative sampling (uniform) # collect negative samples if opts.nll: sample_scores = [[ts] for ts in true_scores] else: margins = [] neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))] # remove source and true targets if applicable for t in [node] + trues: if t in neg_samples: neg_samples.remove(t) neg_samples.append(np.random.choice(range(N))) for (i,ns) in enumerate(neg_samples): # compute association score as dynet expression if is_source: ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel) else: ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel) if log_file is not None: log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\ .format(node, side, ns, ns_assoc_score.scalar_value())) corresponding_true = i // opts.neg_samp if opts.nll: sample_scores[corresponding_true].append(ns_assoc_score) else: # TODO maybe use dy.hinge() ctt_score = true_scores[corresponding_true] margin = ctt_score - ns_assoc_score margins.append(dy.rectify(dy.scalarInput(1.0) - margin)) # compute overall loss if opts.nll: if len(sample_scores) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores]) else: if len(margins) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum(margins) sc_loss = dy_loss.scalar_value() if log_file is not None: log_file.write('{}\tLOSS\t{:.3e}\n'\ .format(node, sc_loss)) # backprop and recompute score if opts.v > 1: timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\ .format(rel, node, 'source' if is_source else 'target', sc_loss)) dy_loss.backward() trainer.update() return sc_loss
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words": [], "file": daf["file"]} daf = daf["words"] # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate( [char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input( pos_enc('*BOS*')) all_losses = [] pos_prec = 0.0 rough_pos_prec = 0.0 pos_items = 0 class_prec = 0.0 class_items = 0.0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): should_backprop = gold_word_class == 1 # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate( [bilstm_output, prev_pos_lstm_state.output()]) # run through the class mlp class_mlp_output = class_mlp(mlp_input) predicted_word_class = np.argmax(class_mlp_output.npvalue()) confidence = np.max(class_mlp_output.npvalue()) / np.sum( class_mlp_output.npvalue()) # prec if should_backprop: class_prec += 1 if predicted_word_class == gold_word_class else 0 class_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append( -dy.log(dy.pick(class_mlp_output, gold_word_class))) word_class_ans = gold_word_class # otherwise, set the answer to be the argmax else: word_class_ans = predicted_word_class # if the word_class answer is 1, do the pos! # alternatively, if validating and it's aramic, do the pos! if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang): # run the pos mlp output pos_mlp_output = pos_mlp(mlp_input) try: temp_pos_array = pos_mlp_output.npvalue() possible_pos_array = np.zeros(temp_pos_array.shape) pos_list = pos_hashtable[word] # pos_list.add('') #concat 'unknown' as possible pos possible_pos_indices = [ pos_vocab[temp_pos] for temp_pos in pos_list ] possible_pos_array[possible_pos_indices] = temp_pos_array[ possible_pos_indices] except KeyError: possible_pos_array = pos_mlp_output.npvalue() # if fValidation: # possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right? predicted_word_pos = pos_vocab.getItem( np.argmax(possible_pos_array)) confidence = np.max(possible_pos_array) / np.sum( possible_pos_array) # prec if should_backprop: pos_prec += 1 if predicted_word_pos == gold_word_pos else 0 rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[ 0] else 0 # you got at least the rough pos right pos_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log( dy.pick(pos_mlp_output, pos_vocab[gold_word_pos]))) word_pos_ans = gold_word_pos # otherwise, set the answer to be the argmax elif not fRunning and fValidation: if should_backprop: pos_conf_matrix(pos_vocab[predicted_word_pos], pos_vocab[gold_word_pos]) word_pos_ans = predicted_word_pos else: word_pos_ans = predicted_word_pos # run through the prev-pos-mlp predicted = predicted_word_pos prev_pos_lstm_state = prev_pos_lstm_state.add_input( pos_enc(word_pos_ans)) # if the answer is 0, put a '' through the prev-pos lstm else: predicted = 'UNK' prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) tagged_daf["words"].append({ "word": word, "gold_pos": gold_word_pos, "gold_class": gold_word_class, "predicted": predicted, "confidence": confidence, "lang": gold_word_lang }) if fRunning: return tagged_daf pos_prec = pos_prec / pos_items if pos_items > 0 else None rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return class_prec, pos_prec, tagged_daf, rough_pos_prec total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, class_prec, pos_prec, rough_pos_prec
def soft_average(buffer, word_weights): """soft attention""" return dy.esum([ vector * attterion_weight for vector, attterion_weight in zip(buffer, word_weights) ])
else: with open(os.curdir + save_dir + 'parsed.txt', 'w') as f: pass for i in ids: # Prepare a triple of the source word's character ids, # the target word's character ids and morphosyntactic features. d = data[step][i] triple = ([vocab._char_dict.x2i[c] for c in d[0]], [ vocab._char_dict.x2i[c] for c in d[1] ], [vocab._feat_dicts[idx].x2i[c] for idx, c in enumerate(d[2])]) pred_word_indices, loss = mdl.run(triple, isTrain) losses.extend(loss) if isTrain: if len(losses) >= config.batch_size: sum_loss = dy.esum(losses) tot_loss += sum_loss.value() sum_loss.backward() mdl.update_parameters() mdl._global_step += 1 losses = [] dy.renew_cg() else: pred_word = ''.join( [vocab._char_dict.i2x[c] for c in pred_word_indices[:-1]]) if pred_word == d[1]: tot_cor += 1 with open(os.curdir + save_dir + 'parsed.txt', 'a') as f: f.write(d[0] + '\t' + pred_word + '\n')
def calculate_scores_vector_for_list_of_words(word_idxs): dy.renew_cg() b = dy.parameter(mb) score_vector = dy.esum([dy.lookup(mW, x) for x in word_idxs]) return b + score_vector
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source): """ Perform one iteration of trying to score a node's neighbors above negative samples. """ # true instances likelihood trues = targets(g, node) if is_source else sources(g, node) side = '->' if is_source else '<-' if len(trues) == 0: return 0.0 if opts.debug: dy.renew_cg(immediate_compute=True, check_validity=True) else: dy.renew_cg() # compute association score as dynet expression (can't do this above due to staleness) true_scores = [] for tr in trues: if is_source: j_assoc_score = assoc_model.word_assoc_score(node, tr, rel) else: j_assoc_score = assoc_model.word_assoc_score(tr, node, rel) if log_file is not None: log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\ .format(node, side, tr, j_assoc_score.scalar_value())) true_scores.append(j_assoc_score) # false targets likelihood - negative sampling (uniform) # collect negative samples if opts.nll: sample_scores = [[ts] for ts in true_scores] else: margins = [] neg_samples = [ np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues)) ] # remove source and true targets if applicable for t in [node] + trues: if t in neg_samples: neg_samples.remove(t) neg_samples.append(np.random.choice(range(N))) for (i, ns) in enumerate(neg_samples): # compute association score as dynet expression if is_source: ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel) else: ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel) if log_file is not None: log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\ .format(node, side, ns, ns_assoc_score.scalar_value())) corresponding_true = i // opts.neg_samp if opts.nll: sample_scores[corresponding_true].append(ns_assoc_score) else: # TODO maybe use dy.hinge() ctt_score = true_scores[corresponding_true] margin = ctt_score - ns_assoc_score margins.append(dy.rectify(dy.scalarInput(1.0) - margin)) # compute overall loss if opts.nll: if len(sample_scores) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum([ dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores ]) else: if len(margins) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum(margins) sc_loss = dy_loss.scalar_value() if log_file is not None: log_file.write('{}\tLOSS\t{:.3e}\n'\ .format(node, sc_loss)) # backprop and recompute score if opts.v > 1: timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\ .format(rel, node, 'source' if is_source else 'target', sc_loss)) dy_loss.backward() trainer.update() return sc_loss
def pool(input_, _): return dy.esum(input_) / len(input_)
def train(self, train_file, epochs): # matplotlib config loss_values = [] plt.ion() ax = plt.gca() ax.set_xlim([0, 10]) ax.set_ylim([0, 3]) plt.title("Loss over time") plt.xlabel("Minibatch") plt.ylabel("Loss") for i in range(epochs): print('started epoch', (i+1)) losses = [] train_data = open(train_file, 'r').read().strip().split('\n') # shuffle the training data. random.shuffle(train_data) step = 0 for line in train_data: fields = line.strip().split() features, label = fields[:-1], fields[-1] gold_label = self.vocab.action2id(label) result = self.build_graph(features) # getting loss with respect to negative log softmax function and the gold label. loss = dynet.pickneglogsoftmax(result, gold_label) # appending to the minibatch losses losses.append(loss) step += 1 if len(losses) >= self.properties.minibatch_size: # now we have enough loss values to get loss for minibatch minibatch_loss = dynet.esum(losses) / len(losses) # calling dynet to run forward computation for all minibatch items minibatch_loss.forward() # getting float value of the loss for current minibatch minibatch_loss_value = minibatch_loss.value() # printing info and plotting loss_values.append(minibatch_loss_value) if len(loss_values)%10==0: ax.set_xlim([0, len(loss_values)+10]) ax.plot(loss_values) plt.draw() plt.pause(0.0001) progress = round(100 * float(step) / len(train_data), 2) print('current minibatch loss', minibatch_loss_value, 'progress:', progress, '%') # calling dynet to run backpropagation minibatch_loss.backward() # calling dynet to change parameter values with respect to current backpropagation self.updater.update() # empty the loss vector losses = [] # refresh the memory of dynet dynet.renew_cg() # there are still some minibatch items in the memory but they are smaller than the minibatch size # so we ask dynet to forget them dynet.renew_cg()
def _trans_loss(self, superv_acts, superv_terms, buffer, stack_tail, act_tail): stack = [] loss_lst = [] reduction_flag = False reducable_flag = False while not (len(stack) == 1 and reduction_flag != False): reduction_flag = False act_choices = self._legal_acts(stack, reducable_flag) w_weights = None act = self._act_dict[superv_acts.pop(0)] # Accumlate loss in action predication if len(stack) > 0 and act_choices[0] != self._ACT_RED: stack_emb = stack[-1][0].output() act_emb = act_tail.output() w_weights = self._atten(stack_emb, buffer) buf_emb, _ = nnunits.attention_output(buffer, w_weights, 'soft_average') # buf_emb=self._atten.output(buffer,w_weights) for i in xrange(len(stack)): re_idx = len(stack) - 1 - i if stack[re_idx][1] == 'nl': nl_emb = stack[re_idx][2] # Find the raw embedding of the root of subtree # for the leaves. break trans_state = dy.concatenate( [buf_emb, stack_emb, nl_emb, act_emb]) out = self._mlp_layer(trans_state) if self._dropout > 0: out = dy.dropout(out, self._dropout) if len(act_choices): log_probs_act = dy.log_softmax(self._act_pred_layer(out), act_choices) assert act in act_choices, 'illegal action' loss_lst.append(-dy.pick(log_probs_act, act)) act_emb = self._act_in_layer(self._lookup_act[act]) act_tail = act_tail.add_input(act_emb) # Accumlate loss in term predication if act == self._ACT_NT: idx_nt = self._nt_dict[superv_terms.pop(0)] if w_weights is not None: buf_emb, _ = nnunits.attention_output( buffer, w_weights, 'soft_average') # buf_emb = self._atten.output(buffer, w_weights) log_probs_nt = dy.log_softmax(self._nt_pred_layer(buf_emb)) loss_lst.append(-dy.pick(log_probs_nt, idx_nt)) stack_state, label, _ = stack[-1] if stack else (stack_tail, 'ROOT', stack_tail) nt_emb = self._nt_in_layer(self._lookup_nt[idx_nt]) # Here it is called 'raw embedding' stack_state = stack_state.add_input(nt_emb) stack.append((stack_state, 'nl', nt_emb)) # 'nl' label represents the non-leaf nodes elif act in self._ACT_NT_dg: idx_nt = self._nt_dict[superv_terms.pop(0)] # There is no terms (operands) for this action stack_state, label, _ = stack[-1] if stack else (stack_tail, 'ROOT', stack_tail) nt_emb = self._nt_in_layer(self._lookup_nt[idx_nt]) # Here it is called 'raw embedding' stack_state = stack_state.add_input(nt_emb) stack.append((stack_state, 'nl', nt_emb)) # 'nl' label represents the non-leaf nodes elif act == self._ACT_TER: idx_ter = self._ter_dict[superv_terms.pop(0)] if buf_emb != None: log_probs_ter = dy.log_softmax( self._ter_pred_layer(buf_emb)) loss_lst.append(-dy.pick(log_probs_ter, idx_ter)) stack_state, label, _ = stack[-1] if stack else (stack_tail, 'ROOT', stack_tail) ter_emb = self._nt_in_layer(self._lookup_ter[idx_ter]) # Here it is called 'raw embedding' stack_state = stack_state.add_input(ter_emb) stack.append((stack_state, 'l', ter_emb)) # 'nl' label represents the non-leaf nodes else: leaf_raw_reps = [] while stack[-1][1] == 'l': top = stack.pop() rep, _, raw_rep = top leaf_raw_reps.append(raw_rep) nl_raw_rep = stack.pop()[2] subtree_rep = self._red_in_layer( dy.concatenate([dy.average(leaf_raw_reps), nl_raw_rep])) # Append the new reduced node stack_state, _, _ = stack[-1] if stack else (stack_tail, 'ROOT', stack_tail) stack_state = stack_state.add_input(subtree_rep) stack.append((stack_state, 'l', subtree_rep)) reduction_flag = True reducable_flag = True if stack[-1][1] != 'nl' else False # for loss in loss_lst: # print loss.vec_value() return dy.esum(loss_lst)
def train(self, conll_path): # pylint: disable=invalid-name # pylint: disable=missing-docstring eloss = 0.0 mloss = 0.0 eerrors = 0 etotal = 0 start = time.time() shuffled_data = list(read_conll(conll_path)) random.shuffle(shuffled_data) errs = [] lerrs = [] i_sentence = 0 for sentence in shuffled_data: if i_sentence % 100 == 0 and i_sentence != 0: print('Processing sentence number:', i_sentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Time', time.time() - start) start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 conll_sentence = [ entry for entry in sentence if isinstance(entry, ConllEntry) ] for entry in conll_sentence: c = float(self.words_count.get(entry.norm, 0)) drop_flag = (random.random() < (c / (0.25 + c))) wordvec = self.wlookup[int(self.vocab.get(entry.norm, 0)) if drop_flag else 0] \ if self.wdims > 0 else None posvec = self.plookup[int( self.pos[entry.pos])] if self.pdims > 0 else None entry.vec = concatenate( [_f for _f in [wordvec, posvec, None] if _f]) entry.lstms = [entry.vec, entry.vec] entry.headfov = None entry.modfov = None entry.rheadfov = None entry.rmodfov = None if self.blstm_flag: lstm_forward = self.builders[0].initial_state() lstm_backward = self.builders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): lstm_forward = lstm_forward.add_input(entry.vec) lstm_backward = lstm_backward.add_input(rentry.vec) entry.lstms[1] = lstm_forward.output() rentry.lstms[0] = lstm_backward.output() if self.bibi_flag: for entry in conll_sentence: entry.vec = concatenate(entry.lstms) blstm_forward = self.bbuilders[0].initial_state() blstm_backward = self.bbuilders[1].initial_state() for entry, rentry in zip(conll_sentence, reversed(conll_sentence)): blstm_forward = blstm_forward.add_input(entry.vec) blstm_backward = blstm_backward.add_input(rentry.vec) entry.lstms[1] = blstm_forward.output() rentry.lstms[0] = blstm_backward.output() scores, exprs = self._evaluate(conll_sentence) gold = [entry.parent_id for entry in conll_sentence] heads = decoder.parse_proj(scores, gold if self.costaug_flag else None) if self.labels_flag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self._evaluate_label( conll_sentence, head, modifier + 1) gold_label_ind = self.rels[conll_sentence[modifier + 1].relation] wrong_label_ind = max(((label, scr) for label, scr in enumerate(rscores) if label != gold_label_ind), key=itemgetter(1))[0] if rscores[gold_label_ind] < rscores[wrong_label_ind] + 1: lerrs.append(rexprs[wrong_label_ind] - rexprs[gold_label_ind]) e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h, g) in enumerate(zip(heads, gold)) if h != g] # * (1.0/float(e)) eloss += e mloss += e errs.extend(loss) etotal += len(conll_sentence) if i_sentence % 1 == 0 or errs > 0 or lerrs: if errs or lerrs: eerrs = (esum(errs + lerrs)) # * (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] renew_cg() i_sentence += 1 if errs: eerrs = (esum(errs + lerrs)) # * (1.0/(float(len(errs)))) eerrs.scalar_value() eerrs.backward() self.trainer.update() renew_cg() self.trainer.update() print("Loss: ", mloss / i_sentence)
def get_score_for_hist(hist): scores = [dy.parameter(b_m)] for h_, W_ in zip(hist, W_m): scores.append(dy.lookup(W_, h_)) return dy.esum(scores)
def calc_scores(words): dy.renew_cg() h = dy.esum([dy.lookup(W_emb, x) for x in words]) for W_h_i, b_h_i in zip(W_h, b_h): h = dy.tanh(W_h_i * h + b_h_i) return W_sm * h + b_sm
good = 0.0 bad = 0.0 errors = [] # batching for i, (s1, s2, label) in enumerate(train.data): prob = model(s1, s2) softmax = dy.softmax(prob).npvalue() pred = np.argmax(softmax) error = dy.pickneglogsoftmax(prob, label) errors.append(error) if pred == label: good += 1 else: bad += 1 if i % batch_size == 0 and i > 0: sum_errors = dy.esum(errors) loss += sum_errors.value() sum_errors.backward() model.trainer.update() checked += batch_size dy.renew_cg() errors = [] if i % (batch_size * 5) == 0 and i > 0: avgLoss = loss / checked losses.append(avgLoss) print "-" * 20 print "Time: " + str(passed_time(start_time)) print "Epoch: " + str(epoch + 1) + ", Iteration: " + str( i) + " Average loss: " + str(avgLoss) loss = 0 checked = 0
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words":[],"file":daf["file"]} daf = daf["words"] # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) s_0 = prev_pos_lstm.initial_state() beam = [(['*BOS*'],1.0,s_0,[],0.0,0.0,0.0,0.0,0.0,[])] # seq, prob, lstm_state, losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences i = 0 for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): should_backprop = gold_word_class == 1 new_hypos = [] for hypo in beam: seq, hyp_prob, hyp_state, losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences = hypo new_seq = seq[:] new_losses = losses[:] new_confidences = confidences[:] last_pos = seq[-1] next_hyp_state = hyp_state.add_input(pos_enc(last_pos)) # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, next_hyp_state.output()]) # run through the class mlp class_mlp_output = class_mlp(mlp_input) predicted_word_class = np.argmax(class_mlp_output.npvalue()) new_confidences.append(np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue())) # prec if should_backprop: class_prec += 1 if predicted_word_class == gold_word_class else 0 class_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: new_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class))) word_class_ans = gold_word_class # otherwise, set the answer to be the argmax else: word_class_ans = predicted_word_class # if the word_class answer is 1, do the pos! # alternatively, if validating and it's aramic, do the pos! if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang): # run the pos mlp output pos_mlp_output = pos_mlp(mlp_input) try: temp_pos_array = pos_mlp_output.npvalue() possible_pos_array = np.zeros(temp_pos_array.shape) pos_list = pos_hashtable[word] # pos_list.add('') #concat 'unknown' as possible pos possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list] possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices] except KeyError: possible_pos_array = pos_mlp_output.npvalue() # if fValidation: # possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right? poss_pos_sum = np.sum(possible_pos_array) for iprob, prob in enumerate(possible_pos_array): new_seq = seq[:] temp_picked_pos = pos_vocab.getItem(iprob) temp_confidence = possible_pos_array[iprob] / poss_pos_sum new_confidences[-1] = temp_confidence # overwrite class confidence new_pos_prec = pos_prec new_pos_items = pos_items new_rough_pos_prec = rough_pos_prec if should_backprop: new_pos_prec += 1 if temp_picked_pos == gold_word_pos else 0 new_rough_pos_prec += 1 if len(temp_picked_pos) > 0 and temp_picked_pos[0] == gold_word_pos[ 0] else 0 # you got at least the rough pos right new_pos_items += 1 if not fValidation and not fRunning: if should_backprop: new_losses.append( -dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos]))) new_seq += [temp_picked_pos] new_prob = hyp_prob + math.log(prob) if prob != 0 else hyp_prob + math.log(1E-10) # which is log(0.00000001) or something like that new_hypos += [(new_seq, new_prob, next_hyp_state, new_losses, class_prec, class_items, new_pos_prec, new_rough_pos_prec, new_pos_items, new_confidences)] else: # assume prob is 1. It's really good at predicting hebrew / aramaic new_seq = seq[:] new_seq += [''] new_prob = hyp_prob new_hypos += [(new_seq, new_prob, next_hyp_state, new_losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, new_confidences)] # pick the best hypos new_probs = [p for (s, p, r, l, cp, ci, pp, rpp, pi, c) in new_hypos] argmax_indices = util.argmax(new_probs, n=beam_width) if type(argmax_indices) == int: argmax_indices = [argmax_indices] beam = [new_hypos[l] for l in argmax_indices] i += 1 correct_answer_in_beam = False for max_ind in argmax_indices: if new_hypos[max_ind][0][-1] == gold_word_pos: correct_answer_in_beam = True break if not correct_answer_in_beam and not fValidation and not fRunning and with_early_stop: # early stop break final_probs = [p for (s, p, r, l, cp, ci, pp, rpp, pi, c) in beam] argmax_index = util.argmax(final_probs) final_seq, prob, lstm_state, all_losses, class_prec, class_items, pos_prec, rough_pos_prec, pos_items, confidences = beam[argmax_index] for (word, gold_word_class, gold_word_pos, gold_word_lang), pred, conf in zip(daf, final_seq[1:], confidences): # VERY IMPORTANT. final_seq is off-by-one b/c we inited it with BOS tagged_daf['words'].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":pred,"confidence":conf,"lang":gold_word_lang}) should_backprop = gold_word_class == 1 if should_backprop: pos_conf_matrix(pos_vocab[pred], pos_vocab[gold_word_pos]) if fRunning: return tagged_daf pos_prec = pos_prec / pos_items if pos_items > 0 else None rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return class_prec, pos_prec,tagged_daf, rough_pos_prec total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, class_prec, pos_prec, rough_pos_prec
prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss for ITER in range(100): # Perform training random.shuffle(train) train_words, train_loss, train_kl_loss, train_reconstruct_loss = 0, 0.0, 0.0, 0.0 start = time.time() for sent_id, sent in enumerate(train): kl_loss, softmax_loss = calc_loss(sent) total_loss = dy.esum([kl_loss, softmax_loss]) train_loss += total_loss.value() # Record the KL loss and reconstruction loss separately help you monitor the training. train_kl_loss += kl_loss.value() train_reconstruct_loss += softmax_loss.value() train_words += len(sent) total_loss.backward() trainer.update() if (sent_id + 1) % 1000 == 0: print("--finished %r sentences" % (sent_id + 1)) print("iter %r: train loss/word=%.4f, kl loss/word=%.4f, reconstruction loss/word=%.4f, ppl=%.4f, time=%.2fs" % ( ITER, train_loss / train_words, train_kl_loss / train_words, train_reconstruct_loss / train_words, math.exp(train_loss / train_words), time.time() - start))
def train_item(args, model, document): loss = None word_lookups = [] for preprocessed_sentence in document.preprocessed_sentences: seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in preprocessed_sentence ] if len(seq) > 0: word_lookups.append(seq) sentences_lookups = [] for seq in word_lookups: sentence_encode = encode_sequence(model, seq, model.sentence_rnn) global_max = max_pooling(sentence_encode) global_min = average_pooling(sentence_encode) if len(sentence_encode) > 0: att_mlp_outputs = [] for e in sentence_encode: mlp_out = (model.word_attention_w * e) + model.word_attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append( dy.exp(dy.sum_elems(dy.cmult(o, model.word_att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, sentence_encode)]) context = dy.concatenate([att_context, global_max, global_min]) sentences_lookups.append(context) document_encode = encode_sequence(model, sentences_lookups, model.document_rnn) global_max = max_pooling(document_encode) global_min = average_pooling(document_encode) if len(document_encode) > 0: att_mlp_outputs = [] for e in document_encode: mlp_out = (model.sentence_attention_w * e) + model.sentence_attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append( dy.exp(dy.sum_elems(dy.cmult(o, model.sentence_att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, document_encode)]) context = dy.concatenate([att_context, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) if document.permissions[args.permission_type]: loss = dy.binary_log_loss(y_pred, dy.scalarInput(1)) else: loss = dy.binary_log_loss(y_pred, dy.scalarInput(0)) loss.backward() model.trainer.update() loss_val = loss.scalar_value() dy.renew_cg() return loss_val return 0
def calc_loss(self, policy_reward, results={}): """ Calc policy networks loss. """ assert len(policy_reward) == len(self.states), "There should be a reward for every action taken" batch_size = self.states[0].dim()[1] loss = {} # Calculate the baseline loss of the reinforce loss for each timestep: # b = W_b * s + b_b # R = r - b # Also calculate the baseline loss # b = r_p (predicted) # loss_b = squared_distance(r_p - r_r) rewards = [] baseline_loss = [] units = np.zeros(batch_size) for i, state in enumerate(self.states): r_p = self.baseline.transform(dy.nobackprop(state)) rewards.append(policy_reward[i] - r_p) if self.valid_pos[i] is not None: r_p = dy.pick_batch_elems(r_p, self.valid_pos[i]) r_r = dy.pick_batch_elems(policy_reward[i], self.valid_pos[i]) units[self.valid_pos[i]] += 1 else: r_r = policy_reward[i] units += 1 baseline_loss.append(dy.sum_batches(dy.squared_distance(r_p, r_r))) loss["rl_baseline"] = losses.LossExpr(dy.esum(baseline_loss), units) # Z Normalization # R = R - mean(R) / std(R) rewards = dy.concatenate(rewards, d=0) r_dim = rewards.dim() if self.z_normalization: rewards_shape = dy.reshape(rewards, (r_dim[0][0], r_dim[1])) rewards_mean = dy.mean_elems(rewards_shape) rewards_std = dy.std_elems(rewards_shape) + 1e-20 rewards = (rewards - rewards_mean.value()) / rewards_std.value() rewards = dy.nobackprop(rewards) # Calculate Confidence Penalty if self.confidence_penalty: loss["rl_confpen"] = self.confidence_penalty.calc_loss(self.policy_lls) # Calculate Reinforce Loss # L = - sum([R-b] * pi_ll) reinf_loss = [] units = np.zeros(batch_size) for i, (policy, action) in enumerate(zip(self.policy_lls, self.actions)): reward = dy.pick(rewards, i) ll = dy.pick_batch(policy, action) if self.valid_pos[i] is not None: ll = dy.pick_batch_elems(ll, self.valid_pos[i]) reward = dy.pick_batch_elems(reward, self.valid_pos[i]) units[self.valid_pos[i]] += 1 else: units += 1 reinf_loss.append(dy.sum_batches(dy.cmult(ll, reward))) loss["rl_reinf"] = losses.LossExpr(-dy.esum(reinf_loss), units) # Pack up + return return losses.FactoredLossExpr(loss)
def train(self, examples): # Train action classifier: classifier_data = [] for example in examples: encoder_input = example[0] ground_labels = example[1] cdata = (encoder_input[0], ground_labels[0], ground_labels[1]) classifier_data.append(cdata) self.classifier.train(classifier_data) # Train cluster model: # num_examples = len(examples) num_examples = 100 trainer = dy.AdamTrainer(self.params) for epoch in range(self.num_epochs): batch_loss = [] loss_sum = 0 for idx in range(num_examples): # if (idx % 1000 == 0): # print("(Clusters) Epoch: {} | Example: {} | Loss sum: {}".format(epoch, idx, loss_sum)) loss = self.train_example(examples[idx]) batch_loss.append(loss) # Minibatching: if (idx % self.minibatch == 0) or (idx + 1 == num_examples): batch_loss = dy.esum(batch_loss) loss_sum += batch_loss.value() batch_loss.backward() batch_loss = [] trainer.update() dy.renew_cg() print("(Clusters) Epoch: {} | Loss: {}".format( epoch + 1, loss_sum)) # Expectation maximization: em_time = time.time() self.e_time = 0 self.m_time = 0 self.back_time = 0 self.papx_time = 0 self.pa_time = 0 self.px_time = 0 zs = {} # Initialize one-hot z's: self.onehotzs = [] for idx in range(self.num_clusters): one_hot_z = np.zeros(self.num_clusters) one_hot_z[idx] = 1 one_hot_z = dy.inputVector(one_hot_z) self.onehotzs.append(one_hot_z) for epoch in range(10): batch_loss = [] loss_sum = 0 for idx in range(num_examples): # if (idx % 100 == 0): # print("(EM) Epoch: {} | Example: {}".format(epoch, idx)) if len(examples[idx][1][1]) == 3: # Agreement occurs loss, z_list = self.em_example(examples[idx]) zs[idx] = z_list batch_loss.append(loss) else: zs[idx] = [] # Minibatching: if (idx % self.minibatch == 0) or (idx + 1 == num_examples) and (batch_loss != []): batch_loss = dy.esum(batch_loss) loss_sum += batch_loss.value() back_time = time.time() batch_loss.backward() batch_loss = [] trainer.update() dy.renew_cg() self.onehotzs = [] for idx in range(self.num_clusters): one_hot_z = np.zeros(self.num_clusters) one_hot_z[idx] = 1 one_hot_z = dy.inputVector(one_hot_z) self.onehotzs.append(one_hot_z) self.back_time += (time.time() - back_time) print("(EM) Epoch: {} | Loss: {}".format(epoch + 1, loss_sum)) # print("EM time: {}".format(time.time() - em_time)) # print("E time: {}".format(self.e_time)) # print("M time: {}".format(self.m_time)) # print("Backprop time: {}".format(self.back_time)) # print("PAPX time: {}".format(self.papx_time)) # print("PA time: {}".format(self.pa_time)) # print("PX time: {}".format(self.px_time)) # Print zs to file: with open("data/clusters/clusters.txt", 'w') as f: for idx in range(num_examples): f.write(str(zs[idx])) f.write('\n')
def calc_loss(scores, tags): losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] return dy.esum(losses)
def calc_loss(scores, tags): losses = [ dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags) ] return dy.esum(losses)
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words":[],"file":daf["file"]} daf = daf["words"] # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input(pos_enc('*BOS*')) all_losses = [] pos_prec = 0.0 rough_pos_prec = 0.0 pos_items = 0 class_prec = 0.0 class_items = 0.0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): should_backprop = gold_word_class == 1 # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_pos_lstm_state.output()]) # run through the class mlp class_mlp_output = class_mlp(mlp_input) predicted_word_class = np.argmax(class_mlp_output.npvalue()) confidence = np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue()) # prec if should_backprop: class_prec += 1 if predicted_word_class == gold_word_class else 0 class_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class))) word_class_ans = gold_word_class # otherwise, set the answer to be the argmax else: word_class_ans = predicted_word_class # if the word_class answer is 1, do the pos! # alternatively, if validating and it's aramic, do the pos! if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang): # run the pos mlp output pos_mlp_output = pos_mlp(mlp_input) try: temp_pos_array = pos_mlp_output.npvalue() possible_pos_array = np.zeros(temp_pos_array.shape) pos_list = pos_hashtable[word] # pos_list.add('') #concat 'unknown' as possible pos possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list] possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices] except KeyError: possible_pos_array = pos_mlp_output.npvalue() # if fValidation: # possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right? predicted_word_pos = pos_vocab.getItem(np.argmax(possible_pos_array)) confidence = np.max(possible_pos_array) / np.sum(possible_pos_array) # prec if should_backprop: pos_prec += 1 if predicted_word_pos == gold_word_pos else 0 rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[0] else 0 # you got at least the rough pos right pos_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos]))) word_pos_ans = gold_word_pos # otherwise, set the answer to be the argmax elif not fRunning and fValidation: if should_backprop: pos_conf_matrix(pos_vocab[predicted_word_pos], pos_vocab[gold_word_pos]) word_pos_ans = predicted_word_pos else: word_pos_ans = predicted_word_pos # run through the prev-pos-mlp predicted = predicted_word_pos prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(word_pos_ans)) # if the answer is 0, put a '' through the prev-pos lstm else: predicted = 'UNK' prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) tagged_daf["words"].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":predicted,"confidence":confidence, "lang": gold_word_lang}) if fRunning: return tagged_daf pos_prec = pos_prec / pos_items if pos_items > 0 else None rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return class_prec, pos_prec,tagged_daf, rough_pos_prec total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, class_prec, pos_prec, rough_pos_prec
def train_beam_graph(e, beam_size, traj_type, loss_fn): dy.renew_cg() tags = e[tk_tags_key] m = model_init(e) beam_costs_prev = np.array([0], dtype="int") beam_costs = [] losses = [] for i in xrange(len(e["tk_words"])): scores = model_scores(m) # transition scores_np = scores.npvalue() beam_indices, tag_indices = beam_argtopk(scores_np, beam_size) beam_costs_cur = beam_costs_prev[beam_indices] + (tag_indices != tags[i]).astype('int') # compute the loss if there is score accumulation or always next_beam_size = beam_size if i < len(e["tk_words"]) - 1 else 1 if (not cfg["update_only_on_cost_increase"]) or ( cfg["update_only_on_cost_increase"] and beam_costs_prev.min() < beam_costs_cur[:next_beam_size].min()): loss = loss_fn(tags, i, beam_costs_prev, scores, beam_size) losses.append(loss) if traj_type == "stop": if beam_costs_cur.min() > 0: break elif traj_type == "continue": pass elif traj_type == "reset": if beam_costs_cur.min() > 0: b_gold_idx = beam_costs_prev.argmin() beam_indices = np.array([b_gold_idx], dtype='int') tag_indices = np.array([tags[i]], dtype='int') beam_costs_cur = np.array([0], dtype='int') elif traj_type == "reset_multiple": # NOTE: this is similar to the reset option. replace the last element # in the beam with the correct one. if beam_costs_cur.min() > 0: b_gold_idx = beam_costs_prev.argmin() beam_indices[-1] = b_gold_idx tag_indices[-1] = tags[i] beam_costs_cur[-1] = beam_costs_prev[b_gold_idx] # this should be zero # assert beam_costs_prev[-1] == 0 # NOTE: there is probably a less repetitive way of doing this. elif traj_type == "oracle": t_idx = tags[i] beam_size_prev = beam_costs_prev.shape[0] costs = beam_costs_prev.reshape((beam_size_prev, 1)) * np.ones( (1, num_tags)) costs += 1.0 costs[:, t_idx] -= 1.0 beam_indices, tag_indices = beam_argtopk(-costs, beam_size) beam_costs_cur = beam_costs_prev[beam_indices] + ( tag_indices != tags[i]).astype('int') else: raise ValueError beam_costs.append(beam_costs_cur) beam_costs_prev = beam_costs_cur model_step(m, beam_indices, tag_indices) if len(losses) > 0: return dy.esum(losses) else: return dy.zeros(1)
def on_calc_additional_loss(self, translator_loss): if not self.learn_segmentation or self.segment_decisions is None: return None reward = -translator_loss["mle"] if not self.log_reward: reward = dy.exp(reward) reward = dy.nobackprop(reward) # Make sure that reward is not scalar, but rather based on the each batch item assert reward.dim()[1] == len(self.src_sent) # Mask enc_mask = self.enc_mask.get_active_one_mask().transpose( ) if self.enc_mask is not None else None # Compose the lose ret = LossBuilder() ## Length prior alpha = self.length_prior_alpha.value( ) if self.length_prior_alpha is not None else 0 if alpha > 0: reward += self.segment_length_prior * alpha # reward z-score normalization if self.z_normalization: reward = dy.cdiv(reward - dy.mean_batches(reward), dy.std_batches(reward) + EPS) ## Baseline Loss if self.use_baseline: baseline_loss = [] for i, baseline in enumerate(self.bs): loss = dy.squared_distance(reward, baseline) if enc_mask is not None: loss = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), loss) baseline_loss.append(loss) ret.add_loss("Baseline", dy.esum(baseline_loss)) if self.print_sample: print( dy.exp(self.segment_logsoftmaxes[i]).npvalue().transpose()[0]) ## Reinforce Loss lmbd = self.lmbd.value() if lmbd > 0.0: reinforce_loss = [] # Calculating the loss of the baseline and reinforce for i in range(len(self.segment_decisions)): ll = dy.pick_batch(self.segment_logsoftmaxes[i], self.segment_decisions[i]) if self.use_baseline: r_i = reward - dy.nobackprop(self.bs[i]) else: r_i = reward if enc_mask is not None: ll = dy.cmult(dy.inputTensor(enc_mask[i], batched=True), ll) reinforce_loss.append(r_i * -ll) loss = dy.esum(reinforce_loss) * lmbd ret.add_loss("Reinforce", loss) if self.confidence_penalty: ls_loss = self.confidence_penalty(self.segment_logsoftmaxes, enc_mask) ret.add_loss("Confidence Penalty", ls_loss) # Total Loss return ret
def __call__(self, context, F2I, only_train_words, dropout_rate=1.0, activate_sub_word=False, stop_updating_lookup=False): num_params = len(self.params) # get the length of params vector lookup = self.lookup # if the user choose to continue learning the pre-trained words if not stop_updating_lookup: # if sub word feature is not activated if not activate_sub_word: emb_vectors = [lookup[F2I.get(i)] for i in context] # get embedding of the words else: # sum embedding of word, suffix and prefix for words that allow it emb_vectors = [] for word in context: # get the word alone if len<=3 or word=start/end/unk if len(word) <= 3 or (word in ["<s>", "</s>", "UUUNKKK"]): emb_vectors.append(lookup[F2I.get(word)]) else: pref = False suff = False # check if prefix exist in F2I. relevant for test/dev sets if F2I.has_key(word[:3]): prefix_embd = lookup[F2I.get(word[:3])] pref = True # check if suffix exist in F2I. relevant for test/dev sets if F2I.has_key(word[-3:]): suffix_embd = lookup[F2I.get(word[-3:])] suff = True word_embd = lookup[F2I.get(word)] # sum vectors of word with existing prefix/suffix if pref and suff: sum_embd = dy.esum( [prefix_embd, suffix_embd, word_embd]) elif pref and suff == False: sum_embd = dy.esum([prefix_embd, word_embd]) elif suff and pref == False: sum_embd = dy.esum([suffix_embd, word_embd]) else: sum_embd = dy.esum([word_embd]) emb_vectors.append(sum_embd) # if the user choose to stop learning the pre-trained words if stop_updating_lookup: # if sub word feature is not activated if not activate_sub_word: emb_vectors = [] for word in context: # if it's a word from the corpus continue training it if word in only_train_words: emb_vectors.append(lookup[F2I.get(word)]) # if it's a word from the pre-train stop training it else: emb_vectors.append(dy.nobackprop( lookup[F2I.get(word)])) else: # sum embedding of word, suffix and prefix for words that allow it emb_vectors = [] for word in context: # get the word alone if len<=3 or word=start/end/unk if len(word) <= 3 or (word in ["<s>", "</s>", "UUUNKKK"]): # if it's a word from the corpus continue training it if word in only_train_words: emb_vectors.append(lookup[F2I.get(word)]) # if it's a word from the pre-train stop training it else: emb_vectors.append( dy.nobackprop(lookup[F2I.get(word)])) else: pref = False suff = False # check if prefix exist in F2I. relevant for test/dev sets if F2I.has_key(word[:3]): # if it's a word from the corpus continue training it if word[:3] in only_train_words: prefix_embd = lookup[F2I.get(word[:3])] # if it's a word from the pre-train stop training it else: prefix_embd = dy.nobackprop(lookup[F2I.get( word[:3])]) pref = True # check if suffix exist in F2I. relevant for test/dev sets if F2I.has_key(word[-3:]): # if it's a word from the corpus continue training it if word[-3:] in only_train_words: suffix_embd = lookup[F2I.get(word[-3:])] # if it's a word from the pre-train stop training it else: suffix_embd = dy.nobackprop(lookup[F2I.get( word[-3:])]) suff = True # if it's a word from the corpus continue training it if word in only_train_words: word_embd = lookup[F2I.get(word)] # if it's a word from the pre-train stop training it else: word_embd = dy.nobackprop(lookup[F2I.get(word)]) # sum vectors of word with existing prefix/suffix if pref and suff: sum_embd = dy.esum( [prefix_embd, suffix_embd, word_embd]) elif pref and suff == False: sum_embd = dy.esum([prefix_embd, word_embd]) elif suff and pref == False: sum_embd = dy.esum([suffix_embd, word_embd]) else: sum_embd = dy.esum([word_embd]) emb_vectors.append(sum_embd) net_input = dy.concatenate(emb_vectors) for i in xrange( 0, num_params - 2, 2 ): # calculate the activation of each subsequent layers and apply the bernoulli mask W = dy.parameter(self.params[i]) # from parameters to expressions b = dy.parameter(self.params[i + 1]) if i == 0: # first layer activation = dy.tanh((W * net_input) + b) else: # other layers activation = dy.tanh((W * activation) + b) if dropout_rate != 1.0: activation = dy.dropout(activation, dropout_rate) W = dy.parameter(self.params[num_params - 2]) # from parameters to expressions b = dy.parameter(self.params[num_params - 1]) net_output = dy.softmax( (W * activation) + b) # apply sfotmax on last layer return net_output
def test_item(model, document): word_lookups = [] for preprocessed_sentence in document.preprocessed_sentences: seq = [ model.wlookup[int(model.w2i.get(entry, 0))] for entry in preprocessed_sentence ] if len(seq) > 0: word_lookups.append(seq) sentences_lookups = [] for seq in word_lookups: sentence_encode = encode_sequence(model, seq, model.sentence_rnn) global_max = max_pooling(sentence_encode) global_min = average_pooling(sentence_encode) if len(sentence_encode) > 0: att_mlp_outputs = [] for e in sentence_encode: mlp_out = (model.word_attention_w * e) + model.word_attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append( dy.exp(dy.sum_elems(dy.cmult(o, model.word_att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, sentence_encode)]) context = dy.concatenate([att_context, global_max, global_min]) sentences_lookups.append(context) document_encode = encode_sequence(model, sentences_lookups, model.document_rnn) global_max = max_pooling(document_encode) global_min = average_pooling(document_encode) if len(document_encode) > 0: att_mlp_outputs = [] for e in document_encode: mlp_out = (model.sentence_attention_w * e) + model.sentence_attention_b att_mlp_outputs.append(mlp_out) lst = [] for o in att_mlp_outputs: lst.append( dy.exp(dy.sum_elems(dy.cmult(o, model.sentence_att_context)))) sum_all = dy.esum(lst) probs = [dy.cdiv(e, sum_all) for e in lst] att_context = dy.esum( [dy.cmult(p, h) for p, h in zip(probs, document_encode)]) context = dy.concatenate([att_context, global_max, global_min]) y_pred = dy.logistic((model.mlp_w * context) + model.mlp_b) document.prediction_result = y_pred.scalar_value() dy.renew_cg() return document.prediction_result return 0
# make a forward pass pred = forward_pass(x) # calculate loss for each example loss = dy.binary_log_loss(pred, y) losses.append(loss) pred y # Now let's accumulate the loss and backpropogate it. # In[24]: # get total loss for dataset total_loss = dy.esum(losses) # apply the calculations of the computational graph total_loss.forward() # calculate loss to backpropogate total_loss.backward() # update parameters with backpropogated error trainer.update() # Let's make sure that our parameter `W_1` has been updated (e.g. it "learned" something). # In[25]: # confirm that parameters updated dy.renew_cg()
def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True): sent_len = len(inputs) batch_size = inputs[0].dim()[1] flat_len = sent_len * batch_size print('===Vào call===') print('input length: ', inputs.__len__()) # input length: 46 print('input dim: ', inputs[1].dim()) # input dim: ((400,), 2) print('sent_len', sent_len) # sent_len 46 print('batch_size', batch_size) # batch_size 2 print('flat_len', flat_len) # flat_len 92 # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) X = dy.concatenate_cols(inputs) print('X dim: ', X.dim()) # X dim: ((400, 46), 2) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) print('head_arc dim: ', head_arc.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) # ((L, L), B) masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True)) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): print('----layer-----', k) # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) / arc_norm - masks_2D arc_prob = dy.softmax(arc_mat) # arc_mat dim: ((46, 46), 2) # arc_prob dim: ((46, 46), 2) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) print('arc_mat val', arc_mat.value()) print('arc_mat dim', arc_mat.dim()) print("truth['head'] value", truth['head']) print("truth['head'] lengt", truth['head'].__len__()) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) print('arc_loss', arc_loss.value()) print('arc_loss', arc_loss.dim()) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token print('arc_loss', arc_loss.value) print('arc_loss', arc_loss.dim()) gnn_losses.append(arc_loss.value()) input("pause") # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX print('HX dim: ', HX.dim()) print('DX dim: ', DX.dim()) print('FX dim: ', FX.dim()) # HX dim: ((300, 46), 2) # DX dim: ((300, 46), 2) # FX dim: ((300, 46), 2) # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) print('head_arc dim: ', head_arc.dim()) print('FX_new dim: ', FX_new.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # FX_new dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR + DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel print('HR dim: ', HR.dim()) print('DR dim: ', DR.dim()) print('FX dim: ', FX.dim()) # HR dim: ((100, 46), 2) # DR dim: ((100, 46), 2) # FX dim: ((100, 46), 2) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) / arc_norm - masks_2D # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) print('arc_mat dim: ', arc_mat.dim()) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # arc_mat dim: ((46,), 92) # head_rel dim: ((100, 92), 1) # dept_rel dim: ((100,), 92) if is_train: # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token # Final Total Loss with Layer-wise warm = [int(iters >= x) for x in self.warm_list] losses = rel_loss*self.cfg.LAMBDA2 * \ warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1] if gnn_losses: for i in range(self.cfg.GRAPH_LAYERS): gnn_losses[i] *= warm[i] losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [ np.array(masks['flat'][i:i + sent_len]) for i in range(0, flat_len, sent_len) ] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [ j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred) ] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def calc_score_of_history(words): # Create a list of things to sum up with only the bias vector at first score_vecs = [dy.parameter(b_sm)] for word_id, lookup_param in zip(words, W_sm): score_vecs.append(lookup_param[word_id]) return dy.esum(score_vecs)