def calculate_LM_loss(self, sequence): # Renew the computation graph dy.renew_cg() # Initialize the RNN f_init = self.RNN.initial_state() # Initialize the parameters W_exp = dy.parameter(self.W_sm) b_exp = dy.parameter(self.b_sm) # Get the ids for ICD codes wids = [self.vw.w2i[w] for w in sequence] #print wids #print wids[0] #print dy.lookup(self.lookup, wids[0]) # Start the RNN s = f_init.add_input(dy.lookup(self.lookup, wids[-1])) # Feed the vectors into the RNN and predict the next code losses = [] for wid in wids: score = W_exp * s.output() + b_exp loss = dy.pickneglogsoftmax(score, wid) losses.append(loss) s = s.add_input(self.lookup[wid]) return dy.esum(losses)
def build_tagging_graph_lvl1(words, tags, builders): dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] wembs = [E[w] for w in words] wembs = [dy.noise(we, 0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(wembs)] bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] # fw_rnn_hidden_outs = [x.value() for x in fw] # bw_rnn_hidden_outs = [x.value() for x in bw] # print ("Transducing") # fw_rnn_hidden_outs = f_init.transduce(wembs) # bw_rnn_hidden_outs = b_init.transduce(reversed(wembs)) if MLP: H = dy.parameter(pH) O = dy.parameter(pO) else: O = dy.parameter(pO) errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if MLP: r_t = O * (dy.tanh(H * f_b)) else: r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return {'err': dy.esum(errs), 'fw': fw, 'bw': bw}
def softmax( edges, labels_exprs, label_dict, # type: Dictionary is_train): labeled_edges = [] loss = dn.scalarInput(0.0) for edge, r_scores_expr in zip(edges, labels_exprs): head, label, modifier = edge if head == 0: if not is_train: labeled_edges.append( graph_utils.Edge(edge.source, "ROOT", edge.target)) continue if is_train: gold_label_index = label_dict.word_to_int[label] loss += dn.pickneglogsoftmax(r_scores_expr, gold_label_index) else: label_index = np.argmax(r_scores_expr.value()) label = label_dict.int_to_word[label_index] labeled_edges.append( graph_utils.Edge(edge.source, label, edge.target)) if is_train: return loss else: return labeled_edges
def trainAlgo(train_tokens, train_labels, num_epochs, num_batches_training, batch_size, w2i, embedding_parameters, pW, pb, modelPath, RNN_unit, trainer, RNN_model): # i = epoch index # j = batch index # k = sentence index (inside batch j) # l = token index (inside sentence k) epoch_losses = [] overall_accuracies = [] sentence_accuracies = [] start_train_time = time.clock() for i in range(num_epochs): epoch_loss = [] print("Starting epoch: " + str(i + 1)) start_epoch_time = time.clock() for j in range(num_batches_training): # begin a clean computational graph dy.renew_cg() # build the batch batch_tokens = train_tokens[j * batch_size:(j + 1) * batch_size] batch_labels = train_labels[j * batch_size:(j + 1) * batch_size] # iterate through the batch for k in range(len(batch_tokens)): # prepare input: words to indexes seq_of_idxs = words2indexes(batch_tokens[k], w2i) # make a forward pass preds = forward_pass(seq_of_idxs, embedding_parameters, pW, pb, RNN_unit) # calculate loss for each token in each example loss = [ dy.pickneglogsoftmax(preds[l], batch_labels[k][l]) for l in range(len(preds)) ] # sum the loss for each token sent_loss = dy.esum(loss) # backpropogate the loss for the sentence sent_loss.backward() trainer.update() epoch_loss.append(sent_loss.npvalue()) # print("epoch: " + str(i+1) + " batch: " + str(j+1) + " loss: " + str(np.average(epoch_loss)) + "\r") # record epoch loss epoch_losses.append(np.sum(epoch_loss)) print("Train loss after epoch: " + str(i + 1) + " loss: " + str(np.average(epoch_loss))) print("Epoch " + str(i + 1) + " Time Taken: " + str(time.clock() - start_epoch_time)) # get accuracy on test set # # print("Train loss after epoch {}".format(i + 1)) # epoch_predictions = test(train_tokens, train_labels, num_batches_training, w2i, embedding_parameters, pW, pb) # epoch_overall_accuracy, epoch_sentence_accuracy = evaluate(epoch_predictions, train_labels) # overall_accuracies.append(epoch_overall_accuracy) # sentence_accuracies.append(epoch_sentence_accuracy) print("Training Completed. Time taken: " + str(time.clock() - start_train_time)) print("Saving model in " + str(modelPath)) RNN_model.save(modelPath) print("Done!")
def Train(instances, itercount): dy.renew_cg() ontoparser.initialize_graph_nodes(train=True) loss = [] errors = 0.0 for instance in instances: fexpr, sexpr, groundtruth = instance # context insensitive embeddings or local embeddings subtype = [sb.lower() for sb in fexpr.split()] #if sb.lower() not in stop] supertype = [sp.lower() for sp in sexpr.split()] #if sp.lower() not in stop] fembs, DSTATUS_X = ontoparser.get_linear_embd(subtype) sembs, DSTATUS_Y = ontoparser.get_linear_embd(supertype) #if (DSTATUS_X is False) or (DSTATUS_Y is False): continue fembs = fembs[0] if len(fembs) == 1 else dy.average(fembs) sembs = sembs[0] if len(sembs) == 1 else dy.average(sembs) x = dy.concatenate([fembs, sembs]) #e_dist = dy.squared_distance(fembs, sembs) e_dist = 1 - distance.cosine(fembs.npvalue(), sembs.npvalue()) #weighted_x = x * e_dist output = ontoparser.W2 * (dy.rectify(ontoparser.W1 * x) + ontoparser.b1) + ontoparser.b2 prediction = np.argmax(output.npvalue()) loss.append( dy.pickneglogsoftmax(output, ontoparser.meta.tdmaps[groundtruth])) #if ((ontoparser.meta.rmaps[prediction] == "Hypernym") and ("Hypernym" != groundtruth)) and (e_dist < 0.5): # loss[-1] += -log(0.6) errors += 0 if groundtruth == ontoparser.meta.rmaps[prediction] else 1 return loss, errors
def build_tagging_graph_lvl2(embeds, words, tags, builders): # dy.renew_cg() f_init, b_init = [b.initial_state() for b in builders] # wembs = [E[w] for w in words] # wembs = [dy.noise(we,0.1) for we in wembs] fw = [x.output() for x in f_init.add_inputs(embeds)] bw = [x.output() for x in b_init.add_inputs(reversed(embeds))] # fw = [x.output() for x in f_init.add_inputs(wembs)] # bw = [x.output() for x in b_init.add_inputs(reversed(wembs))] # fw = f_init.transduce(embeds) # bw = b_init.transduce(reversed(embeds)) if MLP: H = dy.parameter(pH) O = dy.parameter(pO) else: O = dy.parameter(pO) errs = [] for f, b, t in zip(fw, reversed(bw), tags): f_b = dy.concatenate([f, b]) if MLP: r_t = O * (dy.tanh(H * f_b)) else: r_t = O * f_b err = dy.pickneglogsoftmax(r_t, t) errs.append(err) return dy.esum(errs)
def calc_lm_loss(sent): dy.renew_cg() # parameters -> expressions W_exp = dy.parameter(W_sm) b_exp = dy.parameter(b_sm) # initialize the RNN f_init = RNN.initial_state() # get the word ids wids = [vw.w2i[w] for w in sent] # start the rnn by inputting "<s>" s = f_init.add_input(WORDS_LOOKUP[wids[-1]]) # feed word vectors into the RNN and predict the next word losses = [] for wid in wids: # calculate the softmax and loss score = W_exp * s.output() + b_exp loss = dy.pickneglogsoftmax(score, wid) losses.append(loss) # update the state of the RNN s = s.add_input(WORDS_LOOKUP[wid]) return dy.esum(losses)
def decode_to_loss(self, vectors, output): w = dy.parameter(self.w_softmax) b = dy.parameter(self.b_softmax) w1 = dy.parameter(self.attention_source) output = list(output) encoded_states = dy.concatenate_cols(vectors) prev_output_embeddings = self.target_lookup[self.eos_target] current_state = self.decoder.initial_state().add_input( dy.concatenate( [dy.vecInput(self.hidden_size * 2), prev_output_embeddings])) losses = [] attentional_component = w1 * encoded_states for next_word in output: vector = dy.concatenate([ self.attention(encoded_states, current_state, attentional_component), prev_output_embeddings ]) current_state = current_state.add_input(vector) s = dy.affine_transform([b, w, current_state.output()]) item_loss = dy.pickneglogsoftmax(s, next_word) losses.append(item_loss) prev_output_embeddings = self.target_lookup[next_word] loss = dy.esum(losses) return loss
def sent_loss_precalc(words, tags, vecs): errs = [] for v, t in zip(vecs, tags): tid = vt.w2i[t] err = dy.pickneglogsoftmax(v, tid) errs.append(err) return dy.esum(errs)
def encode(self, instance, wids): dy.renew_cg() W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) # print "chceking wids here",wids["about"] src_sent = instance.split() #print "printing src sentnce length", len(src_sent) losses = [] total_words = 0 # Encoder enc_state = self.enc_builder.initial_state() for current_word in src_sent: state = enc_state.add_input(self.src_lookup[wids[current_word]]) encoded = (W_y * state.output()) + b_y dec_state = self.dec_builder.initial_state() dec_state = self.dec_builder.initial_state(encoded) errs = [] # Calculate losses for decoding for (cw, nw) in zip(src_sent, src_sent[1:]): dec_state = dec_state.add_input( self.tgt_lookup[wids[current_word]]) decoded = dec_state.output() ystar = (W_y * dec_state.output()) + b_y print "current word is >>>>>>>", cw print "next word shud be", nw #loss = dy.pickneglogsoftmax(ystar, wids[nw]) for wid in wids: loss = dy.pickneglogsoftmax(ystar, wids[wid]) print "Loss for ", wid, " w.r.t ", nw, " is ", loss.value()
def build_nnlm_graph(self, dictionary): dy.renew_cg() M = self.model.add_lookup_parameters((len(self.wids), self.EMB_SIZE)) W_mh = self.model.add_parameters( (self.HID_SIZE, self.EMB_SIZE * (self.N - 1))) b_hh = self.model.add_parameters((self.HID_SIZE)) W_hs = self.model.add_parameters((len(self.wids), self.HID_SIZE)) b_s = self.model.add_parameters((len(self.wids))) w_xh = dy.parameter(W_mh) b_h = dy.parameter(b_hh) W_hy = dy.parameter(W_hs) b_y = dy.parameter(b_s) errs = [] for context, next_word in dictionary: #print context, next_word k = M[self.wids[context.split()[0]]] kk = M[self.wids[context.split()[1]]] #print k , kk #print k.value() x = k.value() + kk.value() #print x h_val = dy.tanh(w_xh * dy.inputVector(x) + b_h) y_val = W_hy * h_val + b_y err = dy.pickneglogsoftmax(y_val, self.wids[next_word]) errs.append(err) gen_err = dy.esum(errs) return gen_err
def sent_loss(self, words, tags, ltags): self.eval = False vecs = self.build_tagging_graph(words, ltags) for v, t in zip(vecs, tags): tid = self.meta.t2i[t] err = dy.pickneglogsoftmax(v, tid) self.loss.append(err)
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def train(): # i = epoch index # j = batch index # k = sentence index (inside batch j) # l = token index (inside sentence k) for i in range(num_epochs): random.seed(i+100) random.shuffle(train_tokens) random.seed(i+100) random.shuffle(train_labels) for j in range(num_batches_training): # begin a clean computational graph dy.renew_cg() # build the batch batch_tokens = train_tokens[j*batch_size:(j+1)*batch_size] batch_labels = train_labels[j*batch_size:(j+1)*batch_size] # iterate through the batch for k in range(len(batch_tokens)): # prepare input: words to indexes seq_of_idxs = words2indexes(batch_tokens[k], w2i) # make a forward pass preds = forward_pass(seq_of_idxs) # calculate loss for each token in each example loss = [dy.pickneglogsoftmax(preds[l], batch_labels[k][l]) for l in range(len(preds))] # sum the loss for each token sent_loss = dy.esum(loss) # backpropogate the loss for the sentence sent_loss.backward() trainer.update()
def decode(self, states, y, encoded_input, train=False): def sample(probs): return np.argmax(probs) s = self.decoder_rnn.initial_state() start_encoded = self.l2e["sep"].encode("<s>") out = [] loss = dy.scalarInput(0.) #s = s.add_input(states[-1]) #s.add_input(dy.concatenate([start_encoded, states[-1]])) s = s.add_input(dy.concatenate([start_encoded, states[-1]])) generated_string = [] for char in y: true_char_encoded = self.l2e["l"].encode(char) scores = self.predict_letter(s.output()) generated_string.append(scores) weighted_states = self.attend(s.output(), states, encoded_input) #s = s.add_input(weighted_states) #s.add_input(dy.concatenate([true_char_encoded, weighted_states])) s = s.add_input( dy.concatenate([true_char_encoded, weighted_states])) if char in self.C2I: loss += dy.pickneglogsoftmax(scores, self.C2I[char]) return loss, generated_string
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def train_sentence(self, words, word_idxs): dy.renew_cg() forward_init, backward_init = [ b.initial_state() for b in self.builders ] embed_words = words.tensor # entities = words.ents forward = forward_init.transduce(embed_words) backward = backward_init.transduce(reversed(embed_words)) errors = [] encodings = [] good = bad = 0.0 for f, b, tag in zip(forward, backward, word_idxs): r_t = self(dy.concatenate([f, b])) temp_val = dy.softmax(r_t).value() chosen = np.argmax(temp_val) encodings.append(temp_val) good += 1 if chosen == tag else 0 bad += 1 if chosen != tag else 0 error = dy.pickneglogsoftmax(r_t, tag) errors.append(error) sum_errors = dy.esum(errors) loss = sum_errors.scalar_value() sum_errors.backward() self.trainer.update() accuracy = 100 * (good / (good + bad)) print(str(accuracy), str(loss)) return encodings
def calc_lm_loss(sent): dy.renew_cg() # parameters -> expressions W_exp = dy.parameter(W_sm) b_exp = dy.parameter(b_sm) # initialize the RNN f_init = RNN.initial_state() # get the wids and masks for each step tot_words = len(sent) # start the rnn by inputting "<s>" s = f_init.add_input(WORDS_LOOKUP[S]) # feed word vectors into the RNN and predict the next word losses = [] for wid in sent: # calculate the softmax and loss score = W_exp * s.output() + b_exp loss = dy.pickneglogsoftmax(score, wid) losses.append(loss) # update the state of the RNN wemb = WORDS_LOOKUP[wid] s = s.add_input(wemb) return dy.esum(losses), tot_words
def Train(sentence, epoch, dynamic=True): parser.eval = False if parser.meta.palgo in ['standard', 'swap']: configuration = Configuration(sentence, standard=True) else: configuration = Configuration(sentence) pr_bi_exps, pos_errs = parser.feature_extraction(sentence[1:-1]) while not parser.transitionSystem.inFinalState(configuration): xo = parser.predict(configuration, pr_bi_exps) if parser.meta.palgo in ['swap', 'standard']: # Static Oracle goldTransitionFunc, goldLabel = parser.transitionSystem.LabelledAction(configuration) goldTransition = goldTransitionFunc.__name__ parser.loss.append(dy.pickneglogsoftmax(xo, parser.meta.td2i[(goldTransition, goldLabel)])) goldTransitionFunc(configuration, goldLabel) else: # Dynamic Oracle output_probs = dy.softmax(xo).npvalue() ranked_actions = sorted(zip(output_probs, range(len(output_probs))), reverse=True) pscore, paction = ranked_actions[0] #{0: <bound method arceager.SHIFT>} validTransitions, allmoves = parser.transitionSystem.get_valid_transitions(configuration) while parser.transitionSystem.action_cost(\ configuration, parser.meta.i2td[paction], parser.meta.transitions, validTransitions) > 500: ranked_actions = ranked_actions[1:] pscore, paction = ranked_actions[0] gaction = None for i,(score, ltrans) in enumerate(ranked_actions): cost = parser.transitionSystem.action_cost(\ configuration, parser.meta.i2td[ltrans], parser.meta.transitions, validTransitions) if cost == 0: gaction = ltrans need_update = (i > 0) break gtransitionstr, goldLabel = parser.meta.i2td[gaction] ptransitionstr, predictedLabel = parser.meta.i2td[paction] if dynamic and (epoch > 2) and (np.random.random() < 0.9): predictedTransitionFunc = allmoves[parser.meta.transitions[ptransitionstr]] predictedTransitionFunc(configuration, predictedLabel) else: goldTransitionFunc = allmoves[parser.meta.transitions[gtransitionstr]] goldTransitionFunc(configuration, goldLabel) parser.loss.append(dy.pickneglogsoftmax(xo, parser.meta.td2i[(gtransitionstr, goldLabel)])) #NOTE original parser.loss.extend(pos_errs)
def calc_loss(words, tags, holder): vecs = build_graph(words, holder) losses = [] for v, t in zip(vecs, tags): tid = holder.tag2index[t] loss = dy.pickneglogsoftmax(v, tid) losses.append(loss) return dy.esum(losses)
def get_loss(self, sentence): scores = self.propogate(sentence) errs = [] for i, score in enumerate(scores): root_err = dy.pickneglogsoftmax(score, 0) errs.append(root_err) return dy.esum(errs)
def sent_loss(words, tags): vecs = build_tagging_graph(words) errs = [] for v,t in zip(vecs,tags): tid = vt.w2i[t] err = dy.pickneglogsoftmax(v, tid) errs.append(err) return dy.esum(errs)
def create_network_return_loss(self, inputs, expected_output, dropout=False): out = self(inputs, dropout) loss = dy.pickneglogsoftmax(out, expected_output) # loss = -dy.log(dy.pick(out, expected_output)) return loss
def sent_loss(words, tags): vecs = build_tagging_graph(words) losses = [] for v, t in zip(vecs, tags): tid = vt.w2i[t] loss = dy.pickneglogsoftmax(v, tid) # cross entropy loss losses.append(loss) return dy.esum(losses) # esum is max pooling?
def calc_loss(self, context, ref_action): scores = self.get_scores(context) # single mode if not Batcher.is_batched(ref_action): return dy.pickneglogsoftmax(scores, ref_action) # minibatch mode else: return dy.pickneglogsoftmax_batch(scores, ref_action)
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def train(self, epochs=30): n = self.generator.get_train_size() print "size of training set: ", n print "Training..." iteration = 0 losses = [] loss_avg = 0. for i, batch in enumerate(self.generator.generate(mode="train")): dy.renew_cg() for j, training_example in enumerate(batch): x, y, data_sample = training_example preds = self._predict(x, training=True) loss = dy.scalarInput(0.) for agr, pred in zip(self.agreements, preds): true_number = y[agr] loss += dy.pickneglogsoftmax(pred, true_number) losses.append(loss) if iteration % n % 2500 == 0: print "{}/{}".format(iteration % n, n) iteration += 1 #stopping criteria if iteration > epochs * n: return # report progress. if iteration % n == 0: print "EPOCH {} / {}".format(iteration / n, epochs) print "Average loss: {}".format(loss_avg / n) loss_avg = 0. self.evaluate(mode="dev") #self.collector.collect() losses = [] if losses: loss_sum = dy.esum(losses) loss_sum.forward() loss_sum.backward() self.trainer.update() loss_avg += loss_sum.value() losses = []
def Train(sentence, epoch, dynamic=True): loss = [] totalError = 0 parser.eval = False configuration = Configuration(sentence) pr_bi_exps, pos_errs = parser.feature_extraction(sentence[1:-1]) while not parser.isFinalState(configuration): rfeatures = parser.basefeaturesEager(configuration.nodes, configuration.stack, configuration.b0) xi = dy.concatenate([ pr_bi_exps[id - 1] if id > 0 else parser.pad for id, rform in rfeatures ]) xh = parser.pr_W1 * xi xh = dy.rectify(xh) + parser.pr_b1 xo = parser.pr_W2 * xh + parser.pr_b2 output_probs = dy.softmax(xo).npvalue() ranked_actions = sorted(zip(output_probs, range(len(output_probs))), reverse=True) pscore, paction = ranked_actions[0] validTransitions, allmoves = parser.get_valid_transitions( configuration) #{0: <bound method arceager.SHIFT>} while parser.action_cost(configuration, parser.meta.i2td[paction], parser.meta.transitions, validTransitions) > 500: ranked_actions = ranked_actions[1:] pscore, paction = ranked_actions[0] gaction = None for i, (score, ltrans) in enumerate(ranked_actions): cost = parser.action_cost(configuration, parser.meta.i2td[ltrans], parser.meta.transitions, validTransitions) if cost == 0: gaction = ltrans need_update = (i > 0) break gtransitionstr, goldLabel = parser.meta.i2td[gaction] ptransitionstr, predictedLabel = parser.meta.i2td[paction] if dynamic and (epoch > 2) and (np.random.random() < 0.9): predictedTransitionFunc = allmoves[ parser.meta.transitions[ptransitionstr]] predictedTransitionFunc(configuration, predictedLabel) else: goldTransitionFunc = allmoves[ parser.meta.transitions[gtransitionstr]] goldTransitionFunc(configuration, goldLabel) loss.append( dy.pickneglogsoftmax( xo, parser.meta.td2i[(gtransitionstr, goldLabel)])) #NOTE original if need_update: totalError += 1 return dy.esum(loss) + dy.esum(pos_errs), totalError
def train(self, train_file, epochs): loss_values = [] for i in range(epochs): print 'started epoch', (i + 1) losses = [] train_data = open(train_file, 'r').read().strip().split('\n') # shuffle the training data. random.shuffle(train_data) step = 0 for line in train_data: fields = line.strip().split(' ') # label here means action y, lazy to modify original start code features, label = fields[:-1], fields[-1] gold_label = self.vocab.action2id(label) result = self.build_graph(features) # getting loss with respect to negative log softmax function and the gold label. loss = dynet.pickneglogsoftmax(result, gold_label) # appending to the minibatch losses losses.append(loss) step += 1 if len(losses) >= self.properties.minibatch_size: # now we have enough loss values to get loss for minibatch minibatch_loss = dynet.esum(losses) / len(losses) # calling dynet to run forward computation for all minibatch items minibatch_loss.forward() # getting float value of the loss for current minibatch minibatch_loss_value = minibatch_loss.value() # printing info and plotting loss_values.append(minibatch_loss_value) if len(loss_values) % 10 == 0: progress = round(100 * float(step) / len(train_data), 2) print 'current minibatch loss', minibatch_loss_value, 'progress:', progress, '%' # calling dynet to run backpropagation minibatch_loss.backward() # calling dynet to change parameter values with respect to current backpropagation self.updater.update() # empty the loss vector losses = [] # refresh the memory of dynet dynet.renew_cg() # there are still some minibatch items in the memory but they are smaller than the minibatch size # so we ask dynet to forget them dynet.renew_cg()
def _build_lm_graph(self, sent): state = self.builder.initial_state() errs = [] for (cw, nw) in zip(sent, sent[1:]): emb = dy.lookup(self.embs, cw) state = state.add_input(emb) scores = self._get_scores(state) errs.append(dy.pickneglogsoftmax(scores, nw)) return errs
def sent_loss(self, sent): words, tags = map(list, zip(*sent)) vecs = self.build_tagging_graph(words) errs = [] for v, t in zip(vecs, tags): tid = self.vt.w2i[t] err = dy.pickneglogsoftmax(v, tid) errs.append(err) return dy.esum(errs)
def transduce(seq,Y): seq = [E[i] for i in seq] fw = fwR.initial_state().transduce(seq) # this UNUSED part affects strategy 2 XXX = fwR2.initial_state().transduce([E[3],E[5]]) W = W_.expr() outs = [W*z for z in fw] losses = [dy.pickneglogsoftmax(o,y) for o,y in zip(outs,Y)] s = dy.esum(losses) return s
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # The initial history is equal to end of sentence symbols hist = [S] * N # Step through the sentence, including the end of sentence token all_losses = [] for next_word in sent + [S]: s = calc_score_of_history(hist) all_losses.append(dy.pickneglogsoftmax(s, next_word)) hist = hist[1:] + [next_word] return dy.esum(all_losses)
def sent_lm_loss(self, sent): rnn_cur = self.rnn.initial_state() losses = [] prev_word = self.start for word in sent: x_t = self.embeddings[prev_word] rnn_cur = rnn_cur.add_input(x_t) logits = dy.affine_transform([self.lb, self.h2l, rnn_cur.output()]) losses.append(dy.pickneglogsoftmax(logits, word)) prev_word = word return dy.esum(losses)
def BuildLMGraph(self, sent): dy.renew_cg() init_state = self.builder.initial_state() errs = [] # will hold expressions es=[] state = init_state inputs = [self.lookup[int(cw)] for cw in sent[:-1]] expected_outputs = [int(nw) for nw in sent[1:]] outputs = state.transduce(inputs) r_ts = ((self.bias + (self.R * y_t)) for y_t in outputs) errs = [dy.pickneglogsoftmax(r_t, eo) for r_t, eo in zip(r_ts, expected_outputs)] nerr = dy.esum(errs) return nerr
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() #add padding to the sentence equal to the size of the window #as we need to predict the eos as well, the future window at that point is N past it padded_sent = [S] * N + sent + [S] * N padded_emb = [W_c_p[x] for x in padded_sent] # Step through the sentence all_losses = [] for i in range(N,len(sent)+N): c = dy.esum(padded_emb[i-N:i] + padded_emb[i+1:i+N+1]) s = W_w * c all_losses.append(dy.pickneglogsoftmax(s, padded_sent[i])) return dy.esum(all_losses)
def build_lm_graph(self, sent): dy.renew_cg() init_state = self.builder.initial_state() errs = [] # will hold expressions es=[] state = init_state for (cw,nw) in zip(sent,sent[1:]): # assume word is already a word-id x_t = dy.lookup(self.lookup, int(cw)) state = state.add_input(x_t) y_t = state.output() r_t = self.bias + (self.R * y_t) err = dy.pickneglogsoftmax(r_t, int(nw)) errs.append(err) nerr = dy.esum(errs) return nerr
def calc_reinforce_loss(words, tags, delta): dy.renew_cg() # Transduce all batch elements with an LSTM word_reps = LSTM.transduce([LOOKUP[x] for x in words]) # Softmax scores W = dy.parameter(W_sm) b = dy.parameter(b_sm) #calculate the probability distribution scores = [dy.affine_transform([b, W, x]) for x in word_reps] losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] probs = [-dy.exp(loss).as_array() for loss in losses] #then take samples from the probability distribution samples = [np.random.choice(range(len(x)), p=x) for x in probs] #calculate accuracy=reward correct = [sample == tag for sample, tag in zip(samples, tags)] r_i = float(sum(correct))/len(correct) r = dy.constant((1), r_i) # Reward baseline for each word W_bl = dy.parameter(W_bl_p) b_bl = dy.parameter(b_bl_p) r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps] #we need to take the value in order to break the computation graph #as the reward portion is trained seperatley and not backpropogated through during the overall score rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b] #the scores for training the baseline baseline_scores = [dy.square(r - x) for x in r_b] #then calculate the reinforce scores using reinforce reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)] #we want the first len(sent)-delta scores from xent then delta scores from reinforce #for mixer if len(scores) > delta: mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:] else: mixer_scores = reinforce_scores return dy.esum(mixer_scores), dy.esum(baseline_scores)
def loss(self, input_, y): if self.batched: return dy.pickneglogsoftmax_batch(input_, y) return dy.pickneglogsoftmax(input_, y)
def calc_loss(scores, tags): losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)] return dy.esum(losses)
def train(self, train_file, epochs): # matplotlib config loss_values = [] plt.ion() ax = plt.gca() ax.set_xlim([0, 10]) ax.set_ylim([0, 3]) plt.title("Loss over time") plt.xlabel("Minibatch") plt.ylabel("Loss") for i in range(epochs): print('started epoch', (i+1)) losses = [] train_data = open(train_file, 'r').read().strip().split('\n') # shuffle the training data. random.shuffle(train_data) step = 0 for line in train_data: fields = line.strip().split() features, label = fields[:-1], fields[-1] gold_label = self.vocab.action2id(label) result = self.build_graph(features) # getting loss with respect to negative log softmax function and the gold label. loss = dynet.pickneglogsoftmax(result, gold_label) # appending to the minibatch losses losses.append(loss) step += 1 if len(losses) >= self.properties.minibatch_size: # now we have enough loss values to get loss for minibatch minibatch_loss = dynet.esum(losses) / len(losses) # calling dynet to run forward computation for all minibatch items minibatch_loss.forward() # getting float value of the loss for current minibatch minibatch_loss_value = minibatch_loss.value() # printing info and plotting loss_values.append(minibatch_loss_value) if len(loss_values)%10==0: ax.set_xlim([0, len(loss_values)+10]) ax.plot(loss_values) plt.draw() plt.pause(0.0001) progress = round(100 * float(step) / len(train_data), 2) print('current minibatch loss', minibatch_loss_value, 'progress:', progress, '%') # calling dynet to run backpropagation minibatch_loss.backward() # calling dynet to change parameter values with respect to current backpropagation self.updater.update() # empty the loss vector losses = [] # refresh the memory of dynet dynet.renew_cg() # there are still some minibatch items in the memory but they are smaller than the minibatch size # so we ask dynet to forget them dynet.renew_cg()
def calc_loss(self, scores, axis, true, importance): ret = [i * dy.pickneglogsoftmax(scores, t) for t, i in zip(true, importance)] if self.loss == "max_margin": ret.append(dy.max_dim(dy.log_softmax(scores, restrict=list(set(range(self.num_labels[axis])) - set(true))))) return ret
def compute_loss(model, char_lookup, feat_lookup, R, bias, encoder_frnn, encoder_rrnn, decoder_rnn, W_c, W__a, U__a, v__a, lemma, feats, word, alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters # char_lookup = model["char_lookup"] # feat_lookup = model["feat_lookup"] # R = pc.parameter(model["R"]) # bias = pc.parameter(model["bias"]) # W_c = pc.parameter(model["W_c"]) # W__a = pc.parameter(model["W__a"]) # U__a = pc.parameter(model["U__a"]) # v__a = pc.parameter(model["v__a"]) R = pc.parameter(R) bias = pc.parameter(bias) W_c = pc.parameter(W_c) W__a = pc.parameter(W__a) U__a = pc.parameter(U__a) v__a = pc.parameter(v__a) blstm_outputs = encode_feats_and_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, feat_index, feat_lookup, feats, feature_types, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] padded_word = word + END_WORD # run the decoder through the output sequence and aggregate loss for i, output_char in enumerate(padded_word): # get current h of the decoder s = s.add_input(prev_output_vec) decoder_rnn_output = s.output() attention_output_vector, alphas, W = attend(blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias if output_char in alphabet_index: current_loss = pc.pickneglogsoftmax(readout, alphabet_index[output_char]) else: current_loss = pc.pickneglogsoftmax(readout, alphabet_index[UNK]) # print 'computed readout layer' loss.append(current_loss) # prepare for the next iteration - "feedback" # TODO: add "input feeding" - the attention_output_vector is also concatenated to the next decoder input if output_char in alphabet_index: prev_output_vec = char_lookup[alphabet_index[output_char]] else: prev_output_vec = char_lookup[alphabet_index[UNK]] total_sequence_loss = pc.esum(loss) # loss = average(loss) return total_sequence_loss
b = model.add_parameters((ntags)) # Softmax bias # A function to calculate scores for one value def calc_scores(words): # Create a computation graph, and add parameters dy.renew_cg() # Take the sum of all the embedding vectors for each word score = dy.esum([dy.lookup(W, x) for x in words]) # Add the bias vector and return return score + b for ITER in range(100): # Perform training random.shuffle(train) train_loss = 0.0 start = time.time() for words, tag in train: my_loss = dy.pickneglogsoftmax(calc_scores(words), tag) train_loss += my_loss.value() my_loss.backward() trainer.update() print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) # Perform testing test_correct = 0.0 for words, tag in dev: scores = calc_scores(words).npvalue() predict = np.argmax(scores) if predict == tag: test_correct += 1 print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source): """ Perform one iteration of trying to score a node's neighbors above negative samples. """ # true instances likelihood trues = targets(g, node) if is_source else sources(g, node) side = '->' if is_source else '<-' if len(trues) == 0: return 0.0 if opts.debug: dy.renew_cg(immediate_compute = True, check_validity = True) else: dy.renew_cg() # compute association score as dynet expression (can't do this above due to staleness) true_scores = [] for tr in trues: if is_source: j_assoc_score = assoc_model.word_assoc_score(node, tr, rel) else: j_assoc_score = assoc_model.word_assoc_score(tr, node, rel) if log_file is not None: log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\ .format(node, side, tr, j_assoc_score.scalar_value())) true_scores.append(j_assoc_score) # false targets likelihood - negative sampling (uniform) # collect negative samples if opts.nll: sample_scores = [[ts] for ts in true_scores] else: margins = [] neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))] # remove source and true targets if applicable for t in [node] + trues: if t in neg_samples: neg_samples.remove(t) neg_samples.append(np.random.choice(range(N))) for (i,ns) in enumerate(neg_samples): # compute association score as dynet expression if is_source: ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel) else: ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel) if log_file is not None: log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\ .format(node, side, ns, ns_assoc_score.scalar_value())) corresponding_true = i // opts.neg_samp if opts.nll: sample_scores[corresponding_true].append(ns_assoc_score) else: # TODO maybe use dy.hinge() ctt_score = true_scores[corresponding_true] margin = ctt_score - ns_assoc_score margins.append(dy.rectify(dy.scalarInput(1.0) - margin)) # compute overall loss if opts.nll: if len(sample_scores) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores]) else: if len(margins) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum(margins) sc_loss = dy_loss.scalar_value() if log_file is not None: log_file.write('{}\tLOSS\t{:.3e}\n'\ .format(node, sc_loss)) # backprop and recompute score if opts.v > 1: timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\ .format(rel, node, 'source' if is_source else 'target', sc_loss)) dy_loss.backward() trainer.update() return sc_loss
dev_time = 0 report = args.minibatch_size * 30 dev_report = args.minibatch_size * 600 for epoch in range(50): random.shuffle(training) print(("Epoch {} starting".format(epoch+1))) i = 0 while i < len(training): dy.renew_cg() mbsize = min(args.minibatch_size, len(training) - i) minibatch = training[i:i+mbsize] losses = [] for lbl, img in minibatch: x = dy.inputVector(img) logits = classify(x, dropout=True) loss = dy.pickneglogsoftmax(logits, lbl) losses.append(loss) mbloss = dy.esum(losses) / mbsize mbloss.backward() sgd.update() # eloss is an exponentially smoothed loss. if eloss is None: eloss = mbloss.scalar_value() else: eloss = mbloss.scalar_value() * alpha + eloss * (1.0 - alpha) # Do dev evaluation here: if (i > 0) and (i % dev_report == 0): confusion = [[0 for _ in range(10)] for _ in range(10)] correct = 0
return ngrams for ITER in range(10): # Perform training random.shuffle(train) train_loss = 0.0 train_correct = 0.0 start = time.time() for _, wids, tag in train: scores = calc_scores(wids) predict = np.argmax(scores.npvalue()) if predict == tag: train_correct += 1 my_loss = dy.pickneglogsoftmax(scores, tag) train_loss += my_loss.value() my_loss.backward() trainer.update() print("iter %r: train loss/sent=%.4f, acc=%.4f, time=%.2fs" % (ITER, train_loss/len(train), train_correct/len(train), time.time()-start)) # Perform testing test_correct = 0.0 for _, wids, tag in dev: scores = calc_scores(wids).npvalue() predict = np.argmax(scores) if predict == tag: test_correct += 1 print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev))) for words, wids, tag in dev: