def pick_neg_log(self, pred, gold): # TODO make this a static function in both classes if not isinstance(gold, int) and not isinstance(gold, np.int64): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def _get_loss(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) log_out = dy.log(layers[-1] + epsilon) loss = dy.zeros(1) for t in targets: loss += dy.pick(log_out, t) r = np.random.randint(self.dim_out) while r in targets: r = np.random.randint(self.dim_out) loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon) #loss -= dy.pick(log_out, r) return -loss
def select_action(tree, policy, choose_max=False, return_prob=False, mode='train'): prob, pairs = policy.selection_by_tree(tree, mode) if pairs is None: if return_prob: return None, None, None, None else: return None, None, None with np.errstate(all='raise'): try: prob_v = prob.npvalue() if choose_max: idx = np.argmax(prob_v) else: # if np.random.random() < policy.epsilon: # idx = np.random.randint(len(prob_v)) # while prob_v[idx] == 0: # idx = np.random.randint(len(prob_v)) # else: idx = np.random.choice(range(len(prob_v)), p=prob_v / np.sum(prob_v)) except: for para in policy.model_parameters: check_error(para, dy.parameter(policy.model_parameters[para])) check_error('history', policy.history.output()) check_error('pr', prob) action = prob[idx] policy.saved_actions[-1].append(action) policy.update_history(pairs[idx]) if return_prob: return pairs[idx], prob_v[idx], pairs, prob_v return pairs[idx], prob_v[idx], dy.mean_elems(dy.cmult(prob, dy.log(prob)))
def finish_episode(policy, trainer, entropy_l): loss = [] all_cum_rewards = [] for ct, p_rewards in enumerate(policy.rewards): R = 0 rewards = [] for r in p_rewards[::-1]: R = r + policy.gamma * R rewards.insert(0, R) all_cum_rewards.append(rewards) rewards = np.array(rewards) - policy.baseline_reward rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, reward, in zip(policy.saved_actions[ct], rewards): loss.append(-dy.log(action) * reward) # loss = dy.average(loss) + policy.decaying_beta * dy.average(entropy_l) loss = dy.average(loss) loss.backward() try: trainer.update() policy.update_baseline(np.mean(all_cum_rewards)) except RuntimeError: print policy.rewards for actions in policy.saved_actions: for action in actions: print action.npvalue() policy.update_global_step() policy.update_eps() return loss.scalar_value()
def get_score(self,word,context): ## Get the loss given word, context pair and perform negative sampling objective = dy.logistic(((dy.transpose(self.context_embeddings[context]))*self.word_embeddings[word])) negative_sample = np.random.choice(self.context_size, self.num_sampled, replace=False, p=self.context_fre) for context_prime in negative_sample: objective *= dy.logistic(-((dy.transpose(self.context_embeddings[context_prime]))*self.word_embeddings[word])) loss = -dy.log(objective) return loss
def _get_loss_and_prediction(self, input, targets, epsilon=1e-10): layers = self.compute_output_layer(input) output = layers[-1].value() res = {i for i in output if i > 0.5} log_out = dy.log(layers[-1] + epsilon) loss = dy.zeros(1) for t in targets: loss += dy.pick(log_out, t) r = np.random.randint(self.dim_out) while r in targets: r = np.random.randint(self.dim_out) loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon) #loss -= dy.pick(log_out, r) return -loss, res
def compute_loss_multilabel(self, task, seq, multi_y): """ computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels """ out_probs = self(task, seq) losses = [] for y in multi_y: assigned_prob = dn.pick(out_probs, y) losses.append(-dn.log(assigned_prob) / len(multi_y)) return dn.esum(losses)
def decode_loss(self, src1, src2, tgt): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) _, prev_coverage = self.get_coverage( a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1)) ) loss = [] cov_loss = [] diag_loss = [] embedded_tgt = self.embed_idx(tgt, self.tgt_lookup) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)): a_t, c1_t = self.attend( src1_mat, decoder_state, src1_w1dt, self.att1_w2, self.att1_v, prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = decoder_state.add_input(x_t) out_vector = self.dec_w * decoder_state.output() + self.dec_b probs = dy.softmax(out_vector) probs, _ = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) loss.append(-dy.log(dy.pick(probs, char))) cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage) cov_loss.append(cov_loss_cur) diag_loss.append(self.get_diag_loss(a_t, t)) last_output_embeddings = embedded_char loss = dy.esum(loss) cov_loss = dy.esum(cov_loss) diag_loss = dy.esum(diag_loss) return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
def get_loss_and_prediction(self, input, target, epsilon=1e-10): layers = self.compute_output_layer(input) return -dy.log(dy.pick(layers[-1], target) + epsilon), np.argmax( layers[-1].value())
def get_loss(self, input, target, epsilon=1e-10): layers = self.compute_output_layer(input) return -dy.log(dy.pick(layers[-1], target) + epsilon)
def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def dy_log(x): return dy.log(x + 1e-6)
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def dy_softplus(x): return dy.log(dy.exp(x) + 1)
def train_network(params, ntags, train_data, dev_set, telemetry_file, randstring, very_common_tag=-1): global MIN_ACC prev_acc = 0 m = params[0] t0 = time.clock() # train the network trainer = dy.SimpleSGDTrainer(m) total_loss = 0 seen_instances = 0 train_good = 0 very_common_tag_count = 0 for x_data, train_y in train_data: dy.renew_cg() output = build_network(params, x_data) # l2 regularization did not look promising at all, so it's commented out loss = -dy.log( output[train_y] ) #+ REG_LAMBDA * sum([dy.l2_norm(p) for p in params[2:]]) if train_y == np.argmax(output.npvalue()): train_good += 1 seen_instances += 1 total_loss += loss.value() loss.backward() trainer.update() if seen_instances % 20000 == 0: # measure elapsed seconds secs = time.clock() - t0 t0 = time.clock() good = case = 0 max_dev_instances = 70 * 1000 dev_instances = 0 for x_tuple, dev_y in dev_set: output = build_network(params, x_tuple) y_hat = np.argmax(output.npvalue()) case += 1 if y_hat == dev_y and y_hat == very_common_tag: case -= 1 # don't count this case very_common_tag_count += 1 elif y_hat == dev_y: good += 1 dev_instances += 1 if dev_instances >= max_dev_instances: break acc = float(good) / case print( "iterations: {}. train_accuracy: {} accuracy: {} avg loss: {} secs per 1000:{}" .format(seen_instances, float(train_good) / 20000, acc, total_loss / (seen_instances + 1), secs / 20)) train_good = 0 if acc > MIN_ACC and acc > prev_acc: print("saving.") dy.save("params_" + randstring, list(params)[1:]) prev_acc = acc telemetry_file.write("{}\t{}\t{}\t{}\n".format( seen_instances, acc, total_loss / (seen_instances + 1), secs / 20)) print("very common tag count: {}".format(very_common_tag_count))
def pick_neg_log(pred, gold): return -dynet.log(dynet.pick(pred, gold))
def train(builder, model, model_parameters, X_train, y_train, nepochs, alpha=0.01, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Train the LSTM :param builder: the LSTM builder :param model: LSTM RNN model :param model_parameters: the model parameters :param X_train: the lstm instances :param y_train: the lstm labels :param nepochs: number of epochs :param alpha: the learning rate (only for SGD) :param update: whether to update the lemma embeddings :param dropout: dropout probability for all component embeddings :param x_y_vectors: the word vectors of x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network """ trainer = dy.AdamTrainer(model, alpha=alpha) minibatch_size = min(MINIBATCH_SIZE, len(y_train)) nminibatches = int(math.ceil(len(y_train) / minibatch_size)) previous_loss = 1000 for epoch in range(nepochs): total_loss = 0.0 epoch_indices = np.random.permutation(len(y_train)) for minibatch in range(nminibatches): path_cache = {} batch_indices = epoch_indices[minibatch * minibatch_size:(minibatch + 1) * minibatch_size] dy.renew_cg() loss = dy.esum([ -dy.log( dy.pick( process_one_instance( builder, model, model_parameters, X_train[batch_indices[i]], path_cache, update, dropout, x_y_vectors=x_y_vectors[batch_indices[i]] if x_y_vectors is not None else None, num_hidden_layers=num_hidden_layers), y_train[batch_indices[i]])) for i in range(minibatch_size) ]) total_loss += loss.value() # forward computation loss.backward() trainer.update() # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB # and requires an argument (would be epoch i guess...) # trainer.update_epoch() trainer.update() total_loss /= len(y_train) print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss # Early stopping if math.fabs(previous_loss - total_loss) < LOSS_EPSILON: break previous_loss = total_loss
def pick_neg_log(pred, gold): return -dynet.log(dynet.pick(pred, gold))