Ejemplo n.º 1
0
 def pick_neg_log(self, pred, gold):
     # TODO make this a static function in both classes
     if not isinstance(gold, int) and not isinstance(gold, np.int64):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Ejemplo n.º 2
0
    def _get_loss(self, input, targets, epsilon=1e-10):
        layers = self.compute_output_layer(input)

        log_out = dy.log(layers[-1] + epsilon)

        loss = dy.zeros(1)
        for t in targets:
            loss += dy.pick(log_out, t)

        r = np.random.randint(self.dim_out)
        while r in targets:
            r = np.random.randint(self.dim_out)
        loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon)
        #loss -= dy.pick(log_out, r)

        return -loss
Ejemplo n.º 3
0
def select_action(tree, policy, choose_max=False, return_prob=False, mode='train'):
    prob, pairs = policy.selection_by_tree(tree, mode)
    if pairs is None:
        if return_prob:
            return None, None, None, None
        else:
            return None, None, None
    with np.errstate(all='raise'):
        try:
            prob_v = prob.npvalue()
            if choose_max:
                idx = np.argmax(prob_v)
            else:
                # if np.random.random() < policy.epsilon:
                #     idx = np.random.randint(len(prob_v))
                #     while prob_v[idx] == 0:
                #         idx = np.random.randint(len(prob_v))
                # else:
                idx = np.random.choice(range(len(prob_v)), p=prob_v / np.sum(prob_v))
        except:
            for para in policy.model_parameters:
                check_error(para, dy.parameter(policy.model_parameters[para]))
            check_error('history', policy.history.output())
            check_error('pr', prob)
    action = prob[idx]
    policy.saved_actions[-1].append(action)
    policy.update_history(pairs[idx])
    if return_prob:
        return pairs[idx], prob_v[idx], pairs, prob_v
    return pairs[idx], prob_v[idx], dy.mean_elems(dy.cmult(prob, dy.log(prob)))
Ejemplo n.º 4
0
def finish_episode(policy, trainer, entropy_l):
    loss = []
    all_cum_rewards = []
    for ct, p_rewards in enumerate(policy.rewards):
        R = 0
        rewards = []
        for r in p_rewards[::-1]:
            R = r + policy.gamma * R
            rewards.insert(0, R)
        all_cum_rewards.append(rewards)
        rewards = np.array(rewards) - policy.baseline_reward
        rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
        for action, reward, in zip(policy.saved_actions[ct], rewards):
            loss.append(-dy.log(action) * reward)
    # loss = dy.average(loss) + policy.decaying_beta * dy.average(entropy_l)
    loss = dy.average(loss)
    loss.backward()
    try:
        trainer.update()
        policy.update_baseline(np.mean(all_cum_rewards))
    except RuntimeError:
        print policy.rewards
        for actions in policy.saved_actions:
            for action in actions:
                print action.npvalue()
    policy.update_global_step()
    policy.update_eps()
    return loss.scalar_value()
Ejemplo n.º 5
0
 def get_score(self,word,context):
 	## Get the loss given word, context pair and perform negative sampling
     objective = dy.logistic(((dy.transpose(self.context_embeddings[context]))*self.word_embeddings[word]))
     negative_sample = np.random.choice(self.context_size, self.num_sampled, replace=False, p=self.context_fre)
     for context_prime in negative_sample:
         objective *= dy.logistic(-((dy.transpose(self.context_embeddings[context_prime]))*self.word_embeddings[word]))
     loss = -dy.log(objective)
     return loss
Ejemplo n.º 6
0
    def _get_loss_and_prediction(self, input, targets, epsilon=1e-10):
        layers = self.compute_output_layer(input)
        output = layers[-1].value()
        res = {i for i in output if i > 0.5}

        log_out = dy.log(layers[-1] + epsilon)

        loss = dy.zeros(1)
        for t in targets:
            loss += dy.pick(log_out, t)

        r = np.random.randint(self.dim_out)
        while r in targets:
            r = np.random.randint(self.dim_out)
        loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon)
        #loss -= dy.pick(log_out, r)

        return -loss, res
Ejemplo n.º 7
0
 def compute_loss_multilabel(self, task, seq, multi_y):
     """
     computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels
     """
     out_probs = self(task, seq)
     losses = []
     for y in multi_y:
         assigned_prob = dn.pick(out_probs, y)
         losses.append(-dn.log(assigned_prob) / len(multi_y))
     return dn.esum(losses)
    def decode_loss(self, src1, src2, tgt):
        src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward(
            src1, src2
        )
        _, prev_coverage = self.get_coverage(
            a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1))
        )

        loss = []
        cov_loss = []
        diag_loss = []

        embedded_tgt = self.embed_idx(tgt, self.tgt_lookup)
        last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)]

        for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)):
            a_t, c1_t = self.attend(
                src1_mat,
                decoder_state,
                src1_w1dt,
                self.att1_w2,
                self.att1_v,
                prev_coverage,
            )
            if not self.single_source:
                _, c2_t = self.attend(
                    src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None
                )
            else:
                c2_t = dy.vecInput(2 * HIDDEN_DIM)

            x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings])
            decoder_state = decoder_state.add_input(x_t)

            out_vector = self.dec_w * decoder_state.output() + self.dec_b
            probs = dy.softmax(out_vector)
            probs, _ = self.get_pointergen_probs(
                c1_t, decoder_state, x_t, a_t, probs, src1
            )

            loss.append(-dy.log(dy.pick(probs, char)))
            cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage)
            cov_loss.append(cov_loss_cur)
            diag_loss.append(self.get_diag_loss(a_t, t))

            last_output_embeddings = embedded_char

        loss = dy.esum(loss)
        cov_loss = dy.esum(cov_loss)
        diag_loss = dy.esum(diag_loss)
        return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
Ejemplo n.º 9
0
 def get_loss_and_prediction(self, input, target, epsilon=1e-10):
     layers = self.compute_output_layer(input)
     return -dy.log(dy.pick(layers[-1], target) + epsilon), np.argmax(
         layers[-1].value())
Ejemplo n.º 10
0
 def get_loss(self, input, target, epsilon=1e-10):
     layers = self.compute_output_layer(input)
     return -dy.log(dy.pick(layers[-1], target) + epsilon)
Ejemplo n.º 11
0
 def pick_neg_log(self, pred, gold):
     if hasattr(gold, "__len__"):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Ejemplo n.º 12
0
def dy_log(x):
    return dy.log(x + 1e-6)
Ejemplo n.º 13
0
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence,
                    output, env, first, previous):
    pos_lookup = params_encoder["pos_lookup"]
    char_lookup = params_encoder["char_lookup"]
    char_v = params_decoder["attention_v"]
    char_w1 = params_decoder["attention_wc"]
    char_w2 = params_decoder["attention_bc"]
    sc_vector = []
    for i, world in enumerate(_state(env)):
        world = world
        sc0 = char_encoder.initial_state()
        sc = sc0
        for char in world:
            sc = sc.add_input(char_lookup[char2int[char]])
        sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]]))
    dy_sc_vector = dy.concatenate(sc_vector, d=1)
    s0 = encoder.initial_state()
    s = s0
    lookup = params_encoder["lookup"]
    attention_w = params_decoder["attention_w"]
    attention_b = params_decoder["attention_b"]
    sentence = sentence + ' <end>'
    sentence = [
        vocab.index(c) if c in vocab else vocab.index('<unknown>')
        for c in sentence.split(' ')
    ]
    loss = []
    generate = []
    s_vector = []
    for word in (sentence):
        s = s.add_input(lookup[word])
        s_vector.append(dy.softmax(attention_w * s.output() + attention_b))
    encode_output = s.output()
    dy_s_vector = dy.concatenate(s_vector, d=1)
    _s0 = decoder.initial_state(s.s())
    _s = _s0
    R = params_decoder["R"]
    bias = params_decoder["bias"]
    index = 1
    input_word = "<start>"
    _lookup = params_decoder["lookup"]
    while True:
        dy_env = dy.inputTensor(get_state_embed3(env))
        word = vocab_out.index(input_word)
        gt_y = vocab_out.index(output[index])

        weight = dy.softmax(
            dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector]))
        weight_char = dy.softmax(
            dy.concatenate([
                char_v * dy.tanh(char_w1 * x + char_w2 * _s.output())
                for x in sc_vector
            ]))

        encode_output = dy_s_vector * weight
        encode_state = dy_sc_vector * weight_char
        _s = _s.add_input(
            dy.concatenate([_lookup[word], encode_output, encode_state]))
        probs = dy.softmax((R) * _s.output() + bias)
        prediction = np.argsort(probs.npvalue())[-1]
        if (vocab_out[prediction]) == '<start>':
            prediction = np.argsort(probs.npvalue())[-2]
        generate.append(vocab_out[prediction])
        loss.append(-dy.log(dy.pick(probs, gt_y)))
        if output[index] == '<end>':
            break
        index += 1
        input_word = vocab_out[prediction]
        if input_word == '<end>':
            continue
        env = str(execute(env, [input_word]))
        if env == 'None':
            env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_'
    loss = dy.esum(loss)
    while '<start>' in generate:
        generate.remove('<start>')
    previous = s.output()
    return loss, generate, previous
Ejemplo n.º 14
0
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks,
                                   output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup,
                                             [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(
            self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(
                encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1, ), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup,
                                                  step_word_ids)
            decoder_input = dn.concatenate(
                [feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
Ejemplo n.º 15
0
 def pick_neg_log(self, pred, gold):
     if not isinstance(gold, int):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Ejemplo n.º 16
0
def dy_softplus(x):
    return dy.log(dy.exp(x) + 1)
Ejemplo n.º 17
0
def train_network(params,
                  ntags,
                  train_data,
                  dev_set,
                  telemetry_file,
                  randstring,
                  very_common_tag=-1):
    global MIN_ACC
    prev_acc = 0
    m = params[0]
    t0 = time.clock()
    # train the network
    trainer = dy.SimpleSGDTrainer(m)
    total_loss = 0
    seen_instances = 0
    train_good = 0
    very_common_tag_count = 0
    for x_data, train_y in train_data:
        dy.renew_cg()
        output = build_network(params, x_data)
        # l2 regularization did not look promising at all, so it's commented out
        loss = -dy.log(
            output[train_y]
        )  #+ REG_LAMBDA * sum([dy.l2_norm(p) for p in params[2:]])
        if train_y == np.argmax(output.npvalue()):
            train_good += 1
        seen_instances += 1
        total_loss += loss.value()
        loss.backward()
        trainer.update()
        if seen_instances % 20000 == 0:
            # measure elapsed seconds
            secs = time.clock() - t0
            t0 = time.clock()
            good = case = 0
            max_dev_instances = 70 * 1000
            dev_instances = 0
            for x_tuple, dev_y in dev_set:
                output = build_network(params, x_tuple)
                y_hat = np.argmax(output.npvalue())
                case += 1
                if y_hat == dev_y and y_hat == very_common_tag:
                    case -= 1  # don't count this case
                    very_common_tag_count += 1
                elif y_hat == dev_y:
                    good += 1

                dev_instances += 1
                if dev_instances >= max_dev_instances:
                    break
            acc = float(good) / case
            print(
                "iterations: {}. train_accuracy: {} accuracy: {} avg loss: {} secs per 1000:{}"
                .format(seen_instances,
                        float(train_good) / 20000, acc,
                        total_loss / (seen_instances + 1), secs / 20))
            train_good = 0
            if acc > MIN_ACC and acc > prev_acc:
                print("saving.")
                dy.save("params_" + randstring, list(params)[1:])
                prev_acc = acc

            telemetry_file.write("{}\t{}\t{}\t{}\n".format(
                seen_instances, acc, total_loss / (seen_instances + 1),
                secs / 20))
            print("very common tag count: {}".format(very_common_tag_count))
Ejemplo n.º 18
0
def pick_neg_log(pred, gold):
    return -dynet.log(dynet.pick(pred, gold))
Ejemplo n.º 19
0
def train(builder,
          model,
          model_parameters,
          X_train,
          y_train,
          nepochs,
          alpha=0.01,
          update=True,
          dropout=0.0,
          x_y_vectors=None,
          num_hidden_layers=0):
    """
    Train the LSTM
    :param builder: the LSTM builder
    :param model: LSTM RNN model
    :param model_parameters: the model parameters
    :param X_train: the lstm instances
    :param y_train: the lstm labels
    :param nepochs: number of epochs
    :param alpha: the learning rate (only for SGD)
    :param update: whether to update the lemma embeddings
    :param dropout: dropout probability for all component embeddings
    :param x_y_vectors: the word vectors of x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    """
    trainer = dy.AdamTrainer(model, alpha=alpha)
    minibatch_size = min(MINIBATCH_SIZE, len(y_train))
    nminibatches = int(math.ceil(len(y_train) / minibatch_size))
    previous_loss = 1000

    for epoch in range(nepochs):

        total_loss = 0.0

        epoch_indices = np.random.permutation(len(y_train))

        for minibatch in range(nminibatches):

            path_cache = {}
            batch_indices = epoch_indices[minibatch *
                                          minibatch_size:(minibatch + 1) *
                                          minibatch_size]

            dy.renew_cg()

            loss = dy.esum([
                -dy.log(
                    dy.pick(
                        process_one_instance(
                            builder,
                            model,
                            model_parameters,
                            X_train[batch_indices[i]],
                            path_cache,
                            update,
                            dropout,
                            x_y_vectors=x_y_vectors[batch_indices[i]]
                            if x_y_vectors is not None else None,
                            num_hidden_layers=num_hidden_layers),
                        y_train[batch_indices[i]]))
                for i in range(minibatch_size)
            ])
            total_loss += loss.value()  # forward computation
            loss.backward()
            trainer.update()

        # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB
        # and requires an argument (would be epoch i guess...)
        # trainer.update_epoch()
        trainer.update()
        total_loss /= len(y_train)
        print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss

        # Early stopping
        if math.fabs(previous_loss - total_loss) < LOSS_EPSILON:
            break

        previous_loss = total_loss
Ejemplo n.º 20
0
def pick_neg_log(pred, gold):
    return -dynet.log(dynet.pick(pred, gold))