Esempio n. 1
0
def xent_objective(decoder, weight=None) -> Objective:
    """Get XENT objective from decoder with cost."""
    return Objective(
        name="{} - cross-entropy".format(decoder.name),
        decoder=decoder,
        loss=decoder.cost,
        gradients=None,
        weight=weight,
    )
Esempio n. 2
0
    def setUp(self):
        self.mpart = TestMP("dummy_model_part")
        self.mpart_2 = TestMP("dummy_model_part_2")

        objective = Objective(name="dummy_objective",
                              decoder=self.mpart,
                              loss=self.mpart.loss,
                              gradients=None,
                              weight=None)

        objective_2 = Objective(name="dummy_objective_2",
                                decoder=self.mpart_2,
                                loss=self.mpart_2.loss,
                                gradients=None,
                                weight=None)

        self.trainer1 = GenericTrainer([objective])
        self.trainer2 = GenericTrainer([objective_2], clip_norm=1.0)
Esempio n. 3
0
def mrt_objective(decoder, loss, weight=None, name='objective') -> Objective:
    """Get the mrt objective with mrt loss"""
    return Objective(
        name="{} - minimum-risk".format(decoder.name),
        decoder=decoder,
        loss=loss,
        gradients=None,
        weight=weight,
    )
Esempio n. 4
0
def self_critical_objective(decoder: Decoder,
                            reward_function: RewardFunction,
                            weight: float = None) -> Objective:
    """Self-critical objective.

    Args:
        decoder: A recurrent decoder.
        reward_function: A reward function computing score in Python.
        weight: Mixing weight for a trainer.

    Returns:
        Objective object to be used in generic trainer.
    """
    check_argument_types()

    # logits, shape (time, batch, vocab)
    train_logits = tf.stack(decoder.train_logits)
    runtime_logits = tf.stack(decoder.runtime_logits)
    runtime_mask = tf.stack(decoder.runtime_mask)

    # decoded, shape (time, batch)
    train_decoded = tf.argmax(train_logits, axis=2)
    runtime_decoded = tf.argmax(runtime_logits, axis=2)
    reference = decoder.train_inputs

    # rewards, shape (batch)
    train_reward = tf.py_func(reward_function, [reference, train_decoded],
                              tf.float32)
    runtime_reward = tf.py_func(reward_function, [reference, runtime_decoded],
                                tf.float32)

    tf.summary.scalar("train_{}/{}".format(decoder.data_id,
                                           reward_function.__name__),
                      tf.reduce_mean(runtime_reward),
                      collections=["summary_train"])

    # REINFORCE score: shape (time, batch, vocab)
    score_by_word = reinforce_score(runtime_reward, train_reward,
                                    runtime_decoded, runtime_logits)

    float_mask = tf.to_float(runtime_mask)
    masked_score_by_word = score_by_word * float_mask

    # sum the matrix up (dot product of rows, sum over time, and over batch)
    # pylint: disable=invalid-unary-operand-type
    loss = -tf.reduce_sum(masked_score_by_word) / tf.reduce_sum(float_mask)
    # pylint: enable=invalid-unary-operand-type

    tf.summary.scalar("train_{}/self_critical_cost".format(decoder.data_id),
                      loss,
                      collections=["summary_train"])

    return Objective(name="{}_self_critical".format(decoder.name),
                     decoder=decoder,
                     loss=loss,
                     gradients=None,
                     weight=weight)
def expected_loss_objective(decoder: Decoder,
                            reward_function: RewardFunction,
                            control_variate: str = None) -> Objective:
    """Construct Expected Loss objective for training with bandit feedback.

    'Bandit Structured Prediction for Neural Sequence-to-Sequence Learning'
    Details: http://www.aclweb.org/anthology/P17-1138

    :param decoder: a recurrent decoder to sample from
    :param reward_function: any evaluator object
    :param control_variate: optional 'baseline' average reward
    :return: Objective object to be used in generic trainer
    """
    check_argument_types()

    # decoded, shape (time, batch)
    # pylint: disable=protected-access
    sample_loop_result = decoder.decoding_loop(train_mode=False, sample=True)
    sample_logits = sample_loop_result[0]
    sample_decoded = sample_loop_result[3]

    reference = decoder.train_inputs

    def _score_with_reward_function(references: np.array,
                                    hypotheses: np.array) -> np.array:
        """Score (time, batch) arrays with sentence-based reward function.

        Parts of the sentence after generated <pad> or </s> are ignored.
        BPE-postprocessing is also included.

        :param references: array of indices of references, shape (time, batch)
        :param hypotheses: array of indices of hypotheses, shape (time, batch)
        :return: an array of batch length with float rewards
        """
        rewards = []
        for refs, hyps in zip(references.transpose(), hypotheses.transpose()):
            ref_seq = []
            hyp_seq = []
            for r_token in refs:
                token = decoder.vocabulary.index_to_word[r_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                ref_seq.append(token)
            for h_token in hyps:
                token = decoder.vocabulary.index_to_word[h_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                hyp_seq.append(token)
            # join BPEs, split on " " to prepare list for evaluator
            refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ")
            hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ")
            reward = float(reward_function([hyps_tokens], [refs_tokens]))
            rewards.append(reward)
        return np.array(rewards, dtype=np.float32)

    # rewards, shape (batch)
    sample_reward = tf.py_func(_score_with_reward_function,
                               [reference, sample_decoded], tf.float32)

    # if specified, compute the average reward baseline
    baseline = None

    reward_counter = tf.Variable(0.0, trainable=False, name="reward_counter")
    reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum")

    if control_variate == "baseline":
        # increment the cumulative reward in the decoder
        reward_counter = tf.assign_add(reward_counter,
                                       tf.to_float(decoder.batch_size))
        reward_sum = tf.assign_add(reward_sum, tf.reduce_sum(sample_reward))
        baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0))

    tf.summary.scalar("sample_{}/reward".format(decoder.data_id),
                      tf.reduce_mean(sample_reward),
                      collections=["summary_train"])

    # REINFORCE score: shape (time, batch, vocab)
    sent_loss = reinforce_score(sample_reward, baseline, sample_decoded,
                                sample_logits)

    batch_loss = tf.reduce_mean(sent_loss)

    tf.summary.scalar("train_{}/self_bandit_cost".format(decoder.data_id),
                      batch_loss,
                      collections=["summary_train"])

    return Objective(name="{}_bandit".format(decoder.name),
                     decoder=decoder,
                     loss=batch_loss,
                     gradients=None,
                     weight=None)
Esempio n. 6
0
def rl_objective(decoder: Decoder,
                 reward_function: RewardFunction,
                 subtract_baseline: bool = False,
                 normalize: bool = False,
                 temperature: float = 1.,
                 ce_smoothing: float = 0.,
                 alpha: float = 1.,
                 sample_size: int = 1) -> Objective:
    """Construct RL objective for training with sentence-level feedback.

    Depending on the options the objective corresponds to:
    1) sample_size = 1, normalize = False, ce_smoothing = 0.0
     Bandit objective (Eq. 2) described in 'Bandit Structured Prediction for
     Neural Sequence-to-Sequence Learning'
     (http://www.aclweb.org/anthology/P17-1138)
     It's recommended to set subtract_baseline = True.
    2) sample_size > 1, normalize = True, ce_smoothing = 0.0
     Minimum Risk Training as described in 'Minimum Risk Training for Neural
     Machine Translation' (http://www.aclweb.org/anthology/P16-1159) (Eq. 12).
    3) sample_size > 1, normalize = False, ce_smoothing = 0.0
     The Google 'Reinforce' objective as proposed in 'Google’s NMT System:
     Bridging the Gap between Human and Machine Translation'
     (https://arxiv.org/pdf/1609.08144.pdf) (Eq. 8).
    4) sample_size > 1, normalize = False, ce_smoothing > 0.0
     Google's 'Mixed' objective in the above paper (Eq. 9),
     where ce_smoothing implements alpha.

    Note that 'alpha' controls the sharpness of the normalized distribution,
    while 'temperature' controls the sharpness during sampling.

    :param decoder: a recurrent decoder to sample from
    :param reward_function: any evaluator object
    :param subtract_baseline: avg reward is subtracted from obtained reward
    :param normalize: the probabilities of the samples are re-normalized
    :param sample_size: number of samples to obtain feedback for
    :param ce_smoothing: add cross-entropy loss with this coefficient to loss
    :param alpha: determines the shape of the normalized distribution
    :param temperature: the softmax temperature for sampling
    :return: Objective object to be used in generic trainer
    """
    check_argument_types()

    reference = decoder.train_inputs

    def _score_with_reward_function(references: np.array,
                                    hypotheses: np.array) -> np.array:
        """Score (time, batch) arrays with sentence-based reward function.

        Parts of the sentence after generated <pad> or </s> are ignored.
        BPE-postprocessing is also included.

        :param references: array of indices of references, shape (time, batch)
        :param hypotheses: array of indices of hypotheses, shape (time, batch)
        :return: an array of batch length with float rewards
        """
        rewards = []
        for refs, hyps in zip(references.transpose(), hypotheses.transpose()):
            ref_seq = []
            hyp_seq = []
            for r_token in refs:
                token = decoder.vocabulary.index_to_word[r_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                ref_seq.append(token)
            for h_token in hyps:
                token = decoder.vocabulary.index_to_word[h_token]
                if token == END_TOKEN or token == PAD_TOKEN:
                    break
                hyp_seq.append(token)
            # join BPEs, split on " " to prepare list for evaluator
            refs_tokens = " ".join(ref_seq).replace("@@ ", "").split(" ")
            hyps_tokens = " ".join(hyp_seq).replace("@@ ", "").split(" ")
            reward = float(reward_function([hyps_tokens], [refs_tokens]))
            rewards.append(reward)
        return np.array(rewards, dtype=np.float32)

    samples_rewards = []
    samples_logprobs = []

    for _ in range(sample_size):
        # sample from logits
        # decoded, shape (time, batch)
        sample_loop_result = decoder.decoding_loop(train_mode=False,
                                                   sample=True,
                                                   temperature=temperature)
        sample_logits = sample_loop_result[0]
        sample_decoded = sample_loop_result[3]

        # rewards, shape (batch)
        # simulate from reference
        sample_reward = tf.py_func(_score_with_reward_function,
                                   [reference, sample_decoded], tf.float32)

        # pylint: disable=invalid-unary-operand-type
        word_logprobs = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sample_decoded, logits=sample_logits)

        # sum word log prob to sentence log prob
        # no masking here, since otherwise shorter sentences are preferred
        sent_logprobs = tf.reduce_sum(word_logprobs, axis=0)

        samples_rewards.append(sample_reward)  # sample_size x batch
        samples_logprobs.append(sent_logprobs)  # sample_size x batch

    # stack samples, sample_size x batch
    samples_rewards_stacked = tf.stack(samples_rewards)
    samples_logprobs_stacked = tf.stack(samples_logprobs)

    if subtract_baseline:
        # if specified, compute the average reward baseline
        reward_counter = tf.Variable(0.0,
                                     trainable=False,
                                     name="reward_counter")
        reward_sum = tf.Variable(0.0, trainable=False, name="reward_sum")
        # increment the cumulative reward
        reward_counter = tf.assign_add(
            reward_counter, tf.to_float(decoder.batch_size * sample_size))
        # sum over batch and samples
        reward_sum = tf.assign_add(reward_sum,
                                   tf.reduce_sum(samples_rewards_stacked))
        # compute baseline: avg of previous rewards
        baseline = tf.div(reward_sum, tf.maximum(reward_counter, 1.0))
        samples_rewards_stacked -= baseline

        tf.summary.scalar("train_{}/rl_reward_baseline".format(
            decoder.data_id),
                          tf.reduce_mean(baseline),
                          collections=["summary_train"])

    if normalize:
        # normalize over sample space
        samples_logprobs_stacked = tf.nn.softmax(samples_logprobs_stacked *
                                                 alpha,
                                                 dim=0)

    scored_probs = tf.stop_gradient(
        tf.negative(samples_rewards_stacked)) * samples_logprobs_stacked

    # sum over samples
    total_loss = tf.reduce_sum(scored_probs, axis=0)

    # average over batch
    batch_loss = tf.reduce_mean(total_loss)

    if ce_smoothing > 0.0:
        batch_loss += tf.multiply(ce_smoothing, decoder.cost)

    tf.summary.scalar("train_{}/self_rl_cost".format(decoder.data_id),
                      batch_loss,
                      collections=["summary_train"])

    return Objective(name="{}_rl".format(decoder.name),
                     decoder=decoder,
                     loss=batch_loss,
                     gradients=None,
                     weight=None)