def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    output = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=output)

    logits = output

    # Calculate model loss.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params.label_smoothing, params.vocab_size)
    loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)

    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op = get_train_op(loss, params)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    output = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=output)

    logits = output

    # Calculate model loss.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params.label_smoothing, params.vocab_size)
    loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights)

    if mode == tf.estimator.ModeKeys.EVAL:
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op = get_train_op(loss, params)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    def build_no_teacher_discriminator(self,
                                       origin_inputs,
                                       gen_target,
                                       real_loss,
                                       margin=1.0):
        fake_attention_bias = model_utils.get_padding_bias(
            gen_target)  # [batch, 1, 1, src_len]
        fake_encoder_outputs = self.encode(
            gen_target, fake_attention_bias)  # [batch, src_len, hidden_size]
        _, fake_logits = self.argmax_predict(fake_encoder_outputs,
                                             fake_attention_bias)
        fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
            fake_logits, origin_inputs, self.params.label_smoothing,
            self.params.target_vocab_size)  # [batch, origin_length]
        fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum(
            fake_weights, axis=1)
        tf.identity(fake_loss[:5], "fake_loss")

        mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss")
        tf.summary.scalar("mean_fake_loss", mean_fake_loss)

        rewards = 1 / tf.maximum(margin, fake_loss /
                                 (real_loss + 1e-12) - 1)  # [batch]
        tf.identity(rewards[:5], "rewards")

        mean_wards = tf.reduce_mean(rewards, name="mean_wards")
        tf.summary.scalar("mean_wards", mean_wards)
        return rewards
Example #4
0
def evaluation(model, input_fn):
    tf.logging.info("!!!Build graph for evaluation!!!")
    logits = model.build_pretrain(input_fn.source, input_fn.target)
    xentropy, weights = metrics.padded_cross_entropy_loss(
    logits, input_fn.target, params.label_smoothing, params.target_vocab_size)
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
    return loss, logits, input_fn.target
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)

        logits = model(inputs, targets)

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "translate": tf.estimator.export.PredictOutput(logits)
                })

        # Explicitly set the shape of the logits for XLA (TPU). This is needed
        # because the logits are passed back to the host VM CPU for metric
        # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
        # it is known from Transformer that the first two dimensions of logits
        # are the dimensions of targets. Note that the ambiguous shape of logits is
        # not a problem when computing xentropy, because padded_cross_entropy_loss
        # resolves the shape on the TPU.
        logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params["label_smoothing"], params["vocab_size"])
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(loss, "cross_entropy")

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            metric_dict["minibatch_loss"] = loss
            record_scalars(metric_dict)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
Example #6
0
def evaluation(model, input_fn):
    tf.logging.info("!!!Build graph for evaluation!!!")
    #model = transformer_5.Transformer(params, is_train=True)
    #predictions = model.build_pretrain(input_fn.source, targets=None)
    logits = model.build_pretrain(input_fn.source, input_fn.target)
    xentropy, weights = metrics.padded_cross_entropy_loss(
    logits, input_fn.target, params.label_smoothing, params.target_vocab_size)
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) 
    #return predictions, input_fn.target
    return loss, logits, input_fn.target
 def get_loss(self, origin_inputs, targets):
     with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE):
         attention_bias = model_utils.get_padding_bias(targets)  # [batch, 1, 1, src_len]
         encoder_outputs = self.encode(targets, attention_bias)  # [batch, src_len, hidden_size]
         logits = self.decode(origin_inputs, encoder_outputs, attention_bias)
         xentropy, weights = metrics.padded_cross_entropy_loss(
             logits, origin_inputs, self.params.label_smoothing,
             self.params.target_vocab_size)  # [batch, origin_length]
         self.loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch]
         #prediction = self.argmax_predict(encoder_outputs, attention_bias) # [batch, max_len]
         return tf.reshape(self.loss, (-1, 1))  # [batch, 1]
Example #8
0
    def get_loss(self, gen_targets, real_inputs):
        with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE):
            attention_bias = model_utils.get_padding_bias(gen_targets)
            encoder_outputs = self.encode(gen_targets, attention_bias)

            logits = self.decode(real_inputs, encoder_outputs, attention_bias)

            xentropy, weights = metrics.padded_cross_entropy_loss(logits, real_inputs,
                                                                  self.params.label_smoothing,
                                                                  self.params.target_vocab_size)
            loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1)  # [batch, 1]
            return tf.reshape(loss, (-1, 1))
 def get_teach_real_loss(self, origin_inputs, origin_target):
     real_logits = self.build_pretrain(
         inputs=origin_target,
         targets=origin_inputs)  # [batch, tgt_len, vocab_size]
     real_xentropy, real_weights = metrics.padded_cross_entropy_loss(
         real_logits, origin_inputs, self.params.label_smoothing,
         self.params.target_vocab_size)
     real_loss = tf.reduce_sum(real_xentropy, axis=1) / tf.reduce_sum(
         real_weights, axis=1)  # [batch]
     tf.identity(real_loss[:5], "real_loss")
     mean_real_loss = tf.reduce_mean(real_loss, name="mean_real_loss")
     tf.summary.scalar("mean_real_loss", mean_real_loss)
     return real_loss
Example #10
0
def train_step(batch_data):
    src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data
    with tf.GradientTape() as tape:
        logits = model(batch_data, training=True)
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits,
            tgt_output_ids,
            config.label_smoothing,
            vocab_size=tgt_vocab_size)
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
    variables = model.Encoder.trainable_variables + model.Decoder.trainable_variables
    gradients = tape.gradient(target=loss, sources=variables)
    grads_and_vars = zip(gradients, variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss
Example #11
0
def gan_tower_loss(scope, model, input_fn):
    """ calculate the total loss on a single tower runing the train model.

    :param scope:
    :param src:
    :param tgt:
    :return:
    """
    # Build inference Graph.
    #model = transformer_5.Transformer(params, is_train=True)
    logits = model.build_pretrain(input_fn.source, input_fn.target)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, input_fn.target, params.label_smoothing,
        params.target_vocab_size)
    cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
    tf.add_to_collection("losses", cross_entropy_mean)
    #_ = get_loss(logits, input_fn.target, "loss", "total_loss")

    losses = tf.get_collection('losses', scope)
    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    gen_samples = model.build_generator(input_fn.source)

    given_num, rewards_mb = model.get_one_reward_baseline(
        origin_inputs=input_fn.source,
        gen_targets=gen_samples,
        roll_num=flags_obj.roll_num)
    g_loss = model.get_one_g_loss(gen_targets=gen_samples,
                                  given_num=given_num,
                                  rewards=rewards_mb)

    tf.add_to_collection("g_losses", g_loss)
    g_losses = tf.get_collection("g_losses", scope)
    total_g_loss = tf.add_n(g_losses, name="total_g_loss")

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss] + g_losses + [total_g_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
        tf.summary.scalar(loss_name, l)
    return total_loss, total_g_loss, rewards_mb
Example #12
0
 def get_real_loss(self, origin_inputs, origin_target):
     with tf.variable_scope("Discriminator",
                            initializer=self._initializer,
                            reuse=tf.AUTO_REUSE):
         real_attention_bias = model_utils.get_padding_bias(
             origin_target)  # [batch, 1, 1, src_len]
         real_encoder_outputs = self.encode(
             origin_target,
             real_attention_bias)  # [batch, src_len, hidden_size]
         real_logits = self.decode(origin_inputs, real_encoder_outputs,
                                   real_attention_bias)
         real_xentropy, real_weights = metrics.padded_cross_entropy_loss(
             real_logits, origin_inputs, self.params.label_smoothing,
             self.params.target_vocab_size)
         self.real_loss = tf.reduce_sum(real_xentropy) / tf.reduce_sum(
             real_weights)  # [batch]
         return self.real_loss
Example #13
0
    def get_fake_loss(self, origin_inputs, gen_targets):
        inputs_length = tf.argmin(gen_targets, axis=-1) + 1
        max_len = inputs_length[tf.argmax(inputs_length)]
        batch_size = tf.shape(gen_targets)[0]

        pad_gen_targets = tf.zeros([0, max_len], dtype=tf.int32)

        def inner_loop(i, pad_inputs):
            ori_length = inputs_length[i]
            ori_input = tf.reshape(gen_targets[i][:ori_length], [1, -1])
            pad_input = tf.pad(ori_input, [[0, 0], [0, max_len - ori_length]])
            pad_inputs = tf.concat([pad_inputs, pad_input], axis=0)
            return i + 1, pad_inputs

        _, pad_gen_targets = tf.while_loop(
            cond=lambda i, _: i < batch_size,
            body=inner_loop,
            loop_vars=[tf.constant(0), pad_gen_targets],
            shape_invariants=[
                tf.TensorShape([]),
                tf.TensorShape([None, None])
            ])
        gen_targets = pad_gen_targets

        with tf.variable_scope("Discriminator",
                               initializer=self._initializer,
                               reuse=tf.AUTO_REUSE):
            fake_attention_bias = model_utils.get_padding_bias(
                gen_targets)  # [batch, 1, 1, src_len]
            fake_encoder_outputs = self.encode(
                gen_targets,
                fake_attention_bias)  # [batch, src_len, hidden_size]
            fake_logits = self.decode(origin_inputs, fake_encoder_outputs,
                                      fake_attention_bias)
            fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
                fake_logits, origin_inputs, self.params.label_smoothing,
                self.params.target_vocab_size)  # [batch, origin_length]
            self.fake_loss = tf.reduce_sum(fake_xentropy) / tf.reduce_sum(
                fake_weights)
            #fake_prediction = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) # [batch, max_len]
            return self.fake_loss
 def build_teach_force_discriminator(self,
                                     origin_inputs,
                                     gen_target,
                                     real_loss,
                                     margin=1):
     fake_logits = self.build_pretrain(
         inputs=gen_target,
         targets=origin_inputs)  # [batch, tgt_length, vocab_size]
     fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
         fake_logits, origin_inputs, self.params.label_smoothing,
         self.params.target_vocab_size)  # [batch, origin_length]
     fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum(
         fake_weights, axis=1)
     tf.identity(fake_loss[:5], "fake_loss")
     mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss")
     tf.summary.scalar("mean_fake_loss", mean_fake_loss)
     rewards = 1 / tf.maximum(margin, fake_loss /
                              (real_loss + 1e-12) - 1)  # [batch]
     tf.identity(rewards[:5], "rewards")
     mean_wards = tf.reduce_mean(rewards, name="mean_wards")
     tf.summary.scalar("mean_wards", mean_wards)
     return rewards
Example #15
0
def eval():
    """internal evaluation """
    dev_dataset = dataset.get_train_dataset(src_file=config.eval_src_file,
                                            tgt_file=config.eval_tgt_file,
                                            tgt_vocab_table=tgt_vocab_table,
                                            batch_size=config.batch_size)
    total_cnt, total_loss, total_bleu = 0.0, 0.0, 0.0
    for batch_num, batch_data in enumerate(dev_dataset.take(config.debug_num)):
        src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data
        logits = model(batch_data, training=True)
        bs = logits.shape[0]
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits,
            tgt_output_ids,
            config.label_smoothing,
            vocab_size=tgt_vocab_size)
        batch_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
        batch_bleu = metrics.bleu_score(logits=logits, labels=tgt_output_ids)
        total_cnt += bs
        total_loss += bs * batch_loss
        total_bleu += bs * batch_bleu
    eval_loss = total_loss / total_cnt
    eval_bleu = total_bleu / total_cnt
    return eval_bleu, eval_loss
Example #16
0
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    if params.frozen_graph and mode == tf.estimator.ModeKeys.PREDICT:
        print("Reading***** From *** pb", flush=True)
        input_map = {'input_tokens': features}
        output_names = [
            'model/Transformer/strided_slice_15',
            'model/Transformer/strided_slice_16'
        ]

        with tf.io.gfile.GFile(params.frozen_graph, "rb") as f:
            graph_def = tf.compat.v1.GraphDef()
            graph_def.ParseFromString(f.read())
        tf.graph_util.import_graph_def(graph_def,
                                       input_map,
                                       output_names,
                                       name="")
        output_tensors = [
            tf.compat.v1.get_default_graph().get_tensor_by_name(name + ":0")
            for name in output_names
        ]
        output = {'outputs': output_tensors[0], 'scores': output_tensors[1]}
        return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.PREDICT,
                                          predictions=output)
    else:
        with tf.compat.v1.variable_scope("model"):
            inputs, targets = features, labels

            # Create model and get output logits.
            model = transformer.Transformer(
                params, mode == tf.estimator.ModeKeys.TRAIN)

            output = model(inputs, targets)

            # When in prediction mode, the labels/targets is None. The model output
            # is the prediction
            if mode == tf.estimator.ModeKeys.PREDICT:
                return tf.estimator.EstimatorSpec(
                    tf.estimator.ModeKeys.PREDICT, predictions=output)

            logits = output

            # Calculate model loss.
            xentropy, weights = metrics.padded_cross_entropy_loss(
                logits, targets, params.label_smoothing, params.vocab_size)
            loss = tf.reduce_sum(input_tensor=xentropy *
                                 weights) / tf.reduce_sum(input_tensor=weights)

            if mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metric_ops=metrics.get_eval_metrics(
                        logits, labels, params))
            else:
                train_op = get_train_op(loss, params)
                logging_hook = tf.compat.v1.train.LoggingTensorHook(
                    {"loss": loss}, every_n_iter=FLAGS.print_iter)
                return tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    training_hooks=[logging_hook])
Example #17
0
    def build_discriminator(self,
                            origin_inputs,
                            gen_target,
                            margin,
                            real_loss,
                            given_num=None,
                            discount_factor=0.95):
        fake_logits = self.build_pretrain(
            inputs=gen_target,
            targets=origin_inputs)  # [batch, tgt_length, vocab_size]
        fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss(
            fake_logits, origin_inputs, self.params.label_smoothing,
            self.params.target_vocab_size)  # [batch, origin_length]
        #print("fake_xentropy:", fake_xentropy.shape)
        #print("-------given_num-----", given_num)
        #fake_xentropy = tf.transpose(fake_xentropy, perm=[1, 0])  # [tgt_len, batch]
        #tgt_len = tf.shape(fake_xentropy)[0]
        #
        #def _unstack_ta(inp):
        #    return tf.TensorArray(
        #        dtype=inp.dtype, size=tf.shape(inp)[0],
        #        element_shape=inp.get_shape()[1:]).unstack(inp)
        #
        #ta_fake_xentropy = nest.map_structure(_unstack_ta, fake_xentropy)

        #def _create_ta(inp):
        #    return tf.TensorArray(
        #        dtype=tf.float32,
        #        size=tgt_len,
        #        dynamic_size=False,
        #        element_shape=inp.get_shape()[1:])

        #discounted_fake_loss = nest.map_structure(_create_ta, fake_xentropy)

        #def inner_loop_1(i, ta_fake_xentropy, discounted_fake_loss):
        #    print("aaaaa", (i, given_num))
        #    disc_loss = ta_fake_xentropy.read(i)
        #    discounted_fake_loss = nest.map_structure(lambda ta, out: ta.write(i, out),
        #                                              discounted_fake_loss, disc_loss)
        #    return i + 1, ta_fake_xentropy, discounted_fake_loss
        #
        #def inner_loop_2(i, ta_fake_xentropy, discounted_fake_loss):
        #    print("bbbbbb", (i, tgt_len))
        #    disc_loss = ta_fake_xentropy.read(i) * (discount_factor ** tf.to_float(i - given_num))
        #    discounted_fake_loss = nest.map_structure(lambda ta, out: ta.write(i, out),
        #                                              discounted_fake_loss, disc_loss)
        #    return i + 1, ta_fake_xentropy, discounted_fake_loss
        #
        ## i < given_num
        #i, ta_fake_xentropy, discounted_fake_loss = tf.while_loop(
        #    cond=lambda i, _1, _2: tf.less(i, given_num),
        #    body=inner_loop_1,
        #    loop_vars=[tf.constant(0), ta_fake_xentropy, discounted_fake_loss],
        #)
        ## # # i >= given_num
        #i, ta_fake_xentropy, discounted_fake_loss = tf.while_loop(
        #    cond=lambda i, _1, _2: tf.less(i, tgt_len),
        #    body=inner_loop_2,
        #    loop_vars=[i, ta_fake_xentropy, discounted_fake_loss],
        #)

        #fake_loss = tf.transpose(discounted_fake_loss.stack(), perm=[1, 0])  # [batch, tgt_length]
        #fake_loss = tf.reduce_sum(fake_loss, axis=1) / tf.reduce_sum(fake_weights, axis=1)
        fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum(
            fake_weights, axis=1)

        tf.identity(fake_loss[:5], "fake_loss")
        mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss")
        tf.summary.scalar("mean_fake_loss", mean_fake_loss)
        rewards = 1 / tf.maximum(0.2, fake_loss /
                                 (real_loss + 1e-12) - 1)  # [batch]
        tf.identity(rewards[:5], "rewards")
        mean_wards = tf.reduce_mean(rewards, name="mean_wards")
        tf.summary.scalar("mean_wards", mean_wards)
        return rewards
Example #18
0
def model_fn(features, labels, mode: tf.estimator.ModeKeys, params: dict):
    """
    :param features:
                    encode_inputs = features['encode_feature_name']
    :param labels:
    :param mode:
    :param params:
    :return:
    """
    with tf.variable_scope('model'):
        inputs = features
        transformer = Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

        logits = transformer(inputs, labels)
        """
            when in prediction mode, the labels and decode_inputs  is None,
            the model output id the prediction
            it is a dict {"outputs": top_decoded_ids, "scores": top_scores}
        """

        if mode == tf.estimator.ModeKeys.PREDICT:
            estimator = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=logits,
                export_outputs={
                    'translate': tf.estimator.export.PredictOutput(logits)
                })

            return estimator

        logits.set_shape(labels.shape.as_list() + logits.shape.as_list()[2:])

        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits=logits,
            labels=labels,
            smoothing=params.get('label_smoothing'),
            vocab_size=params.get('vocab_size'))

        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        tf.identity(loss, 'cross_entropy')

        if mode == tf.estimator.ModeKeys.EVAL:
            estimator = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={'predictions': logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))

            return estimator

        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op, metrics_dict = model_utils.get_train_op_and_metrics(
                loss, params)

            metrics_dict['mini_batch_loss'] = loss

            model_utils.record_scalars(metrics_dict)

            estimator = tf.estimator.EstimatorSpec(mode=mode,
                                                   loss=loss,
                                                   train_op=train_op)

            return estimator
Example #19
0
def get_loss(logits, labels, scope_name_1, scope_name_2):
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, labels, params.label_smoothing, params.target_vocab_size)
    cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
    tf.add_to_collection(scope_name_1, cross_entropy_mean)
    return tf.add_n(tf.get_collection(scope_name_1), name=scope_name_2)
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    # 如果是predict:
    # returns a dictionary {
    #   output: [batch_size, decoded length]
    #   score: [batch_size, float]}
    # else:
    # Returns:
    #   float32 tensor with shape [batch_size, target_length, vocab_size]
    logits = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      if params["use_tpu"]:
        raise NotImplementedError("Prediction is not yet supported on TPUs.")
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=logits,
          export_outputs={
              "translate": tf.estimator.export.PredictOutput(logits)
          })

    # Explicitly set the shape of the logits for XLA (TPU). This is needed
    # because the logits are passed back to the host VM CPU for metric
    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
    # it is known from Transformer that the first two dimensions of logits
    # are the dimensions of targets. Note that the ambiguous shape of logits is
    # not a problem when computing xentropy, because padded_cross_entropy_loss
    # resolves the shape on the TPU.
    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

    # Calculate model loss.
    # xentropy contains the cross entropy loss of every nonpadding token in the
    # targets.
    # 训练时,labels 为0(即<PAD>)的对应loss的weight被置0
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params["label_smoothing"], params["vocab_size"])
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

    # Save loss as named tensor that will be logged with the logging hook.
    tf.identity(loss, "cross_entropy")

    if mode == tf.estimator.ModeKeys.EVAL:
      if params["use_tpu"]:
        # host call functions should only have tensors as arguments.
        # This lambda pre-populates params so that metric_fn is
        # TPUEstimator compliant.
        def metric_fn(logits, labels): return (
            metrics.get_eval_metrics(logits, labels, params=params))
        eval_metrics = (metric_fn, [logits, labels])
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, predictions={"predictions": logits},
            eval_metrics=eval_metrics)
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op, metric_dict = get_train_op_and_metrics(loss, params)

      # Epochs can be quite long. This gives some intermediate information
      # in TensorBoard.
      metric_dict["minibatch_loss"] = loss
      if params["use_tpu"]:
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, train_op=train_op,
            host_call=tpu_util.construct_scalar_host_call(
                metric_dict=metric_dict, model_dir=params["model_dir"],
                prefix="training/")
        )
      record_scalars(metric_dict)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def train(params):
    with tf.Graph().as_default():
        if tf.train.latest_checkpoint(flags_obj.model_dir):
            global_step_value = int(
                tf.train.latest_checkpoint(flags_obj.model_dir).split("-")[-1])
            global_step = tf.Variable(initial_value=global_step_value,
                                      dtype=tf.int32,
                                      trainable=False)
            print(
                "right here!",
                int(
                    tf.train.latest_checkpoint(
                        flags_obj.model_dir).split("-")[-1]))
        else:
            global_step_value = 0
            global_step = tf.get_variable(
                'global_step', [],
                initializer=tf.constant_initializer(0),
                trainable=False)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps,
                                          global_step)

        optimizer = tf.contrib.opt.LazyAdamOptimizer(
            learning_rate,
            beta1=params.optimizer_adam_beta1,
            beta2=params.optimizer_adam_beta2,
            epsilon=params.optimizer_adam_epsilon)

        my_dataset = dataset.Dataset(params)

        train_iterator = my_dataset.train_input_fn(params)
        valid_iterator = my_dataset.eval_input_fn(params)

        tower_grads = []
        g_model = transformer_9.Transformer(params,
                                            is_train=True,
                                            mode=None,
                                            scope="Transformer")
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            for i in xrange(flags_obj.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                        tf.logging.info("Build graph on gpu:{}".format(i))
                        logits = g_model.inference(train_iterator.source,
                                                   train_iterator.target)
                        xentropy, weights = metrics.padded_cross_entropy_loss(
                            logits, train_iterator.target,
                            params.label_smoothing, params.target_vocab_size)
                        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)
                        grads = optimizer.compute_gradients(loss)
                        tf.logging.info(
                            "total trainable variables number: {}".format(
                                len(grads)))
                        tower_grads.append(grads)
                    if i == 0 and valid_iterator:
                        valid_pred = g_model.inference(
                            inputs=valid_iterator.source,
                            targets=None)["outputs"]
                        valid_tgt = valid_iterator.target
                        valid_src = valid_iterator.source

        if len(tower_grads) > 1:
            grads = average_gradients(tower_grads)
        else:
            grads = tower_grads[0]
        summaries.append(tf.summary.scalar('learning_rate', learning_rate))
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))
        apply_gradient_op = optimizer.apply_gradients(grads,
                                                      global_step=global_step)
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))
        train_op = apply_gradient_op

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=20)

        init = tf.global_variables_initializer()
        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True

        with tf.Session(config=sess_config) as sess:
            sess.run(init)
            sess.run(tf.local_variables_initializer())

            sess.run(train_iterator.initializer)

            ckpt = tf.train.latest_checkpoint(flags_obj.model_dir)
            tf.logging.info("ckpt {}".format(ckpt))
            if ckpt and tf.train.checkpoint_exists(ckpt):
                tf.logging.info(
                    "Reloading model parameters..from {}".format(ckpt))
                saver.restore(sess, ckpt)
            else:
                tf.logging.info("create a new model...{}".format(
                    flags_obj.model_dir))
            tf.train.start_queue_runners(sess=sess)
            summary_writer = tf.summary.FileWriter(flags_obj.model_dir,
                                                   sess.graph)

            count = 0
            best_bleu = 0.0
            for step in xrange(global_step_value, flags_obj.train_steps):
                _, loss_value, lr_value = sess.run(
                    [train_op, loss, learning_rate],
                    feed_dict={g_model.dropout_rate: 0.1})
                if step % 200 == 0:
                    tf.logging.info(
                        "step: {}, loss = {:.4f}, lr = {:5f}".format(
                            step, loss_value, lr_value))

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step < 10000:
                    steps_between_evals = 2000
                else:
                    steps_between_evals = 1000
                if step % steps_between_evals == 0:
                    sess.run(valid_iterator.initializer)
                    tf.logging.info(
                        "------------------ Evaluation bleu -------------------------"
                    )
                    total_bleu = 0.0
                    total_size = 0
                    while True:
                        try:
                            val_pred, val_tgt, val_src = sess.run(
                                [valid_pred, valid_tgt, valid_src],
                                feed_dict={g_model.dropout_rate: 0.0})
                            val_bleu = metrics.compute_bleu(val_tgt, val_pred)
                            batch_size = val_pred.shape[0]
                            total_bleu += val_bleu * batch_size
                            total_size += batch_size
                        except tf.errors.OutOfRangeError:
                            break
                    total_bleu /= total_size
                    tf.logging.info("{}, Step: {}, Valid bleu : {:.6f}".format(
                        datetime.now(), step, total_bleu))
                    tf.logging.info(
                        "--------------------- Finish evaluation ------------------------"
                    )
                    # Save the model checkpoint periodically.
                    if step == 0:
                        total_bleu = 0.0

                    if total_bleu > best_bleu:
                        best_bleu = total_bleu
                        checkpoint_path = os.path.join(flags_obj.model_dir,
                                                       'model.ckpt')
                        saver.save(sess, checkpoint_path, global_step=step)
                        tf.logging.info(
                            "Saving model at {}".format(checkpoint_path + "-" +
                                                        str(step)))
                    elif total_bleu + 0.003 > best_bleu:
                        checkpoint_path = os.path.join(flags_obj.model_dir,
                                                       'model.ckpt')
                        saver.save(sess, checkpoint_path, global_step=step)
                        tf.logging.info(
                            "Saving model at {}".format(checkpoint_path + "-" +
                                                        str(step)))
                    else:
                        count += 1
                        # early stop
                        if count > 5:
                            break
            tf.logging.info("Best bleu is {}".format(best_bleu))
Example #22
0
def build_graph(params):
    my_dataset = dataset.Dataset(params)
    train_iterator = my_dataset.train_input_fn(params)
    valid_iterator = my_dataset.eval_input_fn(params)

    ckpt = tf.train.latest_checkpoint(flags_obj.model_dir)
    if ckpt and tf.train.checkpoint_exists(ckpt):
        init_step = int(
            tf.train.latest_checkpoint(flags_obj.model_dir).split("-")[-1])
        global_step = tf.get_variable('global_step',
                                      initializer=init_step,
                                      trainable=False)
    else:
        init_step = 0
        global_step = tf.Variable(init_step,
                                  trainable=False,
                                  name="global_step")

    learning_rate = get_learning_rate(params.learning_rate, params.hidden_size,
                                      params.learning_rate_warmup_steps,
                                      global_step)

    optimizer = tf.contrib.opt.LazyAdamOptimizer(
        learning_rate,
        beta1=params.optimizer_adam_beta1,
        beta2=params.optimizer_adam_beta2,
        epsilon=params.optimizer_adam_epsilon)

    tower_grads = []
    g_tower_grads = []
    g_model = gen_and_dis.Generator(params,
                                    is_train=True,
                                    name_scope="Transformer")
    d_model = gen_and_dis.Discriminator(params,
                                        is_train=True,
                                        name_scope="Discriminator")
    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        for i in xrange(flags_obj.num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                    tf.logging.info("Build graph on gpu:{}".format(i))
                    # pretrain loss
                    logits = g_model.inference(train_iterator.source,
                                               train_iterator.target)
                    xentropy, weights = metrics.padded_cross_entropy_loss(
                        logits, train_iterator.target, params.label_smoothing,
                        params.target_vocab_size)
                    xen_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

                    # g_loss
                    gen_samples = g_model.inference(train_iterator.source,
                                                    None)["outputs"]
                    deal_samples = train_helper._trim_and_pad(gen_samples)
                    given_num, rewards, roll_mean_loss, real_mean_loss = g_model.get_reward(
                        real_inputs=train_iterator.source,
                        real_targets=train_iterator.target,
                        gen_targets=deal_samples,
                        roll_num=flags_obj.roll_num,
                        discriminator=d_model)
                    g_loss = g_model.g_loss(gen_targets=deal_samples,
                                            given_num=given_num,
                                            rewards=rewards)

                    xen_grads = optimizer.compute_gradients(xen_loss)
                    gen_grads = optimizer.compute_gradients(g_loss)

                    g_grads = []
                    x_grads = []
                    for grad, var in gen_grads:
                        if "Transformer" in var.name:
                            g_grads.append((grad, var))
                    for grad, var in xen_grads:
                        if "Transformer" in var.name:
                            x_grads.append((grad, var))

                    tf.logging.info(
                        "total trainable variables number: {}, {}".format(
                            len(g_grads), len(x_grads)))
                    tower_grads.append(x_grads)
                    g_tower_grads.append(g_grads)

                if i == 0 and valid_iterator:
                    val_pred = g_model.inference(inputs=valid_iterator.source,
                                                 targets=None)["outputs"]

    if len(tower_grads) > 1:
        print(len(tower_grads[0]), len(tower_grads[1]))
        x_grads = train_helper.average_gradients(tower_grads)
        g_grads = train_helper.average_gradients(g_tower_grads)
    else:
        x_grads = tower_grads[0]
        g_grads = g_tower_grads[0]

    apply_gradient_op = optimizer.apply_gradients(x_grads,
                                                  global_step=global_step)
    g_apply_gradient_op = optimizer.apply_gradients(g_grads,
                                                    global_step=global_step)

    train_op = tf.group(apply_gradient_op, g_apply_gradient_op)

    train_return = (train_op, global_step, g_loss, xen_loss, rewards,
                    learning_rate, init_step, roll_mean_loss, real_mean_loss)
    valid_return = (val_pred, valid_iterator.target, valid_iterator.source)
    dataset_iter = (train_iterator, valid_iterator)
    return g_model, d_model, train_return, valid_return, dataset_iter
Example #23
0
def get_mono_loss(logits, labels):
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, labels, params.label_smoothing, params.target_vocab_size)
    cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
    tf.add_to_collection('mono_losses', cross_entropy_mean)
    return tf.add_n(tf.get_collection('mono_losses'), name='mono_total_loss')
Example #24
0
    for i in xrange(train_eval_iterations):
        gc.collect()
        print('Starting iteration', i + 1)
        print('Train:')
        for step in xrange(single_iteration_train_steps):
            tic = time()
            losses = 0
            mini_batch_train = dataset_train.get_mini_batch(batch_size=params.batch_size)
            input = gluon.utils.split_and_load(mini_batch_train['input'], ctx)
            targets = gluon.utils.split_and_load(mini_batch_train['targets'], ctx)
            global_step = 1 + global_step
            learning_rate = get_learning_rate(params.learning_rate, params.hidden_size,
                                              params.learning_rate_warmup_steps, global_step)
            with autograd.record():
                for j in xrange(num_gpu):
                    loss = metrics.padded_cross_entropy_loss(net(input[j], targets[j]), targets[j],
                                                             params.label_smoothing, params.vocab_size)
                    loss.backward()
                    losses = losses + loss
            trainer.set_learning_rate(learning_rate)
            trainer.step(params.batch_size)
            mx.ndarray.waitall()

            print("\t step %d: Loss: %.3f, Time:%.1f seconds" % (global_step, losses.mean().asscalar() / 4, time() - tic))

        print('Evaluate: ')
        uncased_score = translate_and_compute_bleu(net, subtokenizer, bleu_source, bleu_ref)
        print('\t uncased_score: %.3f' % uncased_score)
        print('\t best_bleu_score: %.3f' % best_bleu_score)

        if uncased_score > best_bleu_score:
            best_bleu_score = uncased_score