def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        model = transformer.Transformer(params,
                                        mode == tf.estimator.ModeKeys.TRAIN)

        logits = model(inputs, targets)

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            if params["use_tpu"]:
                raise NotImplementedError(
                    "Prediction is not yet supported on TPUs.")
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "translate": tf.estimator.export.PredictOutput(logits)
                })

        # Explicitly set the shape of the logits for XLA (TPU). This is needed
        # because the logits are passed back to the host VM CPU for metric
        # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
        # it is known from Transformer that the first two dimensions of logits
        # are the dimensions of targets. Note that the ambiguous shape of logits is
        # not a problem when computing xentropy, because padded_cross_entropy_loss
        # resolves the shape on the TPU.
        logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params["label_smoothing"], params["vocab_size"])
        loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(loss, "cross_entropy")

        if mode == tf.estimator.ModeKeys.EVAL:
            if params["use_tpu"]:
                # host call functions should only have tensors as arguments.
                # This lambda pre-populates params so that metric_fn is
                # TPUEstimator compliant.
                metric_fn = lambda logits, labels: (metrics.get_eval_metrics(
                    logits, labels, params=params))
                eval_metrics = (metric_fn, [logits, labels])
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metrics=eval_metrics)
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            metric_dict["minibatch_loss"] = loss
            if params["use_tpu"]:
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    host_call=tpu_util.construct_scalar_host_call(
                        metric_dict=metric_dict,
                        model_dir=params["model_dir"],
                        prefix="training/"))
            ####domyoung 2019.10.1####
            #record_scalars(metric_dict)
            for key, value in metric_dict.items():
                tf.summary.scalar(name=key, tensor=value)
                tf.logging.info(key)
            summary_hook = tf.train.SummarySaverHook(
                save_steps=20,
                output_dir=params["model_dir"],
                summary_op=tf.summary.merge_all())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op,
                                              training_hooks=[summary_hook])
Example #2
0
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  with tf.variable_scope("model"):
    inputs, targets = features, labels

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    logits = model(inputs, targets)

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      if params["use_tpu"]:
        raise NotImplementedError("Prediction is not yet supported on TPUs.")
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions=logits,
          export_outputs={
              "translate": tf.estimator.export.PredictOutput(logits)
          })

    # Explicitly set the shape of the logits for XLA (TPU). This is needed
    # because the logits are passed back to the host VM CPU for metric
    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
    # it is known from Transformer that the first two dimensions of logits
    # are the dimensions of targets. Note that the ambiguous shape of logits is
    # not a problem when computing xentropy, because padded_cross_entropy_loss
    # resolves the shape on the TPU.
    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

    # Calculate model loss.
    # xentropy contains the cross entropy loss of every nonpadding token in the
    # targets.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params["label_smoothing"], params["vocab_size"])
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

    # Save loss as named tensor that will be logged with the logging hook.
    tf.identity(loss, "cross_entropy")

    if mode == tf.estimator.ModeKeys.EVAL:
      if params["use_tpu"]:
        # host call functions should only have tensors as arguments.
        # This lambda pre-populates params so that metric_fn is
        # TPUEstimator compliant.
        metric_fn = lambda logits, labels: (
            metrics.get_eval_metrics(logits, labels, params=params))
        eval_metrics = (metric_fn, [logits, labels])
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, predictions={"predictions": logits},
            eval_metrics=eval_metrics)
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op, metric_dict = get_train_op_and_metrics(loss, params)

      # Epochs can be quite long. This gives some intermediate information
      # in TensorBoard.
      metric_dict["minibatch_loss"] = loss
      if params["use_tpu"]:
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, train_op=train_op,
            host_call=tpu_util.construct_scalar_host_call(
                metric_dict=metric_dict, model_dir=params["model_dir"],
                prefix="training/")
        )
      record_scalars(metric_dict)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer model."""
    with tf.variable_scope("model"):
        inputs, targets = features, labels

        # Create model and get output logits.
        train = (mode == tf.estimator.ModeKeys.TRAIN)
        #model = transformer.Transformer(params, train)
        #model = transformer2.Transformer(params, train)
        model = transformer3.Transformer(params, train)

        logits, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar = model(
            inputs, targets)
        # debug
        #print('latent_sample.shape', tf.shape(latent_sample))
        #print('latent_sample.shape', latent_sample.shape[-1].value)
        #exit()

        # When in prediction mode, the labels/targets is None. The model output
        # is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            if params["use_tpu"]:
                raise NotImplementedError(
                    "Prediction is not yet supported on TPUs.")
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "translate": tf.estimator.export.PredictOutput(logits)
                })

        # Explicitly set the shape of the logits for XLA (TPU). This is needed
        # because the logits are passed back to the host VM CPU for metric
        # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
        # it is known from Transformer that the first two dimensions of logits
        # are the dimensions of targets. Note that the ambiguous shape of logits is
        # not a problem when computing xentropy, because padded_cross_entropy_loss
        # resolves the shape on the TPU.
        logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the
        # targets.
        xentropy, weights = metrics.padded_cross_entropy_loss(
            logits, targets, params["label_smoothing"], params["vocab_size"])
        # size:
        #   xentropy: [batch_size, max(length_logits, length_labels)]
        #   weights:  [batch_size, max(length_logits, length_labels)], 0 or 1

        #loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

        real_batch_size = tf.to_float(tf.shape(logits)[0])  # get batch_size

        if params["word_avg"]:
            predict_loss_avg_in_sentence = tf.reduce_sum(
                xentropy, axis=1) / tf.reduce_sum(weights, axis=1)
            predict_loss = tf.reduce_sum(
                predict_loss_avg_in_sentence) / real_batch_size
            # 1.first average in sentence by word;
            # 2.then average in batch by sample.
        else:
            predict_loss = tf.reduce_sum(xentropy) / real_batch_size

        if train:  # train mode
            # if use gaussian_kld_v2, the meaning of 'logvar' becomes standard deviation.
            if params["use_std"]:
                kl_loss = gaussian_kld_v2(recog_mu, recog_logvar, prior_mu,
                                          prior_logvar)
            else:
                kl_loss = gaussian_kld(recog_mu, recog_logvar, prior_mu,
                                       prior_logvar)
            kl_loss = tf.reduce_sum(kl_loss) / real_batch_size
            tf.identity(kl_loss, "kl_loss")
            # annealing
            if params["kl_weight"] == 'sigmoid':
                scaled_x = (tf.to_float(tf.train.get_or_create_global_step()) /
                            params["full_kl_steps"] -
                            0.5) * 20.0  # sigmoid weight
                kl_loss_weight = 1.0 / (1 + tf.exp(-scaled_x))
            elif params["kl_weight"] == 'linear':
                kl_loss_weights = tf.minimum(
                    (tf.to_float(tf.train.get_or_create_global_step()) /
                     params["full_kl_steps"]), 1.0)  # linear weight
            else:
                kl_loss_weight = 1.0

            weighted_kl_loss = kl_loss * kl_loss_weight
            tf.identity(weighted_kl_loss, "weighted_kl_loss")
            tf.identity(kl_loss_weight, "kl_loss_weight")
            if params["use_bow"]:
                bow_loss = compute_bow_loss(latent_sample, targets, params,
                                            train)
                loss = predict_loss + weighted_kl_loss + bow_loss
                tf.identity(bow_loss, "bow_loss")  # total loss
                #TENSORS_TO_LOG["bow_loss"] = "model/bow_loss"
            else:
                loss = predict_loss + weighted_kl_loss
        else:  # eval and infer modes
            loss = predict_loss

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(predict_loss, "predict_loss")
        tf.identity(loss, "cross_entropy")  # total loss

        if mode == tf.estimator.ModeKeys.EVAL:
            if params["use_tpu"]:
                # host call functions should only have tensors as arguments.
                # functools.partial() pre-populates params so that metric_fn is
                # TPUEstimator compliant.
                metric_fn = functools.partial(metrics.get_eval_metrics,
                                              params=params)
                eval_metrics = (metric_fn, [logits, labels])
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metrics=eval_metrics)
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, labels, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            #metric_dict["minibatch_loss"] = loss
            metric_dict["predict_loss"] = predict_loss
            metric_dict["kl_loss"] = kl_loss
            if params["use_bow"]:
                metric_dict["bow_loss"] = bow_loss
            if params["kl_weight"]:
                metric_dict["weighted_kl_loss"] = weighted_kl_loss
                metric_dict["kl_loss_weight"] = kl_loss_weight

            if params["use_tpu"]:
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    host_call=tpu_util.construct_scalar_host_call(
                        metric_dict=metric_dict,
                        model_dir=params["model_dir"],
                        prefix="training/"))
            record_scalars(metric_dict)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
def model_fn(features, labels, mode, params):
    """Defines how to train, evaluate and predict from the transformer_classifier model."""
    with tf.variable_scope("model"):
        inputs = features

        # Create model and get output logits.
        model = transformer.TransformerClassifier(params, mode)

        logits = model(inputs)

        # When in prediction mode, the model output is the prediction
        if mode == tf.estimator.ModeKeys.PREDICT:
            if params["use_tpu"]:
                raise NotImplementedError(
                    "Prediction is not yet supported on TPUs.")
            return tf.estimator.EstimatorSpec(
                tf.estimator.ModeKeys.PREDICT,
                predictions=logits,
                export_outputs={
                    "classify": tf.estimator.export.PredictOutput(logits)
                })

        # Calculate model loss.
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels, logits=logits)
        loss = tf.reduce_mean(xentropy)

        # Save loss as named tensor that will be logged with the logging hook.
        tf.identity(loss, "cross_entropy")

        if mode == tf.estimator.ModeKeys.EVAL:
            if params["use_tpu"]:
                # host call functions should only have tensors as arguments.
                # This lambda pre-populates params so that metric_fn is
                # TPUEstimator compliant.
                metric_fn = lambda logits, labels: (metrics.get_eval_metrics(
                    logits, labels, params=params))
                eval_metrics = (metric_fn, [logits, inputs])
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    predictions={"predictions": logits},
                    eval_metrics=eval_metrics)
            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": logits},
                eval_metric_ops=metrics.get_eval_metrics(
                    logits, inputs, params))
        else:
            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            metric_dict["minibatch_loss"] = loss
            if params["use_tpu"]:
                return tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=loss,
                    train_op=train_op,
                    host_call=tpu_util.construct_scalar_host_call(
                        metric_dict=metric_dict,
                        model_dir=params["model_dir"],
                        prefix="training/"))
            record_scalars(metric_dict)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
Example #5
0
def model_fn(features, labels, mode, params):
  """Defines how to train, evaluate and predict from the transformer model."""
  #tf.set_random_seed(1367)
  with tf.variable_scope("model"):
    inputs, targets = features, labels
    concrete_loss = tf.constant(0)
    total_loss = tf.constant(0)
    concrete_reg = tf.constant(0)
    sparsity_rate = tf.constant(0)
    gate_values = tf.constant(0)
    # =================== For concrete gates ==================================
    print("**** concrete heads has this : {} ****".format(params["concrete_heads"]))
    if not params["concrete_coef"] == 0:
        tf.get_default_graph().clear_collection("CONCRETE")
        tf.get_default_graph().clear_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    # =========================================================================

    # Create model and get output logits.
    model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN)

    logits = model(inputs, targets)
    #print('logits')
    #print(len(logits))

    # When in prediction mode, the labels/targets is None. The model output
    # is the prediction
    if mode == tf.estimator.ModeKeys.PREDICT:
      if params["use_tpu"]:
        raise NotImplementedError("Prediction is not yet supported on TPUs.")
      print ("Logits", logits)
      #print (logits["attn_weights"], tf.transpose(tf.stack(logits["attn_weights"]).get_shape(), perm=[1,0,2,3,4]))
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.PREDICT,
          predictions={"outputs": logits["outputs"], "scores": logits["scores"]})
          #export_outputs={
          #    "translate": tf.estimator.export.PredictOutput(logits["outputs"])
          #})

    # Explicitly set the shape of the logits for XLA (TPU). This is needed
    # because the logits are passed back to the host VM CPU for metric
    # evaluation, and the shape of [?, ?, vocab_size] is too vague. However
    # it is known from Transformer that the first two dimensions of logits
    # are the dimensions of targets. Note that the ambiguous shape of logits is
    # not a problem when computing xentropy, because padded_cross_entropy_loss
    # resolves the shape on the TPU.
    logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:])

    # Calculate model loss.
    # xentropy contains the cross entropy loss of every nonpadding token in the
    # targets.
    xentropy, weights = metrics.padded_cross_entropy_loss(
        logits, targets, params["label_smoothing"], params["vocab_size"])
    loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights)

    # Save loss as named tensor that will be logged with the logging hook.
    tf.identity(loss, "cross_entropy")

    # ============ Loss for concrete gates =================
    if not params["concrete_coef"] == 0:
        concrete_coef = params["concrete_coef"]
        sparsity_rate = tf.reduce_mean(tf.get_collection("CONCRETE"))
        concrete_reg = tf.reduce_mean(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
        concrete_loss = concrete_coef * tf.reduce_mean(concrete_reg)
        
        total_loss = loss + concrete_loss

        gate_values = tf.get_collection("GATEVALUES")
        
        tf.identity(concrete_loss, "concrete_loss")
        tf.identity(total_loss, "total_loss")
        tf.identity(concrete_reg, "concrete_reg")
        tf.identity(sparsity_rate, "sparsity_rate")
        tf.identity(gate_values, "gate_values")
        loss = total_loss
    else:
        tf.identity(concrete_loss, "concrete_loss")
        tf.identity(total_loss, "total_loss")
        tf.identity(concrete_reg, "concrete_reg")
        tf.identity(sparsity_rate, "sparsity_rate")
        tf.identity(gate_values, "gate_values")
    # =======================================================
    if mode == tf.estimator.ModeKeys.EVAL:
      if params["use_tpu"]:
        # host call functions should only have tensors as arguments.
        # This lambda pre-populates params so that metric_fn is
        # TPUEstimator compliant.
        metric_fn = lambda logits, labels: (
            metrics.get_eval_metrics(logits, labels, params=params))
        eval_metrics = (metric_fn, [logits, labels])
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, predictions={"predictions": logits},
            eval_metrics=eval_metrics)
      return tf.estimator.EstimatorSpec(
          mode=mode, loss=loss, predictions={"predictions": logits},
          eval_metric_ops=metrics.get_eval_metrics(logits, labels, params))
    else:
      train_op, metric_dict = get_train_op_and_metrics(loss, params)

      # Epochs can be quite long. This gives some intermediate information
      # in TensorBoard.
      metric_dict["minibatch_loss"] = loss
      if params["use_tpu"]:
        return tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=loss, train_op=train_op,
            host_call=tpu_util.construct_scalar_host_call(
                metric_dict=metric_dict, model_dir=params["model_dir"],
                prefix="training/")
        )
      record_scalars(metric_dict)
      return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)