def infer(self,
              features=None,
              decode_length=50,
              beam_size=1,
              top_beams=1,
              alpha=0.0,
              use_tpu=False):
        t2t_model.set_custom_getter_compose(self._custom_getter)
        with self._eager_var_store.as_default():
            self.prepare_features_for_infer(features)
            if not self.has_input and beam_size > 1:
                t2t_model.log_warn(
                    "Beam searching for a model with no inputs.")
            if not self.has_input and self.hparams.sampling_method != "random":
                t2t_model.log_warn(
                    "Non-random sampling for a model with no inputs.")
            self._fill_problem_hparams_features(features)

            if self._problem_hparams:
                target_modality = self._problem_hparams.target_modality
                if target_modality.is_class_modality:
                    beam_size = 1  # No use to run beam-search for a single class.
            t2t_model.log_info("Greedy Decoding")
            """ Modified """
            # Removed every other decoding option, but the greedy method.
            results = self._greedy_infer(features, decode_length, use_tpu)
            return results
    def model_fn(self, features):
        with tf.variable_scope(tf.get_variable_scope(), use_resource=True):
            transformed_features = self.bottom(features)

            if self.hparams.activation_dtype == "bfloat16":
                for k, v in sorted(six.iteritems(transformed_features)):
                    if v.dtype == tf.float32:
                        transformed_features[k] = tf.cast(v, tf.bfloat16)

            with tf.variable_scope("body"):
                t2t_model.log_info("Building model body")
                """ Modified """
                # Passing the encoder state reference in 'enc_out' variable.
                body_out, enc_out = self.body(transformed_features)
            output, losses = self._normalize_body_output(body_out)

            if "training" in losses:
                t2t_model.log_info("Skipping T2TModel top and loss "
                                   "because training loss "
                                   "returned from body")
                logits = output
            else:
                logits = self.top(output, features)
                losses["training"] = 0.0
                if self._hparams.mode != tf.estimator.ModeKeys.PREDICT:
                    losses["training"] = self.loss(logits, features)

            return logits, losses, enc_out
  def __init__(self, *args, **kwargs):
    super(EvolvedTransformer, self).__init__(*args, **kwargs)
    self._encoder_function = evolved_transformer_encoder
    self._decoder_function = evolved_transformer_decoder
    self._init_cache_fn = init_evolved_transformer_cache

    # -1 means train all weights.
    if self.hparams.get("num_trainable_top_decoder_layers", -1) < 0:
      t2t_model.log_info(
          "num_trainable_top_decoder_layers is negative so training all weights."
      )
    elif self.hparams.shared_embedding_and_softmax_weights:
      t2t_model.log_info(
          "Setting hparams.shared_embedding_and_softmax_weights to False, "
          "because hparam.num_trainable_top_decoder_layers is being used.")

      # When hparam.num_trainable_top_decoder_layers is set to N >= 0 we will
      # freeze (not train) every variable except the N top decoder layers and
      # the (pre-)softmax matrix. For any N >= 0 we will freeze the encoder and
      # input/target embeddings. This also means we will not share the
      # (pre-)softmax matrix with input/target embeddings otherwise they will be
      # trained as well.
      self.hparams.shared_embedding_and_softmax_weights = False

      # If hparams.shared_embedding_and_softmax_weights was previously True,
      # then input and target embeddings were being shared.
      # To make sure it they embeddings continue to be shared, we need to set
      # hparams.shared_embedding to True.
      self.hparams.shared_embedding = True
    def model_fn(self, features):
        with tf.variable_scope(tf.get_variable_scope(),
                               use_resource=True,
                               reuse=tf.AUTO_REUSE):
            transformed_features = self.bottom(features)

            if self.hparams.activation_dtype == "bfloat16":
                for k, v in sorted(six.iteritems(transformed_features)):
                    if v.dtype == tf.float32:
                        transformed_features[k] = tf.cast(v, tf.bfloat16)

            t2t_model.log_info("Building model body")
            output, losses, monitor, targets_mask = self.body(
                transformed_features, features)
            output, losses = self._normalize_body_output((output, losses))

            if "training" in losses:
                t2t_model.log_info(
                    "Skipping T2TModel top and loss because training loss "
                    "returned from body")
                logits = output
            else:
                logits = self.top(output, features)
                losses["training"] = 0.0
                if (self._hparams.mode != tf_estimator.ModeKeys.PREDICT
                        and self._hparams.mode != "attack"):
                    losses["training"] = self.loss(logits, features)

        return logits, losses, monitor, targets_mask
    def optimize(self, loss, num_async_replicas=1, use_tpu=False):
        """Return a training op minimizing loss."""
        hparams = self.hparams

        lr = learning_rate.learning_rate_schedule(hparams)
        if num_async_replicas > 1:
            log_info("Dividing learning rate by num_async_replicas: %d",
                     num_async_replicas)
        lr /= tf.sqrt(float(num_async_replicas))

        loss = weight_decay_and_noise(loss, hparams, lr)
        loss = tf.identity(loss, name="total_loss")
        log_variable_sizes(verbose=hparams.summarize_vars)
        opt = ConditionalOptimizer(hparams.optimizer, lr, hparams)

        opt_summaries = ["loss", "learning_rate", "global_gradient_norm"]

        if hparams.clip_grad_norm:
            tf.logging.info("Clipping gradients, norm: %0.5f",
                            hparams.clip_grad_norm)
        if hparams.grad_noise_scale:
            tf.logging.info("Adding noise to gradients, noise scale: %0.5f",
                            hparams.grad_noise_scale)

        tf.summary.scalar("training/learning_rate", lr)
        return tf.contrib.layers.optimize_loss(
            name="training",
            loss=loss,
            global_step=tf.train.get_or_create_global_step(),
            learning_rate=lr,
            clip_gradients=hparams.clip_grad_norm or None,
            gradient_noise_scale=hparams.grad_noise_scale or None,
            optimizer=opt,
            summaries=opt_summaries,
            colocate_gradients_with_ops=True)
    def model_fn(self, features):
        with tf.variable_scope(tf.get_variable_scope(), use_resource=True):
            transformed_features = self.bottom(features)

            if self.hparams.activation_dtype == "bfloat16":
                for k, v in transformed_features.items():
                    if v.dtype == tf.float32:
                        transformed_features[k] = tf.cast(v, tf.bfloat16)

            with tf.variable_scope("body"):
                log_info("Building model body")
                body_out = self.body(transformed_features)
            output, losses = self._normalize_body_output(body_out)

            if "training" in losses:
                log_info(
                    "Skipping T2TModel top and loss because training loss "
                    "returned from body")
                logits = output
            else:
                logits = self.top(output, features)
                losses["training"] = 0.0
                if self._hparams.mode != tf.estimator.ModeKeys.PREDICT:
                    training_loss = self.loss(logits, features)
                    if isinstance(training_loss, dict):
                        assert "training" in training_loss
                        losses.update(training_loss)
                    else:
                        losses["training"] = training_loss

            return logits, losses
    def model_fn(self, features):
        """We need this for shallow fusion to change logits."""
        transformed_features = self.bottom(features)

        with tf.variable_scope("body"):
            log_info("Building model body")
            body_out, p_copy = self.body(transformed_features)

            output, losses = self._normalize_body_output(body_out)

            if "training" in losses:
                log_info(
                    "Skipping T2TModel top and loss because training loss "
                    "returned from body")
                logits = output
            else:
                dzq = tf.transpose(p_copy[0].stack(), [1, 0, 2])
                inv_dz = tf.transpose(p_copy[1].stack(), [1, 0])

                if False:

                    y_tilda = tf.concat([
                        features[key + "_one_hot"]
                        for key in self._problem_hparams.nearest_target_keys
                    ],
                                        axis=1)
                    p_tilda = tf.diag_part(
                        tf.tensordot(dzq, y_tilda, axes=[[2], [1]]))
                    logits = inv_dz * self.top(output, features)
                else:
                    logits = self.top(output, features)

                losses["training"] = self.loss(logits, features)
        return logits, losses
    def infer(self,
              features=None,
              decode_length=50,
              beam_size=1,
              top_beams=1,
              alpha=0.0,
              use_tpu=False):
        """A inference method.

        Quadratic time in decode_length.
    
        Args:
          features: an map of string to `Tensor`
          decode_length: an integer.  How many additional timesteps to decode.
          beam_size: number of beams.
          top_beams: an integer. How many of the beams to return.
          alpha: Float that controls the length penalty. larger the alpha, stronger
            the preference for longer translations.
          use_tpu: bool, whether to build the inference graph for TPU.
    
        Returns:
          A dict of decoding results {
              "outputs": integer `Tensor` of decoded ids of shape
                  [batch_size, <= decode_length] if beam_size == 1 or
                  [batch_size, top_beams, <= decode_length]
              "scores": decoding log probs from the beam search,
                  None if using greedy decoding (beam_size=1)
          }
          if slow greedy decoding is used then the dict will also contain {
              "logits": `Tensor` of shape [batch_size, time, 1, 1, vocab_size].
              "losses": a dictionary: {loss-name (string): floating point `Scalar`
          }
        """
        set_custom_getter_compose(self._custom_getter)
        with self._eager_var_store.as_default():
            self.prepare_features_for_infer(features)
            if not self.has_input and beam_size > 1:
                log_warn("Beam searching for a model with no inputs.")
            if not self.has_input and self.hparams.sampling_method != "random":
                log_warn("Non-random sampling for a model with no inputs.")
            self._fill_problem_hparams_features(features)

            if self._problem_hparams:
                target_modality = self._problem_hparams.target_modality
                if not isinstance(target_modality,
                                  dict) and target_modality.is_class_modality:
                    beam_size = 1  # No use to run beam-search for a single class.
            if beam_size == 1:
                log_info("Greedy Decoding")
                results = self._greedy_infer(features, decode_length, use_tpu)
            else:
                log_info("Beam Decoding with beam size %d" % beam_size)
                results = self._beam_decode(features, decode_length, beam_size,
                                            top_beams, alpha)

            return results
 def optimize(self, loss, num_async_replicas=1, use_tpu=False, variables=None):
   """Return a training op minimizing loss."""
   lr = ops.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     t2t_model.log_info("Dividing learning rate by num_async_replicas: %d",
                        num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=use_tpu, variables=variables)
   return train_op
Beispiel #10
0
  def model_fn(self, features):
    transformed_features = self.bottom(features)

    with tf.variable_scope("body"):
      t2t_model.log_info("Building model body")
      body_out = self.body(transformed_features, features)
    output, losses = self._normalize_body_output(body_out)

    if "training" in losses:
      t2t_model.log_info("Skipping T2TModel top and loss because training loss "
               "returned from body")
      logits = output

    else:
      logits = self.top(output, features)
      losses["training"] = self.loss(logits, features)
    return logits, losses
    def bottom(self, features):
        transformed_features = super().bottom(features)
        print(features.keys())
        # here we can define how to transform nearest_targets
        # now they are transformed with one-hot encoding
        target_modality = self._problem_hparams.target_modality
        vocab_size = target_modality._vocab_size
        print("\n\nvocab_size:", vocab_size, "\n\n")
        with tf.variable_scope(target_modality.name, reuse=True):
            for key in self._problem_hparams.nearest_target_keys:
                log_info("Transforming %s with %s.targets_bottom", key,
                         target_modality.name)
                transformed_features[key] = target_modality.targets_bottom(
                    features[key])
                log_info("Transforming %s with one-hot encoding", key)
                # shape is (bs, max_len, vocab_size)
                transformed_features[key + "_one_hot"] = tf.one_hot(
                    features[key], depth=vocab_size, axis=-1)

        return transformed_features
Beispiel #12
0
  def _top_single(self, body_output, target_modality, features):
    """Top transformation that ensures correct reuse of target embeddings."""
    t2t_model.log_info(
        "Transforming body output with %s.top", target_modality.name)

    # Get target embeddings.
    target_modality = self._problem_hparams.modality["targets"]
    target_modality_scope = self._variable_scopes[target_modality.name]
    target_embeddings = model_utils.get_embeddings(
        modality=target_modality,
        outer_scope=target_modality_scope,
        inner_scope="shared")
    target_vocab_size = target_modality._vocab_size  # pylint: disable=protected-access

    # Preprocess body output.
    last_only = (
        target_modality.top_is_pointwise and
        self.hparams.mode == tf.estimator.ModeKeys.PREDICT and
        not self.hparams.force_full_predict)
    if last_only:
      # Take body outputs for the last position only.
      if "decode_loop_step" not in features:
        body_output = tf.expand_dims(body_output[:, -1, :, :], axis=[1])
      else:
        body_output_shape = body_output.shape.as_list()
        body_output = tf.slice(
            body_output, [0, features["decode_loop_step"][0], 0, 0], [
                body_output_shape[0], 1, body_output_shape[2],
                body_output_shape[3]
            ])

    # Build logits.
    logits = model_utils.build_logits(
        sequences=body_output,
        embeddings=target_embeddings,
        vocab_size=target_vocab_size)
    return logits
Beispiel #13
0
    def bottom(self, features):
        """Transforms features to feed into body.

    Ensures that all language tags are transformed using input modality.

    Args:
      features: dict of str to Tensor. The tensors contain token ids.

    Returns:
      transformed_features: dict of same key-value pairs as features. The value
        Tensors are newly transformed (i.e., embeddings).
    """
        if not self._problem_hparams:
            t2t_model.log_warn(
                "Without a Problem, T2TModel.bottom is a passthrough.")
            return features

        transformed_features = collections.OrderedDict()

        # Transform inputs.
        feature_name = "inputs"
        modality_obj = self._problem_hparams.modality[feature_name]
        with tf.variable_scope(modality_obj.name, reuse=False) as vs:
            self._add_variable_scope(modality_obj.name, vs)
            t2t_model.log_info("Transforming feature '%s' with %s.bottom",
                               feature_name, modality_obj.name)
            transformed_features[feature_name] = modality_obj.bottom_simple(
                features[feature_name], "input_emb", reuse=False)

        # Transform tags (using same modality as for the inputs).
        for feature_name in ["all_tags", "input_tags", "target_tags"]:
            if feature_name not in features:
                tf.logging.warning("Missing feature %s - ignoring." %
                                   feature_name)
                continue
            with tf.variable_scope(modality_obj.name, reuse=True):
                t2t_model.log_info(
                    "Transforming feature '%s' with %s.bottom_simple",
                    feature_name, modality_obj.name)
                transformed_features[
                    feature_name] = modality_obj.bottom_simple(
                        features[feature_name], "input_emb", reuse=True)

        # Transform targets.
        feature_name = "targets"
        modality_obj = self._problem_hparams.modality[feature_name]
        with tf.variable_scope(modality_obj.name, reuse=False) as vs:
            self._add_variable_scope(modality_obj.name, vs)
            t2t_model.log_info(
                "Transforming feature '%s' with %s.bottom_simple",
                feature_name, modality_obj.name)
            transformed_features[feature_name] = modality_obj.bottom_simple(
                features[feature_name], "shared", reuse=False)

        for key in features:
            if key not in transformed_features:
                # For features without a modality, we pass them along as is
                transformed_features[key] = features[key]
            else:
                # Other features get passed along with the "raw" suffix
                transformed_features[key + "_raw"] = features[key]

        return transformed_features