def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, alpha=0.0, use_tpu=False): t2t_model.set_custom_getter_compose(self._custom_getter) with self._eager_var_store.as_default(): self.prepare_features_for_infer(features) if not self.has_input and beam_size > 1: t2t_model.log_warn( "Beam searching for a model with no inputs.") if not self.has_input and self.hparams.sampling_method != "random": t2t_model.log_warn( "Non-random sampling for a model with no inputs.") self._fill_problem_hparams_features(features) if self._problem_hparams: target_modality = self._problem_hparams.target_modality if target_modality.is_class_modality: beam_size = 1 # No use to run beam-search for a single class. t2t_model.log_info("Greedy Decoding") """ Modified """ # Removed every other decoding option, but the greedy method. results = self._greedy_infer(features, decode_length, use_tpu) return results
def model_fn(self, features): with tf.variable_scope(tf.get_variable_scope(), use_resource=True): transformed_features = self.bottom(features) if self.hparams.activation_dtype == "bfloat16": for k, v in sorted(six.iteritems(transformed_features)): if v.dtype == tf.float32: transformed_features[k] = tf.cast(v, tf.bfloat16) with tf.variable_scope("body"): t2t_model.log_info("Building model body") """ Modified """ # Passing the encoder state reference in 'enc_out' variable. body_out, enc_out = self.body(transformed_features) output, losses = self._normalize_body_output(body_out) if "training" in losses: t2t_model.log_info("Skipping T2TModel top and loss " "because training loss " "returned from body") logits = output else: logits = self.top(output, features) losses["training"] = 0.0 if self._hparams.mode != tf.estimator.ModeKeys.PREDICT: losses["training"] = self.loss(logits, features) return logits, losses, enc_out
def __init__(self, *args, **kwargs): super(EvolvedTransformer, self).__init__(*args, **kwargs) self._encoder_function = evolved_transformer_encoder self._decoder_function = evolved_transformer_decoder self._init_cache_fn = init_evolved_transformer_cache # -1 means train all weights. if self.hparams.get("num_trainable_top_decoder_layers", -1) < 0: t2t_model.log_info( "num_trainable_top_decoder_layers is negative so training all weights." ) elif self.hparams.shared_embedding_and_softmax_weights: t2t_model.log_info( "Setting hparams.shared_embedding_and_softmax_weights to False, " "because hparam.num_trainable_top_decoder_layers is being used.") # When hparam.num_trainable_top_decoder_layers is set to N >= 0 we will # freeze (not train) every variable except the N top decoder layers and # the (pre-)softmax matrix. For any N >= 0 we will freeze the encoder and # input/target embeddings. This also means we will not share the # (pre-)softmax matrix with input/target embeddings otherwise they will be # trained as well. self.hparams.shared_embedding_and_softmax_weights = False # If hparams.shared_embedding_and_softmax_weights was previously True, # then input and target embeddings were being shared. # To make sure it they embeddings continue to be shared, we need to set # hparams.shared_embedding to True. self.hparams.shared_embedding = True
def model_fn(self, features): with tf.variable_scope(tf.get_variable_scope(), use_resource=True, reuse=tf.AUTO_REUSE): transformed_features = self.bottom(features) if self.hparams.activation_dtype == "bfloat16": for k, v in sorted(six.iteritems(transformed_features)): if v.dtype == tf.float32: transformed_features[k] = tf.cast(v, tf.bfloat16) t2t_model.log_info("Building model body") output, losses, monitor, targets_mask = self.body( transformed_features, features) output, losses = self._normalize_body_output((output, losses)) if "training" in losses: t2t_model.log_info( "Skipping T2TModel top and loss because training loss " "returned from body") logits = output else: logits = self.top(output, features) losses["training"] = 0.0 if (self._hparams.mode != tf_estimator.ModeKeys.PREDICT and self._hparams.mode != "attack"): losses["training"] = self.loss(logits, features) return logits, losses, monitor, targets_mask
def optimize(self, loss, num_async_replicas=1, use_tpu=False): """Return a training op minimizing loss.""" hparams = self.hparams lr = learning_rate.learning_rate_schedule(hparams) if num_async_replicas > 1: log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= tf.sqrt(float(num_async_replicas)) loss = weight_decay_and_noise(loss, hparams, lr) loss = tf.identity(loss, name="total_loss") log_variable_sizes(verbose=hparams.summarize_vars) opt = ConditionalOptimizer(hparams.optimizer, lr, hparams) opt_summaries = ["loss", "learning_rate", "global_gradient_norm"] if hparams.clip_grad_norm: tf.logging.info("Clipping gradients, norm: %0.5f", hparams.clip_grad_norm) if hparams.grad_noise_scale: tf.logging.info("Adding noise to gradients, noise scale: %0.5f", hparams.grad_noise_scale) tf.summary.scalar("training/learning_rate", lr) return tf.contrib.layers.optimize_loss( name="training", loss=loss, global_step=tf.train.get_or_create_global_step(), learning_rate=lr, clip_gradients=hparams.clip_grad_norm or None, gradient_noise_scale=hparams.grad_noise_scale or None, optimizer=opt, summaries=opt_summaries, colocate_gradients_with_ops=True)
def model_fn(self, features): with tf.variable_scope(tf.get_variable_scope(), use_resource=True): transformed_features = self.bottom(features) if self.hparams.activation_dtype == "bfloat16": for k, v in transformed_features.items(): if v.dtype == tf.float32: transformed_features[k] = tf.cast(v, tf.bfloat16) with tf.variable_scope("body"): log_info("Building model body") body_out = self.body(transformed_features) output, losses = self._normalize_body_output(body_out) if "training" in losses: log_info( "Skipping T2TModel top and loss because training loss " "returned from body") logits = output else: logits = self.top(output, features) losses["training"] = 0.0 if self._hparams.mode != tf.estimator.ModeKeys.PREDICT: training_loss = self.loss(logits, features) if isinstance(training_loss, dict): assert "training" in training_loss losses.update(training_loss) else: losses["training"] = training_loss return logits, losses
def model_fn(self, features): """We need this for shallow fusion to change logits.""" transformed_features = self.bottom(features) with tf.variable_scope("body"): log_info("Building model body") body_out, p_copy = self.body(transformed_features) output, losses = self._normalize_body_output(body_out) if "training" in losses: log_info( "Skipping T2TModel top and loss because training loss " "returned from body") logits = output else: dzq = tf.transpose(p_copy[0].stack(), [1, 0, 2]) inv_dz = tf.transpose(p_copy[1].stack(), [1, 0]) if False: y_tilda = tf.concat([ features[key + "_one_hot"] for key in self._problem_hparams.nearest_target_keys ], axis=1) p_tilda = tf.diag_part( tf.tensordot(dzq, y_tilda, axes=[[2], [1]])) logits = inv_dz * self.top(output, features) else: logits = self.top(output, features) losses["training"] = self.loss(logits, features) return logits, losses
def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1, alpha=0.0, use_tpu=False): """A inference method. Quadratic time in decode_length. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. use_tpu: bool, whether to build the inference graph for TPU. Returns: A dict of decoding results { "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } if slow greedy decoding is used then the dict will also contain { "logits": `Tensor` of shape [batch_size, time, 1, 1, vocab_size]. "losses": a dictionary: {loss-name (string): floating point `Scalar` } """ set_custom_getter_compose(self._custom_getter) with self._eager_var_store.as_default(): self.prepare_features_for_infer(features) if not self.has_input and beam_size > 1: log_warn("Beam searching for a model with no inputs.") if not self.has_input and self.hparams.sampling_method != "random": log_warn("Non-random sampling for a model with no inputs.") self._fill_problem_hparams_features(features) if self._problem_hparams: target_modality = self._problem_hparams.target_modality if not isinstance(target_modality, dict) and target_modality.is_class_modality: beam_size = 1 # No use to run beam-search for a single class. if beam_size == 1: log_info("Greedy Decoding") results = self._greedy_infer(features, decode_length, use_tpu) else: log_info("Beam Decoding with beam size %d" % beam_size) results = self._beam_decode(features, decode_length, beam_size, top_beams, alpha) return results
def optimize(self, loss, num_async_replicas=1, use_tpu=False, variables=None): """Return a training op minimizing loss.""" lr = ops.learning_rate_schedule(self.hparams) if num_async_replicas > 1: t2t_model.log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize( loss, lr, self.hparams, use_tpu=use_tpu, variables=variables) return train_op
def model_fn(self, features): transformed_features = self.bottom(features) with tf.variable_scope("body"): t2t_model.log_info("Building model body") body_out = self.body(transformed_features, features) output, losses = self._normalize_body_output(body_out) if "training" in losses: t2t_model.log_info("Skipping T2TModel top and loss because training loss " "returned from body") logits = output else: logits = self.top(output, features) losses["training"] = self.loss(logits, features) return logits, losses
def bottom(self, features): transformed_features = super().bottom(features) print(features.keys()) # here we can define how to transform nearest_targets # now they are transformed with one-hot encoding target_modality = self._problem_hparams.target_modality vocab_size = target_modality._vocab_size print("\n\nvocab_size:", vocab_size, "\n\n") with tf.variable_scope(target_modality.name, reuse=True): for key in self._problem_hparams.nearest_target_keys: log_info("Transforming %s with %s.targets_bottom", key, target_modality.name) transformed_features[key] = target_modality.targets_bottom( features[key]) log_info("Transforming %s with one-hot encoding", key) # shape is (bs, max_len, vocab_size) transformed_features[key + "_one_hot"] = tf.one_hot( features[key], depth=vocab_size, axis=-1) return transformed_features
def _top_single(self, body_output, target_modality, features): """Top transformation that ensures correct reuse of target embeddings.""" t2t_model.log_info( "Transforming body output with %s.top", target_modality.name) # Get target embeddings. target_modality = self._problem_hparams.modality["targets"] target_modality_scope = self._variable_scopes[target_modality.name] target_embeddings = model_utils.get_embeddings( modality=target_modality, outer_scope=target_modality_scope, inner_scope="shared") target_vocab_size = target_modality._vocab_size # pylint: disable=protected-access # Preprocess body output. last_only = ( target_modality.top_is_pointwise and self.hparams.mode == tf.estimator.ModeKeys.PREDICT and not self.hparams.force_full_predict) if last_only: # Take body outputs for the last position only. if "decode_loop_step" not in features: body_output = tf.expand_dims(body_output[:, -1, :, :], axis=[1]) else: body_output_shape = body_output.shape.as_list() body_output = tf.slice( body_output, [0, features["decode_loop_step"][0], 0, 0], [ body_output_shape[0], 1, body_output_shape[2], body_output_shape[3] ]) # Build logits. logits = model_utils.build_logits( sequences=body_output, embeddings=target_embeddings, vocab_size=target_vocab_size) return logits
def bottom(self, features): """Transforms features to feed into body. Ensures that all language tags are transformed using input modality. Args: features: dict of str to Tensor. The tensors contain token ids. Returns: transformed_features: dict of same key-value pairs as features. The value Tensors are newly transformed (i.e., embeddings). """ if not self._problem_hparams: t2t_model.log_warn( "Without a Problem, T2TModel.bottom is a passthrough.") return features transformed_features = collections.OrderedDict() # Transform inputs. feature_name = "inputs" modality_obj = self._problem_hparams.modality[feature_name] with tf.variable_scope(modality_obj.name, reuse=False) as vs: self._add_variable_scope(modality_obj.name, vs) t2t_model.log_info("Transforming feature '%s' with %s.bottom", feature_name, modality_obj.name) transformed_features[feature_name] = modality_obj.bottom_simple( features[feature_name], "input_emb", reuse=False) # Transform tags (using same modality as for the inputs). for feature_name in ["all_tags", "input_tags", "target_tags"]: if feature_name not in features: tf.logging.warning("Missing feature %s - ignoring." % feature_name) continue with tf.variable_scope(modality_obj.name, reuse=True): t2t_model.log_info( "Transforming feature '%s' with %s.bottom_simple", feature_name, modality_obj.name) transformed_features[ feature_name] = modality_obj.bottom_simple( features[feature_name], "input_emb", reuse=True) # Transform targets. feature_name = "targets" modality_obj = self._problem_hparams.modality[feature_name] with tf.variable_scope(modality_obj.name, reuse=False) as vs: self._add_variable_scope(modality_obj.name, vs) t2t_model.log_info( "Transforming feature '%s' with %s.bottom_simple", feature_name, modality_obj.name) transformed_features[feature_name] = modality_obj.bottom_simple( features[feature_name], "shared", reuse=False) for key in features: if key not in transformed_features: # For features without a modality, we pass them along as is transformed_features[key] = features[key] else: # Other features get passed along with the "raw" suffix transformed_features[key + "_raw"] = features[key] return transformed_features