def dual_decode(self, decoder_input, wav_encoder_output, txt_encoder_output, wav_enc_dec_attention_bias, txt_enc_dec_attention_bias, decoder_self_attention_bias, hparams, cache=None, nonpadding=None, losses=None): """ dual transformer decoder, attention to both inputs """ decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer_dual_decoder( decoder_input, wav_encoder_output,txt_encoder_output, decoder_self_attention_bias, wav_enc_dec_attention_bias, txt_enc_dec_attention_bias, hparams, cache=cache, nonpadding=nonpadding, save_weights_to=self.attention_weights, losses=losses) if (common_layers.is_on_tpu() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def estimator_spec_eval(self, features, logits, labels, loss): """Construct EstimatorSpec for EVAL mode.""" hparams = self.hparams if not hasattr(hparams, "problem_instances"): raise NotImplementedError(_no_problem_err("estimator_spec_eval")) problem = hparams.problem_instances[0] if common_layers.is_on_tpu(): eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams) _remove_summaries() return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, [logits, labels]), loss=loss) else: eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.EVAL, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=loss)
def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for EVAL mode.""" hparams = self.hparams if not hasattr(hparams, "problem_instances"): raise NotImplementedError(_no_problem_err("estimator_spec_eval")) problem = hparams.problem_instances[0] if common_layers.is_on_tpu(): eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams) _remove_summaries() if isinstance(logits, dict): # For TPU, logits dict will be passed as keyword arguments to # eval_metrics_fn. Here we add the labels to those arguments. logits.update({"labels": labels}) return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, logits), loss=loss) else: return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, [logits, labels]), loss=loss) else: eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): eval_metrics[metric_name] = metric_fn(logits, features) return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.EVAL, predictions={"predictions": logits}, eval_metric_ops=eval_metrics, loss=loss)
def body(self, features): assert self._hparams.block_size > 0 assert not common_layers.is_on_tpu() hparams = copy.copy(self._hparams) targets = features["targets"] inputs = features["inputs"] if not (tf.get_variable_scope().reuse or hparams.mode == tf.contrib.learn.ModeKeys.INFER): tf.summary.image("inputs", inputs, max_outputs=1) tf.summary.image("targets", targets, max_outputs=1) encoder_input = cia.prepare_encoder(inputs, hparams) encoder_output = cia.transformer_encoder_layers( encoder_input, hparams.num_encoder_layers, hparams, attention_type=hparams.enc_attention_type, name="encoder") decoder_input, rows, cols = cia.prepare_decoder(targets, hparams) decoder_output = cia.transformer_decoder_layers( decoder_input, encoder_output, hparams.num_decoder_layers, hparams, attention_type=hparams.dec_attention_type, name="decoder") assert not isinstance(decoder_output, tuple) assert len(decoder_output.shape) == 4 relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(self._hparams, "relu_dropout_broadcast_dims", ""))) with tf.variable_scope("block_size_%d" % self._hparams.block_size): tf.logging.info("Using block_size %d", self._hparams.block_size) block_output = common_layers.dense_relu_dense( decoder_output, self._hparams.block_size * self._hparams.filter_size, self._hparams.block_size * self._hparams.hidden_size, dropout=self._hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims) batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3] decoder_output = tf.reshape( decoder_output, [batch_size, rows, cols, 1, self._hparams.hidden_size]) block_output = tf.reshape(block_output, [ batch_size, rows, cols, self._hparams.block_size, self._hparams.hidden_size ]) block_output = common_layers.layer_postprocess(decoder_output, block_output, self._hparams) return block_output
def transformer_n_encoder(encoder_input, encoder_self_attention_bias, hparams, customize_params, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """ transformer with 2 sets of encoders """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(customize_params.num_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, customize_params.num_heads or hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=customize_params.get("max_length")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), customized_ffn=customize_params.ffn_layer, hparams=hparams, pad_remover=pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, name, cache=None, decode_loop_step=None, nonpadding=None, losses=None): """Decode Transformer outputs from encoder representation. Args: decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparameters for model. cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. nonpadding: optional Tensor with shape [batch_size, decoder_length] losses: optional list onto which to append extra training losses Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, name=name, cache=cache, decode_loop_step=decode_loop_step, nonpadding=nonpadding, save_weights_to=self.attention_weights, losses=losses) if (common_layers.is_on_tpu() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" log_info("Base learning rate: %f", self.hparams.learning_rate) lr = learning_rate.learning_rate_schedule(self.hparams) if num_async_replicas > 1: log_info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize( loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu()) return train_op
def estimator_spec_train(self, loss, num_async_replicas=1): """Construct EstimatorSpec for TRAIN mode.""" train_op = self.optimize(loss, num_async_replicas=num_async_replicas) if common_layers.is_on_tpu(): _remove_summaries() # summaries not currently working on TPU return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op) else: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, cache=None, nonpadding=None): """Decode Transformer outputs from encoder representation. Args: decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparmeters for model. cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. nonpadding: optional Tensor with shape [batch_size, decoder_length] Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ decoder_input = tf.nn.dropout( decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=cache, nonpadding=nonpadding, save_weights_to=self.attention_weights) if (common_layers.is_on_tpu() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. m = tf.py_func(self.sentence_cache.QueryMultipleEntries, [decoder_output], tf.float32) m.set_shape(decoder_output.get_shape()) lambd = self.calculate_mixing_weight(decoder_output, m) return tf.expand_dims(lambd * decoder_output + (1.0 - lambd) * m, axis=2)
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, cache=None, nonpadding=None): """Decode Transformer outputs from encoder representation. Args: decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparmeters for model. cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. nonpadding: optional Tensor with shape [batch_size, decoder_length] Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output = transformer_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=cache, nonpadding=nonpadding, save_weights_to=self.attention_weights) if (common_layers.is_on_tpu() and hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return decoder_output else: # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2)
def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for EVAL mode.""" hparams = self.hparams if not hasattr(hparams, "problem_instances"): raise NotImplementedError(_no_problem_err("estimator_spec_eval")) problem = hparams.problem_instances[0] if common_layers.is_on_tpu(): _remove_summaries() if isinstance(logits, dict): eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams) # For TPU, logits dict will be passed as keyword arguments to # eval_metrics_fn. Here we add the labels to those arguments. logits.update({"labels": labels}) return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, logits), loss=loss) else: eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams) return tf.contrib.tpu.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, [logits, labels]), loss=loss) else: eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams) eval_metrics = {} for metric_name, metric_fn in six.iteritems(eval_metrics_fns): if isinstance(logits, dict): # the key is located in the center of metric_name: "metrics-%s/%s/%s" k = metric_name.split("/")[1] eval_metrics[metric_name] = metric_fn(logits[k], features) else: eval_metrics[metric_name] = metric_fn(logits, features) if isinstance(logits, dict): predictions = logits else: predictions = {"predictions": logits} return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.EVAL, predictions=predictions, eval_metric_ops=eval_metrics, loss=loss)
def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" tf.logging.info("Base learning rate: %f", self.hparams.learning_rate) lr = self.hparams.learning_rate decay_rate = optimize.learning_rate_schedule(self.hparams) lr *= decay_rate if self.hparams.learning_rate_minimum: lr_min = float(self.hparams.learning_rate_minimum) tf.logging.info("Applying learning rate minimum: %f", lr_min) lr = tf.max(lr, tf.to_float(lr_min)) if num_async_replicas > 1: tf.logging.info("Dividing learning rate by num_async_replicas: %d", num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu()) return train_op
def top(self, body_output, _): """Generate logits. Args: body_output: A Tensor with shape [batch, p0, p1, body_input_depth] Returns: logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. """ if self._model_hparams.symbol_modality_skip_top: return tf.expand_dims(body_output, 3) if self._model_hparams.shared_embedding_and_softmax_weights: scope_name = "shared" reuse = True else: scope_name = "softmax" reuse = False with tf.variable_scope(scope_name, reuse=reuse): body_output_shape = common_layers.shape_list(body_output) var = self._get_weights(body_output_shape[-1]) if (self._model_hparams.factored_logits and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # insert channels dimension body_output = tf.expand_dims(body_output, 3) return common_layers.FactoredTensor(body_output, var) else: body_output = tf.reshape(body_output, [-1, body_output_shape[-1]]) logits = tf.matmul(body_output, var, transpose_b=True) if (common_layers.is_on_tpu() and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return logits else: return tf.reshape( logits, body_output_shape[:-1] + [1, self._vocab_size])
def top(self, body_output, _): """Generate logits. Args: body_output: A Tensor with shape [batch, p0, p1, body_input_depth] Returns: logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. """ if self._model_hparams.symbol_modality_skip_top: return tf.expand_dims(body_output, 3) if self._model_hparams.shared_embedding_and_softmax_weights: scope_name = "shared" reuse = True else: scope_name = "softmax" reuse = False with tf.variable_scope(scope_name, reuse=reuse): body_output_shape = common_layers.shape_list(body_output) var = self._get_weights(body_output_shape[-1]) if (self._model_hparams.factored_logits and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # insert channels dimension body_output = tf.expand_dims(body_output, 3) return common_layers.FactoredTensor(body_output, var) else: body_output = tf.reshape(body_output, [-1, body_output_shape[-1]]) logits = tf.matmul(body_output, var, transpose_b=True) if (common_layers.is_on_tpu() and self._model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # TPU does not react kindly to extra dimensions. # TODO(noam): remove this once TPU is more forgiving of extra dims. return logits else: return tf.reshape(logits, body_output_shape[:-1] + [1, self._vocab_size])
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def __init__(self, gpu, checkpoints, config=None): self._logger = logging.getLogger('TransformerDecoder') self._settings = config.settings if config is not None else TransformerDecoder.Settings( ) self._checkpoints = checkpoints self._checkpoint = None self._nn_needs_reset = True with tf.device('/device:GPU:0' if gpu is not None else '/cpu:0'): self._restorer = checkpoints.restorer() # Prepare features for feeding into the model. self._ph_decode_length = tf.placeholder(dtype=tf.int32) self._ph_infer_inputs = tf.placeholder(dtype=tf.int32) self._ph_train_inputs = tf.reshape(tf.placeholder(dtype=tf.int32), shape=[-1, -1, 1, 1]) self._ph_train_targets = tf.reshape(tf.placeholder(dtype=tf.int32), shape=[-1, -1, 1, 1]) self._ph_learning_rate = tf.placeholder(tf.float32, [], name='learning_rate') # Prepare the model for training self._model = registry.model('transformer')( self._checkpoints.hparams, tf.estimator.ModeKeys.TRAIN) _, losses = self._model({ "inputs": self._ph_train_inputs, "targets": self._ph_train_targets }) self._loss = losses['training'] self._train_op = optimize.optimize( self._loss, self._ph_learning_rate, self._model.hparams, use_tpu=common_layers.is_on_tpu()) tf.get_variable_scope().reuse_variables() # Prepare the model for infer self._attention_mats_op = [ self._model.attention_weights[ 'transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention' % i] for i in xrange(self._model.hparams.num_hidden_layers) ] self._predictions_ops = [] infer_inputs = tf.reshape(self._ph_infer_inputs, [1, -1, 1, 1]) # Make it 4D. infer_out = self._model.infer({"inputs": infer_inputs}, beam_size=4, top_beams=1, alpha=0.6, decode_length=self._ph_decode_length) self._predictions_op = { "outputs": infer_out["outputs"], "inputs": infer_inputs, } session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True if gpu is not None: session_config.gpu_options.force_gpu_compatible = True session_config.gpu_options.visible_device_list = str(gpu) self._session = tf.Session(config=session_config) # Init model self._warmup()
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # sg: imdb comments y = common_attention.multihead_attention( common_layers.layer_preprocess( x, hparams), # added layer norm None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, # 128 hparams.attention_value_channels or hparams.hidden_size, # 128 hparams.hidden_size, # 128 hparams.num_heads, # 4 hparams.attention_dropout, # 0.1 attention_type=hparams. self_attention_type, # 'dot_product' save_weights_to=save_weights_to, max_relative_position=hparams. max_relative_position, # 0 make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length")) # 256 x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)