def transformer_n_encoder(encoder_input, encoder_self_attention_bias, hparams, customize_params, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """ transformer with 2 sets of encoders """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(customize_params.num_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, customize_params.num_heads or hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=customize_params.get("max_length")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), customized_ffn=customize_params.ffn_layer, hparams=hparams, pad_remover=pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", imageP=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. encoder_output = common_layers.layer_preprocess(x, hparams) #JI: adding image information to the encoder output if imageP is not None: with tf.variable_scope(name): W1 = tf.layers.dense(imageP, 1024, use_bias=False, name="image_proj") encoder_output = tf.add(encoder_output, W1) return encoder_output
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder"): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): # TODO(noam): We should pass in the padding directly. padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover(padding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=1.0 - padding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def get_pad_remover(hparams, encoder_self_attention_bias_slices, is_combined=False): ''' is_combined: whether the multiple translation options are combined or not ''' pad_remover = None if not is_combined: #encoder_self_attention_bias = tf.reduce_mean(tf.stack(encoder_self_attention_bias_slices), 0) encoder_self_attention_bias = encoder_self_attention_bias_slices[0] if hparams.use_pad_remover: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) pad_remover = expert_utils.PadRemover(padding) else: encoder_self_attention_bias = tf.concat( encoder_self_attention_bias_slices, 3) if hparams.use_pad_remover: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) pad_remover = expert_utils.PadRemover(padding) return (pad_remover, encoder_self_attention_bias)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder"): """A stack of transormer layer. Args: encoder_input: a Tensor [batch_size, input_length, hidden_dim] encoder_self_attention_bias: bias Tensor for sel-attention (see common_attention.attention_bias()) hparams: hyperparameters name: a string Returns: y: a Tensor [batch_size, input_length, hidden_dim] """ x = encoder_input with tf.variable_scope(name): pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder"): """Copied from tensor2tensor.models.transformer.""" x = encoder_input with tf.variable_scope(name): pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def _fast_decode(self, features, decode_length, beam_size=1, top_beams=1, alpha=1.0): """Fast decoding. Overrides tensor2tensor.models.transformer.Transformer._fast_decode to let symbols_to_logits_fn return multiple things. Implements both greedy and beam search decoding, uses beam search iff beam_size > 1, otherwise beam search related arguments are ignored. Args: features: a map of string to model features. decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: A dict of decoding results { "body_output": tensor of size [batch_size, <= decode_length, hidden_size] (or [batch_size, top_beams, <= decode_length, hidden_size]) giving the raw output of the Transformer decoder corresponding to the predicted sequences "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } Raises: NotImplementedError: If there are multiple data shards. """ if self._num_datashards != 1: raise NotImplementedError( "Fast decoding only supports a single shard.") dp = self._data_parallelism hparams = self._hparams target_modality = self._problem_hparams.target_modality if isinstance(target_modality, dict): primary_target_feature = self._problem_hparams.primary_target_modality primary_target_modality = target_modality[primary_target_feature] bottom_variable_scope = "%s/%s" % (primary_target_modality.name, primary_target_feature) else: primary_target_feature = "targets" primary_target_modality = target_modality bottom_variable_scope = target_modality.name if self.has_input: inputs = features["inputs"] if primary_target_modality.is_class_modality: decode_length = 1 else: decode_length = (common_layers.shape_list(inputs)[1] + features.get("decode_length", decode_length)) # TODO(llion): Clean up this reshaping logic. inputs = tf.expand_dims(inputs, axis=1) if len(inputs.shape) < 5: inputs = tf.expand_dims(inputs, axis=4) s = common_layers.shape_list(inputs) batch_size = s[0] inputs = tf.reshape(inputs, [s[0] * s[1], s[2], s[3], s[4]]) # _shard_features called to ensure that the variable names match inputs = self._shard_features({"inputs": inputs})["inputs"] input_modality = self._problem_hparams.input_modality["inputs"] with tf.variable_scope(input_modality.name): inputs = input_modality.bottom_sharded(inputs, dp) with tf.variable_scope("body"): encoder_output, encoder_decoder_attention_bias = dp( self.encode, inputs, features["target_space_id"], hparams, features=features) encoder_output = encoder_output[0] encoder_decoder_attention_bias = encoder_decoder_attention_bias[0] partial_targets = None else: # The problem has no inputs. encoder_output = None encoder_decoder_attention_bias = None # Prepare partial targets. # In either features["inputs"] or features["targets"]. # We force the outputs to begin with these sequences. partial_targets = features.get("inputs") if partial_targets is None: partial_targets = features[primary_target_feature] assert partial_targets is not None partial_targets = common_layers.expand_squeeze_to_nd( partial_targets, 2) partial_targets = tf.to_int64(partial_targets) partial_targets_shape = common_layers.shape_list(partial_targets) partial_targets_length = partial_targets_shape[1] decode_length = (partial_targets_length + features.get("decode_length", decode_length)) batch_size = partial_targets_shape[0] if hparams.pos == "timing": positional_encoding = common_attention.get_timing_signal_1d( decode_length + 1, hparams.hidden_size) elif hparams.pos == "emb": positional_encoding = common_attention.add_positional_embedding( tf.zeros([1, decode_length + 1, hparams.hidden_size]), hparams.max_length, "targets_positional_embedding", None) else: positional_encoding = None def preprocess_targets(targets, i): """Performs preprocessing steps on the targets to prepare for the decoder. This includes: - Embedding the ids. - Flattening to 3D tensor. - Optionally adding timing signals. Args: targets: inputs ids to the decoder. [batch_size, 1] i: scalar, Step number of the decoding loop. Returns: Processed targets [batch_size, 1, hidden_dim] """ # _shard_features called to ensure that the variable names match targets = self._shard_features({primary_target_feature: targets})[primary_target_feature] with tf.variable_scope(bottom_variable_scope): targets = primary_target_modality.targets_bottom_sharded( targets, dp)[0] targets = common_layers.flatten4d3d(targets) # At step 0, targets will have 0 size, and instead we want to # create an embedding of all-zero, corresponding to the start symbol # this matches what transformer_prepare_decoder does to the target # outputs during training targets = tf.cond(tf.equal(i, 0), lambda: tf.zeros_like(targets), lambda: targets) if positional_encoding is not None: targets += positional_encoding[:, i:i + 1] return targets decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(decode_length)) if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( decode_length) def symbols_to_logits_fn(ids, i, cache): """Go from ids to logits for next symbol.""" ids = ids[:, -1:] targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3) targets = preprocess_targets(targets, i) bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] logits = self._symbols_to_logits_fn(targets, features, bias, cache) logits = tf.squeeze(logits, axis=[1, 2, 3]) if partial_targets is not None: # If the position is within the given partial targets, we alter the # logits to always return those values. # A faster approach would be to process the partial targets in one # iteration in order to fill the corresponding parts of the cache. # This would require broader changes, though. vocab_size = tf.shape(logits)[1] def forced_logits(): return tf.one_hot( tf.tile(partial_targets[:, i], [beam_size]), vocab_size, 0.0, -1e9) logits = tf.cond(tf.less(i, partial_targets_length), forced_logits, lambda: logits) return logits, cache cache = dict() infer_out = dict() if encoder_output is not None: padding_mask = 1. - common_attention.attention_bias_to_padding( encoder_decoder_attention_bias) masked_encoded_output = encoder_output * tf.expand_dims( padding_mask, axis=2) infer_out["encoded_inputs"] = tf.reduce_sum(masked_encoded_output, axis=1) self._prepare_decoder_cache(batch_size, features, cache) ret = fast_decode( encoder_output=encoder_output, encoder_decoder_attention_bias=encoder_decoder_attention_bias, symbols_to_logits_fn=symbols_to_logits_fn, hparams=hparams, decode_length=decode_length, vocab_size=primary_target_modality.top_dimensionality, beam_size=beam_size, top_beams=top_beams, alpha=alpha, batch_size=batch_size, force_decode_length=self._decode_hparams.force_decode_length, cache=cache) infer_out.update(ret) if "cache" in ret: infer_out.update(ret["cache"]) if partial_targets is not None: if beam_size <= 1 or top_beams <= 1: infer_out["outputs"] = infer_out[ "outputs"][:, partial_targets_length:] else: infer_out["outputs"] = infer_out[ "outputs"][:, :, partial_targets_length:] return infer_out
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def evolved_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """Evolved Transformer encoder. See arxiv.org/abs/1901.11117 for more details. Note: Pad remover is not supported. Args: encoder_input: a Tensor. encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). hparams: hyperparameters for model. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not used. attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: Tensor encoder output. """ del losses hidden_state = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding # Only bfloat16 and float32 supported. float_type = hparams.get("activation_dtype", "float32") if float_type == "bfloat16": cast_fn = tf.to_bfloat16 else: assert float_type == "float32" cast_fn = tf.to_float padding = common_attention.attention_bias_to_padding( attention_bias, cast_fn) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("gated_linear_unit"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) values = common_layers.layers().Dense( hparams.hidden_size)(hidden_state) gates = common_layers.layers().Dense( hparams.hidden_size, activation=tf.nn.sigmoid)(hidden_state) hidden_state = values * gates hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("conv_branches"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask left_output_dim = int(hparams.hidden_size * 4) left_state = common_layers.layers().Dense( left_output_dim, activation=tf.nn.relu)(hidden_state) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) right_state = common_layers.layers().Conv1D( right_output_dim, 3, padding="SAME", name="standard_conv_3x1", activation=tf.nn.relu)(hidden_state) right_state = tf.nn.dropout(right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layer. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, left_output_dim]) hidden_state *= mask separable_conv_9x1 = common_layers.layers().SeparableConv1D( right_output_dim, 9, padding="SAME", name="separable_conv_9x1") hidden_state = separable_conv_9x1(hidden_state) hidden_state = tf.pad( hidden_state, [[0, 0], [0, 0], [0, hparams.hidden_size - right_output_dim]], constant_values=0) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("self_attention"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = common_attention.multihead_attention( hidden_state, None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = common_layers.layers().Dense( int(hparams.hidden_size * 4), activation=tf.nn.relu)(hidden_state) hidden_state = tf.nn.dropout(hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layers().Dense( hparams.hidden_size)(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) # If normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(hidden_state, hparams)
def encode_lex(self, encoder_input, target_space, hparams): ''' encoder_input: [batch_size, input_len, hidden_dim] return: encoder_output: [batch_size, input_len, hidden_dim] encoder_decoder_attention_bias: [batch_size, input_len] ''' encoder_output_slices = [] for i in range(encoder_input.get_shape()[2].value): encoder_input_slice = encoder_input[:, :, i, :] # bias encoder_padding = common_attention.embedding_to_padding( encoder_input_slice) print(encoder_padding.shape.as_list() ) # ==> [None, None] (None, None, 4) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding print(ignore_padding.shape.as_list() ) # ==> [None, 1, 1, None] (None, 1, 1, None, 4) # add target space to encoder input? ishape_static = encoder_input_slice.shape.as_list() print(ishape_static) # ==> [None, None, 300] (None, None, 4, 300) emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") print(emb_target_space.shape.as_list()) # ==> [300] emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) print(emb_target_space.shape.as_list()) # ==> [1, 1, 300] encoder_input_slice += emb_target_space print(encoder_input_slice.shape.as_list() ) # ==> [None, None, 300] (None, None, 4, 300) # add timing signals to encoder input if hparams.pos == "timing": encoder_input_slice = common_attention.add_timing_signal_1d( encoder_input_slice) # dropout encoder_input_slice = tf.nn.dropout( encoder_input_slice, 1.0 - hparams.layer_prepostprocess_dropout) # encoder ''' multihead_attention( query_antecedent: [batch, length_q, channels], -- x, x memory_antecedent: [batch, length_m, channels], -- None, encoder_output bias: bias tensor, -- encoder_self_attention_bias total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size output_depth: integer, -- hparams.hidden_size num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8) dropout_rate: float, -- hparams.attention_dropout ... cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention) ''' x = encoder_input_slice with tf.variable_scope("encoder" + str(i)): # remove pad pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) # self-attention along the sentence dimension for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): query_antecedent = common_layers.layer_preprocess( x, hparams) y = common_attention.multihead_attention( query_antecedent=query_antecedent, memory_antecedent=None, bias=encoder_self_attention_bias, total_key_depth=hparams.attention_key_channels or hparams.hidden_size, total_value_depth=hparams. attention_value_channels or hparams.hidden_size, output_depth=hparams.hidden_size, num_heads=hparams.num_heads, dropout_rate=hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams. max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer.transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) encoder_output_slice = common_layers.layer_preprocess( x, hparams) print(encoder_output_slice.shape.as_list() ) # ==> [None, None, 300] (None, None, 4, 300) encoder_output_slices.append(encoder_output_slice) encoder_output = tf.stack(encoder_output_slices, 2) print(encoder_output.shape.as_list()) # ==> [None, None, 4, 300] # -------- encoder_output_slices = [] #hparams2 = copy.deepcopy(hparams) #hparams2.hidden_size = hparams.lex_cap num_heads = int(hparams.lex_cap / 2) hparams2 = tf.contrib.training.HParams( layer_preprocess_sequence=hparams.layer_preprocess_sequence, layer_postprocess_sequence=hparams.layer_postprocess_sequence, layer_prepostprocess_dropout=hparams.layer_prepostprocess_dropout, norm_type=hparams.norm_type, hidden_size=hparams.lex_cap, norm_epsilon=hparams.norm_epsilon, ffn_layer=hparams.ffn_layer, filter_size=hparams.filter_size, relu_dropout=hparams.relu_dropout, num_heads=num_heads, attention_dropout=hparams.attention_dropout, parameter_attention_key_channels=hparams. parameter_attention_key_channels, parameter_attention_value_channels=hparams. parameter_attention_value_channels) for i in range(encoder_output.get_shape()[3].value): encoder_input_slice = encoder_output[:, :, :, i] #print(encoder_input_slice.shape.as_list()) # ==> [None, None, 4] encoder_padding = common_attention.embedding_to_padding( encoder_input_slice) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding #print(encoder_self_attention_bias.shape.as_list()) # ==> [None, 1, 1, None] # encoder ''' multihead_attention( query_antecedent: [batch, length_q, channels], -- x, x memory_antecedent: [batch, length_m, channels], -- None, encoder_output bias: bias tensor, -- encoder_self_attention_bias total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size output_depth: integer, -- hparams.hidden_size num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8) dropout_rate: float, -- hparams.attention_dropout ... cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention) ''' x = encoder_input_slice with tf.variable_scope("encoder_extra" + str(i)): # remove pad pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) # self-attention along the lexicon dimension with tf.variable_scope("layer_extra"): with tf.variable_scope("self_attention"): #query_antecedent = layer_preprocess2(x, hparams, hparams.lex_cap) query_antecedent = common_layers.layer_preprocess( x, hparams2) y = common_attention.multihead_attention( query_antecedent=query_antecedent, memory_antecedent=None, bias=encoder_self_attention_bias, total_key_depth=hparams.attention_key_channels or hparams.lex_cap, total_value_depth=hparams.attention_value_channels or hparams.lex_cap, output_depth=hparams.lex_cap, num_heads=num_heads, dropout_rate=hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position ) #x = layer_postprocess2(x, y, hparams, hparams.lex_cap) x = common_layers.layer_postprocess(x, y, hparams2) with tf.variable_scope("ffn"): y = transformer.transformer_ffn_layer( common_layers.layer_preprocess(x, hparams2), hparams2, pad_remover) #x = layer_postprocess2(x, y, hparams, hparams.lex_cap) x = common_layers.layer_postprocess(x, y, hparams2) #encoder_output_slice = layer_preprocess2(x, hparams, hparams.lex_cap) encoder_output_slice = common_layers.layer_preprocess( x, hparams2) #print(encoder_output_slice.shape.as_list()) # ==> [None, None, 4] (None, None, 4, 300) encoder_output_slices.append(encoder_output_slice) encoder_output = tf.stack(encoder_output_slices, 3) print(encoder_output.shape.as_list()) # ==> [None, None, 4, 300] # -------- lex_cap = encoder_output.get_shape()[2].value embed_len = encoder_output.get_shape()[3].value assert (lex_cap == hparams.lex_cap) aggregate_layer = tf.get_variable( name="Aggregate", shape=[embed_len, embed_len, lex_cap], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) encoder_output = tf.tensordot(encoder_output, aggregate_layer, axes=[[2, 3], [1, 2]]) print(encoder_output.shape.as_list()) # ==> [None, None, 300] return encoder_output, encoder_decoder_attention_bias
def moe_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="moe-encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, attn_bias_for_padding=None): """A stack of transformer moe layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding(attention_bias) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): x = moe_transformer_encoder_layer( layer, x, encoder_self_attention_bias, hparams, attention_dropout_broadcast_dims, save_weights_to, make_image_summary) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def nas_encoder(encoder_input, encoder_self_attention_bias, hparams, nonpadding=None, final_layer_norm=True): """Encoder for configurable NAS model. Args: encoder_input: Input tensor. encoder_self_attention_bias: Attention bias tensor with 0s for all valid postions and large negative numbers for the padding positions. hparams: transformer.Transformer hparams that must also contain: + encoder_<left|right>_inputs: List of ints specifying the hidden layer input indexes for the <left|right> branches. + encoder_<left|right>_layers: String list of layers. Each string must be the name of a TranslationLayer registered in layers.py's ENCODER_LAYERS. + encoder_<left|right>_activations: String list of activations. Each string in this list must have a corresponding activation in ACTIVATION_MAP. + encoder_<left|right>_output_dims: Int list of output dimensions for <left|right> branch layers. + encoder_<left|right>_norms: String list of norms to apply to the <left|right> layer branches. Each item must be either LAYER_NORM_KEY or NO_NORM_KEY. + encoder_num_cells: The number of cells in the encoder. This determines how many times the given layers will be repeated. + encoder_combiner_functions: String list of functions used to combine left and right branches. Must be a COMBINER_FUNCTION key. nonpadding: Tensor with 1s at all nonpadding positions and 0s everywhere else. If None (default), then nonpadding will be determined from encoder_self_attention_bias. final_layer_norm: Whether or not to apply a final layer_norm to the output of the encoder. Returns: Encoder output and list of each encoder cell's output in order. """ if nonpadding is None: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding return apply_nas_layers( input_tensor=encoder_input, left_inputs=hparams.encoder_left_inputs, left_layers=hparams.encoder_left_layers, left_activations=hparams.encoder_left_activations, left_output_dims=hparams.encoder_left_output_dims, left_norms=hparams.encoder_left_norms, right_inputs=hparams.encoder_right_inputs, right_layers=hparams.encoder_right_layers, right_activations=hparams.encoder_right_activations, right_output_dims=hparams.encoder_right_output_dims, right_norms=hparams.encoder_right_norms, num_cells=hparams.encoder_num_cells, combiner_functions=hparams.encoder_combiner_functions, final_combiner_function=hparams.encoder_final_combiner_function, nonpadding=nonpadding, layer_registry=layers.ENCODER_LAYERS, mask_future=False, hparams=hparams, var_scope="encoder", final_layer_norm=final_layer_norm)
def hierarchical_context_encoder(encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="discourse_aware_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], temp_hparam, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope("hierarchical_context_encoder", reuse=tf.AUTO_REUSE): for context_name in context_encoded_outputs: # self attention feed-forward _y = ffn_self_attention_layer( context_encoded_outputs[context_name], hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, save_weights_to=save_weights_to, name="attentive_sum") # mean over sequence length context_encoded_outputs[context_name] = tf.reduce_mean( _y, axis=1, keep_dims=True) encoded_contexts = [ context_encoded_outputs[context_name] for context_name in context_encoded_outputs ] encoded_contexts = tf.concat(encoded_contexts, axis=1) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", 1) context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) encoded_contexts = transformer_encoder(encoded_contexts, ignore_padding, temp_hparam) with tf.variable_scope("encoder/layer_%d" % hparams.num_hidden_layers, reuse=tf.AUTO_REUSE): with tf.variable_scope("context_input_attention"): context_padding = common_attention.embedding_to_padding( encoded_contexts) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), encoded_contexts, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoded_contexts = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("input_self_attention"): _y = common_attention.multihead_attention( common_layers.layer_preprocess(encoder_output, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) with tf.variable_scope("gated_sum"): _depth = common_layers.shape_list(encoder_output)[-1] gate = tf.layers.dense(tf.concat( [encoded_contexts, encoder_output], axis=-1), _depth, activation=tf.nn.sigmoid) if save_weights_to: save_weights_to["gated_sum"] = gate encoder_output = gate * encoder_output + ( 1. - gate) * encoded_contexts with tf.variable_scope("ffn"): _y = transformer_ffn_layer(common_layers.layer_preprocess( encoder_output, hparams), hparams, input_pad_remover, conv_padding="SAME", nonpadding_mask=input_nonpadding, losses=losses) encoder_output = common_layers.layer_postprocess( encoder_output, _y, hparams) return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover(padding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def _fast_decode(self, features, decode_length, beam_size=1, top_beams=1, alpha=1.0): """Fast decoding. Implements both greedy and beam search decoding, uses beam search iff beam_size > 1, otherwise beam search related arguments are ignored. Args: features: a map of string to model features. decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for longer translations. Returns: A dict of decoding results { "body_output": tensor of size [batch_size, <= decode_length, hidden_size] (or [batch_size, top_beams, <= decode_length, hidden_size]) giving the raw output of the Transformer decoder corresponding to the predicted sequences "outputs": integer `Tensor` of decoded ids of shape [batch_size, <= decode_length] if beam_size == 1 or [batch_size, top_beams, <= decode_length] "scores": decoding log probs from the beam search, None if using greedy decoding (beam_size=1) } Raises: NotImplementedError: If there are multiple data shards. """ if self._num_datashards != 1: raise NotImplementedError( "Fast decoding only supports a single shard.") dp = self._data_parallelism hparams = self._hparams target_modality = self._problem_hparams.target_modality if isinstance(target_modality, dict): primary_target_feature = self._problem_hparams.primary_target_modality primary_target_modality = target_modality[primary_target_feature] bottom_variable_scope = "%s/%s" % (primary_target_modality.name, primary_target_feature) else: primary_target_feature = "targets" primary_target_modality = target_modality bottom_variable_scope = target_modality.name inputs = features["inputs"] s = common_layers.shape_list(inputs) batch_size = s[0] # _shard_features called to ensure that the variable names match inputs = self._shard_features({"inputs": inputs})["inputs"] input_modality = self._problem_hparams.input_modality["inputs"] with tf.variable_scope(input_modality.name): inputs = input_modality.bottom_sharded(inputs, dp) with tf.variable_scope("body"): encoder_output, final_encoder_state, encoder_decoder_attention_bias, input_length = dp( self.encode, inputs, hparams, features=features) # undo the data parallelism encoder_output = encoder_output[0] final_encoder_state = final_encoder_state[0] encoder_decoder_attention_bias = encoder_decoder_attention_bias[0] input_length = input_length[0] def preprocess_targets(targets, i): """Performs preprocessing steps on the targets to prepare for the decoder. This includes: - Embedding the ids. - Flattening to 3D tensor. - Optionally adding timing signals. Args: targets: inputs ids to the decoder. [batch_size, 1] i: scalar, Step number of the decoding loop. Returns: Processed targets [batch_size, 1, hidden_dim] """ # _shard_features called to ensure that the variable names match targets = self._shard_features({primary_target_feature: targets})[primary_target_feature] with tf.variable_scope(bottom_variable_scope): targets = primary_target_modality.targets_bottom_sharded( targets, dp)[0] targets = common_layers.flatten4d3d(targets) # At step 0, targets will have 0 size, and instead we want to # create an embedding of all-zero, corresponding to the start symbol # this matches the common_layers.shift_right() that we do at training time targets = tf.cond(tf.equal(i, 0), lambda: tf.zeros_like(targets), lambda: targets) return targets hparams_decoder = copy.copy(self._hparams) hparams_decoder.hidden_size = 2 * self._hparams.hidden_size def dp_initial_state(encoder_output, input_length, final_encoder_state): if beam_size > 1: tiled_encoder_output = tf.contrib.seq2seq.tile_batch( encoder_output, beam_size) tiled_input_length = tf.contrib.seq2seq.tile_batch( input_length, beam_size) tiled_final_encoder_state = tf.contrib.seq2seq.tile_batch( final_encoder_state, beam_size) else: tiled_encoder_output = encoder_output tiled_input_length = input_length tiled_final_encoder_state = final_encoder_state decoder_cell = construct_decoder_cell(hparams_decoder, tiled_encoder_output, tiled_input_length) initial_state = decoder_cell.zero_state( batch_size * beam_size, tf.float32).clone(cell_state=tiled_final_encoder_state) # HACK: expand_dims on the time dimension to appease the t2t beam_search # code (which does not deal with scalars well) # the time dimension is only used to write TensorArrays, which we don't # need initial_state = initial_state._replace( time=tf.zeros((batch_size * beam_size, ), dtype=tf.int32)) # HACK: t2t's beam_search expects everything to be untiled initially, # will tile and then leave everything tiled # contrib.seq2seq's beam search instead expects everything tiled # from the beginning # we just created the fully tiled initial state, so now we "untile" # it by taking the first beam of each batch element # this is ok because all beams have the same value initially initial_state = tf.contrib.framework.nest.map_structure( lambda x: x[::beam_size], initial_state) return decoder_cell, initial_state with tf.variable_scope("body"): decoder_cell, initial_state = self._data_parallelism( dp_initial_state, encoder_output, input_length, final_encoder_state) def symbols_to_logits_fn(ids, i, cache): """Go from ids to logits for next symbol.""" ids = ids[:, -1:] targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3) targets = preprocess_targets(targets, i) logits = self._symbols_to_logits_fn(targets, features, decoder_cell, cache) logits = tf.squeeze(logits, axis=[1, 2, 3]) return logits, cache cache = dict() infer_out = dict() if encoder_output is not None: padding_mask = 1. - common_attention.attention_bias_to_padding( encoder_decoder_attention_bias) masked_encoded_output = encoder_output * tf.expand_dims( padding_mask, axis=2) infer_out["encoded_inputs"] = tf.reduce_sum(masked_encoded_output, axis=1) self._prepare_decoder_cache(batch_size, beam_size, features, cache) cache['cell_state'] = initial_state cache['encoder_output'] = encoder_output cache[ 'encoder_decoder_attention_bias'] = encoder_decoder_attention_bias ret = common.fast_decode( symbols_to_logits_fn=symbols_to_logits_fn, hparams=hparams, decode_length=decode_length, vocab_size=primary_target_modality.top_dimensionality, beam_size=beam_size, top_beams=top_beams, alpha=alpha, batch_size=batch_size, force_decode_length=self._decode_hparams.force_decode_length, cache=cache) infer_out.update(ret) new_outputs = dict() target_modality = self._problem_hparams.target_modality for key in target_modality: if key == self._problem_hparams.primary_target_modality: new_outputs[key] = infer_out["outputs"] del infer_out["outputs"] else: decoded_ids = ret["outputs_" + key] if beam_size > 1: if top_beams == 1: decoded_ids = decoded_ids[:, 0, :] else: decoded_ids = decoded_ids[:, :top_beams, :] # remove the extra dimension that was added to appease the shape # invariants new_outputs[key] = tf.squeeze(decoded_ids, axis=-1) del infer_out["outputs_" + key] infer_out["outputs"] = new_outputs return infer_out
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_on_tpu(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): # sg: imdb comments y = common_attention.multihead_attention( common_layers.layer_preprocess( x, hparams), # added layer norm None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, # 128 hparams.attention_value_channels or hparams.hidden_size, # 128 hparams.hidden_size, # 128 hparams.num_heads, # 4 hparams.attention_dropout, # 0.1 attention_type=hparams. self_attention_type, # 'dot_product' save_weights_to=save_weights_to, max_relative_position=hparams. max_relative_position, # 0 make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length")) # 256 x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def transformer_encoder_gate(encoder_input, encoder_self_attention_bias, hparams, name="encoder"): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) gate_fiter = tf.get_variable( 'gate_layer_%d' % layer, [1, hparams.hidden_size, hparams.hidden_size], tf.float32, initializer=tf.contrib.layers.xavier_initializer()) gate_x = tf.tanh( tf.nn.conv1d(x, gate_fiter, 1, 'SAME')) x *= gate_x with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) gate_fiter = tf.get_variable( 'gate_layer_%d' % layer, [1, hparams.hidden_size, hparams.hidden_size], tf.float32, initializer=tf.contrib.layers.xavier_initializer()) gate_x = tf.tanh( tf.nn.conv1d(x, gate_fiter, 1, 'SAME')) x *= gate_x # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover(padding) sequence_length = usr_utils.get_length_from_nonpadding(nonpadding) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): for layer_type in _iter_layer_types( hparams.encoder_layer_types, layer): if layer_type == "self_att": with tf.variable_scope("self_attention"): y = model_helper.multihead_attention_qkv( common_layers.layer_preprocess(x, hparams), None, None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams. encoder_self_attention_type, attention_order=hparams.attention_order, max_relative_position=hparams. max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) elif layer_type == "rnn": with tf.variable_scope("recurrent"): y = transformer_rnn_layer( common_layers.layer_preprocess(x, hparams), sequence_length, hparams) x = common_layers.layer_postprocess(x, y, hparams) elif layer_type == "birnn": with tf.variable_scope("recurrent"): y = transformer_rnn_layer( common_layers.layer_preprocess(x, hparams), sequence_length, hparams, bidirectional=True) x = common_layers.layer_postprocess(x, y, hparams) else: tf.logging.warn( "Ignoring '%s' in encoder_layer_types" % layer_type) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def universal_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """Universal Transformer encoder function. Prepares all the arguments and the inputs and passes it to a universal_transformer_layer to encode the encoder_input. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convoltutional layers. save_weights_to: an optional dictionary to capture attention weights for vizualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors as the output of the encoder extra_output: which can be used to pass extra information to the body """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) ffn_unit = functools.partial( universal_transformer_util.transformer_encoder_ffn_unit, hparams=hparams, nonpadding_mask=nonpadding, pad_remover=pad_remover) attention_unit = functools.partial( universal_transformer_util.transformer_encoder_attention_unit, hparams=hparams, encoder_self_attention_bias=encoder_self_attention_bias, attention_dropout_broadcast_dims=attention_dropout_broadcast_dims, save_weights_to=save_weights_to, make_image_summary=make_image_summary) x, extra_output = universal_transformer_layer(x, hparams, ffn_unit, attention_unit, pad_remover=pad_remover) if hparams.get("use_memory_as_last_state", False): x = extra_output # which is memory return common_layers.layer_preprocess(x, hparams), extra_output
def hierarchical_attention_network_encoder( encoder_input, encoder_self_attention_bias, contexts, context_self_attention_biases, features, hparams, name="hierarchical_attention_network_encoder", save_weights_to=None, make_image_summary=True, losses=None): input_x = encoder_input context_xs = {} for context_name in contexts: context_xs[context_name] = contexts[context_name] context_paddings = {} context_nonpaddings = {} context_pad_removers = {} attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name, reuse=tf.AUTO_REUSE): input_padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) input_nonpadding = 1.0 - input_padding for context_name in context_self_attention_biases: context_paddings[ context_name] = common_attention.attention_bias_to_padding( context_self_attention_biases[context_name]) context_nonpaddings[ context_name] = 1.0 - context_paddings[context_name] input_pad_remover = None for context_name in context_paddings: context_pad_removers[context_name] = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): input_pad_remover = expert_utils.PadRemover(input_padding) for context_name in context_paddings: context_pad_removers[context_name] = expert_utils.PadRemover( context_paddings[context_name]) temp_hparam = tf.contrib.training.HParams( ) # copy hparams except num_hidden_layers -> num_hidden_layers - 1 for key, val in hparams.values().items(): temp_hparam.add_hparam(key, val) temp_hparam.set_hparam("num_hidden_layers", hparams.num_hidden_layers - 1) encoder_output = transformer_with_contexts_layers.transformer_encoder( input_x, encoder_self_attention_bias, temp_hparam, nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=save_weights_to, make_image_summary=make_image_summary) context_encoded_outputs = {} for context_name in context_xs: context_encoded_outputs[ context_name] = transformer_with_contexts_layers.transformer_encoder( context_xs[context_name], context_self_attention_biases[context_name], hparams, nonpadding=features_to_nonpadding(features, context_name), save_weights_to=save_weights_to, make_image_summary=make_image_summary) with tf.variable_scope('word_abstraction', reuse=tf.AUTO_REUSE): encoder_word_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_w = f_w(h_t) encoder_word_level_abstraction = {} for context_name in context_encoded_outputs: encoder_word_level_abstraction[ context_name] = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess( encoder_word_level_query, hparams), context_encoded_outputs[context_name], context_self_attention_biases[context_name], hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d")) # s^j, sentence_information = tf.concat([ encoder_word_level_abstraction[context_name] for context_name in encoder_word_level_abstraction ], axis=1) with tf.variable_scope('sentence_abstraction', reuse=tf.AUTO_REUSE): encoder_sentence_level_query = common_layers.dense( encoder_output, hparams.hidden_size) # q_s = f_s(h_t) context_padding = common_attention.embedding_to_padding( sentence_information) ignore_padding = common_attention.attention_bias_ignore_padding( context_padding) contextual_information = transformer_with_contexts_layers.multihead_attention( common_layers.layer_preprocess(encoder_sentence_level_query, hparams), sentence_information, ignore_padding, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, max_relative_position=hparams.max_relative_position, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d") ) # MultiHead(q_s, s^j), [batch, encoder_length, hidden_dim] contextual_information = common_layers.dense_relu_dense( contextual_information, hparams.filter_size, hparams.hidden_size) with tf.variable_scope('context_gating', reuse=tf.AUTO_REUSE): gate_lambda = tf.nn.sigmoid( common_layers.dense(contextual_information, hparams.hidden_size) + common_layers.dense(encoder_output, hparams.hidden_size)) encoder_output = gate_lambda * encoder_output + ( 1 - gate_lambda) * contextual_information return common_layers.layer_preprocess(encoder_output, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: optional list onto which to append extra training losses attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding padding = common_attention.attention_bias_to_padding( attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): if layer < hparams.get("num_area_layers", 0): max_area_width = hparams.get("max_area_width", 1) max_area_height = hparams.get("max_area_height", 1) memory_height = hparams.get("memory_height", 1) else: max_area_width = 1 max_area_height = 1 memory_height = 1 y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32"), hard_attention_k=hparams.get("hard_attention_k", 0), gumbel_noise_weight=hparams.get( "gumbel_noise_weight", 0.0), max_area_width=max_area_width, max_area_height=max_area_height, memory_height=memory_height, area_key_mode=hparams.get("area_key_mode", "none"), area_value_mode=hparams.get("area_value_mode", "none"), training=(hparams.get("mode", tf.estimator.ModeKeys.TRAIN) == tf.estimator.ModeKeys.TRAIN)) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer(common_layers.layer_preprocess( x, hparams), hparams, pad_remover, conv_padding="SAME", nonpadding_mask=nonpadding, losses=losses) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. Returns: y: a Tensors """ x = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS, value=hparams.num_encoder_layers or hparams.num_hidden_layers) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DROPOUT, value=hparams.attention_dropout) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_ATTENTION_DENSE, value={ "use_bias": "false", "num_heads": hparams.num_heads, "hidden_size": hparams.hidden_size }) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: padding = common_attention.attention_bias_to_padding( encoder_self_attention_bias) nonpadding = 1.0 - padding pad_remover = None if hparams.use_pad_remover and not common_layers.is_xla_compiled(): pad_remover = expert_utils.PadRemover(padding) for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): initial_sparsity = None if hparams.get("load_masks_from"): initial_sparsity = hparams.get("initial_sparsity") with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = sparse_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), sparsity_technique=hparams.get("sparsity_technique"), threshold=hparams.get("log_alpha_threshold"), training=hparams.get( "mode") == tf_estimator.ModeKeys.TRAIN, clip_alpha=hparams.get("clip_log_alpha"), initial_sparsity=initial_sparsity, split_heads=hparams.get("split_heads")) x = common_layers.layer_postprocess(x, y, hparams) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_NORM, value={"hidden_size": hparams.hidden_size}) return common_layers.layer_preprocess(x, hparams)
def transformer_encoder(encoder_input, raw_inputs, encoder_self_attention_bias, hparams, name="encoder"): """A stack of transformer layers. Args: encoder_input: a Tensor encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()) hparams: hyperparameters for model name: a string Returns: y: a Tensors """ x = encoder_input with tf.variable_scope(name): raw_encoder_input = tf.squeeze(raw_inputs, axis=[-2, -1]) sequence_length = usr_utils.get_length_from_raw( raw_encoder_input) # Used for RNNs pos_signals = generate_positional_signals(raw_encoder_input, hparams) pos_embeddings = generate_positional_embeddings( pos_signals, hparams.encoder_pos, hparams) attention_pos_embeddings = generate_positional_embeddings( pos_signals, hparams.encoder_attention_pos, hparams) if "sum" in hparams.pos_integration: x = x + pos_embeddings elif "ffn" in hparams.pos_integration: with tf.variable_scope("pos_ffn"): x = tf.concat([x, pos_embeddings], axis=2) x = transformer_ffn_layer(x, hparams) pad_remover = None if hparams.use_pad_remover: pad_remover = expert_utils.PadRemover( common_attention.attention_bias_to_padding( encoder_self_attention_bias)) for layer in xrange(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): for layer_type in _iter_layer_types( hparams.encoder_layer_types, layer): if layer_type == "self_att": with tf.variable_scope("self_attention"): y = model_helper.multihead_attention_qkv( common_layers.layer_preprocess(x, hparams), None, None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams. encoder_self_attention_type, attention_order=hparams.attention_order, max_relative_position=hparams. max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) elif layer_type == "rnn": with tf.variable_scope("recurrent"): y = transformer_rnn_layer( common_layers.layer_preprocess(x, hparams), sequence_length, hparams) x = common_layers.layer_postprocess(x, y, hparams) elif layer_type == "birnn": with tf.variable_scope("recurrent"): y = transformer_rnn_layer( common_layers.layer_preprocess(x, hparams), sequence_length, hparams, bidirectional=True) x = common_layers.layer_postprocess(x, y, hparams) elif layer_type == "pos_self_att" and attention_pos_embeddings is not None: with tf.variable_scope("pos_self_attention"): y = model_helper.multihead_attention_qkv( attention_pos_embeddings, # Query attention_pos_embeddings, # Key common_layers.layer_preprocess( x, hparams), # Value encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.pos_self_attention_type, attention_order=hparams.attention_order, max_relative_position=hparams. max_relative_position) x = common_layers.layer_postprocess(x, y, hparams) else: tf.logging.warn( "Ignoring '%s' in encoder_layer_types" % layer_type) with tf.variable_scope("ffn"): y = transformer_ffn_layer( common_layers.layer_preprocess(x, hparams), hparams, pad_remover) x = common_layers.layer_postprocess(x, y, hparams) # if normalization is done in layer_preprocess, then it shuold also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)