def encode(self, inputs, contexts, target_space, hparams, features=None, losses=None): inputs = common_layers.flatten4d3d(inputs) _contexts = {} for context_name in contexts: _contexts[context_name] = common_layers.flatten4d3d( contexts[context_name]) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer_prepare_encoder(inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) context_inputs = {} self_ctxt_attention_biases = {} encoder_decoder_ctxt_attention_biases = {} for context_name in _contexts: with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): context_input, self_ctxt_attention_bias, encoder_decoder_ctxt_attention_bias = ( transformer_prepare_encoder(_contexts[context_name], target_space, hparams, features=features)) context_input = tf.nn.dropout( context_input, 1.0 - hparams.layer_prepostprocess_dropout) context_inputs[context_name] = context_input self_ctxt_attention_biases[ context_name] = self_ctxt_attention_bias encoder_decoder_ctxt_attention_biases[ context_name] = encoder_decoder_ctxt_attention_bias encoder_output = discourse_aware_transformer_encoder_with_context( encoder_input, self_attention_bias, context_inputs, self_ctxt_attention_biases, features, hparams, save_weights_to=self.attention_weights, losses=losses) return encoder_output, self_attention_bias
def encode(self, stories, questions, target_space, hparams, unused_features=None): """Encode transformer inputs. Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. unused_features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encodre-decoder attention. [batch_size, input_length] """ inputs = tf.concat([stories, questions], axis=1) # inputs = common_layers.flatten4d3d(inputs) (encoder_input, encoder_self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, # nonpadding=features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights) return encoder_output
def encode(self, inputs, target_space, hparams, features=None): """Encode transformer inputs. Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ inputs = common_layers.flatten4d3d(inputs) (encoder_input, self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = r_transformer_util.r_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights) return encoder_output, encoder_extra_output
def encode_no_lookup(self, embedded_inputs, inputs_mask): """Encoder step for transformer given already-embedded inputs Args: embedded_inputs: int tensor with shape [batch_size, input_length, emb_size]. inputs_mask: tensor with shape [batch_size, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ (encoder_input, self_attention_bias, _) = (t2t_transformer.transformer_prepare_encoder( embedded_inputs, self.target_space, self.hparams)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - self.hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = ( universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, self.hparams, nonpadding=inputs_mask, save_weights_to=self.model.attention_weights)) return encoder_output, encoder_extra_output
def transformer_text_encoder(inputs, target_space, hparams, name=None): """Transformer text encoder over inputs with unmasked full attention. Args: inputs: Tensor of shape [batch, length, 1, hparams.hidden_size]. target_space: int. Used for encoding inputs under a target space id. hparams: tf.contrib.training.HParams. name: string, variable scope. Returns: encoder_output: Tensor of shape [batch, length, hparams.hidden_size]. ed: Tensor of shape [batch, 1, 1, length]. Encoder-decoder attention bias for any padded tokens. """ with tf.variable_scope(name, default_name="transformer_text_encoder"): inputs = common_layers.flatten4d3d(inputs) [ encoder_input, encoder_self_attention_bias, ed, ] = transformer.transformer_prepare_encoder(inputs, target_space=target_space, hparams=hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams) return encoder_output, ed
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias ) = transformer.transformer_prepare_decoder(targets, hparams) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout( decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
def transformer_text_encoder(x, space_id, hparams, name="transformer_text_encoder"): """Transformer text encoder over inputs with unmasked full attention. Args: x: Tensor of shape [batch, length, 1, hparams.hidden_size]. space_id: int, id. hparams: tf.contrib.training.HParams. name: string, variable scope. Returns: encoder_output: Tensor of shape [batch, length, hparams.hidden_size]. ed: Tensor of shape [batch, 1, 1, length]. Encoder-decoder attention bias for any padded tokens. """ with tf.variable_scope(name): x = common_layers.flatten4d3d(x) (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, space_id, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams) return encoder_output, ed
def transformer_encoder_ht(inputs, target_space, hparams, features=None, losses=None): encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams, features=features)) # encoder_input = tf.nn.dropout(encoder_input, # 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, self_attention_bias, hparams, # nonpadding=transformer.features_to_nonpadding(features, "inputs"), nonpadding=None, save_weights_to=None, losses=losses) # encoder_output = tf.expand_dims(encoder_output, 2) return encoder_output
def transformer_text_encoder(x, space_id, hparams, name="transformer_text_encoder"): """Transformer text encoder over inputs with unmasked full attention. Args: x: Tensor of shape [batch, length, 1, hparams.hidden_size]. space_id: int, id. hparams: tf.contrib.training.HParams. name: string, variable scope. Returns: encoder_output: Tensor of shape [batch, length, hparams.hidden_size]. ed: Tensor of shape [batch, 1, 1, length]. Encoder-decoder attention bias for any padded tokens. """ with tf.variable_scope(name): x = common_layers.flatten4d3d(x) (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, space_id, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) return transformer.transformer_encoder(encoder_input, encoder_self_attention_bias, hparams), ed
def universal_transformer_encoder(inputs, target_space, hparams, features=None, make_image_summary=False): encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) [encoder_output, encoder_extra_output ] = universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=None, make_image_summary=make_image_summary) if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0: ponder_times, remainders = encoder_extra_output act_loss = hparams.act_loss_weight * tf.reduce_mean(ponder_times + remainders) return encoder_output, act_loss else: return encoder_output, tf.constant(0.0, tf.float32)
def body(self, features): hparams = self._hparams targets = features["targets"] inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer.transformer_prepare_encoder( inputs, target_space, hparams)) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_revnet_encoder( encoder_input, encoder_self_attention_bias, hparams) decoder_output = transformer_revnet_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
def encode(x, x_space, hparams, name): """Transformer preparations and encoder.""" with tf.variable_scope(name): (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, x_space, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) return transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams), ed
def encode(x, x_space, hparams, name): """Transformer preparations and encoder.""" with tf.variable_scope(name): (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, x_space, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) return transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams), ed
def create_t2t_transformer_encoder( x_in: "tf.Tensor", mask: "tf.Tensor", attention_weights: Dict[Text, "tf.Tensor"], hparams: "HParams", C2: float, is_training: "tf.Tensor", ) -> "tf.Tensor": """Create t2t transformer encoder.""" with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): x = create_tf_fnn( x_in, [hparams.hidden_size], hparams.layer_prepostprocess_dropout, C2, is_training, layer_name_suffix="pre_embed", activation=None, use_bias=False, kernel_initializer=tf.random_normal_initializer( 0.0, hparams.hidden_size**-0.5), ) if hparams.multiply_embedding_mode == "sqrt_depth": x *= hparams.hidden_size**0.5 x *= tf.expand_dims(mask, -1) ( x, self_attention_bias, encoder_decoder_attention_bias, ) = transformer_prepare_encoder(x, None, hparams) x *= tf.expand_dims(mask, -1) x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) attn_bias_for_padding = None # Otherwise the encoder will just use encoder_self_attention_bias. if hparams.unidirectional_encoder: attn_bias_for_padding = encoder_decoder_attention_bias x = transformer_encoder( x, self_attention_bias, hparams, nonpadding=mask, save_weights_to=attention_weights, attn_bias_for_padding=attn_bias_for_padding, ) x *= tf.expand_dims(mask, -1) return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
def encode(self, inputs, target_space, hparams, features=None, losses=None, **kwargs): """Encode Universal Transformer inputs. It is similar to "transformer.encode", but it uses "universal_transformer_util.universal_transformer_encoder" instead of "transformer.transformer_encoder". Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ del losses inputs = common_layers.flatten4d3d(inputs) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = ( universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding( features, "inputs"), save_weights_to=self.attention_weights)) return encoder_output, encoder_decoder_attention_bias, encoder_extra_output
def encoder(name, hparams, inputs, target_space): """Compute encoder outputs and attention bias.""" with tf.variable_scope(name, reuse=tf.AUTO_REUSE): (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = (transformer_prepare_encoder( inputs, target_space, hparams)) encoder_input = tf.nn.dropout( encoder_input, rate=hparams.layer_prepostprocess_dropout) encoder_output = transformer_encoder(encoder_input, encoder_self_attention_bias, hparams) return encoder_output, encoder_decoder_attention_bias
def transformer_text_encoder(inputs, space_id, hparams, name="transformer_text_enc"): """Transformer text encoder.""" with tf.variable_scope(name): x = common_layers.flatten4d3d(inputs) (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, space_id, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) return transformer.transformer_encoder(encoder_input, encoder_self_attention_bias, hparams), ed
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Encode inputs using _encoder(). This performs the same way as transformer.Transformer.encode with the encoder portion replaced with _encoder(). Args: inputs: Input [batch_size, input_length, input_height, hidden_dim] tensor which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: Hyperparmeters for model. features: Optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused list of losses. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encodre-decoder attention. [batch_size, input_length] Raises: ValueError: If encoder type not found. """ inputs = common_layers.flatten4d3d(inputs) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = self._encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights) return encoder_output, encoder_decoder_attention_bias
def te_encode(input_seq, hparams, target_space, features, name): input_seq = common_layers.flatten4d3d(input_seq) (encoder_input, encoder_self_attention_bias, _) = ( transformer_prepare_encoder(input_seq, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer_encoder( encoder_input, encoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding(features, "input_seq")) encoder_output = tf.expand_dims(encoder_output, 2) return encoder_output
def _prepare_encoder(self, inputs, target_space): """Process the transformer encoder inputs.""" inputs = common_layers.flatten4d3d(inputs) output = transformer.transformer_prepare_encoder( inputs, target_space, self._hparams, features=None, ) enco_input, enco_self_att_bias, enco_deco_att_bias = output enco_input = tf.nn.dropout( enco_input, 1.0 - self._hparams.layer_prepostprocess_dropout) return enco_input, enco_self_att_bias, enco_deco_att_bias
def _prepare_encoder(self, inputs, target_space): """Process the transformer encoder inputs.""" inputs = common_layers.flatten4d3d(inputs) output = transformer.transformer_prepare_encoder( inputs, target_space, self._hparams, features=None, ) enco_input, enco_self_att_bias, enco_deco_att_bias = output enco_input = tf.nn.dropout( enco_input, 1.0 - self._hparams.layer_prepostprocess_dropout) return enco_input, enco_self_att_bias, enco_deco_att_bias
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Encode Universal Transformer inputs. It is similar to "transformer.encode", but it uses "universal_transformer_util.universal_transformer_encoder" instead of "transformer.transformer_encoder". Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused. Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ del losses inputs = common_layers.flatten4d3d(inputs) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder( inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = ( universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights)) return encoder_output, encoder_decoder_attention_bias, encoder_extra_output
def encode(self, features, input_key): hparams = self._hparams inputs = common_layers.flatten4d3d(features[input_key]) (encoder_input, encoder_self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, problem.SpaceID.EN_TOK, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, input_key)) encoder_output = tf.reduce_mean(encoder_output, axis=1) return encoder_output
def encode(self, features, input_key): hparams = self._hparams inputs = common_layers.flatten4d3d(features[input_key]) (encoder_input, encoder_self_attention_bias, _) = ( transformer.transformer_prepare_encoder(inputs, problem.SpaceID.EN_TOK, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, input_key)) encoder_output = tf.reduce_mean(encoder_output, axis=1) return encoder_output
def sim_encode(inputs, target_space, hparams, features): # inputs = tf.Print(inputs, [tf.shape(inputs)], "input", summarize=10) inputs = common_layers.flatten4d3d(inputs) (encoder_input, encoder_self_attention_bias, _) = (transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs")) positional_mean = tf.nn.l2_normalize(tf.reduce_mean(encoder_output, 1), 1) # out_norm = tf.norm(positional_mean) # positional_mean = tf.Print(positional_mean , [out_norm], "enc_out: (should be b_size**0.5) ", summarize=10) # positional_mean = tf.Print(positional_mean , [tf.shape(positional_mean)], "enc_out: (should be (b_size, h_size)) ", summarize=10) return positional_mean
def model_fn_body(self, features): hparams = self._hparams targets = features["targets"] inputs = features.get("inputs") target_space = features.get("target_space_id") inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_attention_bias, _) = transformer.transformer_prepare_encoder(inputs, target_space, hparams) (decoder_input, decoder_self_attention_bias ) = transformer.transformer_prepare_decoder(targets, hparams) # We need masks of the form batch size x input sequences # Biases seem to be of the form batch_size x 1 x input sequences x vec dim # Squeeze out dim one, and get the first element of each vector. encoder_mask = tf.squeeze(encoder_attention_bias, [1])[:, :, 0] decoder_mask = tf.squeeze(decoder_self_attention_bias, [1])[:, :, 0] def residual_fn(x, y): return common_layers.layer_norm( x + tf.nn.dropout(y, 1.0 - hparams.residual_dropout)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) encoder_output = alt_transformer_encoder(encoder_input, residual_fn, encoder_mask, hparams) decoder_output = alt_transformer_decoder(decoder_input, encoder_output, residual_fn, decoder_mask, encoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
def universal_transformer_encoder(inputs, target_space, hparams, features=None, make_image_summary=False): encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder( inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) [encoder_output, encoder_extra_output] = universal_transformer_util.universal_transformer_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=None, make_image_summary=make_image_summary) # encoder_output = tf.expand_dims(encoder_output, 2) return encoder_output
def body(self, features): hparams = self._hparams inputs = features["inputs"] target_space = features["target_space_id"] inputs = common_layers.flatten4d3d(inputs) (encoder_input, encoder_self_attention_bias, _) = (transformer.transformer_prepare_encoder(inputs, target_space, hparams)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs")) encoder_output = encoder_output[:, :1, :] encoder_output = tf.expand_dims(encoder_output, 2) return encoder_output
def model_fn_body(self, features): hparams = self._hparams targets = features["targets"] inputs = features.get("inputs") target_space = features.get("target_space_id") inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) (encoder_input, encoder_attention_bias, _) = (transformer.transformer_prepare_encoder(inputs, target_space, hparams)) (decoder_input, _) = (transformer.transformer_prepare_decoder(targets, hparams)) encoder_mask = bias_to_mask(encoder_attention_bias) def residual_fn(x, y): return common_layers.layer_norm( x + tf.nn.dropout(y, 1.0 - hparams.residual_dropout)) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout) encoder_output = alt_transformer_encoder(encoder_input, residual_fn, encoder_mask, hparams) decoder_output = alt_transformer_decoder(decoder_input, encoder_output, residual_fn, encoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) return decoder_output
def transformer_text_encoder(x, space_id, hparams, name="transformer_text_encoder"): """Transformer text encoder over inputs with unmasked full attention. Args: x: Tensor of shape [batch, length, hidden_dim]. space_id: int, id. hparams: Dict, hyperparameters. name: string, variable scope. Returns: x: Tensor of shape [batch, length, hidden_dim]. ed: Tensor, bias for padded tokens in the input, shape [batch, length] """ with tf.variable_scope(name): x = common_layers.flatten4d3d(x) (encoder_input, encoder_self_attention_bias, ed) = transformer.transformer_prepare_encoder(x, space_id, hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout) return transformer.transformer_encoder(encoder_input, encoder_self_attention_bias, hparams), ed
def main(): FLAGS = Args() # Enable TF Eager execution tfe = tf.contrib.eager tfe.enable_eager_execution() batch_inputs = input_generator() # initialize translation model hparams_set = FLAGS.hparams_set Modes = tf.estimator.ModeKeys hparams = trainer_lib.create_hparams(hparams_set, data_dir=FLAGS.data_dir, problem_name=FLAGS.problem) translate_model = registry.model(FLAGS.model)(hparams, Modes.EVAL) # recover parameters and conduct recurrent conduction ckpt_dir = tf.train.latest_checkpoint(FLAGS.model_dir) with tfe.restore_variables_on_create(ckpt_dir): with variable_scope.EagerVariableStore().as_default(): features = {'inputs': batch_inputs} with tf.variable_scope('universal_transformer/body'): input_tensor = tf.convert_to_tensor(features['inputs']) input_tensor = common_layers.flatten4d3d(input_tensor) encoder_input, self_attention_bias, _ = ( transformer.transformer_prepare_encoder( input_tensor, tf.convert_to_tensor([0]), translate_model.hparams, features=None)) with tf.variable_scope('universal_transformer/body/encoder'): ffn_unit = functools.partial( universal_transformer_util.transformer_encoder_ffn_unit, hparams=translate_model.hparams) attention_unit = functools.partial( universal_transformer_util. transformer_encoder_attention_unit, hparams=translate_model.hparams, encoder_self_attention_bias=None, attention_dropout_broadcast_dims=[], save_weights_to={}, make_image_summary=True) storing_list = [] transformed_state = encoder_input for step_index in range(1024): storing_list.append(transformed_state.numpy()) with tf.variable_scope( 'universal_transformer/body/encoder/universal_transformer_{}' .format(FLAGS.ut_type)): transformed_state = universal_transformer_util.step_preprocess( transformed_state, tf.convert_to_tensor(step_index % FLAGS.step_num), translate_model.hparams) with tf.variable_scope( 'universal_transformer/body/encoder/universal_transformer_{}/rec_layer_0' .format(FLAGS.ut_type)): transformed_new_state = ffn_unit( attention_unit(transformed_state)) with tf.variable_scope('universal_transformer/body/encoder'): if (step_index + 1) % FLAGS.step_num == 0: transformed_new_state = common_layers.layer_preprocess( transformed_new_state, translate_model.hparams) if step_index == 5: print(transformed_new_state) transformed_state = transformed_new_state storing_list = np.asarray(storing_list) np.save(FLAGS.save_dir, storing_list)
def main(): FLAGS = Args() # Enable TF Eager execution tfe = tf.contrib.eager tfe.enable_eager_execution() # sample sentence input_str = 'Twas brillig, and the slithy toves Did gyre and gimble in the wade; All mimsy were the borogoves, And the mome raths outgrabe.' # convert sentence into index in vocab wmt_problem = problems.problem(FLAGS.problem) encoders = wmt_problem.feature_encoders(FLAGS.data_dir) inputs = encoders["inputs"].encode(input_str) + [1] # add EOS id batch_inputs = tf.reshape(inputs, [1, -1, 1]) # Make it 3D. features = {"inputs": batch_inputs} # initialize translation model hparams_set = FLAGS.hparams_set Modes = tf.estimator.ModeKeys hparams = trainer_lib.create_hparams(hparams_set, data_dir=FLAGS.data_dir, problem_name=FLAGS.problem) translate_model = registry.model(FLAGS.model)(hparams, Modes.EVAL) # recover parameters and conduct recurrent conduction ckpt_dir = tf.train.latest_checkpoint(FLAGS.model_dir) with tfe.restore_variables_on_create(ckpt_dir): with variable_scope.EagerVariableStore().as_default(): with tf.variable_scope('universal_transformer'): # Convert word index to word embedding features = translate_model.bottom(features) with tf.variable_scope('universal_transformer/body'): input_tensor = tf.convert_to_tensor(features['inputs']) input_tensor = common_layers.flatten4d3d(input_tensor) encoder_input, self_attention_bias, _ = ( transformer.transformer_prepare_encoder( input_tensor, tf.convert_to_tensor([0]), translate_model.hparams, features=None)) with tf.variable_scope('universal_transformer/body/encoder'): ffn_unit = functools.partial( universal_transformer_util.transformer_encoder_ffn_unit, hparams=translate_model.hparams) attention_unit = functools.partial( universal_transformer_util.transformer_encoder_attention_unit, hparams=translate_model.hparams, encoder_self_attention_bias=None, attention_dropout_broadcast_dims=[], save_weights_to={}, make_image_summary=True) storing_list = [] transformed_state = encoder_input for step_index in range(1024): storing_list.append(transformed_state.numpy()) with tf.variable_scope('universal_transformer/body/encoder/universal_transformer_{}'.format(FLAGS.ut_type)): transformed_state = universal_transformer_util.step_preprocess( transformed_state, tf.convert_to_tensor(step_index % FLAGS.step_num), translate_model.hparams ) with tf.variable_scope('universal_transformer/body/encoder/universal_transformer_{}/rec_layer_0'.format(FLAGS.ut_type)): transformed_new_state = ffn_unit(attention_unit(transformed_state)) with tf.variable_scope('universal_transformer/body/encoder'): if (step_index + 1) % FLAGS.step_num == 0: transformed_new_state = common_layers.layer_preprocess(transformed_new_state, translate_model.hparams) if step_index == 5: print(transformed_new_state) transformed_state = transformed_new_state storing_list = np.asarray(storing_list) np.save(FLAGS.save_dir, storing_list)
def encode(self, inputs, target_space, hparams, features=None, losses=None, **kwargs): """Encode Universal Transformer inputs. It is similar to "transformer.encode", but it uses "universal_transformer_util.universal_transformer_encoder" instead of "transformer.transformer_encoder". Args: inputs: Transformer inputs [batch_size, input_length, input_height, hidden_dim] which will be flattened along the two spatial dimensions. target_space: scalar, target space ID. hparams: hyperparmeters for model. features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. losses: Unused. **kwargs: additional arguments to pass to encoder_function Returns: Tuple of: encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ #### ## DEBUG #### # with open("invertible_UT_params.json", "w") as f: # json.dump(dict(hparams.__dict__), f, default=lambda o: '<not serializable>', sort_keys=True, # indent=4, separators=(',', ': ')) # sys.exit() del losses inputs = common_layers.flatten4d3d(inputs) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(inputs, target_space, hparams, features=features)) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) (encoder_output, encoder_extra_output) = (invertible_UT_encoder( encoder_input, self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "inputs"), save_weights_to=self.attention_weights)) for var in tf.trainable_variables(): print(var) return encoder_output, encoder_decoder_attention_bias, encoder_extra_output
def encode(self, encoder_input, target_space, hparams): dir_path = os.path.dirname(os.path.realpath(__file__)) config_file = os.path.join(dir_path, "config.yml") config = yaml.load(open(config_file)) enc_name = config["model_params"].split('_')[0][3:] if enc_name == "simple": encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias = transformer.transformer_prepare_encoder( encoder_input, target_space, hparams) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = transformer.transformer_encoder( encoder_input, encoder_self_attention_bias, hparams) else: encoder_input, encoder_self_attention_bias_slices, encoder_decoder_attention_bias_slices = parallel_transformer_prepare_encoder( encoder_input, target_space, hparams) encoder_input = tf.nn.dropout( encoder_input, 1.0 - hparams.layer_prepostprocess_dropout) encoder_output = getattr(encode_fn, enc_name)( encoder_input, encoder_self_attention_bias_slices, hparams, "encoder") encoder_decoder_attention_bias = tf.stack( encoder_decoder_attention_bias_slices) encoder_decoder_attention_bias = tf.reduce_mean( encoder_decoder_attention_bias, 0) return encoder_output, encoder_decoder_attention_bias
def vae_transformer_internal(inputs, targets, target_space, hparams): """VAE Transformer, main step used for training.""" with tf.variable_scope("vae_transformer"): is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN # Prepare inputs, targets, and k. inputs = common_layers.flatten4d3d(inputs) targets = common_layers.flatten4d3d(targets) k = 2**hparams.num_compress_steps _, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=k) # Transformer preparations and encoder. (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias ) = transformer.transformer_prepare_encoder(inputs, target_space, hparams) residual_fn = transformer.get_residual_fn(hparams) encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout) encoder_output = transformer.transformer_encoder( encoder_input, residual_fn, encoder_self_attention_bias, hparams) def get_decoder_autoregressive(): """Decoder input for autoregressive computation.""" (a, b) = transformer.transformer_prepare_decoder(targets, hparams) return (a, b, tf.constant(0.0)) # 10% of the time we compress all-zeros, as will be at decoding start. prob_targets = 0.9 if is_training else 1.0 to_compress = tf.cond(tf.less(tf.random_uniform([]), prob_targets), lambda: targets, lambda: tf.zeros_like(targets)) z, kl_loss = compress_vae(to_compress, hparams, "vae") # Decompress. for i in xrange(hparams.num_compress_steps): j = hparams.num_hidden_layers - i - 1 z = decompress(z, hparams, "decompress_%d" % j) def get_decoder_from_vae(): """Decoder input computed by VAE.""" # Return decoder stuff. (a, b) = transformer.transformer_prepare_decoder( tf.squeeze(z, axis=2), hparams) return (a, b, kl_loss) # Randomize decoder inputs.. prob_do_vae = common_layers.inverse_exp_decay(40000) * 0.7 step = tf.to_float(tf.contrib.framework.get_global_step()) if not is_training: prob_do_vae = tf.cond(tf.less(step, 40000.0), lambda: tf.constant(0.0), lambda: tf.constant(1.0)) (decoder_input, decoder_self_attention_bias, kl_loss2) = tf.cond(tf.less(tf.random_uniform([]), prob_do_vae), get_decoder_from_vae, get_decoder_autoregressive) # Transformer decoder. decoder_output = transformer.transformer_decoder( decoder_input, encoder_output, residual_fn, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams) decoder_output = tf.expand_dims(decoder_output, 2) cond_self = tf.cond(tf.less(step, 30000.0), lambda: tf.constant(1.0), lambda: tf.constant(0.0)) prob_self = 0.4 if is_training else cond_self (ret, kl_loss) = tf.cond(tf.less(tf.random_uniform([]), prob_self), lambda: (z, kl_loss), lambda: (decoder_output, kl_loss2)) kl_loss *= common_layers.inverse_exp_decay(50000) * 2.0 return ret, kl_loss
def encode_decode_task(features, hparams, train, attention_weights=None): """Model core graph for the one-shot action. Args: features: a dictionary contains "inputs" that is a tensor in shape of [batch_size, num_tokens], "verb_id_seq" that is in shape of [batch_size, num_actions], "object_spans" and "param_span" tensor in shape of [batch_size, num_actions, 2]. 0 is used as padding or non-existent values. hparams: the general hyperparameters for the model. train: the train mode. attention_weights: the dict to keep attention weights for analysis. Returns: loss_dict: the losses for training. prediction_dict: the predictions for action tuples. areas: the area encodings of the task. scope: the embedding scope. """ del train input_embeddings, scope = common_embed.embed_tokens( features["task"], hparams.task_vocab_size, hparams.hidden_size, hparams) with tf.variable_scope("encode_decode", reuse=tf.AUTO_REUSE): encoder_nonpadding = tf.minimum(tf.to_float(features["task"]), 1.0) input_embeddings = tf.multiply(tf.expand_dims(encoder_nonpadding, 2), input_embeddings) encoder_input, self_attention_bias, encoder_decoder_attention_bias = ( transformer.transformer_prepare_encoder(input_embeddings, None, hparams, features=None)) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1.0 - hparams.layer_prepostprocess_dropout) if hparams.instruction_encoder == "transformer": encoder_output = transformer.transformer_encoder( encoder_input, self_attention_bias, hparams, save_weights_to=attention_weights, make_image_summary=not common_layers.is_xla_compiled()) else: raise ValueError("Unsupported instruction encoder %s" % (hparams.instruction_encoder)) span_rep = hparams.get("span_rep", "area") area_encodings, area_starts, area_ends = area_utils.compute_sum_image( encoder_output, max_area_width=hparams.max_span) current_shape = tf.shape(area_encodings) if span_rep == "area": area_encodings, _, _ = area_utils.compute_sum_image( encoder_output, max_area_width=hparams.max_span) elif span_rep == "basic": area_encodings = area_utils.compute_alternative_span_rep( encoder_output, input_embeddings, max_area_width=hparams.max_span, hidden_size=hparams.hidden_size, advanced=False) elif span_rep == "coref": area_encodings = area_utils.compute_alternative_span_rep( encoder_output, input_embeddings, max_area_width=hparams.max_span, hidden_size=hparams.hidden_size, advanced=True) else: raise ValueError("xyz") areas = {} areas["encodings"] = area_encodings areas["starts"] = area_starts areas["ends"] = area_ends with tf.control_dependencies([ tf.print("encoder_output", tf.shape(encoder_output)), tf.assert_equal(current_shape, tf.shape(area_encodings), summarize=100) ]): paddings = tf.cast(tf.less(self_attention_bias, -1), tf.int32) padding_sum, _, _ = area_utils.compute_sum_image( tf.expand_dims(tf.squeeze(paddings, [1, 2]), 2), max_area_width=hparams.max_span) num_areas = common_layers.shape_list(area_encodings)[1] area_paddings = tf.reshape(tf.minimum(tf.to_float(padding_sum), 1.0), [-1, num_areas]) areas["bias"] = area_paddings decoder_nonpadding = tf.to_float( tf.greater(features["verb_refs"][:, :, 1], features["verb_refs"][:, :, 0])) if hparams.instruction_encoder == "lstm": hparams_decoder = copy.copy(hparams) hparams_decoder.set_hparam("pos", "none") else: hparams_decoder = hparams decoder_input, decoder_self_attention_bias = _prepare_decoder_input( area_encodings, decoder_nonpadding, features, hparams_decoder, embed_scope=scope) decoder_input = tf.nn.dropout(decoder_input, keep_prob=1.0 - hparams.layer_prepostprocess_dropout) if hparams.instruction_decoder == "transformer": decoder_output = transformer.transformer_decoder( decoder_input=decoder_input, encoder_output=encoder_output, decoder_self_attention_bias=decoder_self_attention_bias, encoder_decoder_attention_bias=encoder_decoder_attention_bias, hparams=hparams_decoder) else: raise ValueError("Unsupported instruction encoder %s" % (hparams.instruction_encoder)) return decoder_output, decoder_nonpadding, areas, scope