def decoder(name, latents, hparams, decoder_self_attention_bias, **kwargs): """Compute final hidden states for p(y|z,x).""" with tf.variable_scope(name, reuse=tf.AUTO_REUSE): decoder_input = drop_2d(latents, hparams.mode, hparams.decoder_2d_dropout) if hparams.pos_attn: decoder_input = gops.positional_attention( "pos_attn", decoder_input, decoder_self_attention_bias, hparams) else: decoder_input = common_attention.add_timing_signal_1d( decoder_input) if common_layers.shape_list(latents)[-1] != hparams.hidden_size: decoder_input = gops.dense("lat2hid", latents, hparams.hidden_size) decoder_output = transformer_decoder_layers( "block", n_layers=hparams.n_decoder_layers, decoder_input=decoder_input, hparams=hparams, decoder_self_attention_bias=decoder_self_attention_bias, **kwargs) batch_size, targets_length = common_layers.shape_list( decoder_output)[:2] decoder_output = tf.reshape( decoder_output, [batch_size, targets_length, 1, hparams.hidden_size]) # Expand since t2t expects 4d tensors. return decoder_output
def transformer_prepare_decoder(targets, hparams, features=None): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a bias tensor for use in encoder self-attention """ decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(targets)[1])) if features and "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias += common_attention.attention_bias_same_segment( targets_segmentation, targets_segmentation) else: targets_position = None if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(targets)[1]) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": if targets_position is not None: decoder_input = common_attention.add_timing_signal_1d_given_position( decoder_input, targets_position) else: decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def cond_prior(name, hparams, decoder_input, targets_mask, output_size, decoder_self_attention_bias, init_scale=0.0, **kwargs): """Compute hidden states for parameters for conditional prior.""" with tf.variable_scope(name, reuse=tf.AUTO_REUSE): decoder_input = common_attention.add_timing_signal_1d(decoder_input) decoder_input = tf.nn.dropout( decoder_input, rate=hparams.layer_prepostprocess_dropout) decoder_output = transformer_decoder_layers( "block", n_layers=hparams.n_posterior_layers, decoder_input=decoder_input, hparams=hparams, decoder_self_attention_bias=decoder_self_attention_bias, **kwargs) decoder_output = gops.dense_weightnorm("h2o_out", decoder_output, output_size, targets_mask, init_scale=init_scale, init=False) return decoder_output
def attention_lm_moe_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments pad_remover (expert_utils.PadRemover): an util object to remove padding """ targets_pad_mask = common_attention.embedding_to_padding(targets) with tf.name_scope("pad_remover"): # Because of the shift_right, the <eos> token will be considered as # padding. In practice, it doesn't really matter, due to the triangular # mask, this token should never be attended. pad_remover = expert_utils.PadRemover(targets_pad_mask) if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepend_inputs_full_attention( targets_pad_mask)) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias, pad_remover)
def attention_lm_moe_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments pad_remover (expert_utils.PadRemover): an util object to remove padding """ targets_pad_mask = common_attention.embedding_to_padding(targets) with tf.name_scope("pad_remover"): pad_remover = expert_utils.PadRemover(targets_pad_mask) if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepended(targets_pad_mask)) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( tf.shape(targets)[1])) decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias, pad_remover)
def attention_lm_moe_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly biases for diagonal alignments pad_remover (expert_utils.PadRemover): an util object to remove padding """ targets_pad_mask = common_attention.embedding_to_padding(targets) with tf.name_scope("pad_remover"): # Because of the shift_right, the <eos> token will be considered as # padding. In practice, it doesn't really matter, due to the triangular # mask, this token should never be attended. pad_remover = expert_utils.PadRemover(targets_pad_mask) if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepend_inputs_full_attention( targets_pad_mask)) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( tf.shape(targets)[1])) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias, pad_remover)
def transformer_prepare_encoder2(encoder_input, target_space, hparams, emb_name): '''the same as the existing module except for being able to name the embedding''' # compute bias ishape_static = encoder_input.shape.as_list() encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(encoder_input)[1]) # Append target_space_id embedding to encoder_input id_values = [ value for attr, value in vars(problem.SpaceID).items() if not attr.startswith("__") ] id_cur = int(max(id_values) + 1) emb_target_space = common_layers.embedding(target_space, id_cur, ishape_static[-1], name=emb_name) emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space # position embedding if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias
def prepare_image_question_encoder(image_feat, question, hparams): """Prepare encoder. Args: image_feat: a Tensor. question: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention """ encoder_input = tf.concat([image_feat, question], axis=1) encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding # Usual case - not a packed dataset. if hparams.pos == "timing": question = common_attention.add_timing_signal_1d(question) elif hparams.pos == "emb": question = common_attention.add_positional_embedding( question, hparams.max_length, "inputs_positional_embedding", None) encoder_input = tf.concat([image_feat, question], axis=1) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_encoder(inputs, hparams): """Prepare one shard of the model for the encoder. Args: inputs: [batch_size, input_length, hidden_dim] hparams: hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack [batch_size, input_length, hidden_dim] encoder_self_attention_bias: a bias tensor for use in encoder self-attention [batch_size, input_length] top_layer_attention_bias: a bias tensor for use in top layer classification [batch_size, input_length] """ ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding top_layer_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(inputs)[1]) if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, top_layer_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def transformer_prepare_decoder(targets, hparams, features=None): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a bias tensor for use in encoder self-attention """ decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(targets)[1])) if features and "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias += common_attention.attention_bias_same_segment( targets_segmentation, targets_segmentation) else: targets_position = None if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(targets)[1]) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": if targets_position is not None: decoder_input = common_attention.add_timing_signal_1d_given_position( decoder_input, targets_position) else: decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def prepare_image_question_encoder(image_feat, question, hparams): """Prepare encoder. Args: image_feat: a Tensor. question: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention """ encoder_input = tf.concat([image_feat, question], axis=1) encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding # Usual case - not a packed dataset. if hparams.pos == "timing": question = common_attention.add_timing_signal_1d(question) elif hparams.pos == "emb": question = common_attention.add_positional_embedding( question, hparams.max_length, "inputs_positional_embedding", None) encoder_input = tf.concat([image_feat, question], axis=1) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def __init__(self, input, word_size=500000, embedding_dim=30, batch_num=10, use_position_encoding=False, use_diy=True): self.input = input self.word_size = word_size self.embedding_dim = embedding_dim self.batch_num = batch_num with tf.name_scope("word_embedding"): self.word_W = tf.get_variable( name="word_W", shape=[word_size, embedding_dim], initializer=tf.orthogonal_initializer()) self.embedding_input = tf.nn.embedding_lookup(self.word_W, self.input) if use_position_encoding: if use_diy: self.postional_encoding_add = self.positional_encoding_layer( input=self.embedding_input) else: self.postional_encoding_add = common_attention.add_timing_signal_1d( self.embedding_input) else: self.postional_encoding_add = self.embedding_input self.output = tf.reduce_mean(self.multi_encoding_layer( self.postional_encoding_add), axis=-1)
def decode_inputs_to_outputs(self, decoder_embed_inputs, encoder_outputs, encoder_attn_bias, rule_id_input_placeholder, mem_contexts, mem_outputs, global_step): if self.hparams.pos == 'timing': decoder_embed_inputs = common_attention.add_timing_signal_1d( decoder_embed_inputs) print('Use positional encoding in decoder text.') decoder_attn_bias = common_attention.attention_bias_lower_triangle( tf.shape(decoder_embed_inputs)[1]) decoder_embed_inputs = tf.nn.dropout( decoder_embed_inputs, 1.0 - self.hparams.layer_prepostprocess_dropout) if 'rule' in self.model_config.memory: decoder_output, contexts = transformer.transformer_decoder2( decoder_embed_inputs, encoder_outputs, decoder_attn_bias, encoder_attn_bias, self.hparams) # encoder_gate_w = tf.get_variable('encoder_gate_w', shape=( # 1, self.model_config.dimension, 1)) # encoder_gate_b = tf.get_variable('encoder_gate_b', shape=(1, 1, 1)) # encoder_gate = tf.tanh(encoder_gate_b + tf.nn.conv1d(encoder_outputs, encoder_gate_w, 1, 'SAME')) # encoder_context_outputs = tf.expand_dims(tf.reduce_mean(encoder_outputs * encoder_gate, axis=1), axis=1) cur_context = contexts[0] #tf.concat(contexts, axis=-1) cur_mem_contexts = tf.stack(self.embedding_fn( rule_id_input_placeholder, mem_contexts), axis=1) cur_mem_outputs = tf.stack(self.embedding_fn( rule_id_input_placeholder, mem_outputs), axis=1) bias = tf.expand_dims(-1e9 * tf.to_float( tf.equal(tf.stack(rule_id_input_placeholder, axis=1), 0)), axis=1) weights = tf.nn.softmax( bias + tf.matmul(cur_context, cur_mem_contexts, transpose_b=True)) mem_output = tf.matmul(weights, cur_mem_outputs) temp_output = tf.concat((decoder_output, mem_output), axis=-1) w = tf.get_variable('w_ffn', shape=(1, self.model_config.dimension * 2, self.model_config.dimension)) # b = tf.get_variable('b_ffn', shape=( # 1, 1, self.model_config.dimension)) mem_output = tf.nn.conv1d(temp_output, w, 1, 'SAME') g = tf.greater( global_step, tf.constant(2 * self.model_config.memory_prepare_step, dtype=tf.int64)) final_output = tf.cond(g, lambda: mem_output, lambda: decoder_output) return final_output, decoder_output, cur_context else: decoder_output = transformer.transformer_decoder( decoder_embed_inputs, encoder_outputs, decoder_attn_bias, encoder_attn_bias, self.hparams) final_output = decoder_output return final_output, decoder_output, None
def preprocess_firstP(firstP): firstP = self._shard_features({"firstP": firstP})["firstP"] firstP_modality = self._problem_hparams.input_modality["firstP"] with tf.variable_scope(firstP_modality.name): firstP = firstP_modality.targets_bottom_sharded(firstP, dp)[0] firstP = common_layers.flatten4d3d(firstP) if hparams.pos == "timing": firstP = common_attention.add_timing_signal_1d(firstP) return firstP
def transformer_prepare_decoder_right(targets, hparams, features=None): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a bias tensor for use in decoder self-attention """ if hparams.causal_decoder_self_attention: # Causal attention. if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepend_inputs_full_attention( common_attention.embedding_to_padding(targets))) else: decoder_self_attention_bias = ( common_attention.attention_bias_local( common_layers.shape_list(targets)[1], 0, -1)) else: # Full attention. decoder_padding = common_attention.embedding_to_padding(targets) decoder_self_attention_bias = ( common_attention.attention_bias_ignore_padding(decoder_padding)) if features and "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias += common_attention.attention_bias_same_segment( targets_segmentation, targets_segmentation) else: targets_position = None if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(targets)[1]) decoder_input = shift_left_3d(targets) if hparams.pos == "timing": if targets_position is not None: decoder_input = common_attention.add_timing_signal_1d_given_position( decoder_input, targets_position) else: decoder_input = common_attention.add_timing_signal_1d( decoder_input) elif hparams.pos == "emb": decoder_input = common_attention.add_positional_embedding( decoder_input, hparams.max_length, "targets_positional_embedding", targets_position) if hparams.activation_dtype == "bfloat16": decoder_self_attention_bias = tf.cast(decoder_self_attention_bias, tf.bfloat16) return (decoder_input, decoder_self_attention_bias)
def prepare_decoder(targets, target_space_emb): """Prepare decoder.""" decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) target_space_emb = tf.reshape(target_space_emb, [1, 1, -1]) target_space_emb = tf.tile(target_space_emb, [tf.shape(targets)[0], 1, 1]) decoder_input = common_layers.shift_right_3d( targets, pad_value=target_space_emb) decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def prepare_decoder(targets, target_space_emb): """Prepare decoder.""" decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) target_space_emb = tf.reshape(target_space_emb, [1, 1, -1]) target_space_emb = tf.tile(target_space_emb, [tf.shape(targets)[0], 1, 1]) decoder_input = common_layers.shift_right_3d(targets, pad_value=target_space_emb) decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] encoder_self_attention_bias = common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding(target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d( encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def decode(self, decoder_inputs, timestep): """ Args: decoder_inputs: targets of shape [batch_size,sequence_length, hidden_size]. Sequence is shifter right by one. timestep: used for timestep encoding during ACT Return: decoder_outputs: the result of passing the decoder_input through the edecoderlayers. Input shape is preserved. This function is one step of decoding. """ with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): #positional encoding x = common_attention.add_timing_signal_1d(decoder_inputs) #timestep encoding x = common_attention.add_timing_signal_1d_given_position( x, timestep) #decoder-decoder attention y = common_attention.multihead_attention( query_antecedent=x, memory_antecedent=None, bias=self.decoder_attention_bias, total_key_depth=self.hparams.hidden_size, total_value_depth=self.hparams.hidden_size, output_depth=self.hparams.hidden_size, num_heads=self.hparams.num_heads, dropout_rate=self.hparams.attention_dropout) #residual connection and dropout x = common_layers.layer_postprocess(x, y, self.hparams) #layer norm x = common_layers.layer_norm(x) #encoder-decoder attention y = common_attention.multihead_attention( query_antecedent=x, memory_antecedent=self.encoder_outputs, bias=self.encoder_attention_bias, total_key_depth=self.hparams.hidden_size, total_value_depth=self.hparams.hidden_size, output_depth=self.hparams.hidden_size, num_heads=self.hparams.num_heads, dropout_rate=self.hparams.attention_dropout) #residual connection and dropout x = common_layers.layer_postprocess(x, y, self.hparams) #layer norm x = common_layers.layer_norm(x) #transition function as fc y = tf.layers.dense(x, self.hparams.hidden_size, name="transition") #residual connection and dropout x = common_layers.layer_postprocess(x, y, self.hparams) #layer norm x = common_layers.layer_norm(x) return x
def transformer_prepare_decoder(targets, hparams): """Copied from tensor2tensor.models.transformer.""" decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) if hparams.proximity_bias: decoder_self_attention_bias += common_attention.attention_bias_proximal( tf.shape(targets)[1]) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def decode_inputs_to_outputs(self, kword_input, abstr_outputs, abstr_bias, hist_vector=None): if self.hparams.pos == 'timing': kword_input = common_attention.add_timing_signal_1d(kword_input) kword_tribias = common_attention.attention_bias_lower_triangle(tf.shape(kword_input)[1]) kword_input = tf.nn.dropout( kword_input, 1.0 - self.hparams.layer_prepostprocess_dropout) kword_output = transformer.transformer_decoder( kword_input, abstr_outputs, kword_tribias, abstr_bias, self.hparams, hist_vector=hist_vector) return kword_output
def decode(cond_vec, cond_add, gold, c, ed, hparams): """Transformer decoder.""" drop_gold = tf.nn.dropout(gold, 1.0 - hparams.layer_prepostprocess_dropout) decoder_input = common_layers.shift_right(drop_gold, pad_value=cond_vec) if cond_add is not None: decoder_input += cond_add decoder_input = tf.squeeze(decoder_input, axis=2) decoder_input = common_attention.add_timing_signal_1d(decoder_input) bias = common_attention.attention_bias_lower_triangle(tf.shape(gold)[1]) if c is not None and len(c.get_shape()) > 3: c = tf.squeeze(c, axis=2) return transformer.transformer_decoder(decoder_input, c, bias, ed, hparams)
def transformer_prepare_encoder(inputs, target_space, hparams, features=None): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. target_space: a Tensor. hparams: run hyperparameters features: optionally pass the entire features dictionary as well. This is needed now for "packed" datasets. Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ishape_static = inputs.shape.as_list() encoder_input = inputs if features and "inputs_segmentation" in features: # Packed dataset. Keep the examples from seeing each other. inputs_segmentation = features["inputs_segmentation"] inputs_position = features["inputs_position"] targets_segmentation = features["targets_segmentation"] encoder_self_attention_bias = common_attention.attention_bias_same_segment( inputs_segmentation, inputs_segmentation) encoder_decoder_attention_bias = ( common_attention.attention_bias_same_segment( targets_segmentation, inputs_segmentation)) else: # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding inputs_position = None if hparams.proximity_bias: encoder_self_attention_bias += common_attention.attention_bias_proximal( common_layers.shape_list(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( target_space, 32, ishape_static[-1], name="target_space_embedding") emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) encoder_input += emb_target_space if hparams.pos == "timing": if inputs_position is not None: encoder_input = common_attention.add_timing_signal_1d_given_position( encoder_input, inputs_position) else: encoder_input = common_attention.add_timing_signal_1d(encoder_input) return (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias)
def attend(x, source, hparams, name): with tf.variable_scope(name): x = tf.squeeze(x, axis=2) if len(source.get_shape()) > 3: source = tf.squeeze(source, axis=2) source = common_attention.add_timing_signal_1d(source) y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), source, None, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) res = common_layers.layer_postprocess(x, y, hparams) return tf.expand_dims(res, axis=2)
def attend(x, source, hparams, name): with tf.variable_scope(name): x = tf.squeeze(x, axis=2) if len(source.get_shape()) > 3: source = tf.squeeze(source, axis=2) source = common_attention.add_timing_signal_1d(source) y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), source, None, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) res = common_layers.layer_postprocess(x, y, hparams) return tf.expand_dims(res, axis=2)
def build_graph(self, inputs, masks): with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): if self.input_mapping: inputs = tf.layers.conv1d(inputs, filters=self.filters, \ kernel_size=1, padding='SAME', name='input_mapping') outputs = inputs for i in range(self.num_blocks): with tf.variable_scope('block{}'.format(i + 1)): outputs = add_timing_signal_1d(outputs) for j in range(self.num_layers): with tf.variable_scope('conv{}'.format(j + 1)): def fn(x): output = tf.layers.separable_conv1d( layer_norm(x, name='ln1_{}'.format(j + 1)), filters=self.filters, kernel_size=self.kernel_size, padding='SAME', name='conv{}'.format(j + 1)) if j % 2 == 0: output = tf.nn.dropout( output, self.keep_prob) return output outputs = layer_dropout(x=outputs, fn=fn, keep_prob=1 - (j + 1) / self.num_layers * (1 - self.keep_prob)) outputs = tf.nn.dropout( outputs + multihead_self_attention( layer_norm(outputs, name='ln2_{}'.format(i)), masks, self.num_heads), self.keep_prob) res = outputs outputs = layer_norm(outputs, name='ln3_{}'.format(i + 1)) #outputs = tf.layers.conv1d(outputs, filters=self.filters, kernel_size=1, padding='SAME', kernel_initializer=initializer_relu(), name='ffn1') outputs = tf.nn.relu( tf.layers.conv1d(outputs, filters=self.filters, kernel_size=1, padding='SAME', kernel_initializer=initializer_relu(), name='ffn1')) outputs = tf.layers.conv1d(outputs, filters=self.filters, kernel_size=1, padding='SAME', name='ffn2') outputs = tf.nn.dropout(res + outputs, self.keep_prob) return outputs
def transformer_prepare_delibdecoder(self, inputs, hparams): """Prepare one shard of the model for the encoder. Args: inputs: a Tensor. hparams: run hyperparameters Returns: """ firstPdecoder_input = inputs firstPdecoder_padding = common_attention.embedding_to_padding(firstPdecoder_input) ignore_padding = common_attention.attention_bias_ignore_padding(firstPdecoder_padding) firstP_delib_attention_bias = ignore_padding if hparams.pos == "timing": firstPdecoder_input = common_attention.add_timing_signal_1d(firstPdecoder_input) return (firstPdecoder_input, firstP_delib_attention_bias)
def attend(x, source, hparams, name): """Attend function.""" with tf.variable_scope(name): # x = tf.squeeze(x, axis=2) x, xshape, _ = cia.maybe_reshape_4d_to_3d(x) if len(source.get_shape()) > 3: source = tf.squeeze(source, axis=2) source = common_attention.add_timing_signal_1d(source) y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), source, None, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) res = common_layers.layer_postprocess(x, y, hparams) return tf.reshape(res, xshape)
def transformer_decoder_block(name, n_layers, x, x_mask, output_size, init, **kwargs): """A transformation block composed of transformer decoder layers. Args: name: variable scope. n_layers: number of transformer layers. x: input to transformation. x_mask: mask. output_size: output dimensionality. init: data-dependent init for weightnorm parameters. **kwargs: Constains hparams, encoder_output, encoder_decoder_attention_bias and decoder_self_attention_bias Returns: outputs: Tensor of shape [batch_size, length, output_size]. """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): hparams = kwargs.pop("hparams") disable_dropout = kwargs.pop("disable_dropout") if disable_dropout: hparams = copy.deepcopy(hparams) hparams.attention_dropout = 0.0 hparams.layer_prepostprocess_dropout = 0.0 hparams.relu_dropout = 0.0 n_channels = common_layers.shape_list(x)[-1] if n_channels != hparams.hidden_size: hparams = copy.deepcopy(hparams) hparams.hidden_size = n_channels outputs = common_attention.add_timing_signal_1d(x) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): for layer_idx in range(n_layers): outputs = transformer_decoder_layer( decoder_input=outputs, layer_idx=layer_idx, hparams=hparams, **kwargs) outputs = common_layers.layer_preprocess(outputs, hparams) outputs = dense_weightnorm( "h2o", outputs, output_size, x_mask, init_scale=0.0, init=init) return outputs
def transformer_fast_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_position_forward_mask: mask Tensor for position-forward. [1, t, 1] """ length = tf.shape(targets)[1] decoder_position_forward_mask = 1. / tf.expand_dims( tf.expand_dims(tf.to_float(tf.range(length)) + 1., 0), -1) # [1, t, 1] decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_position_forward_mask)
def add_vanilla_transformer_layer(x, num_layers, name): """Passes the input through num_layers of vanilla transformer layers. Args: x: input num_layers: number of layers name: string, prefix of layer names Returns: output of vanilla_transformer_layer """ if hparams.add_position_timing_signal: # In case of add_position_timing_signal=true, we set hparams.pos=None # and add position timing signal at the beginning of each step, so for # the vanilla transformer, we need to add timing signal here. x = common_attention.add_timing_signal_1d(x) for layer in range(num_layers): with tf.variable_scope(name + "layer_%d" % layer): x = ffn_unit(attention_unit(x)) return x
def decode_syntax_template(self, trg_syntax_emb): with tf.variable_scope('syntax_decoder', reuse=tf.AUTO_REUSE): trg_syntax_emb = common_attention.add_timing_signal_1d( trg_syntax_emb) trg_syntax_emb = self.update_embedding(trg_syntax_emb) trg_syntax_length = tf.shape(trg_syntax_emb)[1] trg_self_attention_bias = common_attention.attention_bias_lower_triangle( trg_syntax_length) trg_syntax_outputs = transformer.transformer_decoder( decoder_input=trg_syntax_emb, decoder_self_attention_bias=trg_self_attention_bias, encoder_output=self.shared_tensors['src_outputs'], encoder_decoder_attention_bias=self.shared_tensors['src_bias'], hparams=self.hparams, external_output=self. shared_tensors['template_prev_simp_outputs'], external_bias=self.shared_tensors['template_simp_bias']) return trg_syntax_outputs
def attend(x, source, hparams, name): """Attend function.""" with tf.variable_scope(name): # x = tf.squeeze(x, axis=2) x, xshape, _ = cia.maybe_reshape_4d_to_3d(x) if len(source.get_shape()) > 3: source = tf.squeeze(source, axis=2) source = common_attention.add_timing_signal_1d(source) y = common_attention.multihead_attention( common_layers.layer_preprocess(x, hparams), source, None, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout) res = common_layers.layer_postprocess(x, y, hparams) return tf.reshape(res, xshape)
def attention_lm_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments """ decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) decoder_input = common_layers.shift_left_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)
def long_answer_prepare_decoder(inputs, targets, hparams): """Prepare one shard of the model for the decoder. Args: inputs: a Tensor. targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack """ decoder_input = tf.concat([ length_embedding(targets, hparams), inputs, common_layers.shift_left_3d(targets) ], 1) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return decoder_input
def transformer_prepare_encoder(inputs, target_space, hparams): """Prepare one shard of the model for the encoder. Args: inputs: Tensor with shape [batch, memory_length, depth] target_space: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder attention """ ignore_padding = get_ignore_padding(inputs) encoder_self_attention_bias = ignore_padding # Bias for self-attention to encourage attention to close positions. if hparams.proximity_bias: encoder_self_attention_bias += comm_attn.attention_bias_proximal( length=tf.shape(inputs)[1]) # Append target_space_id embedding to inputs. emb_target_space = common_layers.embedding( x=target_space, vocab_size=32, dense_size=inputs.shape.as_list[-1], name='target_space_embedding') emb_target_space = tf.reshape(emb_target_space, [1, 1, -1]) # Question: wat encoder_input = inputs + emb_target_space if hparams.pos == 'timing': encoder_input = comm_attn.add_timing_signal_1d(encoder_input) # Putting this here since always called immediately after... encoder_input = with_dropout(encoder_input, hparams) return EncoderState(input=encoder_input, self_attn_bias=encoder_self_attention_bias, decoder_attn_bias=ignore_padding, output=None)
def prepare_question_encoder(inputs, hparams): """Prepare question encoder. Args: inputs: a Tensor. hparams: run hyperparameters Returns: encoder_input: a Tensor, bottom of encoder stack encoder_self_attention_bias: a bias tensor for use in encoder self-attention """ encoder_input = inputs # Usual case - not a packed dataset. encoder_padding = common_attention.embedding_to_padding(encoder_input) ignore_padding = common_attention.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding if hparams.pos == "timing": encoder_input = common_attention.add_timing_signal_1d(encoder_input) elif hparams.pos == "emb": encoder_input = common_attention.add_positional_embedding( encoder_input, hparams.max_length, "inputs_positional_embedding", None) return (encoder_input, encoder_self_attention_bias)
def attention_lm_prepare_decoder(targets, hparams): """Prepare one shard of the model for the decoder. Args: targets: a Tensor. hparams: run hyperparameters Returns: decoder_input: a Tensor, bottom of decoder stack decoder_self_attention_bias: a Tensor, containing large negative values to implement masked attention and possibly baises for diagonal alignments """ if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( common_attention.attention_bias_prepended( common_attention.embedding_to_padding(targets))) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle( common_layers.shape_list(targets)[1])) decoder_input = common_layers.shift_right_3d(targets) if hparams.pos == "timing": decoder_input = common_attention.add_timing_signal_1d(decoder_input) return (decoder_input, decoder_self_attention_bias)