def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, decoder_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ #self._hparams.add("warm_start_from",True) hparams = self._hparams losses = [] # if self.has_input: # inputs = features["inputs"] # target_space = features["target_space_id"] # encoder_output, encoder_decoder_attention_bias = self.encode( # inputs, target_space, hparams, features=features, losses=losses) # else: encoder_output, encoder_decoder_attention_bias = (None, None) lekeys = "inputs" if lekeys in features: targets = features["inputs"] lekeys = "inputs" else: targets = features["targets"] lekeys = "targets" targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding( features, lekeys), losses=losses) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """dual_Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs1\2": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams losses = [] assert self.has_input, "problems for dual-transformer must have inputs" if self.has_input: inputs1 = features["wav_inputs"] inputs2 = features["txt_inputs"] target_space = features["target_space_id"] wav_encoder_output, wav_enc_dec_attention_bias,\ txt_encoder_output, txt_enc_dec_attention_bias = self.dual_encode( inputs1, inputs2,target_space, hparams, features=features, losses=losses) else: wav_encoder_output, wav_enc_dec_attention_bias, \ txt_encoder_output, txt_enc_dec_attention_bias=(None, None, None, None) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.dual_decode( decoder_input, wav_encoder_output,txt_encoder_output, wav_enc_dec_attention_bias, txt_enc_dec_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding(features, "targets"), losses=losses) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, decoder_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams losses = [] if self.has_input: raise AttributeError("Context transformer encoder not implemented") inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_biases = self.encode( inputs, target_space, hparams, features=features, losses=losses) else: encoder_output, encoder_decoder_attention_biases = (None, None) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_self_attention_biases = expand_bias_modes( decoder_self_attention_bias, features["targets_seg"]) decoder_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_biases, decoder_self_attention_biases, hparams, nonpadding=features_to_nonpadding(features, "targets"), losses=losses) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "tragets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features) else: encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding( features, "targets")) self.cache_flag = tf.py_func( self.sentence_cache.AddMultipleEntries, [features["targets_raw"], decoder_output], tf.float32, ) tf.cast(self.cache_flag, tf.float32) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} self.cache_flag.set_shape((1, )) return decoder_output + 0 * self.cache_flag
def body(self, features): hparams = self._hparams losses = [] contexts = {} for feature_name in features: if 'context' in feature_name and 'raw' not in feature_name: contexts[feature_name] = features[feature_name] inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, contexts, target_space, hparams=hparams, features=features, losses=losses) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams=hparams, nonpadding=features_to_nonpadding( features, "targets"), losses=losses) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "tragets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features) else: encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding(features, "targets")) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights) return decoder_output, {"attention_loss": attention_loss} return decoder_output
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "tragets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features) else: encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding(features, "targets")) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights) return decoder_output, {"attention_loss": attention_loss} return decoder_output
def body(self, features): """Universal Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams if hparams.add_position_timing_signal: # Turning off addition of positional embedding in the encoder/decoder # preparation as we do it in the beginning of each step. hparams.pos = None if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] (encoder_output, encoder_decoder_attention_bias, enc_extra_output) = self.encode( inputs, target_space, hparams, features=features) else: (encoder_output, encoder_decoder_attention_bias, enc_extra_output) = (None, None, (None, None)) targets = features["targets"] targets = common_layers.flatten4d3d(targets) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams, features=features) decoder_output, dec_extra_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "targets")) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0: if self.has_input: enc_ponder_times, enc_remainders = enc_extra_output enc_act_loss = ( hparams.act_loss_weight * tf.reduce_mean(enc_ponder_times + enc_remainders)) else: enc_act_loss = 0.0 (dec_ponder_times, dec_remainders) = dec_extra_output dec_act_loss = ( hparams.act_loss_weight * tf.reduce_mean(dec_ponder_times + dec_remainders)) act_loss = enc_act_loss + dec_act_loss tf.contrib.summary.scalar("act_loss", act_loss) return decoder_output, {"act_loss": act_loss} return decoder_output
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, decoder_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams losses = [] if self.has_input: # use melody-only as input features inputs = features["melody"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features, losses=losses) else: encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = self._prepare_decoder_fn( targets, hparams, features=features) # Not all subclasses of Transformer support keyword arguments related to # recurrent memory, so only pass these arguments if memory is enabled. decode_kwargs = {} if self.recurrent_memory_by_layer is not None: # TODO(kitaev): The chunk_number feature currently has the same shape as # "targets", but this is only for the purposes of sharing sharding code. # In fact every token within an example must have the same chunk number. chunk_number_each_token = tf.squeeze(features["chunk_number"], (-1, -2)) chunk_number_each_example = chunk_number_each_token[:, 0] # Uncomment the code below to verify that tokens within a batch share the # same chunk number: # with tf.control_dependencies([ # tf.assert_equal(chunk_number_each_token, # chunk_number_each_example[:, None]) # ]): # chunk_number_each_example = tf.identity(chunk_number_each_example) decode_kwargs = dict( recurrent_memory_by_layer=self.recurrent_memory_by_layer, chunk_number=chunk_number_each_example, ) decoder_output = self.decode(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding( features, "targets"), losses=losses, **decode_kwargs) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs. [batch_size, input_length, 1, hidden_dim]. "targets": Target decoder outputs. [batch_size, decoder_length, 1, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams losses = [] if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features, losses=losses) else: encoder_output, encoder_decoder_attention_bias = (None, None) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) left_decoder_input, left_decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) right_decoder_input, right_decoder_self_attention_bias = transformer_prepare_decoder_right( targets, hparams, features=features) non_pad = nonpadding = features_to_nonpadding(features, "targets") with tf.variable_scope("left_decoder"): left_decoder_output = self.decode(left_decoder_input, encoder_output, encoder_decoder_attention_bias, left_decoder_self_attention_bias, hparams, nonpadding=non_pad, losses=losses) with tf.variable_scope("right_decoder"): right_decoder_output = self.decode( right_decoder_input, encoder_output, encoder_decoder_attention_bias, right_decoder_self_attention_bias, hparams, nonpadding=non_pad, losses=losses) decoder_output = transformer_bidirectional_joint_decoder( tf.squeeze(left_decoder_output, axis=2), tf.squeeze(right_decoder_output, axis=2), encoder_output, encoder_decoder_attention_bias, hparams, nonpadding=non_pad, save_weights_to=self.attention_weights, losses=losses) decoder_output = tf.expand_dims(decoder_output, axis=2) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, {"extra_loss": tf.add_n(losses)} else: return ret
def body(self, features): """Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ tf.logging.info("Using PgScratch BODY function.") hparams = self._hparams losses = {} inputs = features["inputs"] target_space = features["target_space_id"] # encoder_output: <tf.float32>[batch_size, input_length, hidden_dim] # encoder_decoder_attention_bias: <tf.float32>[batch_size, input_length] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features, losses=losses) with tf.variable_scope("knowledge"): with tf.name_scope("knowledge_encoding"): # Encode knowledge. # <tf.float32>[batch_size, triple_num, emb_dim] fact_embedding, fact_lengths = self.encode_knowledge_bottom( features) tf.logging.info("Encoded knowledge") with tf.name_scope("knowledge_selection_and_loss"): # Compute knowledge selection and loss. triple_logits, avg_triple_selection_loss, knowledge_encoder_output, transe_loss = self.compute_knowledge_selection_and_loss( features, encoder_output, fact_embedding, fact_lengths, hparams.margin, hparams.num_negative_samples) losses["kb_loss"] = avg_triple_selection_loss losses["transe_loss"] = transe_loss if hparams.attend_kb: tf.logging.info("ATTEND_KB is ACTIVE") with tf.name_scope("knowledge_attention"): knowledge_padding = tf.zeros_like(triple_logits, dtype=tf.float32) knowledge_attention_bias = common_attention.attention_bias_ignore_padding( knowledge_padding) encoder_output = tf.concat( [knowledge_encoder_output, encoder_output], 1) encoder_decoder_attention_bias = tf.concat( [knowledge_attention_bias, encoder_decoder_attention_bias], -1) else: tf.logging.info("ATTEND_KB is INACTIVE") targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) (decoder_input, decoder_self_attention_bias ) = transformer.transformer_prepare_decoder(targets, hparams, features=features) decode_kwargs = {} decoder_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "targets"), losses=losses, **decode_kwargs) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} ret = tf.reshape(decoder_output, targets_shape) if losses: return ret, losses else: return ret
def body(self, features): """CopyTransformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "targets_*": Additional decoder outputs to generate, for copying and pointing; [batch_size, decoder_length] "target_space_id": A scalar int from data_generators.problem.SpaceID. Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams losses = [] inputs = features["inputs"] target_space = features["target_space_id"] encoder_output, encoder_decoder_attention_bias = self.encode( inputs, target_space, hparams, features=features, losses=losses) if "targets_actions" in features: targets = features["targets_actions"] else: tf.logging.warn( "CopyTransformer must be used with a SemanticParsing problem with a ShiftReduceGrammar; bad things will happen otherwise" ) targets = features["targets"] targets_shape = common_layers.shape_list(targets) targets = common_layers.flatten4d3d(targets) decoder_input, decoder_self_attention_bias = transformer_prepare_decoder( targets, hparams, features=features) decoder_output = self.decode(decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=features_to_nonpadding( features, "targets"), losses=losses) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} decoder_output = tf.reshape(decoder_output, targets_shape) body_output = dict() target_modality = self._problem_hparams.target_modality \ if self._problem_hparams else {"targets": None} assert hparams.pointer_layer in ("attentive", "decaying_attentive") for key, modality in target_modality.items(): if isinstance(modality, CopyModality): with tf.variable_scope("copy_layer/" + key): if hparams.pointer_layer == "decaying_attentive": output_layer = DecayingAttentivePointerLayer( encoder_output) else: output_layer = AttentivePointerLayer(encoder_output) scores = output_layer(decoder_output) scores += encoder_decoder_attention_bias body_output[key] = scores else: body_output[key] = decoder_output if losses: return body_output, {"extra_loss": tf.add_n(losses)} else: return body_output
def body(self, features): """Universal Transformer main model_fn. Args: features: Map of features to the model. Should contain the following: "inputs": Transformer inputs [batch_size, input_length, hidden_dim] "targets": Target decoder outputs. [batch_size, decoder_length, hidden_dim] "target_space_id" Returns: Final decoder representation. [batch_size, decoder_length, hidden_dim] """ hparams = self._hparams if hparams.add_position_timing_signal: # Turning off addition of positional embedding in the encoder/decoder # preparation as we do it in the beginning of each step. hparams.pos = None if self.has_input: inputs = features["inputs"] target_space = features["target_space_id"] (encoder_output, encoder_decoder_attention_bias, enc_extra_output) = self.encode( inputs, target_space, hparams, features=features) else: (encoder_output, encoder_decoder_attention_bias, enc_extra_output) = (None, None, (None, None)) targets = features["targets"] targets = common_layers.flatten4d3d(targets) (decoder_input, decoder_self_attention_bias) = transformer.transformer_prepare_decoder( targets, hparams, features=features) decoder_output, dec_extra_output = self.decode( decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, nonpadding=transformer.features_to_nonpadding(features, "targets")) expected_attentions = features.get("expected_attentions") if expected_attentions is not None: attention_loss = common_attention.encoder_decoder_attention_loss( expected_attentions, self.attention_weights, hparams.expected_attention_loss_type, hparams.expected_attention_loss_multiplier) return decoder_output, {"attention_loss": attention_loss} if hparams.recurrence_type == "act" and hparams.act_loss_weight != 0: if self.has_input: enc_ponder_times, enc_remainders = enc_extra_output enc_act_loss = ( hparams.act_loss_weight * tf.reduce_mean(enc_ponder_times + enc_remainders)) else: enc_act_loss = 0.0 (dec_ponder_times, dec_remainders) = dec_extra_output dec_act_loss = ( hparams.act_loss_weight * tf.reduce_mean(dec_ponder_times + dec_remainders)) act_loss = enc_act_loss + dec_act_loss tf.contrib.summary.scalar("act_loss", act_loss) return decoder_output, {"act_loss": act_loss} return decoder_output