def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length]. training: boolean, whether in training mode or not. Returns: float tensor with shape [batch_size, input_length, hidden_size] """ # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = get_input_mask(inputs) length = tf.shape(embedded_inputs)[1] pos_encoding = positional_encoding(length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.is_train: encoder_inputs = tf.nn.dropout(encoder_inputs, rate=1 - self.params.keep_prob) return self.encoder_stack(encoder_inputs, input_mask=attention_bias)
def forward(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: input tensor list of size 1 or 2. First item, inputs: int tensor with shape [batch_size, input_length]. Second item (optional), targets: None or int tensor with shape [batch_size, target_length]. training: boolean, whether in training mode or not. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { outputs: [batch_size, decoded length] scores: [batch_size, float]} """ # # Variance scaling is used here because it seems to work in many problems. # # Other reasonable initializers may also work just as well. # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = get_input_mask(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = get_input_mask(inputs) encoder_outputs = self.encode(inputs, inputs_padding) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits