Esempio n. 1
0
    def call(self, inputs, return_all_layers=True, **kwargs):
        """Implements call() for the layer.

    Args:
      inputs: packed inputs.
      return_all_layers: bool, whether to return outputs of all layers inside
        encoders.
    Returns:
      Output tensor of the last layer or a list of output tensors.
    """
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        input_tensor = unpacked_inputs[0]
        attention_mask = unpacked_inputs[1]
        output_tensor = input_tensor

        all_layer_outputs = []
        all_layer_attentions = []
        for i in range(self.num_hidden_layers):
            output_tensor, attention_score = self.shared_layers[i](
                output_tensor, attention_mask, **kwargs)
            all_layer_outputs.append(output_tensor)
            all_layer_attentions.append(attention_score)
        if return_all_layers:
            return all_layer_outputs, all_layer_attentions

        return all_layer_outputs[-1], all_layer_attentions[-1]
Esempio n. 2
0
    def call(self, inputs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        lm_output = unpacked_inputs[0]
        sentence_output = unpacked_inputs[1]
        lm_label_ids = unpacked_inputs[2]
        lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
        lm_label_ids_one_hot = tf.keras.backend.one_hot(
            lm_label_ids, self.config.vocab_size)
        lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3],
                                                 tf.float32)
        lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1])
        lm_per_example_loss = -tf.keras.backend.sum(
            lm_output * lm_label_ids_one_hot, axis=[-1])
        numerator = tf.keras.backend.sum(lm_label_weights *
                                         lm_per_example_loss)
        denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5
        mask_label_loss = numerator / denominator

        sentence_labels = unpacked_inputs[4]
        sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1])
        sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2)
        per_example_loss_sentence = -tf.keras.backend.sum(
            sentence_label_one_hot * sentence_output, axis=-1)
        sentence_loss = tf.keras.backend.mean(per_example_loss_sentence)
        loss = mask_label_loss + sentence_loss
        # TODO(hongkuny): Avoids the hack and switches add_loss.
        final_loss = tf.fill(tf.keras.backend.shape(per_example_loss_sentence),
                             loss)

        self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
                          lm_per_example_loss, sentence_output,
                          sentence_labels, per_example_loss_sentence)
        return final_loss
Esempio n. 3
0
 def call(self, inputs, **kwargs):
     """Implements call() for the layer."""
     (input_tensor, attention_mask) = tf_utils.unpack_inputs(inputs)
     attention_output, attention_score = self.attention_layer(
         from_tensor=input_tensor,
         to_tensor=input_tensor,
         attention_mask=attention_mask,
         **kwargs)
     attention_output = self.attention_output_dense(attention_output)
     attention_output = self.attention_dropout(attention_output,
                                               training=kwargs.get(
                                                   'training', False))
     # Use float32 in keras layer norm and the gelu activation in the
     # intermediate dense layer for numeric stability
     attention_output = self.attention_layer_norm(input_tensor +
                                                  attention_output)
     if self.float_type == tf.float16:
         attention_output = tf.cast(attention_output, tf.float16)
     intermediate_output = self.intermediate_dense(attention_output)
     if self.float_type == tf.float16:
         intermediate_output = tf.cast(intermediate_output, tf.float16)
     layer_output = self.output_dense(intermediate_output)
     layer_output = self.output_dropout(layer_output,
                                        training=kwargs.get(
                                            'training', False))
     # Use float32 in keras layer norm for numeric stability
     layer_output = self.output_layer_norm(layer_output + attention_output)
     if self.float_type == tf.float16:
         layer_output = tf.cast(layer_output, tf.float16)
     return layer_output, attention_score
Esempio n. 4
0
    def call(self, inputs, mode="bert", **kwargs):
        """Implements call() for the layer.

    Args:
      inputs: packed input tensors.
      mode: string, `bert` or `encoder`.
    Returns:
      Output tensor of the last layer for BERT training (mode=`bert`) which
      is a float Tensor of shape [batch_size, seq_length, hidden_size] or
      a list of output tensors for encoder usage (mode=`encoder`).
    """
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        input_word_ids = unpacked_inputs[0]
        input_mask = unpacked_inputs[1]
        input_type_ids = unpacked_inputs[2]
        word_embeddings = self.embedding_lookup(input_word_ids)
        embedding_tensor = self.embedding_postprocessor(
            word_embeddings=word_embeddings, token_type_ids=input_type_ids)
        if self.float_type == tf.float16:
            embedding_tensor = tf.cast(embedding_tensor, tf.float16)
        attention_mask = None
        if input_mask is not None:
            attention_mask = create_attention_mask_from_input_mask(
                input_word_ids, input_mask)
        # if mode == "encoder":
        #   return self.encoder(
        #       embedding_tensor, attention_mask, return_all_layers=True)

        sequence_output, attention_scores = self.encoder(
            embedding_tensor, attention_mask, return_all_layers=True)
        first_token_tensor = tf.squeeze(sequence_output[-1][:, 0:1, :], axis=1)
        pooled_output = self.pooler_transform(first_token_tensor)
        return (pooled_output, sequence_output, attention_scores,
                embedding_tensor)
Esempio n. 5
0
    def call(self, inputs, **kwargs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        word_embeddings = unpacked_inputs[0]
        token_type_ids = unpacked_inputs[1]
        input_shape = tf_utils.get_shape_list(word_embeddings, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        width = input_shape[2]

        output = word_embeddings
        if self.use_type_embeddings:
            flat_token_type_ids = tf.reshape(token_type_ids, [-1])
            one_hot_ids = tf.one_hot(flat_token_type_ids,
                                     depth=self.token_type_vocab_size,
                                     dtype=self.dtype)
            token_type_embeddings = tf.matmul(one_hot_ids,
                                              self.type_embeddings)
            token_type_embeddings = tf.reshape(token_type_embeddings,
                                               [batch_size, seq_length, width])
            output += token_type_embeddings

        if self.use_position_embeddings:
            position_embeddings = tf.expand_dims(tf.slice(
                self.position_embeddings, [0, 0], [seq_length, width]),
                                                 axis=0)

            output += position_embeddings

        output = self.output_layer_norm(output)
        output = self.output_dropout(output,
                                     training=kwargs.get('training', False))

        return output
Esempio n. 6
0
    def call(self, inputs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        lm_output = unpacked_inputs[0]
        sentence_output = unpacked_inputs[1]
        lm_label_ids = unpacked_inputs[2]
        lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3],
                                                 tf.float32)
        sentence_labels = unpacked_inputs[4]

        mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss(
            labels=lm_label_ids,
            predictions=lm_output,
            weights=lm_label_weights)
        sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss(
            labels=sentence_labels, predictions=sentence_output)
        loss = mask_label_loss + sentence_loss
        batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0],
                               [1])
        # TODO(hongkuny): Avoids the hack and switches add_loss.
        final_loss = tf.fill(batch_shape, loss)

        self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
                          mask_label_loss, sentence_output, sentence_labels,
                          sentence_loss)
        return final_loss
Esempio n. 7
0
    def call(self, inputs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        word_embeddings = unpacked_inputs[0]
        token_type_ids = unpacked_inputs[1]
        input_shape = tf_utils.get_shape_list(word_embeddings, expected_rank=3)
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        width = input_shape[2]

        output = word_embeddings
        if self.use_type_embeddings:
            flat_token_type_ids = tf.reshape(token_type_ids, [-1])
            token_type_embeddings = tf.gather(self.type_embeddings,
                                              flat_token_type_ids)
            token_type_embeddings = tf.reshape(token_type_embeddings,
                                               [batch_size, seq_length, width])
            output += token_type_embeddings

        if self.use_position_embeddings:
            position_embeddings = tf.expand_dims(tf.slice(
                self.position_embeddings, [0, 0], [seq_length, width]),
                                                 axis=0)

            output += position_embeddings

        output = self.output_layer_norm(output)
        output = self.output_dropout(output)

        return output
Esempio n. 8
0
    def call(self, inputs, **kwargs):
        """Implements call() for the layer."""
        (from_tensor, to_tensor,
         attention_mask) = tf_utils.unpack_inputs(inputs)

        # Scalar dimensions referenced here:
        #   B = batch size (number of sequences)
        #   F = `from_tensor` sequence length
        #   T = `to_tensor` sequence length
        #   N = `num_attention_heads`
        #   H = `size_per_head`
        # `query_tensor` = [B, F, N ,H]
        query_tensor = self.query_dense(from_tensor)

        # `key_tensor` = [B, T, N, H]
        key_tensor = self.key_dense(to_tensor)

        # `value_tensor` = [B, T, N, H]
        value_tensor = self.value_dense(to_tensor)

        # Take the dot product between "query" and "key" to get the raw
        # attention scores.
        attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_tensor,
                                     query_tensor)
        attention_scores = tf.multiply(
            attention_scores, 1.0 / math.sqrt(float(self.size_per_head)))

        if attention_mask is not None:
            # `attention_mask` = [B, 1, F, T]
            attention_mask = tf.expand_dims(attention_mask, axis=[1])

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            adder = (1.0 - tf.cast(attention_mask,
                                   attention_scores.dtype)) * -10000.0

            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_scores += adder

        # Normalize the attention scores to probabilities.
        # `attention_probs` = [B, N, F, T]
        attention_probs = tf.nn.softmax(attention_scores)
        # reshape to [b*n, f, t]
        shapes = attention_scores.shape
        attention_scores = tf.reshape(attention_scores,
                                      [-1, shapes[2], shapes[3]])
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.attention_probs_dropout(attention_probs,
                                                       training=kwargs.get(
                                                           'training', False))

        # `context_layer` = [B, F, N, H]
        context_tensor = tf.einsum("BNFT,BTNH->BFNH", attention_probs,
                                   value_tensor)

        return context_tensor, attention_scores
Esempio n. 9
0
  def call(self, inputs):
    # """Implements call() for the layer."""
    # unpacked_inputs = tf_utils.unpack_inputs(inputs)
    # lm_output = unpacked_inputs[0]
    # sentence_output = unpacked_inputs[1]
    # lm_label_ids = unpacked_inputs[2]
    # lm_label_weights = unpacked_inputs[3]
    # sentence_labels = unpacked_inputs[4]
    
    # lm_label_weights = tf.cast(lm_label_weights, tf.float32)
    # lm_output = tf.cast(lm_output, tf.float32)
    # sentence_output = tf.cast(sentence_output, tf.float32)
    # mask_label_loss = losses.loss(
    #     labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights)
    
    # sentence_loss = losses.loss(
    #     labels=sentence_labels, predictions=sentence_output)
    
    # loss = mask_label_loss + sentence_loss
    # batch_shape = tf.slice(tf.shape(sentence_labels), [0], [1])
    # # TODO(hongkuny): Avoids the hack and switches add_loss.
    # final_loss = tf.fill(batch_shape, loss)
    # print(batch_shape, final_loss)
    # self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
    #                   mask_label_loss, sentence_output, sentence_labels,
    #                   sentence_loss)
    # return final_loss
    """Implements call() for the layer."""
    unpacked_inputs = tf_utils.unpack_inputs(inputs)
    lm_output = unpacked_inputs[0]
    sentence_output = unpacked_inputs[1]
    lm_label_ids = unpacked_inputs[2]
    lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
    lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids,
                                                    self.config.vocab_size)
    lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32)
    lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1])
    lm_per_example_loss = -tf.keras.backend.sum(
        lm_output * lm_label_ids_one_hot, axis=[-1])
    numerator = tf.keras.backend.sum(lm_label_weights * lm_per_example_loss)
    denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5
    mask_label_loss = numerator / denominator

    sentence_labels = unpacked_inputs[4]
    sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1])
    sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2)
    per_example_loss_sentence = -tf.keras.backend.sum(
        sentence_label_one_hot * sentence_output, axis=-1)
    sentence_loss = tf.keras.backend.mean(per_example_loss_sentence)
    loss = mask_label_loss + sentence_loss
    # TODO(hongkuny): Avoids the hack and switches add_loss.
    final_loss = tf.fill(
        tf.keras.backend.shape(per_example_loss_sentence), loss)
    
    self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
                      lm_per_example_loss, sentence_output, sentence_labels,
                      per_example_loss_sentence)
    return final_loss
Esempio n. 10
0
    def call(self, inputs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        pooled_output = unpacked_inputs[0]
        sequence_output = unpacked_inputs[1]
        masked_lm_positions = unpacked_inputs[2]

        mask_lm_input_tensor = tf_utils.gather_indexes(sequence_output, masked_lm_positions)
        lm_output = self.lm_dense(mask_lm_input_tensor)
        lm_output = self.lm_layer_norm(lm_output)
        lm_output = tf.matmul(lm_output, self.embedding_table, transpose_b=True)
        lm_output = tf.nn.bias_add(lm_output, self.output_bias)
        lm_output = tf.nn.log_softmax(lm_output, axis=-1)
        
        logits = tf.matmul(pooled_output, self.next_seq_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, self.next_seq_bias)
        sentence_output = tf.nn.log_softmax(logits, axis=-1)
        return (lm_output, sentence_output, logits)
Esempio n. 11
0
    def call(self, inputs, return_all_layers=False, **kwargs):
        """Implements call() for the layer.

    Args:
      inputs: packed inputs.
      return_all_layers: bool, whether to return outputs of all layers inside
        encoders.
    Returns:
      Output tensor of the last layer or a list of output tensors.
    """
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        input_tensor = unpacked_inputs[0]
        attention_mask = unpacked_inputs[1]
        output_tensor = input_tensor

        all_layer_outputs = []
        for layer in self.layers:
            output_tensor = layer(output_tensor, attention_mask, **kwargs)
            all_layer_outputs.append(output_tensor)

        if return_all_layers:
            return all_layer_outputs

        return all_layer_outputs[-1]
Esempio n. 12
0
    def call(self, inputs, **kwargs):
        """Implements call() for the layer."""
        unpacked_inputs = tf_utils.unpack_inputs(inputs)
        sequence_output = unpacked_inputs[0]
        p_mask = unpacked_inputs[1]
        cls_index = unpacked_inputs[2]
        start_positions = unpacked_inputs[3]

        _, seq_len, _ = sequence_output.shape.as_list()
        sequence_output = tf.transpose(sequence_output, [1, 0, 2])

        start_logits = self.start_logits_proj_layer(sequence_output)
        start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

        if kwargs.get("training", False):
            # during training, compute the end logits based on the
            # ground truth of the start position
            start_positions = tf.reshape(start_positions, [-1])
            start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1,
                                     dtype=tf.float32)
            start_features = tf.einsum(
                'lbh,bl->bh', sequence_output, start_index)
            start_features = tf.tile(start_features[None], [seq_len, 1, 1])
            end_logits = self.end_logits_proj_layer0(
                tf.concat([sequence_output, start_features], axis=-1))

            end_logits = self.end_logits_layer_norm(end_logits)

            end_logits = self.end_logits_proj_layer1(end_logits)
            end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
            end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        else:
            start_top_log_probs, start_top_index = tf.nn.top_k(
                start_log_probs, k=self.start_n_top)
            start_index = tf.one_hot(
                start_top_index, depth=seq_len, axis=-1, dtype=tf.float32)
            start_features = tf.einsum(
                'lbh,bkl->bkh', sequence_output, start_index)
            end_input = tf.tile(sequence_output[:, :, None], [
                                1, 1, self.start_n_top, 1])
            start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1])
            end_input = tf.concat([end_input, start_features], axis=-1)
            end_logits = self.end_logits_proj_layer0(end_input)
            end_logits = tf.reshape(end_logits, [seq_len, -1, self.hidden_size])
            end_logits = self.end_logits_layer_norm(end_logits)

            end_logits = tf.reshape(end_logits,
                                    [seq_len, -1, self.start_n_top, self.hidden_size])

            end_logits = self.end_logits_proj_layer1(end_logits)
            end_logits = tf.reshape(
                end_logits, [seq_len, -1, self.start_n_top])
            end_logits = tf.transpose(end_logits, [1, 2, 0])
            end_logits_masked = end_logits * (
                1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
            end_top_log_probs, end_top_index = tf.nn.top_k(
                end_log_probs, k=self.end_n_top)
            end_top_log_probs = tf.reshape(end_top_log_probs,
                                           [-1, self.start_n_top * self.end_n_top])
            end_top_index = tf.reshape(end_top_index,
                                       [-1, self.start_n_top * self.end_n_top])

        # an additional layer to predict answerability

        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum('lbh,bl->bh', sequence_output, cls_index)

        # get the representation of START
        start_p = tf.nn.softmax(start_logits_masked,
                                axis=-1, name='softmax_start')
        start_feature = tf.einsum('lbh,bl->bh', sequence_output, start_p)

        ans_feature = tf.concat([start_feature, cls_feature], -1)
        ans_feature = self.answer_class_proj_layer0(ans_feature)
        ans_feature = self.ans_feature_dropout(
            ans_feature, training=kwargs.get('training', False))
        cls_logits = self.answer_class_proj_layer1(ans_feature)
        cls_logits = tf.squeeze(cls_logits, -1)

        if kwargs.get("training", False):
            return (start_log_probs, end_log_probs, cls_logits)
        else:
            return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)