Esempio n. 1
0
    def __init__(self,
                 intermediate_size,
                 intermediate_activation=tensor_utils.get_activation("gelu"),
                 dropout_rate=0.0,
                 name="span_prediction_head",
                 **kwargs):
        """Constructor for SpanPredictionHead.

    Args:
      intermediate_size: dimension of the intermediate representation of MLP. If
        None then only a single linear layer will be applied
      intermediate_activation: activation function for MLP
      dropout_rate: dropout rate
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.
    """
        super(SpanPredictionHead, self).__init__(name=name, **kwargs)
        if intermediate_size is not None:
            self._intermediate_dense = tf.keras.layers.Dense(intermediate_size)
            self._intermediate_activation = tf.keras.layers.Activation(
                intermediate_activation)
            self._output_dropout = tf.keras.layers.Dropout(dropout_rate)
            self._output_layer_norm = tf.keras.layers.LayerNormalization()
        else:
            self._intermediate_dense = None
            self._intermediate_activation = None
            self._output_dropout = None
            self._output_layer_norm = None
        self._logits_dense = tf.keras.layers.Dense(2)
Esempio n. 2
0
    def __init__(self,
                 output_weights,
                 hidden_size,
                 name='language_model_loss',
                 activation='gelu',
                 initializer_range=0.02,
                 **kwargs):
        """Constructor for LanguageModelLoss.

    Args:
      output_weights: Embeddings table
      hidden_size: Input size
      name: (Optional) name of the layer.
      activation: The non-linear activation function (function or string) in the
        1 layer MLP decoder. Default is "gelu".
      initializer_range: The stdev of the truncated_normal_initializer for
        initializing all weight matrices.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: Shape of the output_weights is not statically known.
    """
        super(LanguageModelLoss, self).__init__(name=name, **kwargs)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.output_weights = output_weights
        self.activation = activation
        self.initializer_range = initializer_range
        self.hidden_size = hidden_size

        self.vocab_size, self.embedding_size = tensor_utils.get_shape_list(
            self.output_weights, expected_rank=2, name='word embeddings table')
        if self.vocab_size is None:
            raise ValueError('`output_weights[0]` must be statically known.')

        self.linear_fn = tf.keras.layers.Dense(
            self.embedding_size,
            activation=tensor_utils.get_activation(self.activation),
            use_bias=True,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self.initializer_range))
        self.layer_norm = tf.keras.layers.LayerNormalization(axis=-1,
                                                             epsilon=0.001)
Esempio n. 3
0
    def __init__(self,
                 hidden_size,
                 num_hidden_layers,
                 num_attention_heads,
                 intermediate_size=None,
                 hidden_act=tensor_utils.get_activation('gelu'),
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 initializer_range=0.02,
                 share_kv_projections=False,
                 num_cross_attention_heads=None,
                 enable_default_side_input=False,
                 name='transformer_layers',
                 **kwargs):
        """Init.

    Args:
      hidden_size: Size of the output hidden dimension.  Must match the input
        hidden dimension size.
      num_hidden_layers: Number of Transformer layers.  Each layer includes both
        an attention sublayer and a feed-forward sublayer.
      num_attention_heads: Number of attention heads. Must evenly divide
        `hidden_size`.
      intermediate_size: The size of the "intermediate" (i.e. feed-forward)
        layers. Defaults to 4 * hidden_size.
      hidden_act: The non-linear activation function in the intermediate layers.
      hidden_dropout_prob: The dropout probability for the attention and
        feed-forward residual blocks. Must be between 0.0 and 1.0.
      attention_probs_dropout_prob: Dropout probability for attention
        probabilities. Must be between 0.0 and 1.0.
      initializer_range: The standard deviation of the truncated normal
        initializer for initializing weight matrices not created by
        `linear_make_fn`. If zero, the scale of the truncated normal initializer
        will be tuned automatically according to the distribution of the inputs.
      share_kv_projections: If True, key and value projections will be shared
        between main-to-main and main-to-side components. This results in 1
        key projection per layer instead of 2 (and similarly for value
        projections). Only relevant for fused side attention,
        NOT cross attention over the side input (when num_cross_attention_heads
        is not None).
      num_cross_attention_heads: If it is not None, will add a cross-attention
        layer over side inputs. In this case, side inputs will NOT be used
        in the `FusedSideAttention`. Must be greater or equal than 0, where 0
        means that cross attention layer will have a single attention head
        WITHOUT projection matrices.
      enable_default_side_input: Add a default side input, which acts like a
        no-op attention, effective allowing attention weights to sum up
        to something less than 1.
        Currently, only available for the cross attention over side inputs.
      name: Name of the layer.
      **kwargs: Forwarded to super.
    """
        super(TransformerWithSideInputLayers, self).__init__(name=name,
                                                             **kwargs)

        if intermediate_size is None:
            intermediate_size = 4 * hidden_size

        if num_cross_attention_heads is not None:
            # This will prevent from allocating extra parameters for
            # fused side attention since side input will not be used there anyway.
            share_kv_projections = True

        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.share_kv_projections = share_kv_projections
        self.num_cross_attention_heads = num_cross_attention_heads
        self.enable_default_side_input = enable_default_side_input

        if (self.enable_default_side_input
                and self.num_cross_attention_heads is None):
            raise ValueError('`enable_default_side_input` is only used when '
                             'num_cross_attention_heads is enabled.')
        if (self.num_cross_attention_heads is not None
                and self.num_cross_attention_heads < 0):
            raise ValueError('If `num_cross_attention_heads` is specified '
                             'it must be non-negative.')

        self.initializer = tf.keras.initializers.TruncatedNormal(
            stddev=initializer_range)

        self.attention_layers = []
        self.cross_attention_layers = []
        self.feed_forward_layers = []
        for i in range(num_hidden_layers):
            self.attention_layers.append(
                wrappers.ResidualBlock(
                    inner_layer=attention.FusedSideAttention(
                        hidden_size=self.hidden_size,
                        num_heads=self.num_attention_heads,
                        att_dropout_prob=self.attention_probs_dropout_prob,
                        share_kv_projections=self.share_kv_projections,
                        initializer=self.initializer),
                    dropout_probability=self.hidden_dropout_prob,
                    use_pre_activation_order=False,
                    name='attention_layer_%d' % i))

            if self.num_cross_attention_heads is not None:
                self.cross_attention_layers.append(
                    wrappers.ResidualBlock(
                        inner_layer=attention.SideAttention(
                            hidden_size=self.hidden_size,
                            num_heads=self.num_cross_attention_heads,
                            att_dropout_prob=self.attention_probs_dropout_prob,
                            initializer=self.initializer,
                            enable_default_side_input=self.
                            enable_default_side_input),
                        dropout_probability=self.hidden_dropout_prob,
                        use_pre_activation_order=False,
                        name='cross_attention_layer_%d' % i))

            self.feed_forward_layers.append(
                wrappers.ResidualBlock(
                    dropout_probability=self.hidden_dropout_prob,
                    use_pre_activation_order=False,
                    inner_intermediate_size=self.intermediate_size,
                    inner_activation=self.hidden_act,
                    inner_kernel_initializer=self.initializer,
                    name='feed_forward_layer_%d' % i))
Esempio n. 4
0
    def __init__(self,
                 config,
                 num_layers_override,
                 num_cross_attention_heads,
                 enable_default_side_input=False,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_decoder",
                 **kwargs):
        """Constructor for ReadItTwiceDecoderModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      num_layers_override: int. Number of Transformer layers.
      num_cross_attention_heads: int. Number of cross-attention heads.
      enable_default_side_input: Add a default side input, which acts like a
        no-op attention, effective allowing attention weights to sum up
        to something less than 1.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceDecoderModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.num_layers_override = num_layers_override
        self.num_cross_attention_heads = num_cross_attention_heads
        self.enable_default_side_input = enable_default_side_input

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)
        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=self.num_layers_override,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=False,
            num_cross_attention_heads=self.num_cross_attention_heads,
            enable_default_side_input=self.enable_default_side_input)
Esempio n. 5
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_bert",
                 **kwargs):
        """Constructor for ReadItTwiceBertModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceBertModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings

        if config.cross_attention_top_k is not None:
            assert config.second_read_type == "cross_attend_once"

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)

        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        if config.cross_attention_pos_emb_mode is not None:
            # We would end up adding block position embeddings multiple times.
            assert config.summary_postprocessing_type not in [
                "pos", "transformer"
            ]

        if config.second_read_type == "from_scratch":
            share_kv_projections_first_read = config.share_kv_projections
        else:
            # Summaries are not going to be used by the first read model anyway.
            share_kv_projections_first_read = True

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=config.num_hidden_layers,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=share_kv_projections_first_read,
            name="transformer_layers")
        # grad_checkpointing_period=config.grad_checkpointing_period)

        self.summary_extraction = SummaryExtraction(
            config=config, use_one_hot_embeddings=use_one_hot_embeddings)

        if config.second_read_type == "new_layers":
            if config.second_read_num_new_layers is None:
                raise ValueError("Must specify `second_read_num_new_layers`"
                                 "when `second_read_type` is new_layers")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=config.share_kv_projections,
                name="transformer_layers")
        elif config.second_read_type == "cross_attend_once":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.cross_attention_layer = readtwice_layers.ResidualBlock(
                inner_layer=readtwice_layers.SideAttention(
                    hidden_size=config.hidden_size,
                    num_heads=config.second_read_num_cross_attention_heads,
                    att_dropout_prob=0,
                    initializer=tf.keras.initializers.TruncatedNormal(
                        stddev=config.initializer_range),
                    top_k_attention=config.cross_attention_top_k,
                    pos_embed_mode=config.cross_attention_pos_emb_mode,
                    pos_embed_size=config.max_num_blocks_per_document,
                    use_one_hot_embeddings=use_one_hot_embeddings,
                    enable_default_side_input=config.
                    second_read_enable_default_side_input),
                dropout_probability=config.hidden_dropout_prob,
                use_pre_activation_order=False,
                name="cross_attention_layer")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                name="transformer_layers")
        elif config.second_read_type == "new_layers_cross_attention":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                num_cross_attention_heads=(
                    config.second_read_num_cross_attention_heads),
                enable_default_side_input=(
                    config.second_read_enable_default_side_input),
                name="transformer_layers")
        else:
            if config.second_read_type != "from_scratch":
                raise ValueError("Unknown `second_read_type`: '{}'".format(
                    config.second_read_type))
Esempio n. 6
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings,
                 name="summary_extraction",
                 **kwargs):
        """Constructor for SummaryExtraction.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(SummaryExtraction, self).__init__(name=name, **kwargs)
        self.mode = config.summary_mode
        self.hidden_size = config.hidden_size
        self.postprocessing_type = config.summary_postprocessing_type
        self.use_sparse_memory_attention = (config.use_sparse_memory_attention)

        self.embedding_norm = None

        if self.mode == "cls":
            pass
        elif self.mode == "text_block":
            self.text_block_extract_every_x = config.text_block_extract_every_x
            assert self.text_block_extract_every_x is not None
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        elif self.mode == "entity":
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        else:
            raise ValueError("Unknown summary mode: {}".format(self.mode))

        if self.postprocessing_type == "none":
            self.postprocessing = None
        elif self.postprocessing_type == "linear":
            self.postprocessing = tf.keras.layers.Dense(
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="cls_pool")
        elif self.postprocessing_type in ["pos", "transformer"]:
            self.position_embedding = readtwice_layers.EmbeddingLookup(
                vocab_size=config.max_num_blocks_per_document,
                embedding_size=config.hidden_size,
                initializer_range=config.initializer_range,
                use_one_hot_lookup=use_one_hot_embeddings,
                name="block_position_emb_lookup")
            # Call layers to force variable initialization.
            self.position_embedding(tf.ones([1, 1], tf.int32))
            self.embedding_norm = tf.keras.layers.LayerNormalization(
                axis=-1, epsilon=1e-12, name="summary_emb_layer_norm")
            self.embedding_dropout = tf.keras.layers.Dropout(
                rate=config.hidden_dropout_prob)

            if self.postprocessing_type == "transformer":
                if config.summary_postprocessing_num_layers is None:
                    raise ValueError(
                        "Must specify `postprocessing_num_layers`"
                        "when `postprocessing_type` is \"transformer\"")

                self.postprocessing = readtwice_layers.TransformerWithSideInputLayers(
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.summary_postprocessing_num_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    hidden_act=tensor_utils.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    share_kv_projections=True)
        else:
            raise ValueError("Unknown summary type: {}".format(
                self.postprocessing_type))