def test_embedding_lookup_1d_ids(self, use_one_hot_lookup):
        embedding_table = tf.constant([
            [1.0, -1.0],  #
            [1.1, -1.1],  #
            [1.2, -1.2],  #
            [1.3, -1.3],  #
            [1.4, -1.4],  #
        ])
        vocab_size, embedding_size = embedding_table.shape.as_list()

        input_ids = tf.constant([1, 0, 0, 3])
        input_mask = tf.constant([1, 1, 0, 1])

        layer = readtwice_layers.EmbeddingLookup(
            vocab_size=vocab_size,
            embedding_size=embedding_size,
            use_one_hot_lookup=use_one_hot_lookup)
        layer.build(None)  # Shapes are unused so we pass None.
        layer.embedding_table = embedding_table

        expected = [
            [1.1, -1.1],  #
            [1.0, -1.0],  #
            [0.0, 0.0],  #
            [1.3, -1.3],  #
        ]
        result = layer(input_ids, input_mask=input_mask)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(expected, result)
    def test_embedding_lookup_with_projection(self):
        # Create an embedding table with width != projection_size
        embedding_table = tf.constant([
            [1.0, -1.0, 0.5],  #
            [1.1, -1.1, -0.4],  #
            [1.2, -1.2, -0.5],  #
            [1.3, -1.3, 0.8],  #
            [1.4, -1.4, 0.9],  #
        ])

        projection_size = 2  #  Different from the embedding_dimension.
        vocab_size, embedding_size = embedding_table.shape.as_list()
        input_ids = tf.constant([
            [3, 2, 1],  #
            [4, 0, 4],  #
        ])

        input_mask = tf.constant([
            [1, 0, 0],  #
            [0, 0, 1],  #
        ])

        layer = readtwice_layers.EmbeddingLookup(
            vocab_size=vocab_size,
            embedding_size=embedding_size,
            projection_size=projection_size,
            use_one_hot_lookup=True)

        layer.build(None)  # Shapes are unused so we pass None.
        layer.embedding_table = embedding_table

        # Dense layer to use for projection. Note that, we have a non-zero
        # bias initializer here to ensure that the bias term doesn't get through
        # to the masked_ids after projection.
        layer.embedding_projection = tf.keras.layers.Dense(
            units=projection_size,
            activation=None,
            use_bias=True,
            kernel_initializer='ones',
            bias_initializer='ones')

        expected = [
            [
                [1.8,
                 1.8],  # [1.3, -1.3, 0.8] * kernel_initializer + 1 (bias).
                [0., 0.],  #
                [0., 0.],  #
            ],  #
            [
                [0., 0.],  #
                [0., 0.],  #
                [1.9,
                 1.9],  # [1.4, -1.4, 0.9] * kernel_initializer + 1 (bias).
            ],  #
        ]
        result = layer(input_ids, input_mask)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(expected, result)
    def test_embedding_lookup_random_init_no_mask(self, use_one_hot_lookup):
        vocab_size = 5
        embedding_size = 2

        input_ids = tf.constant([1, 0, 0, 3])
        input_size = input_ids.shape.as_list()[0]

        layer = readtwice_layers.EmbeddingLookup(
            vocab_size=vocab_size,
            embedding_size=embedding_size,
            use_one_hot_lookup=use_one_hot_lookup)

        result = layer(input_ids)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(result)
        self.assertAllEqual([input_size, embedding_size], result.shape)
    def test_embedding_lookup_no_projection(self, projection_size):
        # Create an embedding table with width = projection_size
        embedding_table = tf.constant([
            [1.0, -1.0, 0.5],  #
            [1.1, -1.1, -0.5],  #
            [1.2, -1.2, -0.2],  #
            [1.3, -1.3, 0.3],  #
            [1.4, -1.4, 0.4],  #
        ])
        vocab_size, embedding_size = embedding_table.shape.as_list()

        input_ids = tf.constant([
            [3, 2, 1],  #
            [4, 0, 4],  #
        ])

        layer = readtwice_layers.EmbeddingLookup(
            vocab_size=vocab_size,
            embedding_size=embedding_size,
            projection_size=projection_size,
            use_one_hot_lookup=True)

        layer.build(None)  # Shapes are unused so we pass None.
        layer.embedding_table = embedding_table

        expected = [
            [
                [1.3, -1.3, 0.3],  #
                [1.2, -1.2, -0.2],  #
                [1.1, -1.1, -0.5],  #
            ],  #
            [
                [1.4, -1.4, 0.4],  #
                [1.0, -1.0, 0.5],  #
                [1.4, -1.4, 0.4],  #
            ],  #
        ]
        result = layer(input_ids)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(expected, result)
Beispiel #5
0
    def __init__(self,
                 config,
                 num_layers_override,
                 num_cross_attention_heads,
                 enable_default_side_input=False,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_decoder",
                 **kwargs):
        """Constructor for ReadItTwiceDecoderModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      num_layers_override: int. Number of Transformer layers.
      num_cross_attention_heads: int. Number of cross-attention heads.
      enable_default_side_input: Add a default side input, which acts like a
        no-op attention, effective allowing attention weights to sum up
        to something less than 1.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceDecoderModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.num_layers_override = num_layers_override
        self.num_cross_attention_heads = num_cross_attention_heads
        self.enable_default_side_input = enable_default_side_input

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)
        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=self.num_layers_override,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=False,
            num_cross_attention_heads=self.num_cross_attention_heads,
            enable_default_side_input=self.enable_default_side_input)
Beispiel #6
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_bert",
                 **kwargs):
        """Constructor for ReadItTwiceBertModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceBertModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings

        if config.cross_attention_top_k is not None:
            assert config.second_read_type == "cross_attend_once"

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)

        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        if config.cross_attention_pos_emb_mode is not None:
            # We would end up adding block position embeddings multiple times.
            assert config.summary_postprocessing_type not in [
                "pos", "transformer"
            ]

        if config.second_read_type == "from_scratch":
            share_kv_projections_first_read = config.share_kv_projections
        else:
            # Summaries are not going to be used by the first read model anyway.
            share_kv_projections_first_read = True

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=config.num_hidden_layers,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=share_kv_projections_first_read,
            name="transformer_layers")
        # grad_checkpointing_period=config.grad_checkpointing_period)

        self.summary_extraction = SummaryExtraction(
            config=config, use_one_hot_embeddings=use_one_hot_embeddings)

        if config.second_read_type == "new_layers":
            if config.second_read_num_new_layers is None:
                raise ValueError("Must specify `second_read_num_new_layers`"
                                 "when `second_read_type` is new_layers")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=config.share_kv_projections,
                name="transformer_layers")
        elif config.second_read_type == "cross_attend_once":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.cross_attention_layer = readtwice_layers.ResidualBlock(
                inner_layer=readtwice_layers.SideAttention(
                    hidden_size=config.hidden_size,
                    num_heads=config.second_read_num_cross_attention_heads,
                    att_dropout_prob=0,
                    initializer=tf.keras.initializers.TruncatedNormal(
                        stddev=config.initializer_range),
                    top_k_attention=config.cross_attention_top_k,
                    pos_embed_mode=config.cross_attention_pos_emb_mode,
                    pos_embed_size=config.max_num_blocks_per_document,
                    use_one_hot_embeddings=use_one_hot_embeddings,
                    enable_default_side_input=config.
                    second_read_enable_default_side_input),
                dropout_probability=config.hidden_dropout_prob,
                use_pre_activation_order=False,
                name="cross_attention_layer")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                name="transformer_layers")
        elif config.second_read_type == "new_layers_cross_attention":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                num_cross_attention_heads=(
                    config.second_read_num_cross_attention_heads),
                enable_default_side_input=(
                    config.second_read_enable_default_side_input),
                name="transformer_layers")
        else:
            if config.second_read_type != "from_scratch":
                raise ValueError("Unknown `second_read_type`: '{}'".format(
                    config.second_read_type))
Beispiel #7
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings,
                 name="summary_extraction",
                 **kwargs):
        """Constructor for SummaryExtraction.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(SummaryExtraction, self).__init__(name=name, **kwargs)
        self.mode = config.summary_mode
        self.hidden_size = config.hidden_size
        self.postprocessing_type = config.summary_postprocessing_type
        self.use_sparse_memory_attention = (config.use_sparse_memory_attention)

        self.embedding_norm = None

        if self.mode == "cls":
            pass
        elif self.mode == "text_block":
            self.text_block_extract_every_x = config.text_block_extract_every_x
            assert self.text_block_extract_every_x is not None
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        elif self.mode == "entity":
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        else:
            raise ValueError("Unknown summary mode: {}".format(self.mode))

        if self.postprocessing_type == "none":
            self.postprocessing = None
        elif self.postprocessing_type == "linear":
            self.postprocessing = tf.keras.layers.Dense(
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="cls_pool")
        elif self.postprocessing_type in ["pos", "transformer"]:
            self.position_embedding = readtwice_layers.EmbeddingLookup(
                vocab_size=config.max_num_blocks_per_document,
                embedding_size=config.hidden_size,
                initializer_range=config.initializer_range,
                use_one_hot_lookup=use_one_hot_embeddings,
                name="block_position_emb_lookup")
            # Call layers to force variable initialization.
            self.position_embedding(tf.ones([1, 1], tf.int32))
            self.embedding_norm = tf.keras.layers.LayerNormalization(
                axis=-1, epsilon=1e-12, name="summary_emb_layer_norm")
            self.embedding_dropout = tf.keras.layers.Dropout(
                rate=config.hidden_dropout_prob)

            if self.postprocessing_type == "transformer":
                if config.summary_postprocessing_num_layers is None:
                    raise ValueError(
                        "Must specify `postprocessing_num_layers`"
                        "when `postprocessing_type` is \"transformer\"")

                self.postprocessing = readtwice_layers.TransformerWithSideInputLayers(
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.summary_postprocessing_num_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    hidden_act=tensor_utils.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    share_kv_projections=True)
        else:
            raise ValueError("Unknown summary type: {}".format(
                self.postprocessing_type))