def test_transformer_with_side_inputs_layers(self,
                                                 attention_probs_dropout_prob,
                                                 side_seq_len,
                                                 share_kv_projections,
                                                 num_cross_attention_heads,
                                                 enable_default_side_input):
        tf.compat.v1.random.set_random_seed(1234)
        np.random.seed(1234)

        batch_size = 5
        main_seq_len = 11
        seq_len = main_seq_len
        num_heads = 7
        hidden_size = 21
        num_layers = 2

        # We use `placeholder_with_default` to simulate the TF v1 situation where
        # the static `batch_size` is unknown.
        inputs = tf.compat.v1.placeholder_with_default(
            np.random.normal(size=[batch_size, seq_len, hidden_size]).astype(
                np.float32),
            shape=[None, None, hidden_size])
        if side_seq_len > 0:
            seq_len += side_seq_len
            side_inputs = tf.compat.v1.placeholder_with_default(
                np.random.normal(size=[side_seq_len, hidden_size]).astype(
                    np.float32),
                shape=[None, hidden_size])
        else:
            side_inputs = None

        att_mask = tf.compat.v1.placeholder_with_default(
            np.random.binomial(n=1,
                               p=0.9,
                               size=[batch_size, main_seq_len, seq_len]),
            shape=[None, None, None])

        layer = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=hidden_size,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            share_kv_projections=share_kv_projections,
            num_cross_attention_heads=num_cross_attention_heads,
            enable_default_side_input=enable_default_side_input)

        result = layer(inputs, side_input=side_inputs, att_mask=att_mask)

        static_batch_size = inputs.shape.as_list()[0]
        self.assertAllEqual([static_batch_size, main_seq_len, hidden_size],
                            result.shape.as_list())

        self.assertNotEmpty(layer.feed_forward_layers)
    def test_transformer_with_multiple_side_inputs_layers(self):
        tf.compat.v1.random.set_random_seed(1234)
        np.random.seed(1234)

        batch_size = 5
        main_seq_len = 11
        seq_len = main_seq_len
        num_heads = 7
        hidden_size = 21
        num_layers = 3

        # We use `placeholder_with_default` to simulate the TF v1 situation where
        # the static `batch_size` is unknown.
        inputs = tf.compat.v1.placeholder_with_default(
            np.random.normal(size=[batch_size, seq_len, hidden_size]).astype(
                np.float32),
            shape=[None, seq_len, hidden_size])

        side_inputs = []
        for _ in range(num_layers):
            side_seq_len = np.random.randint(1, 10)
            side_input = tf.compat.v1.placeholder_with_default(
                np.random.normal(
                    size=[batch_size, side_seq_len, hidden_size]).astype(
                        np.float32),
                shape=[None, side_seq_len, hidden_size])
            side_inputs.append(side_input)

        layer = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=hidden_size,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            attention_probs_dropout_prob=0.1,
            share_kv_projections=False)

        result = layer(inputs, side_input=side_inputs)

        static_batch_size = inputs.shape.as_list()[0]
        self.assertAllEqual([static_batch_size, main_seq_len, hidden_size],
                            result.shape.as_list())

        self.assertNotEmpty(layer.feed_forward_layers)
Beispiel #3
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_bert",
                 **kwargs):
        """Constructor for ReadItTwiceBertModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceBertModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings

        if config.cross_attention_top_k is not None:
            assert config.second_read_type == "cross_attend_once"

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)

        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        if config.cross_attention_pos_emb_mode is not None:
            # We would end up adding block position embeddings multiple times.
            assert config.summary_postprocessing_type not in [
                "pos", "transformer"
            ]

        if config.second_read_type == "from_scratch":
            share_kv_projections_first_read = config.share_kv_projections
        else:
            # Summaries are not going to be used by the first read model anyway.
            share_kv_projections_first_read = True

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=config.num_hidden_layers,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=share_kv_projections_first_read,
            name="transformer_layers")
        # grad_checkpointing_period=config.grad_checkpointing_period)

        self.summary_extraction = SummaryExtraction(
            config=config, use_one_hot_embeddings=use_one_hot_embeddings)

        if config.second_read_type == "new_layers":
            if config.second_read_num_new_layers is None:
                raise ValueError("Must specify `second_read_num_new_layers`"
                                 "when `second_read_type` is new_layers")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=config.share_kv_projections,
                name="transformer_layers")
        elif config.second_read_type == "cross_attend_once":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.cross_attention_layer = readtwice_layers.ResidualBlock(
                inner_layer=readtwice_layers.SideAttention(
                    hidden_size=config.hidden_size,
                    num_heads=config.second_read_num_cross_attention_heads,
                    att_dropout_prob=0,
                    initializer=tf.keras.initializers.TruncatedNormal(
                        stddev=config.initializer_range),
                    top_k_attention=config.cross_attention_top_k,
                    pos_embed_mode=config.cross_attention_pos_emb_mode,
                    pos_embed_size=config.max_num_blocks_per_document,
                    use_one_hot_embeddings=use_one_hot_embeddings,
                    enable_default_side_input=config.
                    second_read_enable_default_side_input),
                dropout_probability=config.hidden_dropout_prob,
                use_pre_activation_order=False,
                name="cross_attention_layer")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                name="transformer_layers")
        elif config.second_read_type == "new_layers_cross_attention":
            if config.second_read_num_new_layers is None:
                raise ValueError(
                    "Must specify `second_read_num_new_layers`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_num_cross_attention_heads is None:
                raise ValueError(
                    "Must specify `second_read_num_cross_attention_heads`"
                    "when `second_read_type` is cross_attend_once")
            if config.second_read_enable_default_side_input is None:
                raise ValueError(
                    "Must specify `second_read_enable_default_side_input`"
                    "when `second_read_type` is cross_attend_once")

            self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers(
                hidden_size=config.hidden_size,
                num_hidden_layers=config.second_read_num_new_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                hidden_act=tensor_utils.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                share_kv_projections=True,
                num_cross_attention_heads=(
                    config.second_read_num_cross_attention_heads),
                enable_default_side_input=(
                    config.second_read_enable_default_side_input),
                name="transformer_layers")
        else:
            if config.second_read_type != "from_scratch":
                raise ValueError("Unknown `second_read_type`: '{}'".format(
                    config.second_read_type))
Beispiel #4
0
    def __init__(self,
                 config,
                 num_layers_override,
                 num_cross_attention_heads,
                 enable_default_side_input=False,
                 use_one_hot_embeddings=False,
                 name="read_it_twice_decoder",
                 **kwargs):
        """Constructor for ReadItTwiceDecoderModel.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      num_layers_override: int. Number of Transformer layers.
      num_cross_attention_heads: int. Number of cross-attention heads.
      enable_default_side_input: Add a default side input, which acts like a
        no-op attention, effective allowing attention weights to sum up
        to something less than 1.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(ReadItTwiceDecoderModel, self).__init__(name=name, **kwargs)

        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.num_layers_override = num_layers_override
        self.num_cross_attention_heads = num_cross_attention_heads
        self.enable_default_side_input = enable_default_side_input

        if config.embedding_size is None:
            config = dataclasses.replace(config,
                                         embedding_size=config.hidden_size)
        self.config = config

        self.token_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.vocab_size,
            embedding_size=config.embedding_size,
            projection_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="token_emb_lookup")

        self.token_embedding_norm = tf.keras.layers.LayerNormalization(
            axis=-1, epsilon=1e-12, name="emb_layer_norm")
        self.token_embedding_dropout = tf.keras.layers.Dropout(
            rate=config.hidden_dropout_prob)

        self.position_embedding = readtwice_layers.EmbeddingLookup(
            vocab_size=config.max_seq_length,
            embedding_size=config.hidden_size,
            initializer_range=config.initializer_range,
            use_one_hot_lookup=use_one_hot_embeddings,
            name="position_emb_lookup_long")
        # Call layers to force variable initialization.
        self.position_embedding(tf.ones([1, 1], tf.int32))

        self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers(
            hidden_size=config.hidden_size,
            num_hidden_layers=self.num_layers_override,
            num_attention_heads=config.num_attention_heads,
            intermediate_size=config.intermediate_size,
            hidden_act=tensor_utils.get_activation(config.hidden_act),
            hidden_dropout_prob=config.hidden_dropout_prob,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            initializer_range=config.initializer_range,
            share_kv_projections=False,
            num_cross_attention_heads=self.num_cross_attention_heads,
            enable_default_side_input=self.enable_default_side_input)
Beispiel #5
0
    def __init__(self,
                 config,
                 use_one_hot_embeddings,
                 name="summary_extraction",
                 **kwargs):
        """Constructor for SummaryExtraction.

    Args:
      config: `model_config.ReadItTwiceBertConfig` instance.
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.nn.embedding_lookup() for the word embeddings.
      name: (Optional) name of the layer.
      **kwargs: Forwarded to super.

    Raises:
      ValueError: The config is invalid.
    """
        super(SummaryExtraction, self).__init__(name=name, **kwargs)
        self.mode = config.summary_mode
        self.hidden_size = config.hidden_size
        self.postprocessing_type = config.summary_postprocessing_type
        self.use_sparse_memory_attention = (config.use_sparse_memory_attention)

        self.embedding_norm = None

        if self.mode == "cls":
            pass
        elif self.mode == "text_block":
            self.text_block_extract_every_x = config.text_block_extract_every_x
            assert self.text_block_extract_every_x is not None
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        elif self.mode == "entity":
            self.extraction_linear = tf.keras.layers.Dense(
                config.hidden_size,
                activation=None,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="entity_pool_linear")
        else:
            raise ValueError("Unknown summary mode: {}".format(self.mode))

        if self.postprocessing_type == "none":
            self.postprocessing = None
        elif self.postprocessing_type == "linear":
            self.postprocessing = tf.keras.layers.Dense(
                config.hidden_size,
                activation=tf.tanh,
                kernel_initializer=(tf.truncated_normal_initializer(
                    stddev=config.initializer_range)),
                name="cls_pool")
        elif self.postprocessing_type in ["pos", "transformer"]:
            self.position_embedding = readtwice_layers.EmbeddingLookup(
                vocab_size=config.max_num_blocks_per_document,
                embedding_size=config.hidden_size,
                initializer_range=config.initializer_range,
                use_one_hot_lookup=use_one_hot_embeddings,
                name="block_position_emb_lookup")
            # Call layers to force variable initialization.
            self.position_embedding(tf.ones([1, 1], tf.int32))
            self.embedding_norm = tf.keras.layers.LayerNormalization(
                axis=-1, epsilon=1e-12, name="summary_emb_layer_norm")
            self.embedding_dropout = tf.keras.layers.Dropout(
                rate=config.hidden_dropout_prob)

            if self.postprocessing_type == "transformer":
                if config.summary_postprocessing_num_layers is None:
                    raise ValueError(
                        "Must specify `postprocessing_num_layers`"
                        "when `postprocessing_type` is \"transformer\"")

                self.postprocessing = readtwice_layers.TransformerWithSideInputLayers(
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.summary_postprocessing_num_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    hidden_act=tensor_utils.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    share_kv_projections=True)
        else:
            raise ValueError("Unknown summary type: {}".format(
                self.postprocessing_type))