def __init__(self, intermediate_size, intermediate_activation=tensor_utils.get_activation("gelu"), dropout_rate=0.0, name="span_prediction_head", **kwargs): """Constructor for SpanPredictionHead. Args: intermediate_size: dimension of the intermediate representation of MLP. If None then only a single linear layer will be applied intermediate_activation: activation function for MLP dropout_rate: dropout rate name: (Optional) name of the layer. **kwargs: Forwarded to super. """ super(SpanPredictionHead, self).__init__(name=name, **kwargs) if intermediate_size is not None: self._intermediate_dense = tf.keras.layers.Dense(intermediate_size) self._intermediate_activation = tf.keras.layers.Activation( intermediate_activation) self._output_dropout = tf.keras.layers.Dropout(dropout_rate) self._output_layer_norm = tf.keras.layers.LayerNormalization() else: self._intermediate_dense = None self._intermediate_activation = None self._output_dropout = None self._output_layer_norm = None self._logits_dense = tf.keras.layers.Dense(2)
def __init__(self, output_weights, hidden_size, name='language_model_loss', activation='gelu', initializer_range=0.02, **kwargs): """Constructor for LanguageModelLoss. Args: output_weights: Embeddings table hidden_size: Input size name: (Optional) name of the layer. activation: The non-linear activation function (function or string) in the 1 layer MLP decoder. Default is "gelu". initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. **kwargs: Forwarded to super. Raises: ValueError: Shape of the output_weights is not statically known. """ super(LanguageModelLoss, self).__init__(name=name, **kwargs) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.output_weights = output_weights self.activation = activation self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size, self.embedding_size = tensor_utils.get_shape_list( self.output_weights, expected_rank=2, name='word embeddings table') if self.vocab_size is None: raise ValueError('`output_weights[0]` must be statically known.') self.linear_fn = tf.keras.layers.Dense( self.embedding_size, activation=tensor_utils.get_activation(self.activation), use_bias=True, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.initializer_range)) self.layer_norm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=0.001)
def __init__(self, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size=None, hidden_act=tensor_utils.get_activation('gelu'), hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, share_kv_projections=False, num_cross_attention_heads=None, enable_default_side_input=False, name='transformer_layers', **kwargs): """Init. Args: hidden_size: Size of the output hidden dimension. Must match the input hidden dimension size. num_hidden_layers: Number of Transformer layers. Each layer includes both an attention sublayer and a feed-forward sublayer. num_attention_heads: Number of attention heads. Must evenly divide `hidden_size`. intermediate_size: The size of the "intermediate" (i.e. feed-forward) layers. Defaults to 4 * hidden_size. hidden_act: The non-linear activation function in the intermediate layers. hidden_dropout_prob: The dropout probability for the attention and feed-forward residual blocks. Must be between 0.0 and 1.0. attention_probs_dropout_prob: Dropout probability for attention probabilities. Must be between 0.0 and 1.0. initializer_range: The standard deviation of the truncated normal initializer for initializing weight matrices not created by `linear_make_fn`. If zero, the scale of the truncated normal initializer will be tuned automatically according to the distribution of the inputs. share_kv_projections: If True, key and value projections will be shared between main-to-main and main-to-side components. This results in 1 key projection per layer instead of 2 (and similarly for value projections). Only relevant for fused side attention, NOT cross attention over the side input (when num_cross_attention_heads is not None). num_cross_attention_heads: If it is not None, will add a cross-attention layer over side inputs. In this case, side inputs will NOT be used in the `FusedSideAttention`. Must be greater or equal than 0, where 0 means that cross attention layer will have a single attention head WITHOUT projection matrices. enable_default_side_input: Add a default side input, which acts like a no-op attention, effective allowing attention weights to sum up to something less than 1. Currently, only available for the cross attention over side inputs. name: Name of the layer. **kwargs: Forwarded to super. """ super(TransformerWithSideInputLayers, self).__init__(name=name, **kwargs) if intermediate_size is None: intermediate_size = 4 * hidden_size if num_cross_attention_heads is not None: # This will prevent from allocating extra parameters for # fused side attention since side input will not be used there anyway. share_kv_projections = True self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.share_kv_projections = share_kv_projections self.num_cross_attention_heads = num_cross_attention_heads self.enable_default_side_input = enable_default_side_input if (self.enable_default_side_input and self.num_cross_attention_heads is None): raise ValueError('`enable_default_side_input` is only used when ' 'num_cross_attention_heads is enabled.') if (self.num_cross_attention_heads is not None and self.num_cross_attention_heads < 0): raise ValueError('If `num_cross_attention_heads` is specified ' 'it must be non-negative.') self.initializer = tf.keras.initializers.TruncatedNormal( stddev=initializer_range) self.attention_layers = [] self.cross_attention_layers = [] self.feed_forward_layers = [] for i in range(num_hidden_layers): self.attention_layers.append( wrappers.ResidualBlock( inner_layer=attention.FusedSideAttention( hidden_size=self.hidden_size, num_heads=self.num_attention_heads, att_dropout_prob=self.attention_probs_dropout_prob, share_kv_projections=self.share_kv_projections, initializer=self.initializer), dropout_probability=self.hidden_dropout_prob, use_pre_activation_order=False, name='attention_layer_%d' % i)) if self.num_cross_attention_heads is not None: self.cross_attention_layers.append( wrappers.ResidualBlock( inner_layer=attention.SideAttention( hidden_size=self.hidden_size, num_heads=self.num_cross_attention_heads, att_dropout_prob=self.attention_probs_dropout_prob, initializer=self.initializer, enable_default_side_input=self. enable_default_side_input), dropout_probability=self.hidden_dropout_prob, use_pre_activation_order=False, name='cross_attention_layer_%d' % i)) self.feed_forward_layers.append( wrappers.ResidualBlock( dropout_probability=self.hidden_dropout_prob, use_pre_activation_order=False, inner_intermediate_size=self.intermediate_size, inner_activation=self.hidden_act, inner_kernel_initializer=self.initializer, name='feed_forward_layer_%d' % i))
def __init__(self, config, num_layers_override, num_cross_attention_heads, enable_default_side_input=False, use_one_hot_embeddings=False, name="read_it_twice_decoder", **kwargs): """Constructor for ReadItTwiceDecoderModel. Args: config: `model_config.ReadItTwiceBertConfig` instance. num_layers_override: int. Number of Transformer layers. num_cross_attention_heads: int. Number of cross-attention heads. enable_default_side_input: Add a default side input, which acts like a no-op attention, effective allowing attention weights to sum up to something less than 1. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(ReadItTwiceDecoderModel, self).__init__(name=name, **kwargs) self.use_one_hot_embeddings = use_one_hot_embeddings self.num_layers_override = num_layers_override self.num_cross_attention_heads = num_cross_attention_heads self.enable_default_side_input = enable_default_side_input if config.embedding_size is None: config = dataclasses.replace(config, embedding_size=config.hidden_size) self.config = config self.token_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.vocab_size, embedding_size=config.embedding_size, projection_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="token_emb_lookup") self.token_embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="emb_layer_norm") self.token_embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="position_emb_lookup_long") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=self.num_layers_override, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=False, num_cross_attention_heads=self.num_cross_attention_heads, enable_default_side_input=self.enable_default_side_input)
def __init__(self, config, use_one_hot_embeddings=False, name="read_it_twice_bert", **kwargs): """Constructor for ReadItTwiceBertModel. Args: config: `model_config.ReadItTwiceBertConfig` instance. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(ReadItTwiceBertModel, self).__init__(name=name, **kwargs) self.use_one_hot_embeddings = use_one_hot_embeddings if config.cross_attention_top_k is not None: assert config.second_read_type == "cross_attend_once" if config.embedding_size is None: config = dataclasses.replace(config, embedding_size=config.hidden_size) self.config = config self.token_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.vocab_size, embedding_size=config.embedding_size, projection_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="token_emb_lookup") self.token_embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="emb_layer_norm") self.token_embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="position_emb_lookup_long") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) if config.cross_attention_pos_emb_mode is not None: # We would end up adding block position embeddings multiple times. assert config.summary_postprocessing_type not in [ "pos", "transformer" ] if config.second_read_type == "from_scratch": share_kv_projections_first_read = config.share_kv_projections else: # Summaries are not going to be used by the first read model anyway. share_kv_projections_first_read = True self.transformer_with_side_inputs = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=share_kv_projections_first_read, name="transformer_layers") # grad_checkpointing_period=config.grad_checkpointing_period) self.summary_extraction = SummaryExtraction( config=config, use_one_hot_embeddings=use_one_hot_embeddings) if config.second_read_type == "new_layers": if config.second_read_num_new_layers is None: raise ValueError("Must specify `second_read_num_new_layers`" "when `second_read_type` is new_layers") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=config.share_kv_projections, name="transformer_layers") elif config.second_read_type == "cross_attend_once": if config.second_read_num_new_layers is None: raise ValueError( "Must specify `second_read_num_new_layers`" "when `second_read_type` is cross_attend_once") if config.second_read_num_cross_attention_heads is None: raise ValueError( "Must specify `second_read_num_cross_attention_heads`" "when `second_read_type` is cross_attend_once") if config.second_read_enable_default_side_input is None: raise ValueError( "Must specify `second_read_enable_default_side_input`" "when `second_read_type` is cross_attend_once") self.cross_attention_layer = readtwice_layers.ResidualBlock( inner_layer=readtwice_layers.SideAttention( hidden_size=config.hidden_size, num_heads=config.second_read_num_cross_attention_heads, att_dropout_prob=0, initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), top_k_attention=config.cross_attention_top_k, pos_embed_mode=config.cross_attention_pos_emb_mode, pos_embed_size=config.max_num_blocks_per_document, use_one_hot_embeddings=use_one_hot_embeddings, enable_default_side_input=config. second_read_enable_default_side_input), dropout_probability=config.hidden_dropout_prob, use_pre_activation_order=False, name="cross_attention_layer") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True, name="transformer_layers") elif config.second_read_type == "new_layers_cross_attention": if config.second_read_num_new_layers is None: raise ValueError( "Must specify `second_read_num_new_layers`" "when `second_read_type` is cross_attend_once") if config.second_read_num_cross_attention_heads is None: raise ValueError( "Must specify `second_read_num_cross_attention_heads`" "when `second_read_type` is cross_attend_once") if config.second_read_enable_default_side_input is None: raise ValueError( "Must specify `second_read_enable_default_side_input`" "when `second_read_type` is cross_attend_once") self.second_read_transformer = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.second_read_num_new_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True, num_cross_attention_heads=( config.second_read_num_cross_attention_heads), enable_default_side_input=( config.second_read_enable_default_side_input), name="transformer_layers") else: if config.second_read_type != "from_scratch": raise ValueError("Unknown `second_read_type`: '{}'".format( config.second_read_type))
def __init__(self, config, use_one_hot_embeddings, name="summary_extraction", **kwargs): """Constructor for SummaryExtraction. Args: config: `model_config.ReadItTwiceBertConfig` instance. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.nn.embedding_lookup() for the word embeddings. name: (Optional) name of the layer. **kwargs: Forwarded to super. Raises: ValueError: The config is invalid. """ super(SummaryExtraction, self).__init__(name=name, **kwargs) self.mode = config.summary_mode self.hidden_size = config.hidden_size self.postprocessing_type = config.summary_postprocessing_type self.use_sparse_memory_attention = (config.use_sparse_memory_attention) self.embedding_norm = None if self.mode == "cls": pass elif self.mode == "text_block": self.text_block_extract_every_x = config.text_block_extract_every_x assert self.text_block_extract_every_x is not None self.extraction_linear = tf.keras.layers.Dense( config.hidden_size, activation=None, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="entity_pool_linear") elif self.mode == "entity": self.extraction_linear = tf.keras.layers.Dense( config.hidden_size, activation=None, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="entity_pool_linear") else: raise ValueError("Unknown summary mode: {}".format(self.mode)) if self.postprocessing_type == "none": self.postprocessing = None elif self.postprocessing_type == "linear": self.postprocessing = tf.keras.layers.Dense( config.hidden_size, activation=tf.tanh, kernel_initializer=(tf.truncated_normal_initializer( stddev=config.initializer_range)), name="cls_pool") elif self.postprocessing_type in ["pos", "transformer"]: self.position_embedding = readtwice_layers.EmbeddingLookup( vocab_size=config.max_num_blocks_per_document, embedding_size=config.hidden_size, initializer_range=config.initializer_range, use_one_hot_lookup=use_one_hot_embeddings, name="block_position_emb_lookup") # Call layers to force variable initialization. self.position_embedding(tf.ones([1, 1], tf.int32)) self.embedding_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name="summary_emb_layer_norm") self.embedding_dropout = tf.keras.layers.Dropout( rate=config.hidden_dropout_prob) if self.postprocessing_type == "transformer": if config.summary_postprocessing_num_layers is None: raise ValueError( "Must specify `postprocessing_num_layers`" "when `postprocessing_type` is \"transformer\"") self.postprocessing = readtwice_layers.TransformerWithSideInputLayers( hidden_size=config.hidden_size, num_hidden_layers=config.summary_postprocessing_num_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, hidden_act=tensor_utils.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, share_kv_projections=True) else: raise ValueError("Unknown summary type: {}".format( self.postprocessing_type))