Example #1
0
def dense_layer_3d_proj(input_tensor,
												hidden_size,
												head_size,
												initializer,
												activation,
												name=None):
	"""A dense layer with 3D kernel for projection.
	Args:
		input_tensor: float Tensor of shape [batch,from_seq_length,
			num_attention_heads, size_per_head].
		hidden_size: The size of hidden layer.
		num_attention_heads: The size of output dimension.
		head_size: The size of head.
		initializer: Kernel initializer.
		activation: Actication function.
		name: The name scope of this layer.
	Returns:
		float logits Tensor.
	"""
	input_shape = albert_utils_official.get_shape_list(input_tensor)
	num_attention_heads= input_shape[2]
	with tf.variable_scope(name):
		w = tf.get_variable(
				name="kernel",
				shape=[num_attention_heads * head_size, hidden_size],
				initializer=initializer)
		w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
		b = tf.get_variable(
				name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
		ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
		ret += b
	if activation is not None:
		return activation(ret)
	else:
		return ret
Example #2
0
def dense_layer_2d(input_tensor,
									 output_size,
									 initializer,
									 activation,
									 num_attention_heads=1,
									 name=None):
	"""A dense layer with 2D kernel.
	Args:
		input_tensor: Float tensor with rank 3.
		output_size: The size of output dimension.
		initializer: Kernel initializer.
		activation: Activation function.
		num_attention_heads: number of attention head in attention layer.
		name: The name scope of this layer.
	Returns:
		float logits Tensor.
	"""
	del num_attention_heads  # unused
	input_shape = albert_utils_official.get_shape_list(input_tensor)
	hidden_size = input_shape[2]
	with tf.variable_scope(name):
		w = tf.get_variable(
				name="kernel",
				shape=[hidden_size, output_size],
				initializer=initializer)
		b = tf.get_variable(
				name="bias", shape=[output_size], initializer=tf.zeros_initializer)
		ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
		ret += b
	if activation is not None:
		return activation(ret)
	else:
		return ret
Example #3
0
def dot_product_attention(q, k, v, bias, dropout_rate=0.0):
    """Dot-product attention.
	Args:
		q: Tensor with shape [..., length_q, depth_k].
		k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
			match with q.
		v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
			match with q.
		bias: bias Tensor (see attention_bias())
		dropout_rate: a float.
	Returns:
		Tensor with shape [..., length_q, depth_v].
	"""
    logits = tf.matmul(q, k, transpose_b=True)  # [..., length_q, length_kv]
    logits = tf.multiply(
        logits,
        1.0 / math.sqrt(float(albert_utils_official.get_shape_list(q)[-1])))
    if bias is not None:
        # `attention_mask` = [B, T]
        from_shape = albert_utils_official.get_shape_list(q)
        if len(from_shape) == 4:
            broadcast_ones = tf.ones([from_shape[0], 1, from_shape[2], 1],
                                     tf.float32)
        elif len(from_shape) == 5:
            # from_shape = [B, N, Block_num, block_size, depth]#
            broadcast_ones = tf.ones(
                [from_shape[0], 1, from_shape[2], from_shape[3], 1],
                tf.float32)

        bias = tf.matmul(broadcast_ones,
                         tf.cast(bias, tf.float32),
                         transpose_b=True)

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 - bias) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        logits += adder
    else:
        adder = 0.0

    attention_probs = tf.nn.softmax(logits, name="attention_probs")
    attention_probs = dropout(attention_probs, dropout_rate)
    return tf.matmul(attention_probs, v)
Example #4
0
    def build_encoder(self, input_ids, input_mask, hidden_dropout_prob,
                      attention_probs_dropout_prob, **kargs):
        reuse = kargs["reuse"]
        input_shape = albert_utils_official.get_shape_list(
            input_ids, expected_rank=[2, 3])
        batch_size = input_shape[0]
        seq_length = input_shape[1]
        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)
        with tf.variable_scope(self.config.get("scope", "bert"), reuse=reuse):
            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].

                if kargs.get('attention_type',
                             'efficient_attention') == 'normal_attention':
                    tf.logging.info("****** normal attention *******")
                    transformer_model = albert_modules_official.transformer_model
                else:
                    tf.logging.info("****** normal attention *******")
                    transformer_model = albert_modules_official.transformer_model

                [
                    self.all_encoder_layers, self.all_attention_scores
                ] = albert_modules_official.transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=input_mask,
                    hidden_size=self.config.hidden_size,
                    num_hidden_layers=self.config.num_hidden_layers,
                    num_hidden_groups=self.config.num_hidden_groups,
                    num_attention_heads=self.config.num_attention_heads,
                    intermediate_size=self.config.intermediate_size,
                    inner_group_num=self.config.inner_group_num,
                    intermediate_act_fn=albert_modules_official.get_activation(
                        self.config.hidden_act),
                    hidden_dropout_prob=self.config.hidden_dropout_prob,
                    attention_probs_dropout_prob=self.config.
                    attention_probs_dropout_prob,
                    initializer_range=self.config.initializer_range,
                    do_return_all_layers=True)
Example #5
0
def embedding_lookup(input_ids,
										 vocab_size,
										 embedding_size=128,
										 initializer_range=0.02,
										 word_embedding_name="word_embeddings",
										 use_one_hot_embeddings=False):
	"""Looks up words embeddings for id tensor.
	Args:
		input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
			ids.
		vocab_size: int. Size of the embedding vocabulary.
		embedding_size: int. Width of the word embeddings.
		initializer_range: float. Embedding initialization range.
		word_embedding_name: string. Name of the embedding table.
		use_one_hot_embeddings: bool. If True, use one-hot method for word
			embeddings. If False, use `tf.nn.embedding_lookup()`.
	Returns:
		float Tensor of shape [batch_size, seq_length, embedding_size].
	"""
	# This function assumes that the input is of shape [batch_size, seq_length,
	# num_inputs].
	#
	# If the input is a 2D tensor of shape [batch_size, seq_length], we
	# reshape to [batch_size, seq_length, 1].
	if input_ids.shape.ndims == 2:
		input_ids = tf.expand_dims(input_ids, axis=[-1])

	embedding_table = tf.get_variable(
			name=word_embedding_name,
			shape=[vocab_size, embedding_size],
			initializer=create_initializer(initializer_range))

	if use_one_hot_embeddings:
		flat_input_ids = tf.reshape(input_ids, [-1])
		one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
		output = tf.matmul(one_hot_input_ids, embedding_table)
	else:
		output = tf.nn.embedding_lookup(embedding_table, input_ids)

	input_shape = albert_utils_official.get_shape_list(input_ids)

	output = tf.reshape(output,
											input_shape[0:-1] + [input_shape[-1] * embedding_size])
	return (output, embedding_table)
Example #6
0
def gumbel_embedding_lookup(input_ids,
                            vocab_size,
                            embedding_size=128,
                            initializer_range=0.02,
                            word_embedding_name="word_embeddings",
                            use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

	Args:
		input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
			ids.
		vocab_size: int. Size of the embedding vocabulary.
		embedding_size: int. Width of the word embeddings.
		initializer_range: float. Embedding initialization range.
		word_embedding_name: string. Name of the embedding table.
		use_one_hot_embeddings: bool. If True, use one-hot method for word
			embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
			for TPUs.

	Returns:
		float Tensor of shape [batch_size, seq_length, embedding_size].
	"""
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].

    input_shape = albert_utils_official.get_shape_list(input_ids,
                                                       expected_rank=[3])

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range))

    output = tf.einsum("abc,cd->abd", tf.cast(input_ids, tf.float32),
                       embedding_table)

    return (output, embedding_table)
Example #7
0
def dense_layer_3d(input_tensor,
									 num_attention_heads,
									 head_size,
									 initializer,
									 activation,
									 name=None):
	"""A dense layer with 3D kernel.
	Args:
		input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
		num_attention_heads: Number of attention heads.
		head_size: The size per attention head.
		initializer: Kernel initializer.
		activation: Actication function.
		name: The name scope of this layer.
	Returns:
		float logits Tensor.
	"""

	input_shape = albert_utils_official.get_shape_list(input_tensor)
	hidden_size = input_shape[2]

	with tf.variable_scope(name):
		w = tf.get_variable(
				name="kernel",
				shape=[hidden_size, num_attention_heads * head_size],
				initializer=initializer)
		w = tf.reshape(w, [hidden_size, num_attention_heads, head_size])
		b = tf.get_variable(
				name="bias",
				shape=[num_attention_heads * head_size],
				initializer=tf.zeros_initializer)
		b = tf.reshape(b, [num_attention_heads, head_size])
		ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
		ret += b
	if activation is not None:
		return activation(ret)
	else:
		return ret
Example #8
0
    def build_embedder(self, input_ids, token_type_ids, hidden_dropout_prob,
                       attention_probs_dropout_prob, **kargs):

        reuse = kargs["reuse"]
        input_shape = albert_utils_official.get_shape_list(
            input_ids, expected_rank=[2, 3])
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        if self.config.get('embedding_scope', None):
            embedding_scope = self.config['embedding_scope']
            other_embedding_scope = self.config.get("scope", "bert")
            tf.logging.info(
                "==using embedding scope of original model_config.embedding_scope: %s==",
                embedding_scope)
        else:
            embedding_scope = self.config.get("scope", "bert")
            other_embedding_scope = self.config.get("scope", "bert")
            tf.logging.info(
                "==using embedding scope of original model_config.scope: %s==",
                embedding_scope)

        with tf.variable_scope(embedding_scope, reuse=reuse):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output_word, self.embedding_table
                 ) = albert_modules_official.embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=self.config.vocab_size,
                     embedding_size=self.config.embedding_size,
                     initializer_range=self.config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=self.config.use_one_hot_embeddings)

                if kargs.get("perturbation", None):
                    self.embedding_output_word += kargs["perturbation"]
                    tf.logging.info(
                        " add word pertubation for robust learning ")

        with tf.variable_scope(other_embedding_scope, reuse=reuse):
            with tf.variable_scope("embeddings"):

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = albert_modules_official.embedding_postprocessor(
                    input_tensor=self.embedding_output_word,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=self.config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=self.config.initializer_range,
                    max_position_embeddings=self.config.
                    max_position_embeddings,
                    dropout_prob=hidden_dropout_prob,
                    token_type_ratio=self.config.get("token_type_ratio", 1.0))
Example #9
0
def transformer_model(input_tensor,
											attention_mask=None,
											hidden_size=768,
											num_hidden_layers=12,
											num_hidden_groups=12,
											num_attention_heads=12,
											intermediate_size=3072,
											inner_group_num=1,
											intermediate_act_fn="gelu",
											hidden_dropout_prob=0.1,
											attention_probs_dropout_prob=0.1,
											initializer_range=0.02,
											do_return_all_layers=False):
	"""Multi-headed, multi-layer Transformer from "Attention is All You Need".
	This is almost an exact implementation of the original Transformer encoder.
	See the original paper:
	https://arxiv.org/abs/1706.03762
	Also see:
	https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
	Args:
		input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
		attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
			seq_length], with 1 for positions that can be attended to and 0 in
			positions that should not be.
		hidden_size: int. Hidden size of the Transformer.
		num_hidden_layers: int. Number of layers (blocks) in the Transformer.
		num_hidden_groups: int. Number of group for the hidden layers, parameters
			in the same group are shared.
		num_attention_heads: int. Number of attention heads in the Transformer.
		intermediate_size: int. The size of the "intermediate" (a.k.a., feed
			forward) layer.
		inner_group_num: int, number of inner repetition of attention and ffn.
		intermediate_act_fn: function. The non-linear activation function to apply
			to the output of the intermediate/feed-forward layer.
		hidden_dropout_prob: float. Dropout probability for the hidden layers.
		attention_probs_dropout_prob: float. Dropout probability of the attention
			probabilities.
		initializer_range: float. Range of the initializer (stddev of truncated
			normal).
		do_return_all_layers: Whether to also return all layers or just the final
			layer.
	Returns:
		float Tensor of shape [batch_size, seq_length, hidden_size], the final
		hidden layer of the Transformer.
	Raises:
		ValueError: A Tensor shape or parameter is invalid.
	"""
	if hidden_size % num_attention_heads != 0:
		raise ValueError(
				"The hidden size (%d) is not a multiple of the number of attention "
				"heads (%d)" % (hidden_size, num_attention_heads))

	attention_head_size = hidden_size // num_attention_heads
	input_shape = albert_utils_official.get_shape_list(input_tensor, expected_rank=3)
	input_width = input_shape[2]

	all_layer_outputs = []
	all_attention_scores = []
	if input_width != hidden_size:
		prev_output = dense_layer_2d(
				input_tensor, hidden_size, create_initializer(initializer_range),
				None, name="embedding_hidden_mapping_in")
	else:
		prev_output = input_tensor
	with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
		for layer_idx in range(num_hidden_layers):
			group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups)
			with tf.variable_scope("group_%d" % group_idx):
				with tf.name_scope("layer_%d" % layer_idx):
					layer_output = prev_output
					for inner_group_idx in range(inner_group_num):
						with tf.variable_scope("inner_group_%d" % inner_group_idx):
							[layer_output, attention_scores] = attention_ffn_block(
									layer_output, hidden_size, attention_mask,
									num_attention_heads, attention_head_size,
									attention_probs_dropout_prob, intermediate_size,
									intermediate_act_fn, initializer_range, hidden_dropout_prob)
							prev_output = layer_output
							all_layer_outputs.append(layer_output)
							all_attention_scores.append(attention_scores)
	if do_return_all_layers:
		return all_layer_outputs, all_attention_scores
	else:
		return all_layer_outputs, all_attention_scores
Example #10
0
def attention_layer(from_tensor,
										to_tensor,
										attention_mask=None,
										num_attention_heads=1,
										query_act=None,
										key_act=None,
										value_act=None,
										attention_probs_dropout_prob=0.0,
										initializer_range=0.02,
										batch_size=None,
										from_seq_length=None,
										to_seq_length=None):
	"""Performs multi-headed attention from `from_tensor` to `to_tensor`.
	Args:
		from_tensor: float Tensor of shape [batch_size, from_seq_length,
			from_width].
		to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
		attention_mask: (optional) int32 Tensor of shape [batch_size,
			from_seq_length, to_seq_length]. The values should be 1 or 0. The
			attention scores will effectively be set to -infinity for any positions in
			the mask that are 0, and will be unchanged for positions that are 1.
		num_attention_heads: int. Number of attention heads.
		query_act: (optional) Activation function for the query transform.
		key_act: (optional) Activation function for the key transform.
		value_act: (optional) Activation function for the value transform.
		attention_probs_dropout_prob: (optional) float. Dropout probability of the
			attention probabilities.
		initializer_range: float. Range of the weight initializer.
		batch_size: (Optional) int. If the input is 2D, this might be the batch size
			of the 3D version of the `from_tensor` and `to_tensor`.
		from_seq_length: (Optional) If the input is 2D, this might be the seq length
			of the 3D version of the `from_tensor`.
		to_seq_length: (Optional) If the input is 2D, this might be the seq length
			of the 3D version of the `to_tensor`.
	Returns:
		float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
			size_per_head].
	Raises:
		ValueError: Any of the arguments or tensor shapes are invalid.
	"""
	from_shape = albert_utils_official.get_shape_list(from_tensor, expected_rank=[2, 3])
	to_shape = albert_utils_official.get_shape_list(to_tensor, expected_rank=[2, 3])
	size_per_head = int(from_shape[2]/num_attention_heads)

	if len(from_shape) != len(to_shape):
		raise ValueError(
				"The rank of `from_tensor` must match the rank of `to_tensor`.")

	if len(from_shape) == 3:
		batch_size = from_shape[0]
		from_seq_length = from_shape[1]
		to_seq_length = to_shape[1]
	elif len(from_shape) == 2:
		if (batch_size is None or from_seq_length is None or to_seq_length is None):
			raise ValueError(
					"When passing in rank 2 tensors to attention_layer, the values "
					"for `batch_size`, `from_seq_length`, and `to_seq_length` "
					"must all be specified.")

	# Scalar dimensions referenced here:
	#   B = batch size (number of sequences)
	#   F = `from_tensor` sequence length
	#   T = `to_tensor` sequence length
	#   N = `num_attention_heads`
	#   H = `size_per_head`

	# `query_layer` = [B, F, N, H]
	q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
										 create_initializer(initializer_range), query_act, "query")

	# `key_layer` = [B, T, N, H]
	k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
										 create_initializer(initializer_range), key_act, "key")
	# `value_layer` = [B, T, N, H]
	v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
										 create_initializer(initializer_range), value_act, "value")
	q = tf.transpose(q, [0, 2, 1, 3])
	k = tf.transpose(k, [0, 2, 1, 3])
	v = tf.transpose(v, [0, 2, 1, 3])
	if attention_mask is not None:
		attention_mask = tf.reshape(
				attention_mask, [batch_size, 1, to_seq_length, 1])
		# 'new_embeddings = [B, N, F, H]'
	new_embeddings = dot_product_attention(q, k, v, attention_mask,
																				 attention_probs_dropout_prob)

	return tf.transpose(new_embeddings, [0, 2, 1, 3])
Example #11
0
def embedding_postprocessor(input_tensor,
														use_token_type=False,
														token_type_ids=None,
														token_type_vocab_size=16,
														token_type_embedding_name="token_type_embeddings",
														use_position_embeddings=True,
														position_embedding_name="position_embeddings",
														initializer_range=0.02,
														max_position_embeddings=512,
														dropout_prob=0.1,
														token_type_ratio=1.0):
	"""Performs various post-processing on a word embedding tensor.
	Args:
		input_tensor: float Tensor of shape [batch_size, seq_length,
			embedding_size].
		use_token_type: bool. Whether to add embeddings for `token_type_ids`.
		token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
			Must be specified if `use_token_type` is True.
		token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
		token_type_embedding_name: string. The name of the embedding table variable
			for token type ids.
		use_position_embeddings: bool. Whether to add position embeddings for the
			position of each token in the sequence.
		position_embedding_name: string. The name of the embedding table variable
			for positional embeddings.
		initializer_range: float. Range of the weight initialization.
		max_position_embeddings: int. Maximum sequence length that might ever be
			used with this model. This can be longer than the sequence length of
			input_tensor, but cannot be shorter.
		dropout_prob: float. Dropout probability applied to the final output tensor.
	Returns:
		float tensor with same shape as `input_tensor`.
	Raises:
		ValueError: One of the tensor shapes or input values is invalid.
	"""
	input_shape = albert_utils_official.get_shape_list(input_tensor, expected_rank=3)
	batch_size = input_shape[0]
	seq_length = input_shape[1]
	width = input_shape[2]

	output = input_tensor

	if use_token_type:
		if token_type_ids is None:
			raise ValueError("`token_type_ids` must be specified if"
											 "`use_token_type` is True.")
		token_type_table = tf.get_variable(
				name=token_type_embedding_name,
				shape=[token_type_vocab_size, width],
				initializer=create_initializer(initializer_range))
		# This vocab will be small so we always do one-hot here, since it is always
		# faster for a small vocabulary.
		flat_token_type_ids = tf.reshape(token_type_ids, [-1])
		one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
		token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
		token_type_embeddings = tf.reshape(token_type_embeddings,
																			 [batch_size, seq_length, width])
		output += token_type_ratio * token_type_embeddings

	if use_position_embeddings:
		assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
		with tf.control_dependencies([assert_op]):
			full_position_embeddings = tf.get_variable(
					name=position_embedding_name,
					shape=[max_position_embeddings, width],
					initializer=create_initializer(initializer_range))
			# Since the position embedding table is a learned variable, we create it
			# using a (long) sequence length `max_position_embeddings`. The actual
			# sequence length might be shorter than this, for faster training of
			# tasks that do not have long sequences.
			#
			# So `full_position_embeddings` is effectively an embedding table
			# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
			# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
			# perform a slice.
			# position_embeddings = tf.slice(full_position_embeddings, [0, 0],
			# 															 [seq_length, -1])
			
			flat_pos_ids = tf.range(seq_length, dtype=tf.int32)
			one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings)
			position_embeddings = tf.matmul(one_hot_pos_ids, full_position_embeddings)

			num_dims = len(output.shape.as_list())

			# Only the last two dimensions are relevant (`seq_length` and `width`), so
			# we broadcast among the first dimensions, which is typically just
			# the batch size.
			position_broadcast_shape = []
			for _ in range(num_dims - 2):
				position_broadcast_shape.append(1)
			position_broadcast_shape.extend([seq_length, width])
			position_embeddings = tf.reshape(position_embeddings,
																			 position_broadcast_shape)
			output += position_embeddings

	output = layer_norm_and_dropout(output, dropout_prob)
	return output