def build(self, input_shape): input_rank = len(input_shape) input_shape = tf.TensorShape(input_shape) free_input_dims = input_rank - self._num_summed_dimensions output_dims = len(self._output_shape) self._einsum_string = self._build_einsum_string( free_input_dims, self._num_summed_dimensions, output_dims) # This is only saved for testing purposes. self._kernel_shape = (input_shape[free_input_dims:].concatenate( self._output_shape)) with tf.variable_scope(self._name): self._kernel = tf.get_variable( 'kernel', shape=self._kernel_shape, initializer=self._kernel_initializer, regularizer=self._kernel_regularizer, constraint=self._kernel_constraint, dtype=self.dtype, trainable=True) if self._use_bias: self._bias = tf.get_variable( 'bias', shape=self._output_shape, initializer=self._bias_initializer, regularizer=self._bias_regularizer, constraint=self._bias_constraint, dtype=self.dtype, trainable=True) else: self._bias = None super(DenseEinsum, self).build(input_shape)
def __init__(self, is_training, input_tensor, label_ids, sample_weight=None, scope='mrc', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs[name] = probs start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1), axis=-1) end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1), axis=-1) self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None, tie_weight=False, bi_data=True, use_tpu=False): '''doc.''' with tf.variable_scope('lm_loss'): if tie_weight: assert lookup_table is not None, \ 'lookup_table cannot be None for tie_weight' softmax_w = lookup_table else: softmax_w = tf.get_variable( 'weight', [n_token, d_model], dtype=hidden.dtype, initializer=initializer) softmax_b = tf.get_variable( 'bias', [n_token], dtype=hidden.dtype, initializer=tf.zeros_initializer()) logits = tf.einsum('ibd,nd->ibn', hidden, softmax_w) + softmax_b preds = tf.argmax(logits, axis=-1) if use_tpu: one_hot_target = tf.one_hot(target, n_token, dtype=logits.dtype) loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) else: loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logits) return loss, preds
def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=2, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) batch_size = tf.shape(input_tensor)[0] seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) self.preds[name] = tf.argmax(logits, axis=-1) self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs, axis=-1) input_mask = tf.concat([ tf.zeros((batch_size, 1), dtype=tf.float32), tf.cast(input_mask[:, 2:], dtype=tf.float32), tf.zeros((batch_size, 1), dtype=tf.float32) ], axis=-1) per_token_losses *= input_mask per_example_loss = tf.reduce_mean(per_token_losses, axis=-1) if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.losses[name] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def embedding_postprocessor(self, input_tensor, position_ids, batch_size, max_seq_length, hidden_size, use_token_type=False, segment_ids=None, token_type_vocab_size=16, token_type_embedding_name=\ 'token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, dtype=tf.float32, trainable=True): output = input_tensor if use_token_type: if segment_ids is None: raise ValueError( 'segment_ids must be specified if use_token_type is True.') token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) # This vocab will be small so we always do one-hot here, # since it is always faster for a small vocabulary. flat_segment_ids = tf.reshape(segment_ids, [-1]) one_hot_ids = tf.one_hot(flat_segment_ids, depth=token_type_vocab_size, dtype=dtype) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape( token_type_embeddings, [batch_size, max_seq_length, hidden_size]) output += token_type_embeddings if use_position_embeddings: full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, hidden_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) output += tf.gather(full_position_embeddings, position_ids) output = util.layer_norm_and_dropout(output, dropout_prob, trainable=trainable) return output
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( label_ids, depth=label_size, dtype=tf.float32) per_example_loss = - tf.reduce_sum( one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast( sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance(thresh, float), ( '`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log( self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def conv1d(x, scope, nf, *, w_init_stdev=0.02): with tf.variable_scope(scope): *start, nx = shape_list(x) w = tf.get_variable( 'w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) c = tf.reshape( tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, start + [nf]) return c
def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=5, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) with tf.variable_scope('crf'): input_length = tf.reduce_sum(input_mask, axis=-1) per_example_loss, transition_matrix = \ contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=label_ids, sequence_lengths=input_length) per_example_loss = -per_example_loss if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss self.preds[name] = tf.argmax(logits, axis=-1) self.probs['logits'] = logits self.probs['transition_matrix'] = transition_matrix
def norm(x, scope, *, axis=-1, epsilon=1e-5): '''Normalize to mean = 0, std = 1, then do a diagonal affine transform.''' with tf.variable_scope(scope): n_state = x.shape[-1].value g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0)) u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + epsilon) x = x * g + b return x
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, label_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.sigmoid(logits, name='probs') self.probs['probs'] = probs self.preds['preds'] = tf.greater(probs, 0.5) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.cast(label_ids, dtype=tf.float32)) if label_weight is not None: label_weight = tf.constant(label_weight, dtype=tf.float32) label_weight = tf.reshape(label_weight, [1, label_size]) per_example_loss *= label_weight per_example_loss = tf.reduce_mean(per_example_loss, axis=-1) if sample_weight is not None: per_example_loss *= sample_weight self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def dense_layer_3d(input_tensor, num_attention_heads, head_size, initializer, activation, use_einsum, name=None, trainable=True): """A dense layer with 3D kernel. Args: input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. num_attention_heads: Number of attention heads. head_size: The size per attention head. initializer: Kernel initializer. activation: Actication function. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = util.get_shape_list(input_tensor) hidden_size = input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[hidden_size, num_attention_heads * head_size], initializer=initializer, trainable=trainable) w = tf.reshape(w, [hidden_size, num_attention_heads, head_size]) b = tf.get_variable( name="bias", shape=[num_attention_heads * head_size], initializer=tf.zeros_initializer, trainable=trainable) b = tf.reshape(b, [num_attention_heads, head_size]) if use_einsum: ret = tf.einsum("BFH,HND->BFND", input_tensor, w) else: ret = einsum_via_matmul(input_tensor, w, 1) ret += b if activation is not None: return activation(ret) else: return ret
def crf_log_likelihood(input_tensor, tag_indices, sequence_lengths, transition_params=None): ''' Computes the log-likelihood of tag sequences in a CRF. Args: input_tensor: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we compute the log-likelihood. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: A [num_tags, num_tags] transition matrix, if available. Returns: log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of each example, given the sequence of tag indices. transition_params: A [num_tags, num_tags] transition matrix. This is either provided by the caller or created in this function. ''' # Get shape information. num_tags = util.get_shape_list(input_tensor)[2] # Get the transition matrix if not provided. if transition_params is None: transition_params = tf.get_variable('transitions', [num_tags, num_tags]) sequence_scores = crf_sequence_score(input_tensor, tag_indices, sequence_lengths, transition_params) log_norm = crf_log_norm(input_tensor, sequence_lengths, transition_params) # Normalize the scores to get the log-likelihood per example. log_likelihood = sequence_scores - log_norm return log_likelihood, transition_params
def embedding_lookup(x, n_token, d_embed, initializer, use_tpu=True, scope='embedding', tilda_embeddings=None, reuse=None, dtype=tf.float32): '''TPU and GPU embedding_lookup function.''' if tilda_embeddings is not None: lookup_table = tilda_embeddings else: with tf.variable_scope(scope, reuse=reuse): lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], dtype=dtype, initializer=initializer) if use_tpu: one_hot_idx = tf.one_hot(x, n_token, dtype=dtype) if one_hot_idx.shape.ndims == 2: return (tf.einsum('in,nd->id', one_hot_idx, lookup_table), lookup_table) else: return (tf.einsum('ibn,nd->ibd', one_hot_idx, lookup_table), lookup_table) else: return tf.nn.embedding_lookup(lookup_table, x), lookup_table
def embedding_lookup(self, input_ids, vocab_size, batch_size, max_seq_length, embedding_size=128, initializer_range=0.02, word_embedding_name='word_embeddings', dtype=tf.float32, trainable=True, tilda_embeddings=None): if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) if tilda_embeddings is not None: embedding_table = tilda_embeddings else: embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) flat_input_ids = tf.reshape(input_ids, [-1]) output = tf.gather(embedding_table, flat_input_ids, name='embedding_look_up') output = tf.reshape(output, [batch_size, max_seq_length, embedding_size]) return (output, embedding_table)
def embedding_preprocessor(self, input_values, batch_size=None, embedding_size=128, initializer_range=0.02, name='cls_embedding', dtype=tf.float32, trainable=True): with tf.variable_scope(name): input_values = util.layer_norm(input_values, trainable=trainable) linear_output = tf.layers.dense( input_values, embedding_size, activation=None, name='dense', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) cls_embedding = tf.get_variable( name='cls', shape=[1, 1, embedding_size], initializer=util.create_initializer(initializer_range), dtype=dtype, trainable=trainable) cls_output = tf.tile(cls_embedding, [batch_size, 1, 1]) output = tf.concat([cls_output, linear_output], axis=1) return output
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds[name] = tf.argmax(logits, axis=-1) self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss self.losses[name] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def head_projection(h, d_model, n_head, d_head, kernel_initializer, name): '''Project hidden states to a specific head with a 4D-shape.''' proj_weight = tf.get_variable('{}/kernel'.format(name), [d_model, n_head, d_head], dtype=h.dtype, initializer=kernel_initializer) head = tf.einsum('ibh,hnd->ibnd', h, proj_weight) return head
def _get_logits(pooled_output, hidden_size, scope, trainable): with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(pooled_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def dense_layer_2d(input_tensor, output_size, initializer, activation, use_einsum, num_attention_heads=1, name=None, trainable=True): """A dense layer with 2D kernel. Args: input_tensor: Float tensor with rank 3. output_size: The size of output dimension. initializer: Kernel initializer. activation: Activation function. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. num_attention_heads: number of attention head in attention layer. name: The name scope of this layer. Returns: float logits Tensor. """ del num_attention_heads # unused input_shape = util.get_shape_list(input_tensor) hidden_size = input_shape[2] with tf.variable_scope(name): w = tf.get_variable(name="kernel", shape=[hidden_size, output_size], initializer=initializer, trainable=trainable) b = tf.get_variable(name="bias", shape=[output_size], initializer=tf.zeros_initializer, trainable=trainable) if use_einsum: ret = tf.einsum("BFH,HO->BFO", input_tensor, w) else: ret = tf.matmul(input_tensor, w) ret += b if activation is not None: return activation(ret) else: return ret
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name='word_embeddings', use_one_hot_embeddings=False): '''Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better for TPUs. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. ''' # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. original_dims = input_ids.shape.ndims if original_dims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) embedding_table = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=util.create_initializer(initializer_range)) if original_dims == 3: input_shape = util.get_shape_list(input_ids) tf.reshape(input_ids, [-1, input_shape[-1]]) output = tf.matmul(input_ids, embedding_table) output = tf.reshape(output, [input_shape[0], input_shape[1], embedding_size]) else: if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) else: output = tf.nn.embedding_lookup(embedding_table, input_ids) input_shape = util.get_shape_list(input_ids) output = tf.reshape( output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return output, embedding_table
def dense_layer_3d_proj(input_tensor, hidden_size, head_size, initializer, activation, use_einsum, name=None): """A dense layer with 3D kernel for projection. Args: input_tensor: float Tensor of shape [batch,from_seq_length, num_attention_heads, size_per_head]. hidden_size: The size of hidden layer. head_size: The size of head. initializer: Kernel initializer. activation: Actication function. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers. name: The name scope of this layer. Returns: float logits Tensor. """ input_shape = util.get_shape_list(input_tensor) num_attention_heads = input_shape[2] with tf.variable_scope(name): w = tf.get_variable( name="kernel", shape=[num_attention_heads * head_size, hidden_size], initializer=initializer) w = tf.reshape(w, [num_attention_heads, head_size, hidden_size]) b = tf.get_variable(name="bias", shape=[hidden_size], initializer=tf.zeros_initializer) if use_einsum: ret = tf.einsum("BFND,NDH->BFH", input_tensor, w) else: ret = einsum_via_matmul(input_tensor, w, 2) ret += b if activation is not None: return activation(ret) else: return ret
def ln(inputs, epsilon = 1e-8, scope='ln'): '''Applies layer normalization. See https://arxiv.org/abs/1607.06450. inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. epsilon: A floating number. A very small number for preventing ZeroDivision Error. scope: Optional scope for `variable_scope`. Returns: A tensor with the same shape and data dtype as `inputs`. ''' with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): inputs_shape = inputs.get_shape() params_shape = inputs_shape[-1:] mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True) beta= tf.get_variable('beta', params_shape, initializer=tf.zeros_initializer()) gamma = tf.get_variable('gamma', params_shape, initializer=tf.ones_initializer()) normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) ) outputs = gamma * normalized + beta return outputs
def _forward(target_ids, target_mask, target_max_seq_length): with tf.variable_scope('decoder'): # shared embedding dec = tf.nn.embedding_lookup(embedding_table, target_ids) dec *= hidden_size ** 0.5 # scale dec += positional_encoding(dec, target_max_seq_length) dec = util.dropout(dec, dropout_rate) # blocks for i in range(num_blocks): with tf.variable_scope('block_%s' % i): # masked self-attention dec = multihead_attention( queries=dec, keys=dec, values=dec, key_masks=target_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=True, scope='masked_self_attention') # vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, key_masks=source_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=False, scope='vanilla_attention') # feed forward dec = ff( dec, num_units=[4 * hidden_size, hidden_size]) # final linear projection (embedding weights are shared) with tf.variable_scope('cls'): output_bias = tf.get_variable( 'output_bias', shape=[vocab_size], initializer=tf.zeros_initializer()) dec = tf.reshape(dec, [-1, hidden_size]) logits = tf.matmul(dec, embedding_table, transpose_b=True) logits = tf.reshape( logits, [-1, target_max_seq_length, vocab_size]) logits = tf.nn.bias_add(logits, output_bias) return logits
def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, dropatt, input_mask, is_training, initializer, scope=None, reuse=None, use_proj=True): ''' Different classification tasks may not may not share the same parameters to summarize the sequence features. If shared, one can keep the `scope` to the default value `None`. Otherwise, one should specify a different `scope` for each task. ''' with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): if summary_type == 'last': summary = hidden[-1] elif summary_type == 'first': summary = hidden[0] elif summary_type == 'mean': summary = tf.reduce_mean(hidden, axis=0) elif summary_type == 'attn': bsz = tf.shape(hidden)[1] summary_bias = tf.get_variable('summary_bias', [d_model], dtype=hidden.dtype, initializer=initializer) summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) if input_mask is not None: input_mask = input_mask[None, :, :, None] summary = multihead_attn( summary_bias, hidden, hidden, input_mask, d_model, n_head, d_head, dropout, dropatt, is_training, initializer, residual=False) summary = summary[0] else: raise ValueError('Unsupported summary type %s' % summary_type) # use another projection as in BERT if use_proj: summary = tf.layers.dense( summary, d_model, activation=tf.tanh, kernel_initializer=initializer, name='summary') # dropout summary = tf.layers.dropout( summary, dropout, training=is_training, name='dropout') return summary
def _cls_fcn(self, prev_output, label_size, hidden_size=768, initializer_range=0.02, dtype=tf.float32, trainable=True): with tf.variable_scope('output'): cls_output_weights = tf.get_variable( 'output_weights', [hidden_size, label_size], initializer=tf.truncated_normal_initializer( stddev=initializer_range), dtype=dtype, trainable=trainable) cls_output_bias = tf.get_variable( 'output_bias', [label_size], initializer=tf.zeros_initializer(), dtype=dtype, trainable=trainable) cls_logits = tf.matmul(prev_output[:, 0, :], cls_output_weights) cls_output = tf.nn.bias_add(cls_logits, cls_output_bias) return cls_output
def _forward(input_ids, past=None): batch, sequence = shape_list(input_ids) if tilda_embeddings is None: wte = tf.get_variable( 'word_embeddings', [hparams.n_vocab, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.02)) else: wte = tilda_embeddings wpe = tf.get_variable( 'wpe', [hparams.n_ctx, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.01)) past_length = 0 if past is None else tf.shape(past)[-2] h = (tf.gather(wte, input_ids) + tf.gather(wpe, positions_for(input_ids, past_length))) # stacked transformer layers presents = [] pasts = tf.unstack(past, axis=1) if past is not None else \ [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) presents.append(present) present = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) return logits, present
def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training, kernel_initializer, residual=True): '''Post-attention processing.''' # post-attention projection (back to `d_model`) proj_o = tf.get_variable( 'o/kernel', [d_model, n_head, d_head], dtype=h.dtype, initializer=kernel_initializer) attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o) attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) if residual: output = util.layer_norm( attn_out + h, name='LayerNorm') else: output = util.layer_norm( attn_out, name='LayerNorm') return output
def get_token_embeddings(vocab_size, num_units, zero_pad=True): '''Constructs token embedding matrix. Note that the column of index 0's are set to zeros. vocab_size: scalar. V. num_units: embedding dimensionalty. E. zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero To apply query/key masks easily, zero pad is turned on. Returns weight variable: (V, E) ''' with tf.variable_scope('shared_weight_matrix'): embeddings = tf.get_variable('weight_mat', dtype=tf.float32, shape=(vocab_size, num_units), initializer=xavier_initializer()) if zero_pad: embeddings = tf.concat((tf.zeros(shape=[1, num_units]), embeddings[1:, :]), 0) return embeddings
def __init__(self, hparams, is_training, input_ids, sample_weight=None, scope='model', given=1, use_tilda_embedding=False, **kwargs): super().__init__() batch_size = util.get_shape_list(input_ids, expected_rank=2)[0] max_seq_length = hparams.n_predict # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): def _forward(input_ids, past=None): batch, sequence = shape_list(input_ids) if tilda_embeddings is None: wte = tf.get_variable( 'word_embeddings', [hparams.n_vocab, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.02)) else: wte = tilda_embeddings wpe = tf.get_variable( 'wpe', [hparams.n_ctx, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.01)) past_length = 0 if past is None else tf.shape(past)[-2] h = (tf.gather(wte, input_ids) + tf.gather(wpe, positions_for(input_ids, past_length))) # stacked transformer layers presents = [] pasts = tf.unstack(past, axis=1) if past is not None else \ [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) presents.append(present) present = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) return logits, present # convert to labels label_ids = tf.concat( [input_ids[:, 1:], tf.zeros([batch_size, 1], dtype=tf.int32)], axis=-1) # forward once if is_training: (logits, _) = _forward(input_ids) self.preds['LM'] = tf.argmax(logits, axis=-1) # forward loop else: input_ids = input_ids[:, 0:given] for cur_length in range(given, max_seq_length + 1): (logits, _) = _forward(input_ids) pred_ids = tf.argmax(logits[:, cur_length - 1:cur_length, :], axis=-1) pred_ids = tf.cast(pred_ids, tf.int32) input_ids = tf.concat([input_ids, pred_ids], axis=-1) self.preds['LM'] = input_ids # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=hparams.n_vocab) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32) per_example_loss = \ tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \ tf.reduce_sum(label_mask, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = tf.reduce_mean(per_example_loss) self.losses['LM'] = per_example_loss
def __init__(self, vocab_size, is_training, input_ids, input_mask, segment_ids, sample_weight=None, reduced_size=64, topic_size=1024, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, bias=0, scope='vae', trainable=True, **kwargs): super().__init__() # freeze parameters config = Config(vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, batch_size=batch_size, max_seq_length=seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings, trainable=trainable) self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, hidden_size=config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope('encoder'): # stacked transformer attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, seq_length) self.all_encoder_layers = self.transformer_model( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=util.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=\ config.attention_probs_dropout_prob, initializer_range=config.initializer_range, trainable=trainable) # projection with tf.variable_scope('projection'): transformer_output = tf.layers.dense( self.all_encoder_layers[-1], reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) transformer_output = tf.reshape(transformer_output, [batch_size, -1]) input_length = tf.reduce_sum(input_mask, axis=-1) input_length = tf.cast(input_length, tf.float32) input_length_1d = tf.reshape(input_length, [batch_size]) input_length_2d = tf.reshape(input_length, [batch_size, 1]) broadcast_mask = tf.sequence_mask( tf.multiply(input_length_1d, reduced_size), seq_length * reduced_size, dtype=tf.float32) broadcast_mask = tf.multiply(broadcast_mask, seq_length / input_length_2d) transformer_output *= broadcast_mask # latent space miu = tf.layers.dense( transformer_output, topic_size, activation='tanh', kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='miu', trainable=trainable) sigma = tf.layers.dense( transformer_output, topic_size, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='sigma', trainable=trainable) self.probs['miu'] = miu self.probs['sigma'] = sigma with tf.variable_scope('decoder'): with tf.variable_scope('projection'): # reparametarization if is_training: noise = tf.random_normal([batch_size, topic_size]) else: noise = tf.random_uniform([batch_size, topic_size], minval=-bias, maxval=bias) decoder_input = miu + tf.exp(sigma) * noise # projection decoder_input = tf.layers.dense( decoder_input, seq_length * reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) intermediate_input = tf.reshape( decoder_input, [-1, seq_length, reduced_size]) intermediate_input = util.layer_norm(intermediate_input, trainable=trainable) intermediate_input = util.dropout( intermediate_input, config.hidden_dropout_prob) # MLP with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( intermediate_input, 4 * reduced_size, activation=util.gelu, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) with tf.variable_scope('output'): decoder_output = tf.layers.dense( intermediate_output, config.hidden_size, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) decoder_output = util.layer_norm(decoder_output, trainable=trainable) decoder_output = util.dropout(decoder_output, config.hidden_dropout_prob) self.all_decoder_layers = [intermediate_output, decoder_output] self.all_decoder_layers = [decoder_output] # reconstruction with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( decoder_output, units=config.hidden_size, activation=util.get_activation(config.hidden_act), kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) input_tensor = util.layer_norm(input_tensor, trainable=trainable) output_weights = self.embedding_table output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) flatten_input_tensor = tf.reshape(input_tensor, [-1, config.hidden_size]) logits = tf.matmul(flatten_input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, config.vocab_size]) probs = tf.nn.softmax(logits, axis=-1, name='probs') lm_log_probs = tf.nn.log_softmax(logits, axis=-1) self.preds['preds'] = tf.argmax(probs, axis=-1) one_hot_labels = tf.one_hot(input_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels, axis=[-1]) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = (tf.reduce_mean(per_example_loss) + tf.reduce_mean(tf.square(miu)) + tf.reduce_mean(tf.exp(sigma) - sigma - 1)) self.losses['losses'] = per_example_loss