def _get_discriminator_output(self, inputs, sample_weight, discriminator, labels): '''Discriminator binary classifier.''' with tf.variable_scope('discriminator_predictions'): hidden = tf.layers.dense( discriminator.get_sequence_output(), units=self.bert_config.hidden_size, activation=util.get_activation(self.bert_config.hidden_act), kernel_initializer=util.create_initializer( self.bert_config.initializer_range)) logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1) weights = tf.cast(inputs.input_mask, tf.float32) labelsf = tf.cast(labels, tf.float32) losses = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labelsf) * weights per_example_loss = (tf.reduce_sum(losses, axis=-1) / (1e-6 + tf.reduce_sum(weights, axis=-1))) if sample_weight is not None: sample_weight = tf.cast(sample_weight, dtype=tf.float32) per_example_loss *= sample_weight loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights)) probs = tf.nn.sigmoid(logits) preds = tf.cast(tf.greater(probs, 0.5), tf.int32) DiscOutput = collections.namedtuple( 'DiscOutput', ['loss', 'per_example_loss', 'probs', 'preds', 'labels']) return DiscOutput(loss=loss, per_example_loss=per_example_loss, probs=probs, preds=preds, labels=labels)
def scatter_update(sequence, updates, positions): '''Scatter-update a sequence. Args: sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor updates: A tensor of size batch_size*seq_len(*depth) positions: A [batch_size, n_positions] tensor Returns: A tuple of two tensors. First is a [batch_size, seq_len] or [batch_size, seq_len, depth] tensor of 'sequence' with elements at 'positions' replaced by the values at 'updates.' Updates to index 0 are ignored. If there are duplicated positions the update is only applied once. Second is a [batch_size, seq_len] mask tensor of which inputs were updated. ''' shape = util.get_shape_list(sequence, expected_rank=[2, 3]) depth_dimension = (len(shape) == 3) if depth_dimension: B, L, D = shape else: B, L = shape D = 1 sequence = tf.expand_dims(sequence, -1) N = util.get_shape_list(positions)[1] shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(positions + shift, [-1, 1]) flat_updates = tf.reshape(updates, [-1, D]) updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D]) updates = tf.reshape(updates, [B, L, D]) flat_updates_mask = tf.ones([B * N], tf.int32) updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L]) updates_mask = tf.reshape(updates_mask, [B, L]) not_first_token = tf.concat( [tf.zeros((B, 1), tf.int32), tf.ones((B, L - 1), tf.int32)], -1) updates_mask *= not_first_token updates_mask_3d = tf.expand_dims(updates_mask, -1) # account for duplicate positions if sequence.dtype == tf.float32: updates_mask_3d = tf.cast(updates_mask_3d, tf.float32) updates /= tf.maximum(1.0, updates_mask_3d) else: assert sequence.dtype == tf.int32 updates = tf.divide(updates, tf.maximum(1, updates_mask_3d)) updates = tf.cast(updates, tf.int32) updates_mask = tf.minimum(updates_mask, 1) updates_mask_3d = tf.minimum(updates_mask_3d, 1) updated_sequence = (((1 - updates_mask_3d) * sequence) + (updates_mask_3d * updates)) if not depth_dimension: updated_sequence = tf.squeeze(updated_sequence, -1) return updated_sequence, updates_mask
def _single_seq_fn(): batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0] example_inds = tf.reshape( tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1]) sequence_scores = tf.gather_nd( tf.squeeze(inputs, [1]), tf.concat([example_inds, tag_indices], axis=1)) sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0), tf.zeros_like(sequence_scores), sequence_scores) return sequence_scores
def crf_log_norm(inputs, sequence_lengths, transition_params): ''' Computes the normalization for a CRF. Args: inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials to use as input to the CRF layer. sequence_lengths: A [batch_size] vector of true sequence lengths. transition_params: A [num_tags, num_tags] transition matrix. Returns: log_norm: A [batch_size] vector of normalizers for a CRF. ''' # Split up the first and rest of the inputs in preparation for the forward # algorithm. first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1]) first_input = tf.squeeze(first_input, [1]) # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp # over the 'initial state' (the unary potentials). def _single_seq_fn(): log_norm = tf.reduce_logsumexp(first_input, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = tf.where(tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm) return log_norm def _multi_seq_fn(): '''Forward computation of alpha values.''' rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1]) # Compute the alpha values in the forward algorithm in order to get the # partition function. forward_cell = CrfForwardRnnCell(transition_params) # Sequence length is not allowed to be less than zero. sequence_lengths_less_one = tf.maximum( tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1) _, alphas = rnn.dynamic_rnn(cell=forward_cell, inputs=rest_of_input, sequence_length=sequence_lengths_less_one, initial_state=first_input, dtype=tf.float32) log_norm = tf.reduce_logsumexp(alphas, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = tf.where(tf.less_equal(sequence_lengths, 0), tf.zeros_like(log_norm), log_norm) return log_norm return smart.smart_cond(pred=tf.equal( util.get_shape_list(inputs)[1] or tf.shape(inputs)[1], 1), true_fn=_single_seq_fn, false_fn=_multi_seq_fn)
def regression_loss(hidden, labels, initializer, scope, reuse=None, return_logits=False): with tf.variable_scope(scope, reuse=reuse): logits = tf.layers.dense( hidden, 1, kernel_initializer=initializer, name='logit') logits = tf.squeeze(logits, axis=-1) loss = tf.square(logits - labels) if return_logits: return loss, logits return loss
def __init__(self, albert_config, is_training, input_ids, input_mask=None, segment_ids=None, scope='bert', drop_pooler=False, trainable=True, **kwargs): """Constructor for AlbertModel. Args: albert_config: `AlbertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. segment_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for dense layers scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ albert_config = copy.deepcopy(albert_config) if not is_training: albert_config.hidden_dropout_prob = 0.0 albert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if segment_ids is None: segment_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.word_embedding_output, self.output_embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=albert_config.vocab_size, embedding_size=albert_config.embedding_size, initializer_range=albert_config.initializer_range, word_embedding_name="word_embeddings", tilda_embeddings=tilda_embeddings, trainable=trainable) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.word_embedding_output, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=albert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=albert_config.initializer_range, max_position_embeddings=albert_config. max_position_embeddings, dropout_prob=albert_config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope("encoder"): # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = transformer_model( input_tensor=self.embedding_output, attention_mask=input_mask, hidden_size=albert_config.hidden_size, num_hidden_layers=albert_config.num_hidden_layers, num_hidden_groups=albert_config.num_hidden_groups, num_attention_heads=albert_config.num_attention_heads, intermediate_size=albert_config.intermediate_size, inner_group_num=albert_config.inner_group_num, intermediate_act_fn=util.get_activation( albert_config.hidden_act), hidden_dropout_prob=albert_config.hidden_dropout_prob, attention_probs_dropout_prob=albert_config. attention_probs_dropout_prob, initializer_range=albert_config.initializer_range, do_return_all_layers=True, use_einsum=False, trainable=trainable) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) # trick: ignore the fully connected layer if drop_pooler: self.pooled_output = first_token_tensor else: self.pooled_output = tf.layers.dense( first_token_tensor, albert_config.hidden_size, activation=tf.tanh, kernel_initializer=util.create_initializer( albert_config.initializer_range), trainable=trainable)