Ejemplo n.º 1
0
 def _get_discriminator_output(self, inputs, sample_weight, discriminator,
                               labels):
     '''Discriminator binary classifier.'''
     with tf.variable_scope('discriminator_predictions'):
         hidden = tf.layers.dense(
             discriminator.get_sequence_output(),
             units=self.bert_config.hidden_size,
             activation=util.get_activation(self.bert_config.hidden_act),
             kernel_initializer=util.create_initializer(
                 self.bert_config.initializer_range))
         logits = tf.squeeze(tf.layers.dense(hidden, units=1), -1)
         weights = tf.cast(inputs.input_mask, tf.float32)
         labelsf = tf.cast(labels, tf.float32)
         losses = tf.nn.sigmoid_cross_entropy_with_logits(
             logits=logits, labels=labelsf) * weights
         per_example_loss = (tf.reduce_sum(losses, axis=-1) /
                             (1e-6 + tf.reduce_sum(weights, axis=-1)))
         if sample_weight is not None:
             sample_weight = tf.cast(sample_weight, dtype=tf.float32)
             per_example_loss *= sample_weight
         loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
         probs = tf.nn.sigmoid(logits)
         preds = tf.cast(tf.greater(probs, 0.5), tf.int32)
         DiscOutput = collections.namedtuple(
             'DiscOutput',
             ['loss', 'per_example_loss', 'probs', 'preds', 'labels'])
         return DiscOutput(loss=loss,
                           per_example_loss=per_example_loss,
                           probs=probs,
                           preds=preds,
                           labels=labels)
Ejemplo n.º 2
0
def scatter_update(sequence, updates, positions):
    '''Scatter-update a sequence.

    Args:
      sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
      updates: A tensor of size batch_size*seq_len(*depth)
      positions: A [batch_size, n_positions] tensor

    Returns: A tuple of two tensors. First is a [batch_size, seq_len] or
      [batch_size, seq_len, depth] tensor of 'sequence' with elements at
      'positions' replaced by the values at 'updates.' Updates to index 0 are
      ignored. If there are duplicated positions the update is only applied
      once. Second is a [batch_size, seq_len] mask tensor of which inputs were
      updated.
    '''
    shape = util.get_shape_list(sequence, expected_rank=[2, 3])
    depth_dimension = (len(shape) == 3)
    if depth_dimension:
        B, L, D = shape
    else:
        B, L = shape
        D = 1
        sequence = tf.expand_dims(sequence, -1)
    N = util.get_shape_list(positions)[1]

    shift = tf.expand_dims(L * tf.range(B), -1)
    flat_positions = tf.reshape(positions + shift, [-1, 1])
    flat_updates = tf.reshape(updates, [-1, D])
    updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
    updates = tf.reshape(updates, [B, L, D])

    flat_updates_mask = tf.ones([B * N], tf.int32)
    updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
    updates_mask = tf.reshape(updates_mask, [B, L])
    not_first_token = tf.concat(
        [tf.zeros((B, 1), tf.int32),
         tf.ones((B, L - 1), tf.int32)], -1)
    updates_mask *= not_first_token
    updates_mask_3d = tf.expand_dims(updates_mask, -1)

    # account for duplicate positions
    if sequence.dtype == tf.float32:
        updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
        updates /= tf.maximum(1.0, updates_mask_3d)
    else:
        assert sequence.dtype == tf.int32
        updates = tf.divide(updates, tf.maximum(1, updates_mask_3d))
        updates = tf.cast(updates, tf.int32)
    updates_mask = tf.minimum(updates_mask, 1)
    updates_mask_3d = tf.minimum(updates_mask_3d, 1)

    updated_sequence = (((1 - updates_mask_3d) * sequence) +
                        (updates_mask_3d * updates))
    if not depth_dimension:
        updated_sequence = tf.squeeze(updated_sequence, -1)

    return updated_sequence, updates_mask
Ejemplo n.º 3
0
 def _single_seq_fn():
     batch_size = tf.shape(inputs, out_type=tag_indices.dtype)[0]
     example_inds = tf.reshape(
         tf.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
     sequence_scores = tf.gather_nd(
         tf.squeeze(inputs, [1]),
         tf.concat([example_inds, tag_indices], axis=1))
     sequence_scores = tf.where(tf.less_equal(sequence_lengths, 0),
                                tf.zeros_like(sequence_scores),
                                sequence_scores)
     return sequence_scores
Ejemplo n.º 4
0
def crf_log_norm(inputs, sequence_lengths, transition_params):
    ''' Computes the normalization for a CRF.

    Args:
        inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
          to use as input to the CRF layer.
        sequence_lengths: A [batch_size] vector of true sequence lengths.
        transition_params: A [num_tags, num_tags] transition matrix.

    Returns:
        log_norm: A [batch_size] vector of normalizers for a CRF.
    '''
    # Split up the first and rest of the inputs in preparation for the forward
    # algorithm.
    first_input = tf.slice(inputs, [0, 0, 0], [-1, 1, -1])
    first_input = tf.squeeze(first_input, [1])

    # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp
    # over the 'initial state' (the unary potentials).
    def _single_seq_fn():
        log_norm = tf.reduce_logsumexp(first_input, [1])
        # Mask `log_norm` of the sequences with length <= zero.
        log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
                            tf.zeros_like(log_norm), log_norm)
        return log_norm

    def _multi_seq_fn():
        '''Forward computation of alpha values.'''
        rest_of_input = tf.slice(inputs, [0, 1, 0], [-1, -1, -1])

        # Compute the alpha values in the forward algorithm in order to get the
        # partition function.
        forward_cell = CrfForwardRnnCell(transition_params)
        # Sequence length is not allowed to be less than zero.
        sequence_lengths_less_one = tf.maximum(
            tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
        _, alphas = rnn.dynamic_rnn(cell=forward_cell,
                                    inputs=rest_of_input,
                                    sequence_length=sequence_lengths_less_one,
                                    initial_state=first_input,
                                    dtype=tf.float32)
        log_norm = tf.reduce_logsumexp(alphas, [1])
        # Mask `log_norm` of the sequences with length <= zero.
        log_norm = tf.where(tf.less_equal(sequence_lengths, 0),
                            tf.zeros_like(log_norm), log_norm)
        return log_norm

    return smart.smart_cond(pred=tf.equal(
        util.get_shape_list(inputs)[1] or tf.shape(inputs)[1], 1),
                            true_fn=_single_seq_fn,
                            false_fn=_multi_seq_fn)
Ejemplo n.º 5
0
def regression_loss(hidden, labels, initializer, scope, reuse=None,
                    return_logits=False):
    with tf.variable_scope(scope, reuse=reuse):
        logits = tf.layers.dense(
            hidden,
            1,
            kernel_initializer=initializer,
            name='logit')

        logits = tf.squeeze(logits, axis=-1)
        loss = tf.square(logits - labels)

        if return_logits:
            return loss, logits

        return loss
Ejemplo n.º 6
0
    def __init__(self,
                 albert_config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 segment_ids=None,
                 scope='bert',
                 drop_pooler=False,
                 trainable=True,
                 **kwargs):
        """Constructor for AlbertModel.

    Args:
      albert_config: `AlbertConfig` instance.
      is_training: bool. true for training model, false for eval model.
        Controls whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      segment_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for
        dense layers
      scope: (optional) variable scope. Defaults to "bert".

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        albert_config = copy.deepcopy(albert_config)
        if not is_training:
            albert_config.hidden_dropout_prob = 0.0
            albert_config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if segment_ids is None:
            segment_ids = tf.zeros(shape=[batch_size, seq_length],
                                   dtype=tf.int32)

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.word_embedding_output,
                 self.output_embedding_table) = embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=albert_config.vocab_size,
                     embedding_size=albert_config.embedding_size,
                     initializer_range=albert_config.initializer_range,
                     word_embedding_name="word_embeddings",
                     tilda_embeddings=tilda_embeddings,
                     trainable=trainable)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.word_embedding_output,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=albert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=albert_config.initializer_range,
                    max_position_embeddings=albert_config.
                    max_position_embeddings,
                    dropout_prob=albert_config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope("encoder"):
                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                self.all_encoder_layers = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=input_mask,
                    hidden_size=albert_config.hidden_size,
                    num_hidden_layers=albert_config.num_hidden_layers,
                    num_hidden_groups=albert_config.num_hidden_groups,
                    num_attention_heads=albert_config.num_attention_heads,
                    intermediate_size=albert_config.intermediate_size,
                    inner_group_num=albert_config.inner_group_num,
                    intermediate_act_fn=util.get_activation(
                        albert_config.hidden_act),
                    hidden_dropout_prob=albert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=albert_config.
                    attention_probs_dropout_prob,
                    initializer_range=albert_config.initializer_range,
                    do_return_all_layers=True,
                    use_einsum=False,
                    trainable=trainable)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)

                # trick: ignore the fully connected layer
                if drop_pooler:
                    self.pooled_output = first_token_tensor
                else:
                    self.pooled_output = tf.layers.dense(
                        first_token_tensor,
                        albert_config.hidden_size,
                        activation=tf.tanh,
                        kernel_initializer=util.create_initializer(
                            albert_config.initializer_range),
                        trainable=trainable)