Exemple #1
0
    def build(self, input_shape):
        input_rank = len(input_shape)
        input_shape = tf.TensorShape(input_shape)
        free_input_dims = input_rank - self._num_summed_dimensions
        output_dims = len(self._output_shape)

        self._einsum_string = self._build_einsum_string(
            free_input_dims, self._num_summed_dimensions, output_dims)

        # This is only saved for testing purposes.
        self._kernel_shape = (input_shape[free_input_dims:].concatenate(
            self._output_shape))

        with tf.variable_scope(self._name):
            self._kernel = tf.get_variable(
                'kernel',
                shape=self._kernel_shape,
                initializer=self._kernel_initializer,
                regularizer=self._kernel_regularizer,
                constraint=self._kernel_constraint,
                dtype=self.dtype,
                trainable=True)
            if self._use_bias:
                self._bias = tf.get_variable(
                    'bias',
                    shape=self._output_shape,
                    initializer=self._bias_initializer,
                    regularizer=self._bias_regularizer,
                    constraint=self._bias_constraint,
                    dtype=self.dtype,
                    trainable=True)
            else:
                self._bias = None
        super(DenseEinsum, self).build(input_shape)
Exemple #2
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 sample_weight=None,
                 scope='mrc',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[2, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[2],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, 2])
            logits = tf.transpose(logits, [0, 2, 1])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            self.probs[name] = probs

            start_one_hot_labels = tf.one_hot(label_ids[:, 0],
                                              depth=seq_length,
                                              dtype=tf.float32)
            end_one_hot_labels = tf.one_hot(label_ids[:, 1],
                                            depth=seq_length,
                                            dtype=tf.float32)
            start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1)
            end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1)
            per_example_loss = (
                -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs,
                                     axis=-1) - 0.5 *
                tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1))
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses[name] = per_example_loss

            start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1),
                                         axis=-1)
            end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1),
                                       axis=-1)
            self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
Exemple #3
0
def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None,
            tie_weight=False, bi_data=True, use_tpu=False):
    '''doc.'''

    with tf.variable_scope('lm_loss'):
        if tie_weight:
            assert lookup_table is not None, \
                'lookup_table cannot be None for tie_weight'
            softmax_w = lookup_table
        else:
            softmax_w = tf.get_variable(
                'weight', [n_token, d_model],
                dtype=hidden.dtype, initializer=initializer)

        softmax_b = tf.get_variable(
            'bias', [n_token], dtype=hidden.dtype,
            initializer=tf.zeros_initializer())

        logits = tf.einsum('ibd,nd->ibn', hidden, softmax_w) + softmax_b
        preds = tf.argmax(logits, axis=-1)

        if use_tpu:
            one_hot_target = tf.one_hot(target, n_token, dtype=logits.dtype)
            loss = -tf.reduce_sum(
                tf.nn.log_softmax(logits) * one_hot_target, -1)
        else:
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=target, logits=logits)

        return loss, preds
Exemple #4
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        batch_size = tf.shape(input_tensor)[0]
        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            self.preds[name] = tf.argmax(logits, axis=-1)
            self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs,
                                               axis=-1)
            input_mask = tf.concat([
                tf.zeros((batch_size, 1), dtype=tf.float32),
                tf.cast(input_mask[:, 2:], dtype=tf.float32),
                tf.zeros((batch_size, 1), dtype=tf.float32)
            ],
                                   axis=-1)
            per_token_losses *= input_mask
            per_example_loss = tf.reduce_mean(per_token_losses, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.cast(sample_weight, dtype=tf.float32)

            self.losses[name] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Exemple #5
0
    def embedding_postprocessor(self,
                                input_tensor,
                                position_ids,
                                batch_size,
                                max_seq_length,
                                hidden_size,
                                use_token_type=False,
                                segment_ids=None,
                                token_type_vocab_size=16,
                                token_type_embedding_name=\
                                    'token_type_embeddings',
                                use_position_embeddings=True,
                                position_embedding_name='position_embeddings',
                                initializer_range=0.02,
                                max_position_embeddings=512,
                                dropout_prob=0.1,
                                dtype=tf.float32,
                                trainable=True):
        output = input_tensor

        if use_token_type:
            if segment_ids is None:
                raise ValueError(
                    'segment_ids must be specified if use_token_type is True.')
            token_type_table = tf.get_variable(
                name=token_type_embedding_name,
                shape=[token_type_vocab_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

            # This vocab will be small so we always do one-hot here,
            # since it is always faster for a small vocabulary.
            flat_segment_ids = tf.reshape(segment_ids, [-1])
            one_hot_ids = tf.one_hot(flat_segment_ids,
                                     depth=token_type_vocab_size,
                                     dtype=dtype)
            token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
            token_type_embeddings = tf.reshape(
                token_type_embeddings,
                [batch_size, max_seq_length, hidden_size])
            output += token_type_embeddings

        if use_position_embeddings:
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)
            output += tf.gather(full_position_embeddings, position_ids)

        output = util.layer_norm_and_dropout(output,
                                             dropout_prob,
                                             trainable=trainable)
        return output
Exemple #6
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(
                label_ids, depth=label_size, dtype=tf.float32)
            per_example_loss = - tf.reduce_sum(
                one_hot_labels * log_probs, axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(
                    sample_weight, dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(thresh, float), (
                    '`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(
                    self.probs['probs']), axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Exemple #7
0
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable(
            'w', [1, nx, nf],
            initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(
            tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b,
            start + [nf])
        return c
Exemple #8
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 input_mask,
                 label_ids,
                 label_size=5,
                 sample_weight=None,
                 scope='cls/sequence',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        seq_length = input_tensor.shape.as_list()[-2]
        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)

            output_layer = tf.reshape(output_layer, [-1, hidden_size])
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            logits = tf.reshape(logits, [-1, seq_length, label_size])

            with tf.variable_scope('crf'):
                input_length = tf.reduce_sum(input_mask, axis=-1)
                per_example_loss, transition_matrix = \
                    contrib.crf.crf_log_likelihood(
                        inputs=logits,
                        tag_indices=label_ids,
                        sequence_lengths=input_length)
                per_example_loss = -per_example_loss
                if sample_weight is not None:
                    per_example_loss *= tf.cast(sample_weight,
                                                dtype=tf.float32)
                self.total_loss = tf.reduce_mean(per_example_loss)
                self.losses[name] = per_example_loss
                self.preds[name] = tf.argmax(logits, axis=-1)
                self.probs['logits'] = logits
                self.probs['transition_matrix'] = transition_matrix
Exemple #9
0
def norm(x, scope, *, axis=-1, epsilon=1e-5):
    '''Normalize to mean = 0, std = 1, then do a diagonal affine transform.'''
    with tf.variable_scope(scope):
        n_state = x.shape[-1].value
        g = tf.get_variable('g', [n_state],
                            initializer=tf.constant_initializer(1))
        b = tf.get_variable('b', [n_state],
                            initializer=tf.constant_initializer(0))
        u = tf.reduce_mean(x, axis=axis, keepdims=True)
        s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True)
        x = (x - u) * tf.rsqrt(s + epsilon)
        x = x * g + b
        return x
Exemple #10
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 label_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable(
                'output_bias',
                shape=[label_size],
                initializer=tf.zeros_initializer(),
                trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            probs = tf.nn.sigmoid(logits, name='probs')

            self.probs['probs'] = probs
            self.preds['preds'] = tf.greater(probs, 0.5)

            per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits,
                labels=tf.cast(label_ids, dtype=tf.float32))
            if label_weight is not None:
                label_weight = tf.constant(label_weight, dtype=tf.float32)
                label_weight = tf.reshape(label_weight, [1, label_size])
                per_example_loss *= label_weight
            per_example_loss = tf.reduce_mean(per_example_loss, axis=-1)
            if sample_weight is not None:
                per_example_loss *= sample_weight

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Exemple #11
0
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   head_size,
                   initializer,
                   activation,
                   use_einsum,
                   name=None,
                   trainable=True):
  """A dense layer with 3D kernel.

  Args:
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
    num_attention_heads: Number of attention heads.
    head_size: The size per attention head.
    initializer: Kernel initializer.
    activation: Actication function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """

  input_shape = util.get_shape_list(input_tensor)
  hidden_size = input_shape[2]

  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[hidden_size, num_attention_heads * head_size],
        initializer=initializer,
        trainable=trainable)
    w = tf.reshape(w, [hidden_size, num_attention_heads, head_size])
    b = tf.get_variable(
        name="bias",
        shape=[num_attention_heads * head_size],
        initializer=tf.zeros_initializer,
        trainable=trainable)
    b = tf.reshape(b, [num_attention_heads, head_size])
    if use_einsum:
      ret = tf.einsum("BFH,HND->BFND", input_tensor, w)
    else:
      ret = einsum_via_matmul(input_tensor, w, 1)
    ret += b
  if activation is not None:
    return activation(ret)
  else:
    return ret
Exemple #12
0
def crf_log_likelihood(input_tensor,
                       tag_indices,
                       sequence_lengths,
                       transition_params=None):
    ''' Computes the log-likelihood of tag sequences in a CRF.

    Args:
        input_tensor: A [batch_size, max_seq_len, num_tags] tensor of unary
          potentials to use as input to the CRF layer.
        tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
          compute the log-likelihood.
        sequence_lengths: A [batch_size] vector of true sequence lengths.
        transition_params: A [num_tags, num_tags] transition matrix, if available.

    Returns:
        log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
          each example, given the sequence of tag indices.
        transition_params: A [num_tags, num_tags] transition matrix. This is either
          provided by the caller or created in this function.
    '''
    # Get shape information.
    num_tags = util.get_shape_list(input_tensor)[2]

    # Get the transition matrix if not provided.
    if transition_params is None:
        transition_params = tf.get_variable('transitions',
                                            [num_tags, num_tags])

    sequence_scores = crf_sequence_score(input_tensor, tag_indices,
                                         sequence_lengths, transition_params)
    log_norm = crf_log_norm(input_tensor, sequence_lengths, transition_params)

    # Normalize the scores to get the log-likelihood per example.
    log_likelihood = sequence_scores - log_norm
    return log_likelihood, transition_params
Exemple #13
0
def embedding_lookup(x,
                     n_token,
                     d_embed,
                     initializer,
                     use_tpu=True,
                     scope='embedding',
                     tilda_embeddings=None,
                     reuse=None,
                     dtype=tf.float32):
    '''TPU and GPU embedding_lookup function.'''
    if tilda_embeddings is not None:
        lookup_table = tilda_embeddings
    else:
        with tf.variable_scope(scope, reuse=reuse):
            lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
                                           dtype=dtype,
                                           initializer=initializer)
    if use_tpu:
        one_hot_idx = tf.one_hot(x, n_token, dtype=dtype)
        if one_hot_idx.shape.ndims == 2:
            return (tf.einsum('in,nd->id', one_hot_idx,
                              lookup_table), lookup_table)
        else:
            return (tf.einsum('ibn,nd->ibd', one_hot_idx,
                              lookup_table), lookup_table)
    else:
        return tf.nn.embedding_lookup(lookup_table, x), lookup_table
Exemple #14
0
    def embedding_lookup(self,
                         input_ids,
                         vocab_size,
                         batch_size,
                         max_seq_length,
                         embedding_size=128,
                         initializer_range=0.02,
                         word_embedding_name='word_embeddings',
                         dtype=tf.float32,
                         trainable=True,
                         tilda_embeddings=None):
        if input_ids.shape.ndims == 2:
            input_ids = tf.expand_dims(input_ids, axis=[-1])

        if tilda_embeddings is not None:
            embedding_table = tilda_embeddings
        else:
            embedding_table = tf.get_variable(
                name=word_embedding_name,
                shape=[vocab_size, embedding_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

        flat_input_ids = tf.reshape(input_ids, [-1])
        output = tf.gather(embedding_table,
                           flat_input_ids,
                           name='embedding_look_up')
        output = tf.reshape(output,
                            [batch_size, max_seq_length, embedding_size])

        return (output, embedding_table)
Exemple #15
0
    def embedding_preprocessor(self,
                               input_values,
                               batch_size=None,
                               embedding_size=128,
                               initializer_range=0.02,
                               name='cls_embedding',
                               dtype=tf.float32,
                               trainable=True):

        with tf.variable_scope(name):
            input_values = util.layer_norm(input_values, trainable=trainable)
            linear_output = tf.layers.dense(
                input_values,
                embedding_size,
                activation=None,
                name='dense',
                kernel_initializer=util.create_initializer(initializer_range),
                trainable=trainable)

            cls_embedding = tf.get_variable(
                name='cls',
                shape=[1, 1, embedding_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)
            cls_output = tf.tile(cls_embedding, [batch_size, 1, 1])

        output = tf.concat([cls_output, linear_output], axis=1)
        return output
Exemple #16
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 name='',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds[name] = tf.argmax(logits, axis=-1)
            self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss

            self.losses[name] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Exemple #17
0
def head_projection(h, d_model, n_head, d_head, kernel_initializer, name):
    '''Project hidden states to a specific head with a 4D-shape.'''
    proj_weight = tf.get_variable('{}/kernel'.format(name),
                                  [d_model, n_head, d_head],
                                  dtype=h.dtype,
                                  initializer=kernel_initializer)
    head = tf.einsum('ibh,hnd->ibnd', h, proj_weight)

    return head
Exemple #18
0
        def _get_logits(pooled_output, hidden_size, scope, trainable):
            with tf.variable_scope(scope):
                output_weights = tf.get_variable(
                    'output_weights',
                    shape=[label_size, hidden_size],
                    initializer=util.create_initializer(
                        bert_config.initializer_range),
                    trainable=trainable)
                output_bias = tf.get_variable(
                    'output_bias',
                    shape=[label_size],
                    initializer=tf.zeros_initializer(),
                    trainable=trainable)

                logits = tf.matmul(pooled_output,
                                   output_weights,
                                   transpose_b=True)
                logits = tf.nn.bias_add(logits, output_bias)
                return logits
Exemple #19
0
def dense_layer_2d(input_tensor,
                   output_size,
                   initializer,
                   activation,
                   use_einsum,
                   num_attention_heads=1,
                   name=None,
                   trainable=True):
    """A dense layer with 2D kernel.

  Args:
    input_tensor: Float tensor with rank 3.
    output_size: The size of output dimension.
    initializer: Kernel initializer.
    activation: Activation function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    num_attention_heads: number of attention head in attention layer.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """
    del num_attention_heads  # unused
    input_shape = util.get_shape_list(input_tensor)
    hidden_size = input_shape[2]
    with tf.variable_scope(name):
        w = tf.get_variable(name="kernel",
                            shape=[hidden_size, output_size],
                            initializer=initializer,
                            trainable=trainable)
        b = tf.get_variable(name="bias",
                            shape=[output_size],
                            initializer=tf.zeros_initializer,
                            trainable=trainable)
        if use_einsum:
            ret = tf.einsum("BFH,HO->BFO", input_tensor, w)
        else:
            ret = tf.matmul(input_tensor, w)
        ret += b
    if activation is not None:
        return activation(ret)
    else:
        return ret
Exemple #20
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
    '''Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
      ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialization range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True, use one-hot method for word
      embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
      for TPUs.

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  '''
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    original_dims = input_ids.shape.ndims
    if original_dims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=util.create_initializer(initializer_range))

    if original_dims == 3:
        input_shape = util.get_shape_list(input_ids)
        tf.reshape(input_ids, [-1, input_shape[-1]])
        output = tf.matmul(input_ids, embedding_table)
        output = tf.reshape(output,
                            [input_shape[0], input_shape[1], embedding_size])
    else:
        if use_one_hot_embeddings:
            flat_input_ids = tf.reshape(input_ids, [-1])
            one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
            output = tf.matmul(one_hot_input_ids, embedding_table)
        else:
            output = tf.nn.embedding_lookup(embedding_table, input_ids)

        input_shape = util.get_shape_list(input_ids)

        output = tf.reshape(
            output, input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return output, embedding_table
Exemple #21
0
def dense_layer_3d_proj(input_tensor,
                        hidden_size,
                        head_size,
                        initializer,
                        activation,
                        use_einsum,
                        name=None):
    """A dense layer with 3D kernel for projection.

  Args:
    input_tensor: float Tensor of shape [batch,from_seq_length,
      num_attention_heads, size_per_head].
    hidden_size: The size of hidden layer.
    head_size: The size of head.
    initializer: Kernel initializer.
    activation: Actication function.
    use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """
    input_shape = util.get_shape_list(input_tensor)
    num_attention_heads = input_shape[2]
    with tf.variable_scope(name):
        w = tf.get_variable(
            name="kernel",
            shape=[num_attention_heads * head_size, hidden_size],
            initializer=initializer)
        w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
        b = tf.get_variable(name="bias",
                            shape=[hidden_size],
                            initializer=tf.zeros_initializer)
        if use_einsum:
            ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
        else:
            ret = einsum_via_matmul(input_tensor, w, 2)
        ret += b
    if activation is not None:
        return activation(ret)
    else:
        return ret
Exemple #22
0
def ln(inputs, epsilon = 1e-8, scope='ln'):
    '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
    inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
    epsilon: A floating number. A very small number for preventing ZeroDivision Error.
    scope: Optional scope for `variable_scope`.

    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta= tf.get_variable('beta', params_shape, initializer=tf.zeros_initializer())
        gamma = tf.get_variable('gamma', params_shape, initializer=tf.ones_initializer())
        normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
        outputs = gamma * normalized + beta

    return outputs
Exemple #23
0
            def _forward(target_ids, target_mask, target_max_seq_length):

                with tf.variable_scope('decoder'):

                    # shared embedding
                    dec = tf.nn.embedding_lookup(embedding_table, target_ids)
                    dec *= hidden_size ** 0.5  # scale
                    dec += positional_encoding(dec, target_max_seq_length)
                    dec = util.dropout(dec, dropout_rate)

                    # blocks
                    for i in range(num_blocks):
                        with tf.variable_scope('block_%s' % i):

                            # masked self-attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=dec,
                                values=dec,
                                key_masks=target_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=True,
                                scope='masked_self_attention')

                            # vanilla attention
                            dec = multihead_attention(
                                queries=dec,
                                keys=memory,
                                values=memory,
                                key_masks=source_mask,
                                num_heads=num_attention_heads,
                                dropout_rate=dropout_rate,
                                training=is_training,
                                causality=False,
                                scope='vanilla_attention')

                            # feed forward
                            dec = ff(
                                dec, num_units=[4 * hidden_size, hidden_size])

                # final linear projection (embedding weights are shared)
                with tf.variable_scope('cls'):
                    output_bias = tf.get_variable(
                        'output_bias', shape=[vocab_size],
                        initializer=tf.zeros_initializer())
                    dec = tf.reshape(dec, [-1, hidden_size])
                    logits = tf.matmul(dec, embedding_table, transpose_b=True)
                    logits = tf.reshape(
                        logits, [-1, target_max_seq_length, vocab_size])
                    logits = tf.nn.bias_add(logits, output_bias)

                return logits
Exemple #24
0
def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
                       dropatt, input_mask, is_training, initializer,
                       scope=None, reuse=None, use_proj=True):

    '''
    Different classification tasks may not may not share the same parameters
    to summarize the sequence features.

    If shared, one can keep the `scope` to the default value `None`.
    Otherwise, one should specify a different `scope` for each task.
    '''

    with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
        if summary_type == 'last':
            summary = hidden[-1]
        elif summary_type == 'first':
            summary = hidden[0]
        elif summary_type == 'mean':
            summary = tf.reduce_mean(hidden, axis=0)
        elif summary_type == 'attn':
            bsz = tf.shape(hidden)[1]

            summary_bias = tf.get_variable('summary_bias', [d_model],
                                           dtype=hidden.dtype,
                                           initializer=initializer)
            summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])

            if input_mask is not None:
                input_mask = input_mask[None, :, :, None]

            summary = multihead_attn(
                summary_bias, hidden, hidden, input_mask,
                d_model, n_head, d_head, dropout, dropatt,
                is_training, initializer, residual=False)
            summary = summary[0]
        else:
            raise ValueError('Unsupported summary type %s' % summary_type)

        # use another projection as in BERT
        if use_proj:
            summary = tf.layers.dense(
                summary,
                d_model,
                activation=tf.tanh,
                kernel_initializer=initializer,
                name='summary')

        # dropout
        summary = tf.layers.dropout(
            summary, dropout, training=is_training,
            name='dropout')

    return summary
Exemple #25
0
    def _cls_fcn(self,
                 prev_output,
                 label_size,
                 hidden_size=768,
                 initializer_range=0.02,
                 dtype=tf.float32,
                 trainable=True):

        with tf.variable_scope('output'):
            cls_output_weights = tf.get_variable(
                'output_weights', [hidden_size, label_size],
                initializer=tf.truncated_normal_initializer(
                    stddev=initializer_range),
                dtype=dtype,
                trainable=trainable)
            cls_output_bias = tf.get_variable(
                'output_bias', [label_size],
                initializer=tf.zeros_initializer(),
                dtype=dtype,
                trainable=trainable)
            cls_logits = tf.matmul(prev_output[:, 0, :], cls_output_weights)
            cls_output = tf.nn.bias_add(cls_logits, cls_output_bias)

        return cls_output
Exemple #26
0
            def _forward(input_ids, past=None):
                batch, sequence = shape_list(input_ids)

                if tilda_embeddings is None:
                    wte = tf.get_variable(
                        'word_embeddings', [hparams.n_vocab, hparams.n_embed],
                        initializer=tf.random_normal_initializer(stddev=0.02))
                else:
                    wte = tilda_embeddings
                wpe = tf.get_variable(
                    'wpe', [hparams.n_ctx, hparams.n_embed],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                past_length = 0 if past is None else tf.shape(past)[-2]
                h = (tf.gather(wte, input_ids) +
                     tf.gather(wpe, positions_for(input_ids, past_length)))

                # stacked transformer layers
                presents = []
                pasts = tf.unstack(past, axis=1) if past is not None else \
                    [None] * hparams.n_layer
                assert len(pasts) == hparams.n_layer
                for layer, past in enumerate(pasts):
                    h, present = block(h,
                                       'h%d' % layer,
                                       past=past,
                                       hparams=hparams)
                    presents.append(present)
                present = tf.stack(presents, axis=1)
                h = norm(h, 'ln_f')

                # Language model loss.  Do tokens <n predict token n?
                h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed])
                logits = tf.matmul(h_flat, wte, transpose_b=True)
                logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])

                return logits, present
Exemple #27
0
def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training,
                   kernel_initializer, residual=True):
    '''Post-attention processing.'''
    # post-attention projection (back to `d_model`)
    proj_o = tf.get_variable(
        'o/kernel', [d_model, n_head, d_head],
        dtype=h.dtype, initializer=kernel_initializer)
    attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o)

    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
    if residual:
        output = util.layer_norm(
            attn_out + h, name='LayerNorm')
    else:
        output = util.layer_norm(
            attn_out, name='LayerNorm')

    return output
Exemple #28
0
def get_token_embeddings(vocab_size, num_units, zero_pad=True):
    '''Constructs token embedding matrix.
    Note that the column of index 0's are set to zeros.
    vocab_size: scalar. V.
    num_units: embedding dimensionalty. E.
    zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
    To apply query/key masks easily, zero pad is turned on.

    Returns
    weight variable: (V, E)
    '''
    with tf.variable_scope('shared_weight_matrix'):
        embeddings = tf.get_variable('weight_mat',
                                   dtype=tf.float32,
                                   shape=(vocab_size, num_units),
                                   initializer=xavier_initializer())
        if zero_pad:
            embeddings = tf.concat((tf.zeros(shape=[1, num_units]),
                                    embeddings[1:, :]), 0)
    return embeddings
Exemple #29
0
    def __init__(self,
                 hparams,
                 is_training,
                 input_ids,
                 sample_weight=None,
                 scope='model',
                 given=1,
                 use_tilda_embedding=False,
                 **kwargs):
        super().__init__()

        batch_size = util.get_shape_list(input_ids, expected_rank=2)[0]
        max_seq_length = hparams.n_predict

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):

            def _forward(input_ids, past=None):
                batch, sequence = shape_list(input_ids)

                if tilda_embeddings is None:
                    wte = tf.get_variable(
                        'word_embeddings', [hparams.n_vocab, hparams.n_embed],
                        initializer=tf.random_normal_initializer(stddev=0.02))
                else:
                    wte = tilda_embeddings
                wpe = tf.get_variable(
                    'wpe', [hparams.n_ctx, hparams.n_embed],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                past_length = 0 if past is None else tf.shape(past)[-2]
                h = (tf.gather(wte, input_ids) +
                     tf.gather(wpe, positions_for(input_ids, past_length)))

                # stacked transformer layers
                presents = []
                pasts = tf.unstack(past, axis=1) if past is not None else \
                    [None] * hparams.n_layer
                assert len(pasts) == hparams.n_layer
                for layer, past in enumerate(pasts):
                    h, present = block(h,
                                       'h%d' % layer,
                                       past=past,
                                       hparams=hparams)
                    presents.append(present)
                present = tf.stack(presents, axis=1)
                h = norm(h, 'ln_f')

                # Language model loss.  Do tokens <n predict token n?
                h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed])
                logits = tf.matmul(h_flat, wte, transpose_b=True)
                logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])

                return logits, present

            # convert to labels
            label_ids = tf.concat(
                [input_ids[:, 1:],
                 tf.zeros([batch_size, 1], dtype=tf.int32)],
                axis=-1)

            # forward once
            if is_training:
                (logits, _) = _forward(input_ids)

                self.preds['LM'] = tf.argmax(logits, axis=-1)

            # forward loop
            else:
                input_ids = input_ids[:, 0:given]

                for cur_length in range(given, max_seq_length + 1):
                    (logits, _) = _forward(input_ids)

                    pred_ids = tf.argmax(logits[:,
                                                cur_length - 1:cur_length, :],
                                         axis=-1)
                    pred_ids = tf.cast(pred_ids, tf.int32)
                    input_ids = tf.concat([input_ids, pred_ids], axis=-1)

                self.preds['LM'] = input_ids

            # loss
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids, depth=hparams.n_vocab)
            per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                            axis=-1)
            label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32)
            per_example_loss = \
                tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \
                tf.reduce_sum(label_mask, axis=-1)
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = tf.reduce_mean(per_example_loss)
            self.losses['LM'] = per_example_loss
Exemple #30
0
    def __init__(self,
                 vocab_size,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 reduced_size=64,
                 topic_size=1024,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 bias=0,
                 scope='vae',
                 trainable=True,
                 **kwargs):
        super().__init__()

        # freeze parameters
        config = Config(vocab_size,
                        hidden_size=hidden_size,
                        num_hidden_layers=num_hidden_layers,
                        num_attention_heads=num_attention_heads)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=seq_length,
                        embedding_size=config.hidden_size,
                        initializer_range=config.initializer_range,
                        word_embedding_name='word_embeddings',
                        tilda_embeddings=tilda_embeddings,
                        trainable=trainable)
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    hidden_size=config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope('encoder'):

                # stacked transformer
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, seq_length)
                self.all_encoder_layers = self.transformer_model(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=util.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                        config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    trainable=trainable)

                # projection
                with tf.variable_scope('projection'):
                    transformer_output = tf.layers.dense(
                        self.all_encoder_layers[-1],
                        reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    transformer_output = tf.reshape(transformer_output,
                                                    [batch_size, -1])
                    input_length = tf.reduce_sum(input_mask, axis=-1)
                    input_length = tf.cast(input_length, tf.float32)
                    input_length_1d = tf.reshape(input_length, [batch_size])
                    input_length_2d = tf.reshape(input_length, [batch_size, 1])

                    broadcast_mask = tf.sequence_mask(
                        tf.multiply(input_length_1d, reduced_size),
                        seq_length * reduced_size,
                        dtype=tf.float32)
                    broadcast_mask = tf.multiply(broadcast_mask,
                                                 seq_length / input_length_2d)
                    transformer_output *= broadcast_mask

                    # latent space
                    miu = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        activation='tanh',
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='miu',
                        trainable=trainable)
                    sigma = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='sigma',
                        trainable=trainable)
                    self.probs['miu'] = miu
                    self.probs['sigma'] = sigma

            with tf.variable_scope('decoder'):
                with tf.variable_scope('projection'):

                    # reparametarization
                    if is_training:
                        noise = tf.random_normal([batch_size, topic_size])
                    else:
                        noise = tf.random_uniform([batch_size, topic_size],
                                                  minval=-bias,
                                                  maxval=bias)
                    decoder_input = miu + tf.exp(sigma) * noise

                    # projection
                    decoder_input = tf.layers.dense(
                        decoder_input,
                        seq_length * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    intermediate_input = tf.reshape(
                        decoder_input, [-1, seq_length, reduced_size])
                    intermediate_input = util.layer_norm(intermediate_input,
                                                         trainable=trainable)
                    intermediate_input = util.dropout(
                        intermediate_input, config.hidden_dropout_prob)

                # MLP
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        intermediate_input,
                        4 * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                with tf.variable_scope('output'):
                    decoder_output = tf.layers.dense(
                        intermediate_output,
                        config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                    decoder_output = util.layer_norm(decoder_output,
                                                     trainable=trainable)
                    decoder_output = util.dropout(decoder_output,
                                                  config.hidden_dropout_prob)
                self.all_decoder_layers = [intermediate_output, decoder_output]
                self.all_decoder_layers = [decoder_output]

        # reconstruction
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    decoder_output,
                    units=config.hidden_size,
                    activation=util.get_activation(config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        config.initializer_range),
                    trainable=trainable)
                input_tensor = util.layer_norm(input_tensor,
                                               trainable=trainable)
            output_weights = self.embedding_table
            output_bias = tf.get_variable('output_bias',
                                          shape=[config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)
            flatten_input_tensor = tf.reshape(input_tensor,
                                              [-1, config.hidden_size])

            logits = tf.matmul(flatten_input_tensor,
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            logits = tf.reshape(logits,
                                [batch_size, seq_length, config.vocab_size])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            lm_log_probs = tf.nn.log_softmax(logits, axis=-1)

            self.preds['preds'] = tf.argmax(probs, axis=-1)
            one_hot_labels = tf.one_hot(input_ids,
                                        depth=config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels,
                                              axis=[-1])
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = (tf.reduce_mean(per_example_loss) +
                               tf.reduce_mean(tf.square(miu)) +
                               tf.reduce_mean(tf.exp(sigma) - sigma - 1))
            self.losses['losses'] = per_example_loss