Esempio n. 1
0
    def embedding_lookup(self,
                         input_ids,
                         vocab_size,
                         batch_size,
                         max_seq_length,
                         embedding_size=128,
                         initializer_range=0.02,
                         word_embedding_name='word_embeddings',
                         dtype=tf.float32,
                         trainable=True,
                         tilda_embeddings=None):
        if input_ids.shape.ndims == 2:
            input_ids = tf.expand_dims(input_ids, axis=[-1])

        if tilda_embeddings is not None:
            embedding_table = tilda_embeddings
        else:
            embedding_table = tf.get_variable(
                name=word_embedding_name,
                shape=[vocab_size, embedding_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

        flat_input_ids = tf.reshape(input_ids, [-1])
        output = tf.gather(embedding_table,
                           flat_input_ids,
                           name='embedding_look_up')
        output = tf.reshape(output,
                            [batch_size, max_seq_length, embedding_size])

        return (output, embedding_table)
Esempio n. 2
0
def gather_positions(sequence, positions):
    '''Gathers the vectors at the specific positions over a minibatch.

    Args:
      sequence: A [batch_size, seq_length] or
          [batch_size, seq_length, depth] tensor of values
      positions: A [batch_size, n_positions] tensor of indices

    Returns: A [batch_size, n_positions] or
      [batch_size, n_positions, depth] tensor of the values at the indices
    '''
    shape = util.get_shape_list(sequence, expected_rank=[2, 3])
    depth_dimension = (len(shape) == 3)
    if depth_dimension:
        B, L, D = shape
    else:
        B, L = shape
        D = 1
        sequence = tf.expand_dims(sequence, -1)
    position_shift = tf.expand_dims(L * tf.range(B), -1)
    flat_positions = tf.reshape(positions + position_shift, [-1])
    flat_sequence = tf.reshape(sequence, [B * L, D])
    gathered = tf.gather(flat_sequence, flat_positions)
    if depth_dimension:
        return tf.reshape(gathered, [B, -1, D])
    else:
        return tf.reshape(gathered, [B, -1])
Esempio n. 3
0
def crf_binary_score(tag_indices, sequence_lengths, transition_params):
    ''' Computes the binary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      transition_params: A [num_tags, num_tags] matrix of binary potentials.

    Returns:
      binary_scores: A [batch_size] vector of binary scores.
    '''
    # Get shape information.
    num_tags = transition_params.get_shape()[0]
    num_transitions = tf.shape(tag_indices)[1] - 1

    # Truncate by one on each side of the sequence to get the start and end
    # indices of each transition.
    start_tag_indices = tf.slice(tag_indices, [0, 0], [-1, num_transitions])
    end_tag_indices = tf.slice(tag_indices, [0, 1], [-1, num_transitions])

    # Encode the indices in a flattened representation.
    flattened_transition_indices = \
        start_tag_indices * num_tags + end_tag_indices
    flattened_transition_params = tf.reshape(transition_params, [-1])

    # Get the binary scores based on the flattened representation.
    binary_scores = tf.gather(flattened_transition_params,
                              flattened_transition_indices)

    masks = tf.sequence_mask(sequence_lengths,
                             maxlen=tf.shape(tag_indices)[1],
                             dtype=tf.float32)
    truncated_masks = tf.slice(masks, [0, 1], [-1, -1])
    binary_scores = tf.reduce_sum(binary_scores * truncated_masks, 1)
    return binary_scores
Esempio n. 4
0
def crf_unary_score(tag_indices, sequence_lengths, inputs):
    ''' Computes the unary scores of tag sequences.

    Args:
      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
      sequence_lengths: A [batch_size] vector of true sequence lengths.
      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.

    Returns:
      unary_scores: A [batch_size] vector of unary scores.
    '''
    batch_size = tf.shape(inputs)[0]
    max_seq_len = tf.shape(inputs)[1]
    num_tags = tf.shape(inputs)[2]

    flattened_inputs = tf.reshape(inputs, [-1])

    offsets = tf.expand_dims(tf.range(batch_size) * max_seq_len * num_tags, 1)
    offsets += tf.expand_dims(tf.range(max_seq_len) * num_tags, 0)
    # Use int32 or int64 based on tag_indices' dtype.
    if tag_indices.dtype == tf.int64:
        offsets = tf.cast(offsets, tf.int64)
    flattened_tag_indices = tf.reshape(offsets + tag_indices, [-1])

    unary_scores = tf.reshape(
        tf.gather(flattened_inputs, flattened_tag_indices),
        [batch_size, max_seq_len])

    masks = tf.sequence_mask(sequence_lengths,
                             maxlen=tf.shape(tag_indices)[1],
                             dtype=tf.float32)

    unary_scores = tf.reduce_sum(unary_scores * masks, 1)
    return unary_scores
Esempio n. 5
0
    def embedding_postprocessor(self,
                                input_tensor,
                                position_ids,
                                batch_size,
                                max_seq_length,
                                hidden_size,
                                use_token_type=False,
                                segment_ids=None,
                                token_type_vocab_size=16,
                                token_type_embedding_name=\
                                    'token_type_embeddings',
                                use_position_embeddings=True,
                                position_embedding_name='position_embeddings',
                                initializer_range=0.02,
                                max_position_embeddings=512,
                                dropout_prob=0.1,
                                dtype=tf.float32,
                                trainable=True):
        output = input_tensor

        if use_token_type:
            if segment_ids is None:
                raise ValueError(
                    'segment_ids must be specified if use_token_type is True.')
            token_type_table = tf.get_variable(
                name=token_type_embedding_name,
                shape=[token_type_vocab_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)

            # This vocab will be small so we always do one-hot here,
            # since it is always faster for a small vocabulary.
            flat_segment_ids = tf.reshape(segment_ids, [-1])
            one_hot_ids = tf.one_hot(flat_segment_ids,
                                     depth=token_type_vocab_size,
                                     dtype=dtype)
            token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
            token_type_embeddings = tf.reshape(
                token_type_embeddings,
                [batch_size, max_seq_length, hidden_size])
            output += token_type_embeddings

        if use_position_embeddings:
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, hidden_size],
                initializer=util.create_initializer(initializer_range),
                dtype=dtype,
                trainable=trainable)
            output += tf.gather(full_position_embeddings, position_ids)

        output = util.layer_norm_and_dropout(output,
                                             dropout_prob,
                                             trainable=trainable)
        return output
Esempio n. 6
0
        def gather_indexes(sequence_tensor, positions):
            sequence_shape = util.get_shape_list(sequence_tensor, 3)
            batch_size = sequence_shape[0]
            seq_length = sequence_shape[1]
            width = sequence_shape[2]

            flat_offsets = tf.reshape(
                tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
            flat_positions = tf.reshape(positions + flat_offsets, [-1])
            flat_sequence_tensor = tf.reshape(sequence_tensor,
                                              [batch_size * seq_length, width])
            output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
            return output_tensor
Esempio n. 7
0
            def _forward(input_ids, past=None):
                batch, sequence = shape_list(input_ids)

                if tilda_embeddings is None:
                    wte = tf.get_variable(
                        'word_embeddings', [hparams.n_vocab, hparams.n_embed],
                        initializer=tf.random_normal_initializer(stddev=0.02))
                else:
                    wte = tilda_embeddings
                wpe = tf.get_variable(
                    'wpe', [hparams.n_ctx, hparams.n_embed],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                past_length = 0 if past is None else tf.shape(past)[-2]
                h = (tf.gather(wte, input_ids) +
                     tf.gather(wpe, positions_for(input_ids, past_length)))

                # stacked transformer layers
                presents = []
                pasts = tf.unstack(past, axis=1) if past is not None else \
                    [None] * hparams.n_layer
                assert len(pasts) == hparams.n_layer
                for layer, past in enumerate(pasts):
                    h, present = block(h,
                                       'h%d' % layer,
                                       past=past,
                                       hparams=hparams)
                    presents.append(present)
                present = tf.stack(presents, axis=1)
                h = norm(h, 'ln_f')

                # Language model loss.  Do tokens <n predict token n?
                h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed])
                logits = tf.matmul(h_flat, wte, transpose_b=True)
                logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])

                return logits, present
Esempio n. 8
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 n_wide_features,
                 wide_features,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        hidden_size = input_tensor.shape.as_list()[-1]
        feature_size = wide_features.shape.as_list()[-1]
        with tf.variable_scope('wide'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 1, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            wide_output = tf.gather(feature_embeddings,
                                    wide_features)  # [B, N, H]

        with tf.variable_scope('wide_and_deep'):
            deep_output = tf.expand_dims(input_tensor, -1)  # [B, H, 1]
            attention_scores = tf.matmul(wide_output, deep_output)  # [B, N, 1]
            attention_scores = tf.transpose(attention_scores,
                                            [0, 2, 1])  # [B, 1, N]
            attention_scores = tf.multiply(attention_scores,
                                           1.0 / math.sqrt(hidden_size))
            feature_mask = tf.cast(
                tf.sequence_mask(n_wide_features, feature_size),
                tf.float32)  # [B, N]
            feature_mask = tf.expand_dims(feature_mask, 1)  # [B, 1, N]
            attention_scores += (1.0 - feature_mask) * -10000.0
            attention_matrix = tf.nn.softmax(attention_scores, axis=-1)
            attention_output = tf.matmul(attention_matrix,
                                         wide_output)  # [B, 1, H]
            attention_output = attention_output[:, 0, :]  # [B, H]
            # attention_output = util.dropout(
            #     attention_output, hidden_dropout_prob)
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)
Esempio n. 9
0
    def __init__(self,
                 vocab_size,
                 filter_sizes,
                 num_channels,
                 is_training,
                 input_ids,
                 scope='text_cnn',
                 embedding_size=256,
                 dropout_prob=0.1,
                 trainable=True,
                 **kwargs):

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        max_seq_length = input_shape[1]

        if isinstance(filter_sizes, str):
            filter_sizes = filter_sizes.split(',')
        assert isinstance(filter_sizes, list), (
            '`filter_sizes` should be a list of integers or a string '
            'seperated with commas.')

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding=kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                if tilda_embeddings is not None:
                    embedding_table = tilda_embeddings
                else:
                    embedding_table = tf.get_variable(
                        name='word_embeddings',
                        shape=[vocab_size, embedding_size],
                        initializer=util.create_initializer(0.02),
                        dtype=tf.float32,
                        trainable=trainable)

                flat_input_ids = tf.reshape(input_ids, [-1])
                output = tf.gather(
                    embedding_table, flat_input_ids, name='embedding_look_up')
                output = tf.reshape(
                    output, [batch_size, max_seq_length, embedding_size])

                output_expanded = tf.expand_dims(output, -1)

            # Create a convolution + maxpool layer for each filter size
            pooled_outputs = []
            for i, filter_size in enumerate(filter_sizes):
                with tf.variable_scope('conv_%s' % filter_size):

                    # Convolution Layer
                    filter_shape = [filter_size, embedding_size, 1, num_channels]
                    W = tf.get_variable(
                        name='W',
                        shape=filter_shape,
                        initializer=\
                            tf.truncated_normal_initializer(0.1),
                        dtype=tf.float32,
                        trainable=trainable)
                    b = tf.get_variable(
                        name='b',
                        shape=[num_channels],
                        initializer=\
                            tf.constant_initializer(0.1),
                        dtype=tf.float32,
                        trainable=trainable)
                    conv = tf.nn.conv2d(
                        output_expanded, W,
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name='conv')

                    # Apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                    # Maxpooling over the outputs
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name='pool')
                    pooled_outputs.append(pooled)

            num_channels_total = num_channels * len(filter_sizes)
            h_pool = tf.concat(pooled_outputs, 3)
            h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total])

            with tf.name_scope('dropout'):
                self.pooled_output = util.dropout(h_pool_flat, dropout_prob)
Esempio n. 10
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_tensor,
                 input_mask,
                 sem_features,
                 label_ids,
                 max_seq_length,
                 feature_size,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 **kwargs):
        super().__init__(**kwargs)

        input_shape = util.get_shape_list(input_tensor)
        batch_size = input_shape[0]
        hidden_size = input_shape[-1]
        with tf.variable_scope('sem'):
            feature_embeddings = tf.get_variable(
                name='feature_embeddings',
                shape=[feature_size + 3,
                       hidden_size],  # for [PAD], [CLS], [SEP]
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            sem_output = tf.gather(feature_embeddings,
                                   sem_features)  # [B, N, H]

            attention_heads = []
            with tf.variable_scope('self'):
                attention_mask = BERTEncoder.create_attention_mask_from_input_mask(
                    input_mask, batch_size, max_seq_length)
                (attention_head, _) = BERTEncoder.attention_layer(
                    from_tensor=sem_output,
                    to_tensor=sem_output,
                    attention_mask=attention_mask,
                    num_attention_heads=bert_config.num_attention_heads,
                    size_per_head=(hidden_size //
                                   bert_config.num_attention_heads),
                    attention_probs_dropout_prob=hidden_dropout_prob
                    if is_training else 0.0,
                    initializer_range=initializer_range,
                    do_return_2d_tensor=False,
                    batch_size=batch_size,
                    from_max_seq_length=max_seq_length,
                    to_max_seq_length=max_seq_length,
                    trainable=trainable)
                attention_heads.append(attention_head)

            if len(attention_heads) == 1:
                attention_output = attention_heads[0]
            else:
                attention_output = tf.concat(attention_heads, axis=-1)

            attention_output = attention_output[:, 0, :]  # [B, H]
            input_tensor = util.layer_norm(attention_output + input_tensor,
                                           trainable=trainable)

        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            self.preds['preds'] = tf.argmax(logits, axis=-1)
            self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs')

            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids,
                                        depth=label_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
                                              axis=-1)
            if sample_weight is not None:
                per_example_loss = tf.cast(sample_weight,
                                           dtype=tf.float32) * per_example_loss
            thresh = kwargs.get('tsa_thresh')
            if thresh is not None:
                assert isinstance(
                    thresh,
                    float), ('`tsa_thresh` must be a float between 0 and 1.')
                uncertainty = tf.reduce_sum(self.probs['probs'] *
                                            tf.log(self.probs['probs']),
                                            axis=-1)
                uncertainty /= tf.log(1 / label_size)
                per_example_loss = tf.cast(
                    tf.greater(uncertainty, thresh), dtype=tf.float32) * \
                    per_example_loss

            self.losses['losses'] = per_example_loss
            self.total_loss = tf.reduce_mean(per_example_loss)