Esempio n. 1
0
    def _cls_fcn(self,
                 prev_output,
                 label_size,
                 hidden_size=768,
                 initializer_range=0.02,
                 dtype=tf.float32,
                 trainable=True):

        with tf.variable_scope('output'):
            cls_output_weights = tf.get_variable(
                'output_weights', [hidden_size, label_size],
                initializer=tf.truncated_normal_initializer(
                    stddev=initializer_range),
                dtype=dtype,
                trainable=trainable)
            cls_output_bias = tf.get_variable(
                'output_bias', [label_size],
                initializer=tf.zeros_initializer(),
                dtype=dtype,
                trainable=trainable)
            cls_logits = tf.matmul(prev_output[:, 0, :], cls_output_weights)
            cls_output = tf.nn.bias_add(cls_logits, cls_output_bias)

        return cls_output
Esempio n. 2
0
    def __init__(self,
                 vocab_size,
                 is_training,
                 input_ids,
                 input_mask,
                 segment_ids,
                 sample_weight=None,
                 reduced_size=64,
                 topic_size=1024,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 bias=0,
                 scope='vae',
                 trainable=True,
                 **kwargs):
        super().__init__()

        # freeze parameters
        config = Config(vocab_size,
                        hidden_size=hidden_size,
                        num_hidden_layers=num_hidden_layers,
                        num_attention_heads=num_attention_heads)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding = kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                (self.embedding_output, self.embedding_table) = \
                    self.embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=config.vocab_size,
                        batch_size=batch_size,
                        max_seq_length=seq_length,
                        embedding_size=config.hidden_size,
                        initializer_range=config.initializer_range,
                        word_embedding_name='word_embeddings',
                        tilda_embeddings=tilda_embeddings,
                        trainable=trainable)
                self.embedding_output = self.embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    hidden_size=config.hidden_size,
                    use_token_type=True,
                    segment_ids=segment_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob,
                    trainable=trainable)

            with tf.variable_scope('encoder'):

                # stacked transformer
                attention_mask = self.create_attention_mask_from_input_mask(
                    input_mask, batch_size, seq_length)
                self.all_encoder_layers = self.transformer_model(
                    input_tensor=self.embedding_output,
                    batch_size=batch_size,
                    max_seq_length=seq_length,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=util.get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=\
                        config.attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    trainable=trainable)

                # projection
                with tf.variable_scope('projection'):
                    transformer_output = tf.layers.dense(
                        self.all_encoder_layers[-1],
                        reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    transformer_output = tf.reshape(transformer_output,
                                                    [batch_size, -1])
                    input_length = tf.reduce_sum(input_mask, axis=-1)
                    input_length = tf.cast(input_length, tf.float32)
                    input_length_1d = tf.reshape(input_length, [batch_size])
                    input_length_2d = tf.reshape(input_length, [batch_size, 1])

                    broadcast_mask = tf.sequence_mask(
                        tf.multiply(input_length_1d, reduced_size),
                        seq_length * reduced_size,
                        dtype=tf.float32)
                    broadcast_mask = tf.multiply(broadcast_mask,
                                                 seq_length / input_length_2d)
                    transformer_output *= broadcast_mask

                    # latent space
                    miu = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        activation='tanh',
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='miu',
                        trainable=trainable)
                    sigma = tf.layers.dense(
                        transformer_output,
                        topic_size,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        name='sigma',
                        trainable=trainable)
                    self.probs['miu'] = miu
                    self.probs['sigma'] = sigma

            with tf.variable_scope('decoder'):
                with tf.variable_scope('projection'):

                    # reparametarization
                    if is_training:
                        noise = tf.random_normal([batch_size, topic_size])
                    else:
                        noise = tf.random_uniform([batch_size, topic_size],
                                                  minval=-bias,
                                                  maxval=bias)
                    decoder_input = miu + tf.exp(sigma) * noise

                    # projection
                    decoder_input = tf.layers.dense(
                        decoder_input,
                        seq_length * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=tf.truncated_normal_initializer(
                            stddev=config.initializer_range),
                        trainable=trainable)
                    intermediate_input = tf.reshape(
                        decoder_input, [-1, seq_length, reduced_size])
                    intermediate_input = util.layer_norm(intermediate_input,
                                                         trainable=trainable)
                    intermediate_input = util.dropout(
                        intermediate_input, config.hidden_dropout_prob)

                # MLP
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        intermediate_input,
                        4 * reduced_size,
                        activation=util.gelu,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                with tf.variable_scope('output'):
                    decoder_output = tf.layers.dense(
                        intermediate_output,
                        config.hidden_size,
                        kernel_initializer=util.create_initializer(
                            config.initializer_range),
                        trainable=trainable)
                    decoder_output = util.layer_norm(decoder_output,
                                                     trainable=trainable)
                    decoder_output = util.dropout(decoder_output,
                                                  config.hidden_dropout_prob)
                self.all_decoder_layers = [intermediate_output, decoder_output]
                self.all_decoder_layers = [decoder_output]

        # reconstruction
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    decoder_output,
                    units=config.hidden_size,
                    activation=util.get_activation(config.hidden_act),
                    kernel_initializer=util.create_initializer(
                        config.initializer_range),
                    trainable=trainable)
                input_tensor = util.layer_norm(input_tensor,
                                               trainable=trainable)
            output_weights = self.embedding_table
            output_bias = tf.get_variable('output_bias',
                                          shape=[config.vocab_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)
            flatten_input_tensor = tf.reshape(input_tensor,
                                              [-1, config.hidden_size])

            logits = tf.matmul(flatten_input_tensor,
                               output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)

            logits = tf.reshape(logits,
                                [batch_size, seq_length, config.vocab_size])
            probs = tf.nn.softmax(logits, axis=-1, name='probs')
            lm_log_probs = tf.nn.log_softmax(logits, axis=-1)

            self.preds['preds'] = tf.argmax(probs, axis=-1)
            one_hot_labels = tf.one_hot(input_ids,
                                        depth=config.vocab_size,
                                        dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels,
                                              axis=[-1])
            if sample_weight is not None:
                per_example_loss *= tf.expand_dims(sample_weight, axis=-1)

            self.total_loss = (tf.reduce_mean(per_example_loss) +
                               tf.reduce_mean(tf.square(miu)) +
                               tf.reduce_mean(tf.exp(sigma) - sigma - 1))
            self.losses['losses'] = per_example_loss
Esempio n. 3
0
def create_initializer(initializer_range=0.02):
    '''Creates a truncated_normal_initializer with the given range.'''
    return tf.truncated_normal_initializer(stddev=initializer_range)
Esempio n. 4
0
    def __init__(self,
                 vocab_size,
                 filter_sizes,
                 num_channels,
                 is_training,
                 input_ids,
                 scope='text_cnn',
                 embedding_size=256,
                 dropout_prob=0.1,
                 trainable=True,
                 **kwargs):

        input_shape = util.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        max_seq_length = input_shape[1]

        if isinstance(filter_sizes, str):
            filter_sizes = filter_sizes.split(',')
        assert isinstance(filter_sizes, list), (
            '`filter_sizes` should be a list of integers or a string '
            'seperated with commas.')

        # Tilda embeddings for SMART algorithm
        tilda_embeddings = None
        use_tilda_embedding=kwargs.get('use_tilda_embedding')
        if use_tilda_embedding:
            with tf.variable_scope('', reuse=True):
                tilda_embeddings = tf.get_variable('tilda_embeddings')

        with tf.variable_scope(scope):
            with tf.variable_scope('embeddings'):

                if tilda_embeddings is not None:
                    embedding_table = tilda_embeddings
                else:
                    embedding_table = tf.get_variable(
                        name='word_embeddings',
                        shape=[vocab_size, embedding_size],
                        initializer=util.create_initializer(0.02),
                        dtype=tf.float32,
                        trainable=trainable)

                flat_input_ids = tf.reshape(input_ids, [-1])
                output = tf.gather(
                    embedding_table, flat_input_ids, name='embedding_look_up')
                output = tf.reshape(
                    output, [batch_size, max_seq_length, embedding_size])

                output_expanded = tf.expand_dims(output, -1)

            # Create a convolution + maxpool layer for each filter size
            pooled_outputs = []
            for i, filter_size in enumerate(filter_sizes):
                with tf.variable_scope('conv_%s' % filter_size):

                    # Convolution Layer
                    filter_shape = [filter_size, embedding_size, 1, num_channels]
                    W = tf.get_variable(
                        name='W',
                        shape=filter_shape,
                        initializer=\
                            tf.truncated_normal_initializer(0.1),
                        dtype=tf.float32,
                        trainable=trainable)
                    b = tf.get_variable(
                        name='b',
                        shape=[num_channels],
                        initializer=\
                            tf.constant_initializer(0.1),
                        dtype=tf.float32,
                        trainable=trainable)
                    conv = tf.nn.conv2d(
                        output_expanded, W,
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name='conv')

                    # Apply nonlinearity
                    h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                    # Maxpooling over the outputs
                    pooled = tf.nn.max_pool(
                        h,
                        ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1],
                        strides=[1, 1, 1, 1],
                        padding='VALID',
                        name='pool')
                    pooled_outputs.append(pooled)

            num_channels_total = num_channels * len(filter_sizes)
            h_pool = tf.concat(pooled_outputs, 3)
            h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total])

            with tf.name_scope('dropout'):
                self.pooled_output = util.dropout(h_pool_flat, dropout_prob)