def _cls_fcn(self, prev_output, label_size, hidden_size=768, initializer_range=0.02, dtype=tf.float32, trainable=True): with tf.variable_scope('output'): cls_output_weights = tf.get_variable( 'output_weights', [hidden_size, label_size], initializer=tf.truncated_normal_initializer( stddev=initializer_range), dtype=dtype, trainable=trainable) cls_output_bias = tf.get_variable( 'output_bias', [label_size], initializer=tf.zeros_initializer(), dtype=dtype, trainable=trainable) cls_logits = tf.matmul(prev_output[:, 0, :], cls_output_weights) cls_output = tf.nn.bias_add(cls_logits, cls_output_bias) return cls_output
def __init__(self, vocab_size, is_training, input_ids, input_mask, segment_ids, sample_weight=None, reduced_size=64, topic_size=1024, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, bias=0, scope='vae', trainable=True, **kwargs): super().__init__() # freeze parameters config = Config(vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, batch_size=batch_size, max_seq_length=seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings, trainable=trainable) self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, hidden_size=config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope('encoder'): # stacked transformer attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, seq_length) self.all_encoder_layers = self.transformer_model( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=util.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=\ config.attention_probs_dropout_prob, initializer_range=config.initializer_range, trainable=trainable) # projection with tf.variable_scope('projection'): transformer_output = tf.layers.dense( self.all_encoder_layers[-1], reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) transformer_output = tf.reshape(transformer_output, [batch_size, -1]) input_length = tf.reduce_sum(input_mask, axis=-1) input_length = tf.cast(input_length, tf.float32) input_length_1d = tf.reshape(input_length, [batch_size]) input_length_2d = tf.reshape(input_length, [batch_size, 1]) broadcast_mask = tf.sequence_mask( tf.multiply(input_length_1d, reduced_size), seq_length * reduced_size, dtype=tf.float32) broadcast_mask = tf.multiply(broadcast_mask, seq_length / input_length_2d) transformer_output *= broadcast_mask # latent space miu = tf.layers.dense( transformer_output, topic_size, activation='tanh', kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='miu', trainable=trainable) sigma = tf.layers.dense( transformer_output, topic_size, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='sigma', trainable=trainable) self.probs['miu'] = miu self.probs['sigma'] = sigma with tf.variable_scope('decoder'): with tf.variable_scope('projection'): # reparametarization if is_training: noise = tf.random_normal([batch_size, topic_size]) else: noise = tf.random_uniform([batch_size, topic_size], minval=-bias, maxval=bias) decoder_input = miu + tf.exp(sigma) * noise # projection decoder_input = tf.layers.dense( decoder_input, seq_length * reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) intermediate_input = tf.reshape( decoder_input, [-1, seq_length, reduced_size]) intermediate_input = util.layer_norm(intermediate_input, trainable=trainable) intermediate_input = util.dropout( intermediate_input, config.hidden_dropout_prob) # MLP with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( intermediate_input, 4 * reduced_size, activation=util.gelu, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) with tf.variable_scope('output'): decoder_output = tf.layers.dense( intermediate_output, config.hidden_size, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) decoder_output = util.layer_norm(decoder_output, trainable=trainable) decoder_output = util.dropout(decoder_output, config.hidden_dropout_prob) self.all_decoder_layers = [intermediate_output, decoder_output] self.all_decoder_layers = [decoder_output] # reconstruction with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( decoder_output, units=config.hidden_size, activation=util.get_activation(config.hidden_act), kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) input_tensor = util.layer_norm(input_tensor, trainable=trainable) output_weights = self.embedding_table output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) flatten_input_tensor = tf.reshape(input_tensor, [-1, config.hidden_size]) logits = tf.matmul(flatten_input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, config.vocab_size]) probs = tf.nn.softmax(logits, axis=-1, name='probs') lm_log_probs = tf.nn.log_softmax(logits, axis=-1) self.preds['preds'] = tf.argmax(probs, axis=-1) one_hot_labels = tf.one_hot(input_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels, axis=[-1]) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = (tf.reduce_mean(per_example_loss) + tf.reduce_mean(tf.square(miu)) + tf.reduce_mean(tf.exp(sigma) - sigma - 1)) self.losses['losses'] = per_example_loss
def create_initializer(initializer_range=0.02): '''Creates a truncated_normal_initializer with the given range.''' return tf.truncated_normal_initializer(stddev=initializer_range)
def __init__(self, vocab_size, filter_sizes, num_channels, is_training, input_ids, scope='text_cnn', embedding_size=256, dropout_prob=0.1, trainable=True, **kwargs): input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] max_seq_length = input_shape[1] if isinstance(filter_sizes, str): filter_sizes = filter_sizes.split(',') assert isinstance(filter_sizes, list), ( '`filter_sizes` should be a list of integers or a string ' 'seperated with commas.') # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding=kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): if tilda_embeddings is not None: embedding_table = tilda_embeddings else: embedding_table = tf.get_variable( name='word_embeddings', shape=[vocab_size, embedding_size], initializer=util.create_initializer(0.02), dtype=tf.float32, trainable=trainable) flat_input_ids = tf.reshape(input_ids, [-1]) output = tf.gather( embedding_table, flat_input_ids, name='embedding_look_up') output = tf.reshape( output, [batch_size, max_seq_length, embedding_size]) output_expanded = tf.expand_dims(output, -1) # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for i, filter_size in enumerate(filter_sizes): with tf.variable_scope('conv_%s' % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_channels] W = tf.get_variable( name='W', shape=filter_shape, initializer=\ tf.truncated_normal_initializer(0.1), dtype=tf.float32, trainable=trainable) b = tf.get_variable( name='b', shape=[num_channels], initializer=\ tf.constant_initializer(0.1), dtype=tf.float32, trainable=trainable) conv = tf.nn.conv2d( output_expanded, W, strides=[1, 1, 1, 1], padding='VALID', name='conv') # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name='pool') pooled_outputs.append(pooled) num_channels_total = num_channels * len(filter_sizes) h_pool = tf.concat(pooled_outputs, 3) h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total]) with tf.name_scope('dropout'): self.pooled_output = util.dropout(h_pool_flat, dropout_prob)