def _forward(self, is_training, split_placeholders, **kwargs): split_placeholders = _expand_features(self, split_placeholders) input_k = tf.transpose(split_placeholders['input_k'], [1, 0]) input_q = tf.transpose(split_placeholders['input_q'], [1, 0]) seg_id = tf.transpose(split_placeholders['seg_id'], [1, 0]) perm_mask = tf.transpose(split_placeholders['perm_mask'], [1, 2, 0]) target = split_placeholders['target'] target_mask = split_placeholders['target_mask'] target_mapping = None if 'target_mapping' in split_placeholders: target_mapping = tf.transpose(split_placeholders['target_mapping'], [1, 2, 0]) model = XLNet(xlnet_config=self.xlnet_config, is_training=is_training, input_ids=input_k, seg_ids=seg_id, input_mask=None, mems=self._mems, perm_mask=perm_mask, target=target, target_mask=target_mask, target_mapping=target_mapping, inp_q=input_q, sample_weight=split_placeholders.get('sample_weight'), **kwargs) (total_loss, losses, probs, preds) = model.get_forward_outputs() return (total_loss, losses, probs, preds)
def _forward(self, is_training, split_placeholders, **kwargs): input_ids = tf.transpose(split_placeholders['input_ids'], [1, 0]) input_mask = tf.transpose(split_placeholders['input_mask'], [1, 0]) segment_ids = tf.transpose(split_placeholders['segment_ids'], [1, 0]) encoder = XLNetEncoder(xlnet_config=self.xlnet_config, is_training=is_training, input_ids=input_ids, seg_ids=segment_ids, input_mask=input_mask, trainable=True, **kwargs) encoder_output = encoder.get_sequence_output() decoder = SeqCLSDecoder( is_training=is_training, input_tensor=encoder_output, input_mask=split_placeholders['input_mask'], label_ids=split_placeholders['label_ids'], label_size=self.label_size, sample_weight=split_placeholders.get('sample_weight'), scope='cls/sequence', name='cls', trainable=True, **kwargs) (total_loss, losses, probs, preds) = decoder.get_forward_outputs() return (total_loss, losses, probs, preds)
def create_projection_matrix(m, d, seed=0, scaling=0, struct_mode=False): r'''Constructs the matrix of random projections. Constructs a matrix of random orthogonal projections. Each projection vector has direction chosen uniformly at random and either deterministic length \sqrt{d} or length taken from the \chi(d) distribution (in the latter case marginal distributions of the projections are d-dimensional Gaussian vectors with associated identity covariance matrix). Args: m: number of random projections. d: dimensionality of each random projection. seed: random seed used to construct projections. scaling: 1 if all the random projections need to be renormalized to have length \sqrt{d}, 0 if the lengths of random projections should follow \chi(d) distribution. struct_mode: if True then products of Givens rotations will be used to construct random orthogonal matrix. This bypasses Gram-Schmidt orthogonalization. Returns: The matrix of random projections of the shape [m, d]. ''' nb_full_blocks = int(m / d) block_list = [] current_seed = seed for _ in range(nb_full_blocks): if struct_mode: q = create_products_of_givens_rotations(d, seed) else: unstructured_block = tf.random_normal((d, d), seed=current_seed) q, _ = tf.linalg.qr(unstructured_block) q = tf.transpose(q) block_list.append(q) current_seed += 1 remaining_rows = m - nb_full_blocks * d if remaining_rows > 0: if struct_mode: q = create_products_of_givens_rotations(d, seed) else: unstructured_block = tf.random_normal((d, d), seed=current_seed) q, _ = tf.linalg.qr(unstructured_block) q = tf.transpose(q) block_list.append(q[0:remaining_rows]) final_matrix = tf.concat(block_list, axis=0) current_seed += 1 if scaling == 0: multiplier = tf.norm(tf.random_normal((m, d), seed=current_seed), axis=1) elif scaling == 1: multiplier = 1 / tf.math.rsqrt(float(d)) * tf.ones((m)) else: raise ValueError('Scaling must be one of {0, 1}. Was %s' % scaling) return tf.matmul(tf.linalg.diag(multiplier), final_matrix)
def __init__(self, is_training, input_tensor, label_ids, sample_weight=None, scope='mrc', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs[name] = probs start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1), axis=-1) end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1), axis=-1) self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
def transpose_for_scores(input_tensor, batch_size, num_attention_heads, max_seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, max_seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor
def scaled_dot_product_attention(Q, K, V, key_masks, causality=False, dropout_rate=0., training=True, scope='scaled_dot_product_attention'): '''See 3.2.1. Q: Packed queries. 3d tensor. [N, T_q, d_k]. K: Packed keys. 3d tensor. [N, T_k, d_k]. V: Packed values. 3d tensor. [N, T_k, d_v]. key_masks: A 2d tensor with shape of [N, key_seqlen] causality: If True, applies masking for future blinding dropout_rate: A floating point number of [0, 1]. training: boolean for controlling droput scope: Optional scope for `variable_scope`. ''' with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): d_k = Q.get_shape().as_list()[-1] # dot product outputs = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) # (N, T_q, T_k) # scale outputs /= d_k ** 0.5 # key masking outputs = mask(outputs, key_masks=key_masks, type='key') # causality or future blinding masking if causality: outputs = mask(outputs, type='future') # softmax outputs = tf.nn.softmax(outputs) attention = tf.transpose(outputs, [0, 2, 1]) tf.summary.image('attention', tf.expand_dims(attention[:1], -1)) # # query masking # outputs = mask(outputs, Q, K, type='query') # dropout outputs = tf.layers.dropout( outputs, rate=dropout_rate, training=training) # weighted sum (context vectors) outputs = tf.matmul(outputs, V) # (N, T_q, d_v) return outputs
def get_sequence_output(self): ''' Returns: float32 Tensor in shape [len, bsz, d_model]. The last layer hidden representation of XLNet. ''' seq_length, _, hidden_size = self.output.shape.as_list() return tf.transpose(self.output, [1, 0, 2])
def favor_attention(query, key, value, kernel_transformation, causal, projection_matrix=None): '''Computes FAVOR normalized attention. Args: query: query tensor. key: key tensor. value: value tensor. kernel_transformation: transformation used to get finite kernel features. causal: whether attention is causal or not. projection_matrix: projection matrix to be used. Returns: FAVOR normalized attention. ''' query_prime = kernel_transformation(query, True, projection_matrix) # [B,L,H,M] key_prime = kernel_transformation(key, False, projection_matrix) # [B,L,H,M] query_prime = tf.transpose(query_prime, [1, 0, 2, 3]) # [L,B,H,M] key_prime = tf.transpose(key_prime, [1, 0, 2, 3]) # [L,B,H,M] value = tf.transpose(value, [1, 0, 2, 3]) # [L,B,H,D] if causal: av_attention = causal_numerator(query_prime, key_prime, value) attention_normalizer = causal_denominator(query_prime, key_prime) else: av_attention = noncausal_numerator(query_prime, key_prime, value) attention_normalizer = noncausal_denominator(query_prime, key_prime) av_attention = tf.transpose(av_attention, [1, 0, 2, 3]) attention_normalizer = tf.transpose(attention_normalizer, [1, 0, 2]) attention_normalizer = tf.expand_dims(attention_normalizer, len(attention_normalizer.shape)) return av_attention / attention_normalizer
def merge_heads(x): # Reverse of split_heads return merge_states(tf.transpose(x, [0, 2, 1, 3]))
def split_heads(x): # From [batch, sequence, features] to [batch, heads, # sequence, features] return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
def __init__(self, bert_config, is_training, input_tensor, sa_mask, label_ids, sample_weight=None, scope='sanet', alpha=0.5, hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) shape = util.get_shape_list(input_tensor) batch_size = shape[0] seq_length = shape[1] hidden_size = shape[2] sa_mask = tf.reshape(sa_mask, [batch_size, seq_length, seq_length]) with tf.variable_scope(scope): with tf.variable_scope('sentence_attention'): (sa_output, _) = self.attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=sa_mask, num_attention_heads=bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=seq_length, to_max_seq_length=seq_length, trainable=trainable) with tf.variable_scope('cls/mrc'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = alpha * sa_output + (1 - alpha) * input_tensor output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['probs'] = probs self.preds['preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses['losses'] = per_example_loss
def attention_layer(self, from_tensor, to_tensor, attention_mask=None, num_attention_heads=12, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_max_seq_length=None, to_max_seq_length=None, dtype=tf.float32, trainable=True): def transpose_for_scores(input_tensor, batch_size, num_attention_heads, max_seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, max_seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = from_tensor sequence length # T = to_tensor sequence length # N = num_attention_heads # H = size_per_head from_tensor_2d = util.reshape_to_matrix(from_tensor) to_tensor_2d = util.reshape_to_matrix(to_tensor) # query_layer = [B*F, N*H] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name='query', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) # key_layer = [B*T, N*H] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name='key', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) # value_layer = [B*T, N*H] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name='value', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) # query_layer = [B, N, F, H] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_max_seq_length, size_per_head) # key_layer = [B, N, T, H] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_max_seq_length, size_per_head) # Take the dot product between 'query' and 'key' to get the raw # attention scores. # attention_scores = [B, N, F, T] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # attention_mask = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) adder = (1.0 - tf.cast(attention_mask, dtype)) * -10000.0 attention_scores += adder # Normalize the attention scores to probabilities. # attention_probs = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, # which might seem a bit unusual, but is taken from the original # Transformer paper. attention_probs = util.dropout(attention_probs, attention_probs_dropout_prob) # value_layer = [B, T, N, H] value_layer = tf.reshape(value_layer, [ batch_size, to_max_seq_length, num_attention_heads, size_per_head ]) # value_layer = [B, N, T, H] value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) # context_layer = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) # context_layer = [B, F, N, H] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # context_layer = [B*F, N*H] context_layer = tf.reshape(context_layer, [ batch_size * from_max_seq_length, num_attention_heads * size_per_head ]) else: # context_layer = [B, F, N*H] context_layer = tf.reshape(context_layer, [ batch_size, from_max_seq_length, num_attention_heads * size_per_head ]) return (context_layer, attention_scores)
def __init__(self, bert_config, is_training, sketchy_encoder, intensive_encoder, query_mask, label_ids, has_answer, sample_weight=None, scope='retro_reader', matching_mechanism='cross-attention', beta_1=0.5, beta_2=0.5, threshold=1.0, trainable=True, **kwargs): super().__init__(**kwargs) # verifier with tf.variable_scope(scope): # sketchy reading module with tf.variable_scope('sketchy/prediction'): sketchy_output = sketchy_encoder.get_pooled_output() hidden_size = sketchy_output.shape.as_list()[-1] output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( sketchy_output, bert_config.hidden_dropout_prob \ if is_training else 0.0) logits = tf.matmul( output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( has_answer, depth=2, dtype=tf.float32) per_example_loss = - tf.reduce_sum( one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast( sample_weight, dtype=tf.float32) * per_example_loss self.losses['sketchy_losses'] = per_example_loss sketchy_loss = tf.reduce_mean(per_example_loss) score_ext = logits[:, 1] - logits[:, 0] # intensive reading module with tf.variable_scope('intensive'): H = intensive_encoder.get_sequence_output() H_Q = H * tf.cast( tf.expand_dims(query_mask, axis=-1), tf.float32) (batch_size, max_seq_length, hidden_size) = \ util.get_shape_list(H) # cross-attention if matching_mechanism == 'cross-attention': with tf.variable_scope('cross_attention'): attention_mask = \ self.create_attention_mask_from_input_mask( query_mask, batch_size, max_seq_length) (H_prime, _) = self.attention_layer( from_tensor=H, to_tensor=H_Q, attention_mask=attention_mask, num_attention_heads=\ bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, trainable=trainable) # matching-attention elif matching_mechanism == 'matching-attention': with tf.variable_scope('matching_attention'): output_weights = tf.get_variable( 'output_weights', shape=[hidden_size, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[hidden_size], initializer=tf.zeros_initializer(), trainable=trainable) trans = tf.matmul( H_Q, tf.tile( tf.expand_dims(output_weights, axis=0), [batch_size, 1, 1]), transpose_b=True) trans = tf.nn.bias_add(trans, output_bias) M = tf.nn.softmax( tf.matmul(H, trans, transpose_b=True), axis=-1) H_prime = tf.matmul(M, H_Q) with tf.variable_scope('prediction'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( H_prime, bert_config.hidden_dropout_prob \ if is_training else 0.0) output_layer = tf.reshape( output_layer, [batch_size * max_seq_length, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape( logits, [batch_size, max_seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['mrc_probs'] = probs self.preds['mrc_preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot( label_ids[:, 0], depth=max_seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot( label_ids[:, 1], depth=max_seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( - 0.5 * tf.reduce_sum( start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum( end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight intensive_loss = tf.reduce_mean(per_example_loss) self.losses['intensive_losses'] = per_example_loss score_has = tf.norm( probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1) score_null = probs[:, 0, 0] + probs[:, 1, 0] score_diff = score_has - score_null # rear verification v = beta_1 * score_diff + beta_2 * score_ext self.preds['verifier_preds'] = \ tf.cast(tf.greater(v, threshold), tf.int32) self.probs['verifier_probs'] = v self.total_loss = sketchy_loss + intensive_loss
def _local_perm(inputs, targets, is_masked, perm_size, seq_len): ''' Sample a permutation of the factorization order, and create an attention mask accordingly. Args: inputs: int64 Tensor in shape [seq_len], input ids. targets: int64 Tensor in shape [seq_len], target ids. is_masked: bool Tensor in shape [seq_len]. True means being selected for partial prediction. perm_size: the length of longest permutation. Could be set to be reuse_len. Should not be larger than reuse_len or there will be data leaks. seq_len: int, sequence length. ''' batch_size = tf.shape(inputs)[0] # Generate permutation indices index = tf.range(seq_len, dtype=tf.int64) index = tf.reshape(index, [-1, perm_size]) index = tf.transpose(index) index = tf.random_shuffle(index) index = tf.transpose(index) index = tf.reshape(index, [1, -1]) index = tf.tile(index, [batch_size, 1]) # `perm_mask` and `target_mask` # non-functional tokens non_func_tokens = tf.logical_not( tf.logical_or(tf.equal(inputs, SEP_ID), tf.equal(inputs, CLS_ID))) non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens) masked_or_func_tokens = tf.logical_not(non_mask_tokens) # Set the permutation indices of non-masked (& non-funcional) tokens to the # smallest index (-1): # (1) they can be seen by all other positions # (2) they cannot see masked positions, so there won't be information leak smallest_index = -tf.ones([batch_size, seq_len], dtype=tf.int64) rev_index = tf.where(non_mask_tokens, smallest_index, index) # Create `target_mask`: non-funcional and maksed tokens # 1: use mask as input and have loss # 0: use token (or [SEP], [CLS]) as input and do not have loss target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens) target_mask = tf.cast(target_tokens, tf.float32) # Create `perm_mask` # `target_tokens` cannot see themselves self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1) # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens) # 0: can attend if i > j or j is non-masked perm_mask = tf.logical_and( self_rev_index[:, :, None] <= rev_index[:, None, :], tf.expand_dims(masked_or_func_tokens, axis=-1)) # new target: [next token] for LM and [curr token] (self) for PLM new_targets = tf.concat([inputs[:, 0:1], targets[:, :-1]], axis=1) # construct inputs_k inputs_k = inputs # construct inputs_q inputs_q = target_mask return perm_mask, new_targets, target_mask, inputs_k, inputs_q
def __init__(self, is_training, input_tensor, n_wide_features, wide_features, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] feature_size = wide_features.shape.as_list()[-1] with tf.variable_scope('wide'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 1, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) wide_output = tf.gather(feature_embeddings, wide_features) # [B, N, H] with tf.variable_scope('wide_and_deep'): deep_output = tf.expand_dims(input_tensor, -1) # [B, H, 1] attention_scores = tf.matmul(wide_output, deep_output) # [B, N, 1] attention_scores = tf.transpose(attention_scores, [0, 2, 1]) # [B, 1, N] attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(hidden_size)) feature_mask = tf.cast( tf.sequence_mask(n_wide_features, feature_size), tf.float32) # [B, N] feature_mask = tf.expand_dims(feature_mask, 1) # [B, 1, N] attention_scores += (1.0 - feature_mask) * -10000.0 attention_matrix = tf.nn.softmax(attention_scores, axis=-1) attention_output = tf.matmul(attention_matrix, wide_output) # [B, 1, H] attention_output = attention_output[:, 0, :] # [B, H] # attention_output = util.dropout( # attention_output, hidden_dropout_prob) input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None): '''Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on 'Attention is all you Need'. If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a 'query' tensor and `to_tensor` into 'key' and 'value' tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with transposes and reshapes rather than actual separate tensors. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]. If False, the output will be of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is true, this will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]). Raises: ValueError: Any of the arguments or tensor shapes are invalid. ''' def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = util.get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = util.get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( 'The rank of `from_tensor` must match the rank of `to_tensor`.') if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if batch_size is None or from_seq_length is None or to_seq_length is None: raise ValueError( 'When passing in rank 2 tensors to attention_layer, the values ' 'for `batch_size`, `from_seq_length`, and `to_seq_length` ' 'must all be specified.') # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` from_tensor_2d = util.reshape_to_matrix(from_tensor) to_tensor_2d = util.reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name='query', kernel_initializer=util.create_initializer(initializer_range)) # `key_layer` = [B*T, N*H] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name='key', kernel_initializer=util.create_initializer(initializer_range)) # `value_layer` = [B*T, N*H] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name='value', kernel_initializer=util.create_initializer(initializer_range)) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # Take the dot product between 'query' and 'key' to get the raw # attention scores. # `attention_scores` = [B, N, F, T] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = util.dropout(attention_probs, attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] value_layer = tf.reshape( value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]) # `value_layer` = [B, N, T, H] value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) # `context_layer` = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) # `context_layer` = [B, F, N, H] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # `context_layer` = [B*F, N*H] context_layer = tf.reshape(context_layer, [ batch_size * from_seq_length, num_attention_heads * size_per_head ]) else: # `context_layer` = [B, F, N*H] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer, attention_probs
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None, use_einsum=True): """Performs multi-headed attention from `from_tensor` to `to_tensor`. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = util.get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = util.get_shape_list(to_tensor, expected_rank=[2, 3]) size_per_head = int(from_shape[2] / num_attention_heads) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, util.create_initializer(initializer_range), query_act, use_einsum, "query") # `key_layer` = [B, T, N, H] k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, util.create_initializer(initializer_range), key_act, use_einsum, "key") # `value_layer` = [B, T, N, H] v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, util.create_initializer(initializer_range), value_act, use_einsum, "value") q = tf.transpose(q, [0, 2, 1, 3]) k = tf.transpose(k, [0, 2, 1, 3]) v = tf.transpose(v, [0, 2, 1, 3]) if attention_mask is not None: attention_mask = tf.reshape(attention_mask, [batch_size, 1, to_seq_length, 1]) # 'new_embeddings = [B, N, F, H]' new_embeddings = dot_product_attention(q, k, v, attention_mask, attention_probs_dropout_prob) return tf.transpose(new_embeddings, [0, 2, 1, 3])