def mask(inputs, key_masks=None, type=None): '''Masks paddings on keys or queries to inputs inputs: 3d tensor. (h*N, T_q, T_k) key_masks: 3d tensor. (N, 1, T_k) type: string. 'key' | 'future' e.g., >> inputs = tf.zeros([2, 2, 3], dtype=tf.float32) >> key_masks = tf.constant([[0., 0., 1.], [0., 1., 1.]]) >> mask(inputs, key_masks=key_masks, type='key') array([[[ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09], [ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09]], [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09], [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]], [[ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09], [ 0.0000000e+00, 0.0000000e+00, -4.2949673e+09]], [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09], [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]]], dtype=float32) ''' padding_num = -2 ** 32 + 1 if type in ('k', 'key', 'keys'): key_masks = tf.to_float(key_masks) key_masks = tf.tile( key_masks, [tf.shape(inputs)[0] // tf.shape(key_masks)[0], 1]) # (h*N, seqlen) key_masks = tf.expand_dims(key_masks, 1) # (h*N, 1, seqlen) outputs = inputs + key_masks * padding_num # elif type in ('q', 'query', 'queries'): # # Generate masks # masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1)) # (N, T_q) # masks = tf.expand_dims(masks, -1) # (N, T_q, 1) # masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]]) # (N, T_q, T_k) # # # Apply masks to inputs # outputs = inputs*masks elif type in ('f', 'future', 'right'): diag_vals = tf.ones_like(inputs[0, :, :]) # (T_q, T_k) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() # (T_q, T_k) future_masks = tf.tile( tf.expand_dims(tril, 0), [tf.shape(inputs)[0], 1, 1]) # (N, T_q, T_k) paddings = tf.ones_like(future_masks) * padding_num outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs) else: print('Check if you entered type correctly!') return outputs
def __init__(self, is_training, input_tensor, is_supervised, is_expanded, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, global_step=None, num_train_steps=None, uda_softmax_temp=-1, uda_confidence_thresh=-1, tsa_schedule='linear', **kwargs): super().__init__(**kwargs) is_supervised = tf.cast(is_supervised, tf.float32) is_expanded = tf.cast(is_expanded, tf.float32) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) with tf.variable_scope('sup_loss'): # reshape sup_ori_log_probs = tf.boolean_mask(log_probs, mask=(1.0 - is_expanded), axis=0) sup_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=is_supervised, axis=0) sup_label_ids = tf.boolean_mask(label_ids, mask=is_supervised, axis=0) self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1) one_hot_labels = tf.one_hot(sup_label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum( one_hot_labels * sup_log_probs, axis=-1) loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32) correct_label_probs = tf.reduce_sum(one_hot_labels * tf.exp(sup_log_probs), axis=-1) if is_training and tsa_schedule: tsa_start = 1.0 / label_size tsa_threshold = get_tsa_threshold(tsa_schedule, global_step, num_train_steps, tsa_start, end=1) larger_than_threshold = tf.greater(correct_label_probs, tsa_threshold) loss_mask = loss_mask * ( 1 - tf.cast(larger_than_threshold, tf.float32)) loss_mask = tf.stop_gradient(loss_mask) per_example_loss = per_example_loss * loss_mask if sample_weight is not None: sup_sample_weight = tf.boolean_mask(sample_weight, mask=is_supervised, axis=0) per_example_loss *= tf.cast(sup_sample_weight, dtype=tf.float32) sup_loss = (tf.reduce_sum(per_example_loss) / tf.maximum(tf.reduce_sum(loss_mask), 1)) self.losses['supervised'] = per_example_loss with tf.variable_scope('unsup_loss'): # reshape ori_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=(1.0 - is_supervised), axis=0) aug_log_probs = tf.boolean_mask(log_probs, mask=is_expanded, axis=0) sup_ori_logits = tf.boolean_mask(logits, mask=(1.0 - is_expanded), axis=0) ori_logits = tf.boolean_mask(sup_ori_logits, mask=(1.0 - is_supervised), axis=0) unsup_loss_mask = 1 if uda_softmax_temp != -1: tgt_ori_log_probs = tf.nn.log_softmax(ori_logits / uda_softmax_temp, axis=-1) tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs) else: tgt_ori_log_probs = tf.stop_gradient(ori_log_probs) if uda_confidence_thresh != -1: largest_prob = tf.reduce_max(tf.exp(ori_log_probs), axis=-1) unsup_loss_mask = tf.cast( tf.greater(largest_prob, uda_confidence_thresh), tf.float32) unsup_loss_mask = tf.stop_gradient(unsup_loss_mask) per_example_loss = kl_for_log_probs( tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask if sample_weight is not None: unsup_sample_weight = tf.boolean_mask(sample_weight, mask=(1.0 - is_supervised), axis=0) per_example_loss *= tf.cast(unsup_sample_weight, dtype=tf.float32) unsup_loss = tf.reduce_mean(per_example_loss) self.losses['unsupervised'] = per_example_loss self.total_loss = sup_loss + unsup_loss
def dynamic_transformer_model(self, is_training, input_tensor, input_mask, batch_size, max_seq_length, label_size, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=util.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, dtype=tf.float32, cls_model='self-attention', cls_hidden_size=128, cls_num_attention_heads=2, speed=0.1, ignore_cls=None): if hidden_size % num_attention_heads != 0: raise ValueError( 'The hidden size (%d) is not a multiple of the number of ' 'attention heads (%d)' % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) keep_cls = list(range(num_hidden_layers + 1)) keep_cls = [ cls_idx for cls_idx in keep_cls if cls_idx not in ignore_cls ] all_layer_outputs = [] all_layer_cls_outputs = collections.OrderedDict() prev_output = input_tensor prev_mask = input_mask for layer_idx in range(num_hidden_layers): with tf.variable_scope('layer_%d' % layer_idx): # build child classifier if is_training or layer_idx not in ignore_cls: with tf.variable_scope('distill'): # FCN + Self_Attention + FCN + FCN if cls_model == 'self-attention-paper': cls_output = self._cls_self_attention_paper( prev_output, batch_size, max_seq_length, label_size, attention_mask=attention_mask, cls_hidden_size=cls_hidden_size, cls_num_attention_heads=\ cls_num_attention_heads, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, dtype=tf.float32, trainable=True) # Self_Attention + FCN elif cls_model == 'self-attention': cls_output = self._cls_self_attention( prev_output, batch_size, max_seq_length, label_size, attention_mask=attention_mask, cls_hidden_size=cls_hidden_size, cls_num_attention_heads=\ cls_num_attention_heads, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, dtype=tf.float32, trainable=True) # FCN elif cls_model == 'fcn': cls_output = self._cls_fcn( prev_output, label_size, hidden_size=hidden_size, initializer_range=initializer_range, dtype=tf.float32, trainable=True) else: raise ValueError( 'Invalid `cls_model = %s`. Pick one from ' '`self-attention-paper`, `self-attention` ' 'and `fcn`' % cls_model) # distill core layer_cls_output = tf.nn.softmax(cls_output, axis=-1, name='cls_%d' % layer_idx) uncertainty = tf.reduce_sum(layer_cls_output * tf.log(layer_cls_output), axis=-1) uncertainty /= tf.log(1 / label_size) # branching only in inference if not is_training: # last output if layer_idx == keep_cls[-1]: all_layer_outputs.append(prev_output) all_layer_cls_outputs[layer_idx] = layer_cls_output return (all_layer_outputs, all_layer_cls_outputs) mask = tf.less(uncertainty, speed) unfinished_mask = \ (tf.ones_like(mask, dtype=dtype) - tf.cast(mask, dtype=dtype)) prev_output = tf.boolean_mask(prev_output, mask=unfinished_mask, axis=0) prev_mask = tf.boolean_mask(prev_mask, mask=unfinished_mask, axis=0) all_layer_cls_outputs[layer_idx] = layer_cls_output # new attention mask input_shape = util.get_shape_list(prev_output) batch_size = input_shape[0] max_seq_length = input_shape[1] attention_mask = \ self.create_attention_mask_from_input_mask( prev_mask, batch_size, max_seq_length, dtype=dtype) # originial stream with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): (attention_head, _) = self.attention_layer( from_tensor=prev_output, to_tensor=prev_output, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, dtype=dtype, trainable=False) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=False) attention_output = util.dropout( attention_output, hidden_dropout_prob) attention_output = util.layer_norm(attention_output + prev_output, trainable=False) # The activation is only applied to the `intermediate` # hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range), trainable=False) # Down-project back to hidden_size then add the residual. with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=False) layer_output = util.dropout(layer_output, hidden_dropout_prob) layer_output = util.layer_norm(layer_output + attention_output, trainable=False) prev_output = layer_output all_layer_outputs.append(layer_output) return (all_layer_outputs, all_layer_cls_outputs)