def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=2, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) batch_size = tf.shape(input_tensor)[0] seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) self.preds[name] = tf.argmax(logits, axis=-1) self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_token_losses = -tf.reduce_mean(one_hot_labels * log_probs, axis=-1) input_mask = tf.concat([ tf.zeros((batch_size, 1), dtype=tf.float32), tf.cast(input_mask[:, 2:], dtype=tf.float32), tf.zeros((batch_size, 1), dtype=tf.float32) ], axis=-1) per_token_losses *= input_mask per_example_loss = tf.reduce_mean(per_token_losses, axis=-1) if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.losses[name] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def norm(x, scope, *, axis=-1, epsilon=1e-5): '''Normalize to mean = 0, std = 1, then do a diagonal affine transform.''' with tf.variable_scope(scope): n_state = x.shape[-1].value g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0)) u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x - u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + epsilon) x = x * g + b return x
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, label_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.sigmoid(logits, name='probs') self.probs['probs'] = probs self.preds['preds'] = tf.greater(probs, 0.5) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.cast(label_ids, dtype=tf.float32)) if label_weight is not None: label_weight = tf.constant(label_weight, dtype=tf.float32) label_weight = tf.reshape(label_weight, [1, label_size]) per_example_loss *= label_weight per_example_loss = tf.reduce_mean(per_example_loss, axis=-1) if sample_weight is not None: per_example_loss *= sample_weight self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def __init__(self, is_training, input_tensor, label_ids, sample_weight=None, scope='mrc', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs[name] = probs start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss start_preds = tf.expand_dims(tf.argmax(logits[:, 0, :], axis=-1), axis=-1) end_preds = tf.expand_dims(tf.argmax(logits[:, 1, :], axis=-1), axis=-1) self.preds[name] = tf.concat([start_preds, end_preds], axis=-1)
def _get_pred_loss(self, teacher_logits, student_logits, weights): teacher_probs = tf.nn.softmax(teacher_logits, axis=-1) teacher_probs = tf.stop_gradient(teacher_probs) student_log_probs = tf.nn.log_softmax(student_logits, axis=-1) pred_loss = ( - tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) * tf.reshape(weights, [-1, 1])) pred_loss = tf.reduce_mean(pred_loss) return pred_loss
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( label_ids, depth=label_size, dtype=tf.float32) per_example_loss = - tf.reduce_sum( one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast( sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance(thresh, float), ( '`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log( self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, dropatt, input_mask, is_training, initializer, scope=None, reuse=None, use_proj=True): ''' Different classification tasks may not may not share the same parameters to summarize the sequence features. If shared, one can keep the `scope` to the default value `None`. Otherwise, one should specify a different `scope` for each task. ''' with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): if summary_type == 'last': summary = hidden[-1] elif summary_type == 'first': summary = hidden[0] elif summary_type == 'mean': summary = tf.reduce_mean(hidden, axis=0) elif summary_type == 'attn': bsz = tf.shape(hidden)[1] summary_bias = tf.get_variable('summary_bias', [d_model], dtype=hidden.dtype, initializer=initializer) summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) if input_mask is not None: input_mask = input_mask[None, :, :, None] summary = multihead_attn( summary_bias, hidden, hidden, input_mask, d_model, n_head, d_head, dropout, dropatt, is_training, initializer, residual=False) summary = summary[0] else: raise ValueError('Unsupported summary type %s' % summary_type) # use another projection as in BERT if use_proj: summary = tf.layers.dense( summary, d_model, activation=tf.tanh, kernel_initializer=initializer, name='summary') # dropout summary = tf.layers.dropout( summary, dropout, training=is_training, name='dropout') return summary
def __init__(self, is_training, input_tensor, input_mask, label_ids, label_size=5, sample_weight=None, scope='cls/sequence', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) seq_length = input_tensor.shape.as_list()[-2] hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, label_size]) with tf.variable_scope('crf'): input_length = tf.reduce_sum(input_mask, axis=-1) per_example_loss, transition_matrix = \ contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=label_ids, sequence_lengths=input_length) per_example_loss = -per_example_loss if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) self.total_loss = tf.reduce_mean(per_example_loss) self.losses[name] = per_example_loss self.preds[name] = tf.argmax(logits, axis=-1) self.probs['logits'] = logits self.probs['transition_matrix'] = transition_matrix
def __init__(self, is_training, input_tensor, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', name='', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds[name] = tf.argmax(logits, axis=-1) self.probs[name] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss self.losses[name] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def _cls_forward(self, is_training, input_tensor, input_mask, label_ids, bert_config, batch_size, max_seq_length, prob, scope, name, sample_weight=None, hidden_dropout_prob=0.1, initializer_range=0.02): with tf.variable_scope(scope): logits = tf.layers.dense( input_tensor, 2, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=True) # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=2) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_mask = tf.cast(input_mask, tf.float32) per_token_loss *= input_mask / tf.reduce_sum( input_mask, keepdims=True, axis=-1) per_example_loss = tf.reduce_sum(per_token_loss, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) if prob != 0: self.total_loss += tf.reduce_mean(per_example_loss) self.losses[name + '_loss'] = per_example_loss self.preds[name + '_preds'] = tf.argmax(logits, axis=-1)
def __init__(self, is_training, input_tensor, n_wide_features, wide_features, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] feature_size = wide_features.shape.as_list()[-1] with tf.variable_scope('wide'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 1, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) wide_output = tf.gather(feature_embeddings, wide_features) # [B, N, H] with tf.variable_scope('wide_and_deep'): deep_output = tf.expand_dims(input_tensor, -1) # [B, H, 1] attention_scores = tf.matmul(wide_output, deep_output) # [B, N, 1] attention_scores = tf.transpose(attention_scores, [0, 2, 1]) # [B, 1, N] attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(hidden_size)) feature_mask = tf.cast( tf.sequence_mask(n_wide_features, feature_size), tf.float32) # [B, N] feature_mask = tf.expand_dims(feature_mask, 1) # [B, 1, N] attention_scores += (1.0 - feature_mask) * -10000.0 attention_matrix = tf.nn.softmax(attention_scores, axis=-1) attention_output = tf.matmul(attention_matrix, wide_output) # [B, 1, H] attention_output = attention_output[:, 0, :] # [B, H] # attention_output = util.dropout( # attention_output, hidden_dropout_prob) input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def __init__(self, is_training, input_tensor, is_supervised, is_expanded, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, global_step=None, num_train_steps=None, uda_softmax_temp=-1, uda_confidence_thresh=-1, tsa_schedule='linear', **kwargs): super().__init__(**kwargs) is_supervised = tf.cast(is_supervised, tf.float32) is_expanded = tf.cast(is_expanded, tf.float32) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) with tf.variable_scope('sup_loss'): # reshape sup_ori_log_probs = tf.boolean_mask(log_probs, mask=(1.0 - is_expanded), axis=0) sup_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=is_supervised, axis=0) sup_label_ids = tf.boolean_mask(label_ids, mask=is_supervised, axis=0) self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1) one_hot_labels = tf.one_hot(sup_label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum( one_hot_labels * sup_log_probs, axis=-1) loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32) correct_label_probs = tf.reduce_sum(one_hot_labels * tf.exp(sup_log_probs), axis=-1) if is_training and tsa_schedule: tsa_start = 1.0 / label_size tsa_threshold = get_tsa_threshold(tsa_schedule, global_step, num_train_steps, tsa_start, end=1) larger_than_threshold = tf.greater(correct_label_probs, tsa_threshold) loss_mask = loss_mask * ( 1 - tf.cast(larger_than_threshold, tf.float32)) loss_mask = tf.stop_gradient(loss_mask) per_example_loss = per_example_loss * loss_mask if sample_weight is not None: sup_sample_weight = tf.boolean_mask(sample_weight, mask=is_supervised, axis=0) per_example_loss *= tf.cast(sup_sample_weight, dtype=tf.float32) sup_loss = (tf.reduce_sum(per_example_loss) / tf.maximum(tf.reduce_sum(loss_mask), 1)) self.losses['supervised'] = per_example_loss with tf.variable_scope('unsup_loss'): # reshape ori_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=(1.0 - is_supervised), axis=0) aug_log_probs = tf.boolean_mask(log_probs, mask=is_expanded, axis=0) sup_ori_logits = tf.boolean_mask(logits, mask=(1.0 - is_expanded), axis=0) ori_logits = tf.boolean_mask(sup_ori_logits, mask=(1.0 - is_supervised), axis=0) unsup_loss_mask = 1 if uda_softmax_temp != -1: tgt_ori_log_probs = tf.nn.log_softmax(ori_logits / uda_softmax_temp, axis=-1) tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs) else: tgt_ori_log_probs = tf.stop_gradient(ori_log_probs) if uda_confidence_thresh != -1: largest_prob = tf.reduce_max(tf.exp(ori_log_probs), axis=-1) unsup_loss_mask = tf.cast( tf.greater(largest_prob, uda_confidence_thresh), tf.float32) unsup_loss_mask = tf.stop_gradient(unsup_loss_mask) per_example_loss = kl_for_log_probs( tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask if sample_weight is not None: unsup_sample_weight = tf.boolean_mask(sample_weight, mask=(1.0 - is_supervised), axis=0) per_example_loss *= tf.cast(unsup_sample_weight, dtype=tf.float32) unsup_loss = tf.reduce_mean(per_example_loss) self.losses['unsupervised'] = per_example_loss self.total_loss = sup_loss + unsup_loss
def __init__(self, tiny_bert_config, bert_config, is_training, input_ids, input_mask, segment_ids, label_ids=None, sample_weight=None, scope='bert', name='', dtype=tf.float32, use_tilda_embedding=False, drop_pooler=False, label_size=2, trainable=True, **kwargs): super(TinyBERTCLSDistillor, self).__init__() def _get_logits(pooled_output, hidden_size, scope, trainable): with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(pooled_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits student = BERTEncoder(bert_config=tiny_bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, scope='tiny/bert', use_tilda_embedding=use_tilda_embedding, drop_pooler=drop_pooler, trainable=True, **kwargs) student_logits = _get_logits(student.get_pooled_output(), tiny_bert_config.hidden_size, 'tiny/cls/seq_relationship', True) if is_training: teacher = BERTEncoder(bert_config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, scope=scope, use_tilda_embedding=False, drop_pooler=drop_pooler, trainable=False, **kwargs) teacher_logits = _get_logits(teacher.get_pooled_output(), bert_config.hidden_size, 'cls/seq_relationship', False) weights = 1.0 if sample_weight is not None: weights = tf.cast(sample_weight, dtype=tf.float32) # embedding loss teacher_embedding = teacher.get_embedding_output() student_embedding = student.get_embedding_output() with tf.variable_scope('embedding_loss'): linear_trans = tf.layers.dense( student_embedding, bert_config.hidden_size, kernel_initializer=util.create_initializer( bert_config.initializer_range)) embedding_loss = tf.losses.mean_squared_error( linear_trans, teacher_embedding, weights=tf.reshape(weights, [-1, 1, 1])) # attention loss teacher_attention_scores = teacher.get_attention_scores() student_attention_scores = student.get_attention_scores() num_teacher_hidden_layers = bert_config.num_hidden_layers num_student_hidden_layers = tiny_bert_config.num_hidden_layers num_projections = \ int(num_teacher_hidden_layers / num_student_hidden_layers) attention_losses = [] for i in range(num_student_hidden_layers): attention_losses.append( tf.losses.mean_squared_error( teacher_attention_scores[num_projections * i + num_projections - 1], student_attention_scores[i], weights=tf.reshape(weights, [-1, 1, 1, 1])), ) attention_loss = tf.add_n(attention_losses) # hidden loss teacher_hidden_layers = teacher.all_encoder_layers student_hidden_layers = student.all_encoder_layers num_teacher_hidden_layers = bert_config.num_hidden_layers num_student_hidden_layers = tiny_bert_config.num_hidden_layers num_projections = int(num_teacher_hidden_layers / num_student_hidden_layers) with tf.variable_scope('hidden_loss'): hidden_losses = [] for i in range(num_student_hidden_layers): hidden_losses.append( tf.losses.mean_squared_error( teacher_hidden_layers[num_projections * i + num_projections - 1], tf.layers.dense( student_hidden_layers[i], bert_config.hidden_size, kernel_initializer=util.create_initializer( bert_config.initializer_range)), weights=tf.reshape(weights, [-1, 1, 1]))) hidden_loss = tf.add_n(hidden_losses) # prediction loss teacher_probs = tf.nn.softmax(teacher_logits, axis=-1) student_log_probs = tf.nn.log_softmax(student_logits, axis=-1) pred_loss = ( -tf.reduce_sum(teacher_probs * student_log_probs, axis=-1) * tf.reshape(weights, [-1, 1])) pred_loss = tf.reduce_mean(pred_loss) # sum up distill_loss = (embedding_loss + attention_loss + hidden_loss + pred_loss) self.total_loss = distill_loss self.losses['distill'] = tf.reshape(distill_loss, [1]) else: student_probs = tf.nn.softmax(student_logits, axis=-1, name='probs') self.probs[name] = student_probs
def __init__(self, vocab_size, is_training, source_ids, target_ids, sos_id, sample_weight=None, hidden_size=768, num_blocks=6, num_attention_heads=12, scope='transformer', use_label_smoothing=False, use_tilda_embedding=False, trainable=True, **kwargs): super().__init__() dropout_rate = 0.0 if is_training: dropout_rate = 0.1 source_shape = util.get_shape_list(source_ids, expected_rank=2) target_shape = util.get_shape_list(target_ids, expected_rank=2) batch_size = source_shape[0] source_max_seq_length = source_shape[1] target_max_seq_length = target_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): source_mask = tf.math.equal(source_ids, 0) # embedding with tf.variable_scope('embeddings'): (enc, embedding_table) = embedding_lookup( input_ids=source_ids, vocab_size=vocab_size, batch_size=batch_size, max_seq_length=source_max_seq_length, embedding_size=hidden_size, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings) enc *= hidden_size ** 0.5 # scale enc += positional_encoding(enc, source_max_seq_length) enc = util.dropout(enc, dropout_rate) with tf.variable_scope('encoder'): # stacked multi-attention layers for i in range(num_blocks): with tf.variable_scope('block_%s' % i): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=source_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=False, scope='self_attention') # feed forward enc = ff(enc, num_units=[hidden_size * 4, hidden_size]) memory = enc def _forward(target_ids, target_mask, target_max_seq_length): with tf.variable_scope('decoder'): # shared embedding dec = tf.nn.embedding_lookup(embedding_table, target_ids) dec *= hidden_size ** 0.5 # scale dec += positional_encoding(dec, target_max_seq_length) dec = util.dropout(dec, dropout_rate) # blocks for i in range(num_blocks): with tf.variable_scope('block_%s' % i): # masked self-attention dec = multihead_attention( queries=dec, keys=dec, values=dec, key_masks=target_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=True, scope='masked_self_attention') # vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, key_masks=source_mask, num_heads=num_attention_heads, dropout_rate=dropout_rate, training=is_training, causality=False, scope='vanilla_attention') # feed forward dec = ff( dec, num_units=[4 * hidden_size, hidden_size]) # final linear projection (embedding weights are shared) with tf.variable_scope('cls'): output_bias = tf.get_variable( 'output_bias', shape=[vocab_size], initializer=tf.zeros_initializer()) dec = tf.reshape(dec, [-1, hidden_size]) logits = tf.matmul(dec, embedding_table, transpose_b=True) logits = tf.reshape( logits, [-1, target_max_seq_length, vocab_size]) logits = tf.nn.bias_add(logits, output_bias) return logits # convert to labels label_ids = tf.concat( [target_ids[:, 1:], tf.zeros([batch_size, 1], dtype=tf.int32)], axis=-1) # forward once if is_training: target_mask = tf.math.equal(target_ids, 0) # (N, T2) logits = _forward( target_ids, target_mask, target_max_seq_length) self.preds['MT'] = tf.argmax(logits, axis=-1) # forward loop else: target_mask_base = tf.zeros([batch_size, 1], dtype=tf.int32) target_ids = tf.ones([batch_size, 1], dtype=tf.int32) * sos_id for cur_length in range(1, target_max_seq_length + 1): target_mask = tf.tile(target_mask_base, [1, cur_length]) logits = _forward(target_ids, target_mask, cur_length) pred_ids = tf.argmax( logits[:, cur_length-1:cur_length, :], axis=-1) pred_ids = tf.cast(pred_ids, tf.int32) target_ids = tf.concat([target_ids, pred_ids], axis=-1) self.preds['MT'] = target_ids[:, 1:] # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=vocab_size) if use_label_smoothing: one_hot_labels = label_smoothing(one_hot_labels) per_token_loss = -tf.reduce_sum( one_hot_labels * log_probs, axis=-1) label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32) per_example_loss = \ tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \ tf.reduce_sum(label_mask, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = tf.reduce_mean(per_example_loss) self.losses['MT'] = per_example_loss
def __init__(self, bert_config, is_training, input_ids, input_mask, segment_ids, sample_weight=None, scope='bert', dtype=tf.float32, drop_pooler=False, cls_model='self-attention', label_size=2, speed=0.1, ignore_cls='0', **kwargs): super(FastBERTCLSDistillor, self).__init__() if not ignore_cls: ignore_cls = [] if isinstance(ignore_cls, str): ignore_cls = ignore_cls.replace(' ', '').split(',') ignore_cls = list(map(int, ignore_cls)) elif isinstance(ignore_cls, list): ignore_cls = list(map(int, ignore_cls)) else: raise ValueError( '`ignore_cls` should be a list of child-classifier ids or ' 'a string seperated with commas.') if not speed: raise ValueError( '`speed` should be a float number between `0` and `1`.') bert_config = copy.deepcopy(bert_config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] max_seq_length = input_shape[1] with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, batch_size=batch_size, max_seq_length=max_seq_length, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', dtype=dtype, trainable=False, tilda_embeddings=None) # Add positional embeddings and token type embeddings # layer normalize and perform dropout. self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=\ bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, dtype=dtype, trainable=False) with tf.variable_scope('encoder'): attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length, dtype=dtype) # stacked transformers (self.all_encoder_layers, self.all_cls_layers) = \ self.dynamic_transformer_model( is_training, input_tensor=self.embedding_output, input_mask=input_mask, batch_size=batch_size, max_seq_length=max_seq_length, label_size=label_size, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=util.get_activation( bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, dtype=dtype, cls_model=cls_model, speed=speed, ignore_cls=ignore_cls) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope('pooler'): first_token_tensor = self.sequence_output[:, 0, :] # trick: ignore the fully connected layer if drop_pooler: self.pooled_output = first_token_tensor else: self.pooled_output = tf.layers.dense( first_token_tensor, bert_config.hidden_size, activation=tf.tanh, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=False) # teacher classifier if bert_config.num_hidden_layers not in ignore_cls: with tf.variable_scope('cls/seq_relationship'): output_weights = tf.get_variable( 'output_weights', shape=[label_size, bert_config.hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=False) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=False) logits = tf.matmul(self.pooled_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1) # distillation if is_training: losses = [] for cls_probs in self.all_cls_layers.values(): # KL-Divergence per_example_loss = tf.reduce_sum( cls_probs * (tf.log(cls_probs) - tf.log(probs)), axis=-1) if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) loss = tf.reduce_mean(per_example_loss) losses.append(loss) distill_loss = tf.add_n(losses) self.total_loss = distill_loss self.losses['losses'] = distill_loss else: if bert_config.num_hidden_layers not in ignore_cls: self.all_cls_layers[bert_config.num_hidden_layers] = probs self.probs['probs'] = tf.concat(list(self.all_cls_layers.values()), axis=0, name='probs')
def __init__(self, bert_config, is_training, encoder, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels, sample_weight=None, scope_lm='cls/predictions', scope_cls='cls/seq_relationship', trainable=True, use_nsp_loss=True, **kwargs): super(BERTDecoder, self).__init__(**kwargs) def gather_indexes(sequence_tensor, positions): sequence_shape = util.get_shape_list(sequence_tensor, 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor scalar_losses = [] # masked language modeling input_tensor = gather_indexes(encoder.get_sequence_output(), masked_lm_positions) with tf.variable_scope(scope_lm): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=util.get_activation(bert_config.hidden_act), kernel_initializer=util.create_initializer( bert_config.initializer_range)) input_tensor = util.layer_norm(input_tensor) output_bias = tf.get_variable('output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(input_tensor, encoder.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs') log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(masked_lm_ids, [-1]) if sample_weight is not None: sample_weight = tf.expand_dims(tf.cast(sample_weight, dtype=tf.float32), axis=-1) masked_lm_weights *= sample_weight label_weights = tf.reshape(masked_lm_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) per_example_loss = label_weights * per_example_loss numerator = tf.reduce_sum(per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator scalar_losses.append(loss) self.losses['MLM_losses'] = per_example_loss self.preds['MLM_preds'] = tf.argmax(probs, axis=-1) # next sentence prediction with tf.variable_scope(scope_cls): output_weights = tf.get_variable( 'output_weights', shape=[2, bert_config.hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(encoder.get_pooled_output(), output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(next_sentence_labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = (tf.cast(sample_weight, dtype=tf.float32) * per_example_loss) loss = tf.reduce_mean(per_example_loss) if use_nsp_loss: scalar_losses.append(loss) self.losses['NSP_losses'] = per_example_loss self.probs['NSP_probs'] = probs self.preds['NSP_preds'] = tf.argmax(probs, axis=-1) self.total_loss = tf.add_n(scalar_losses)
def __init__(self, bert_config, is_training, input_tensor, sa_mask, label_ids, sample_weight=None, scope='sanet', alpha=0.5, hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) shape = util.get_shape_list(input_tensor) batch_size = shape[0] seq_length = shape[1] hidden_size = shape[2] sa_mask = tf.reshape(sa_mask, [batch_size, seq_length, seq_length]) with tf.variable_scope(scope): with tf.variable_scope('sentence_attention'): (sa_output, _) = self.attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=sa_mask, num_attention_heads=bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=seq_length, to_max_seq_length=seq_length, trainable=trainable) with tf.variable_scope('cls/mrc'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = alpha * sa_output + (1 - alpha) * input_tensor output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['probs'] = probs self.preds['preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses['losses'] = per_example_loss
def __init__(self, bert_config, is_training, sketchy_encoder, intensive_encoder, query_mask, label_ids, has_answer, sample_weight=None, scope='retro_reader', matching_mechanism='cross-attention', beta_1=0.5, beta_2=0.5, threshold=1.0, trainable=True, **kwargs): super().__init__(**kwargs) # verifier with tf.variable_scope(scope): # sketchy reading module with tf.variable_scope('sketchy/prediction'): sketchy_output = sketchy_encoder.get_pooled_output() hidden_size = sketchy_output.shape.as_list()[-1] output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( sketchy_output, bert_config.hidden_dropout_prob \ if is_training else 0.0) logits = tf.matmul( output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( has_answer, depth=2, dtype=tf.float32) per_example_loss = - tf.reduce_sum( one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast( sample_weight, dtype=tf.float32) * per_example_loss self.losses['sketchy_losses'] = per_example_loss sketchy_loss = tf.reduce_mean(per_example_loss) score_ext = logits[:, 1] - logits[:, 0] # intensive reading module with tf.variable_scope('intensive'): H = intensive_encoder.get_sequence_output() H_Q = H * tf.cast( tf.expand_dims(query_mask, axis=-1), tf.float32) (batch_size, max_seq_length, hidden_size) = \ util.get_shape_list(H) # cross-attention if matching_mechanism == 'cross-attention': with tf.variable_scope('cross_attention'): attention_mask = \ self.create_attention_mask_from_input_mask( query_mask, batch_size, max_seq_length) (H_prime, _) = self.attention_layer( from_tensor=H, to_tensor=H_Q, attention_mask=attention_mask, num_attention_heads=\ bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, trainable=trainable) # matching-attention elif matching_mechanism == 'matching-attention': with tf.variable_scope('matching_attention'): output_weights = tf.get_variable( 'output_weights', shape=[hidden_size, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[hidden_size], initializer=tf.zeros_initializer(), trainable=trainable) trans = tf.matmul( H_Q, tf.tile( tf.expand_dims(output_weights, axis=0), [batch_size, 1, 1]), transpose_b=True) trans = tf.nn.bias_add(trans, output_bias) M = tf.nn.softmax( tf.matmul(H, trans, transpose_b=True), axis=-1) H_prime = tf.matmul(M, H_Q) with tf.variable_scope('prediction'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( H_prime, bert_config.hidden_dropout_prob \ if is_training else 0.0) output_layer = tf.reshape( output_layer, [batch_size * max_seq_length, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape( logits, [batch_size, max_seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['mrc_probs'] = probs self.preds['mrc_preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot( label_ids[:, 0], depth=max_seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot( label_ids[:, 1], depth=max_seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( - 0.5 * tf.reduce_sum( start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum( end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight intensive_loss = tf.reduce_mean(per_example_loss) self.losses['intensive_losses'] = per_example_loss score_has = tf.norm( probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1) score_null = probs[:, 0, 0] + probs[:, 1, 0] score_diff = score_has - score_null # rear verification v = beta_1 * score_diff + beta_2 * score_ext self.preds['verifier_preds'] = \ tf.cast(tf.greater(v, threshold), tf.int32) self.probs['verifier_probs'] = v self.total_loss = sketchy_loss + intensive_loss
def __init__(self, bert_config, is_training, dilated_ids, label_ids, max_seq_length, spad_id=1, loop=3, sample_weight=None, scope='dilated', use_tilda_embedding=False, **kwargs): super().__init__() dilated_mask = tf.cast(tf.not_equal(dilated_ids, 0), tf.float32) shape = util.get_shape_list(dilated_ids, expected_rank=2) batch_size = shape[0] dilated_seq_length = shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): # forward once if is_training: logits = self._bert_forward(bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) self.preds['LM'] = tf.argmax(logits, axis=-1) # LM loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_length = tf.reduce_sum(dilated_mask, axis=-1) * 2 label_mask = tf.sequence_mask(input_length, max_seq_length * 2, dtype=tf.float32) per_example_loss = \ tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \ tf.reduce_sum(label_mask, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = tf.reduce_mean(per_example_loss) self.losses['LM'] = per_example_loss # forward loop else: def _forward(dilated_ids, dilated_mask): logits = self._bert_forward( bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) output_ids = tf.argmax(logits, axis=-1) output_ids = tf.cast(output_ids, dtype=tf.int32) # special padding (using `spad` token) equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32) equal_zero = tf.reduce_sum(equal_zero, axis=-1) right_pad = spad_id * tf.sequence_mask( equal_zero, dilated_seq_length, dtype=tf.int32) paded = tf.concat([output_ids, right_pad], axis=-1) # extract ids of length `max_seq_length` flattened_padded = tf.reshape(paded, [-1]) is_valid = tf.cast(tf.greater(flattened_padded, 0), dtype=tf.int32) flattened_valid = tf.boolean_mask(flattened_padded, is_valid) valid = tf.reshape(flattened_valid, [batch_size, dilated_seq_length]) cutted_valid = valid[:, :max_seq_length] # replace `spad` token with `pad` non_spad_mask = tf.cast(tf.not_equal( cutted_valid, spad_id), dtype=tf.int32) output_ids = cutted_valid * non_spad_mask output_length = tf.reduce_sum(non_spad_mask, axis=-1) # dilate reshaped_ids = tf.reshape(output_ids, [batch_size, max_seq_length, 1]) reshaped_mask = tf.reshape( tf.sequence_mask(output_length, max_seq_length, dtype=tf.int32), [batch_size, max_seq_length, 1]) concat_ids = tf.concat( [reshaped_ids, tf.zeros_like(reshaped_ids)], axis=-1) concat_mask = tf.concat([ reshaped_mask, tf.zeros_like(reshaped_mask, dtype=tf.int32) ], axis=-1) dilated_ids = tf.reshape(concat_ids, [batch_size, max_seq_length * 2]) dilated_mask = tf.reshape(concat_mask, [batch_size, max_seq_length * 2]) return dilated_ids, dilated_mask for _ in range(loop): dilated_ids, dilated_mask = _forward( dilated_ids, dilated_mask) self.preds['LM'] = dilated_ids
def __init__(self, vocab_size, is_training, input_ids, input_mask, segment_ids, sample_weight=None, reduced_size=64, topic_size=1024, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, bias=0, scope='vae', trainable=True, **kwargs): super().__init__() # freeze parameters config = Config(vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, batch_size=batch_size, max_seq_length=seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings, trainable=trainable) self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, hidden_size=config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope('encoder'): # stacked transformer attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, seq_length) self.all_encoder_layers = self.transformer_model( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=util.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=\ config.attention_probs_dropout_prob, initializer_range=config.initializer_range, trainable=trainable) # projection with tf.variable_scope('projection'): transformer_output = tf.layers.dense( self.all_encoder_layers[-1], reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) transformer_output = tf.reshape(transformer_output, [batch_size, -1]) input_length = tf.reduce_sum(input_mask, axis=-1) input_length = tf.cast(input_length, tf.float32) input_length_1d = tf.reshape(input_length, [batch_size]) input_length_2d = tf.reshape(input_length, [batch_size, 1]) broadcast_mask = tf.sequence_mask( tf.multiply(input_length_1d, reduced_size), seq_length * reduced_size, dtype=tf.float32) broadcast_mask = tf.multiply(broadcast_mask, seq_length / input_length_2d) transformer_output *= broadcast_mask # latent space miu = tf.layers.dense( transformer_output, topic_size, activation='tanh', kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='miu', trainable=trainable) sigma = tf.layers.dense( transformer_output, topic_size, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='sigma', trainable=trainable) self.probs['miu'] = miu self.probs['sigma'] = sigma with tf.variable_scope('decoder'): with tf.variable_scope('projection'): # reparametarization if is_training: noise = tf.random_normal([batch_size, topic_size]) else: noise = tf.random_uniform([batch_size, topic_size], minval=-bias, maxval=bias) decoder_input = miu + tf.exp(sigma) * noise # projection decoder_input = tf.layers.dense( decoder_input, seq_length * reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) intermediate_input = tf.reshape( decoder_input, [-1, seq_length, reduced_size]) intermediate_input = util.layer_norm(intermediate_input, trainable=trainable) intermediate_input = util.dropout( intermediate_input, config.hidden_dropout_prob) # MLP with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( intermediate_input, 4 * reduced_size, activation=util.gelu, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) with tf.variable_scope('output'): decoder_output = tf.layers.dense( intermediate_output, config.hidden_size, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) decoder_output = util.layer_norm(decoder_output, trainable=trainable) decoder_output = util.dropout(decoder_output, config.hidden_dropout_prob) self.all_decoder_layers = [intermediate_output, decoder_output] self.all_decoder_layers = [decoder_output] # reconstruction with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( decoder_output, units=config.hidden_size, activation=util.get_activation(config.hidden_act), kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) input_tensor = util.layer_norm(input_tensor, trainable=trainable) output_weights = self.embedding_table output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) flatten_input_tensor = tf.reshape(input_tensor, [-1, config.hidden_size]) logits = tf.matmul(flatten_input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, config.vocab_size]) probs = tf.nn.softmax(logits, axis=-1, name='probs') lm_log_probs = tf.nn.log_softmax(logits, axis=-1) self.preds['preds'] = tf.argmax(probs, axis=-1) one_hot_labels = tf.one_hot(input_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels, axis=[-1]) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = (tf.reduce_mean(per_example_loss) + tf.reduce_mean(tf.square(miu)) + tf.reduce_mean(tf.exp(sigma) - sigma - 1)) self.losses['losses'] = per_example_loss
def _lm_forward(self, is_training, input_tensor, input_mask, label_ids, bert_config, batch_size, max_seq_length, prob, scope, name, sample_weight=None, hidden_dropout_prob=0.1, initializer_range=0.02): with tf.variable_scope(scope): with tf.variable_scope('verifier'): logits = tf.layers.dense( input_tensor, 2, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=True) verifier_label_ids = tf.cast(tf.greater(label_ids, 0), tf.int32) # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(verifier_label_ids, depth=2) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_mask = tf.cast(input_mask, tf.float32) per_token_loss *= input_mask / tf.reduce_sum( input_mask, keepdims=True, axis=-1) per_example_loss = tf.reduce_sum(per_token_loss, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) if prob != 0: self.total_loss += tf.reduce_mean(per_example_loss) verifier_loss = per_example_loss verifier_preds = tf.argmax(logits, axis=-1) with tf.variable_scope('prediction'): with tf.variable_scope('intermediate'): logits = tf.layers.dense( input_tensor, bert_config.hidden_size * 4, kernel_initializer=util.create_initializer( bert_config.initializer_range), activation=util.gelu, trainable=True) with tf.variable_scope('output'): logits = tf.layers.dense( logits, bert_config.hidden_size, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=True) flattened = tf.reshape( logits, [batch_size * max_seq_length, bert_config.hidden_size]) logits = tf.matmul(flattened, self.embedding_table, transpose_b=True) logits = tf.reshape( logits, [-1, max_seq_length, bert_config.vocab_size]) # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_mask *= tf.cast(verifier_preds, tf.float32) per_token_loss *= input_mask / ( tf.reduce_sum(input_mask, keepdims=True, axis=-1) + 1e-6) per_example_loss = tf.reduce_sum(per_token_loss, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) if prob != 0: self.total_loss += tf.reduce_mean(per_example_loss) self.losses[name + '_loss'] = verifier_loss self.preds[name + '_preds'] = \ tf.argmax(logits, axis=-1) * verifier_preds
def __init__(self, hparams, is_training, input_ids, sample_weight=None, scope='model', given=1, use_tilda_embedding=False, **kwargs): super().__init__() batch_size = util.get_shape_list(input_ids, expected_rank=2)[0] max_seq_length = hparams.n_predict # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): def _forward(input_ids, past=None): batch, sequence = shape_list(input_ids) if tilda_embeddings is None: wte = tf.get_variable( 'word_embeddings', [hparams.n_vocab, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.02)) else: wte = tilda_embeddings wpe = tf.get_variable( 'wpe', [hparams.n_ctx, hparams.n_embed], initializer=tf.random_normal_initializer(stddev=0.01)) past_length = 0 if past is None else tf.shape(past)[-2] h = (tf.gather(wte, input_ids) + tf.gather(wpe, positions_for(input_ids, past_length))) # stacked transformer layers presents = [] pasts = tf.unstack(past, axis=1) if past is not None else \ [None] * hparams.n_layer assert len(pasts) == hparams.n_layer for layer, past in enumerate(pasts): h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) presents.append(present) present = tf.stack(presents, axis=1) h = norm(h, 'ln_f') # Language model loss. Do tokens <n predict token n? h_flat = tf.reshape(h, [batch * sequence, hparams.n_embed]) logits = tf.matmul(h_flat, wte, transpose_b=True) logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab]) return logits, present # convert to labels label_ids = tf.concat( [input_ids[:, 1:], tf.zeros([batch_size, 1], dtype=tf.int32)], axis=-1) # forward once if is_training: (logits, _) = _forward(input_ids) self.preds['LM'] = tf.argmax(logits, axis=-1) # forward loop else: input_ids = input_ids[:, 0:given] for cur_length in range(given, max_seq_length + 1): (logits, _) = _forward(input_ids) pred_ids = tf.argmax(logits[:, cur_length - 1:cur_length, :], axis=-1) pred_ids = tf.cast(pred_ids, tf.int32) input_ids = tf.concat([input_ids, pred_ids], axis=-1) self.preds['LM'] = input_ids # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=hparams.n_vocab) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) label_mask = tf.cast(tf.not_equal(label_ids, 0), tf.float32) per_example_loss = \ tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \ tf.reduce_sum(label_mask, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = tf.reduce_mean(per_example_loss) self.losses['LM'] = per_example_loss
def __init__(self, bert_config, is_training, input_tensor, input_mask, sem_features, label_ids, max_seq_length, feature_size, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) input_shape = util.get_shape_list(input_tensor) batch_size = input_shape[0] hidden_size = input_shape[-1] with tf.variable_scope('sem'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 3, hidden_size], # for [PAD], [CLS], [SEP] initializer=util.create_initializer(initializer_range), trainable=trainable) sem_output = tf.gather(feature_embeddings, sem_features) # [B, N, H] attention_heads = [] with tf.variable_scope('self'): attention_mask = BERTEncoder.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length) (attention_head, _) = BERTEncoder.attention_layer( from_tensor=sem_output, to_tensor=sem_output, attention_mask=attention_mask, num_attention_heads=bert_config.num_attention_heads, size_per_head=(hidden_size // bert_config.num_attention_heads), attention_probs_dropout_prob=hidden_dropout_prob if is_training else 0.0, initializer_range=initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, trainable=trainable) attention_heads.append(attention_head) if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) attention_output = attention_output[:, 0, :] # [B, H] input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)