def classification_loss(hidden, labels, n_class, initializer, scope, reuse=None, return_logits=False): """ Different classification tasks should use different scope names to ensure different dense layers (parameters) are used to produce the logits. An exception will be in transfer learning, where one hopes to transfer the classification weights. """ with tf.variable_scope(scope, reuse=reuse): logits = tf.layers.dense(hidden, n_class, kernel_initializer=initializer, name='logit') one_hot_target = tf.one_hot(labels, n_class, dtype=hidden.dtype) loss = -tf.reduce_sum(tf.nn.log_softmax(logits) * one_hot_target, -1) if return_logits: return loss, logits return loss
def compute_loss(log_probs, positions, depth=seq_length): one_hot_positions = tf.one_hot( positions, depth=depth, dtype=tf.float32) loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) loss = tf.reduce_mean(loss) return loss
def kl_div_loss(student_logits, teacher_logits, temperature=1): """The Kullback–Leibler divergence from Q to P: D_kl (P||Q) = sum(P * log(P / Q)) from student to teacher: sum(teacher * log(teacher / student)) """ teacher_softmax = tf.nn.softmax(teacher_logits / temperature) teacher_log_softmax = tf.nn.log_softmax(teacher_logits / temperature) student_log_softmax = tf.nn.log_softmax(student_logits / temperature) kl_dist = teacher_softmax * (teacher_log_softmax - student_log_softmax) kl_loss = tf.reduce_mean(tf.reduce_sum(kl_dist, -1)) return kl_loss
def get_race_loss(FLAGS, features, is_training): """Loss for downstream multi-choice QA tasks such as RACE.""" bsz_per_core = tf.shape(features["input_ids"])[0] def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 4, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 4]) return out inp = _transform_features(features["input_ids"]) seg_id = _transform_features(features["segment_ids"]) inp_mask = _transform_features(features["input_mask"]) label = tf.reshape(features["label_ids"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("logits"): logits = tf.layers.dense(summary, 1, kernel_initializer=xlnet_model.get_initializer()) logits = tf.reshape(logits, [bsz_per_core, 4]) one_hot_target = tf.one_hot(label, 4) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def get_decomposed_qa_outputs(FLAGS, features, is_training): question_ids = features["question_ids"] context_ids = features["context_ids"] seq_len = FLAGS.max_seq_length q_seq_len = FLAGS.max_first_length + 2 ctx_seq_len = seq_len - q_seq_len q_mask_int = tf.cast(tf.cast(question_ids, tf.bool), tf.int32) cls_index = tf.reshape( tf.reduce_sum(q_mask_int, axis=1) + ctx_seq_len, [-1]) # 0 for mask out # q_zeros = tf.zeros_like(question_ids) # p_ids = tf.concat([context_ids, q_zeros], axis=1) # p_mask = tf.cast(tf.cast(p_ids, tf.bool), tf.float32) question_ids = tf.transpose(question_ids, [1, 0]) context_ids = tf.transpose(context_ids, [1, 0]) q_attn_mask = get_attention_mask(question_ids, q_seq_len) c_attn_mask = get_attention_mask(context_ids, ctx_seq_len) qc_attn_mask = get_attention_mask( tf.concat([context_ids, question_ids], axis=0), seq_len) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) initializer = xlnet._get_initializer(run_config) tfm_args = dict( n_token=xlnet_config.n_token, initializer=initializer, attn_type="bi", n_layer=xlnet_config.n_layer, d_model=xlnet_config.d_model, n_head=xlnet_config.n_head, d_head=xlnet_config.d_head, d_inner=xlnet_config.d_inner, ff_activation=xlnet_config.ff_activation, untie_r=xlnet_config.untie_r, is_training=run_config.is_training, use_bfloat16=run_config.use_bfloat16, use_tpu=run_config.use_tpu, dropout=run_config.dropout, dropatt=run_config.dropatt, # mem_len=run_config.mem_len, # reuse_len=run_config.reuse_len, # bi_data=run_config.bi_data, clamp_len=run_config.clamp_len, # same_length=run_config.same_length, ctx_ids=context_ids, q_ids=question_ids, q_seq_len=q_seq_len, ctx_seq_len=ctx_seq_len, sep_layer=FLAGS.sep_layer, q_attn_mask=q_attn_mask, c_attn_mask=c_attn_mask, qc_attn_mask=qc_attn_mask, ) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): upper_outputs = transformer_xl_decomposed(**tfm_args) output = upper_outputs[-1] return_dict = {'upper_outputs': upper_outputs} with tf.variable_scope("logits"): # logits: seq, batch_size, 2 logits = tf.layers.dense(output, 2, kernel_initializer=initializer) # logits: 2, batch_size, seq logits = tf.transpose(logits, [2, 1, 0]) # start_logits: batch_size, seq # end_logits: batch_size, seq start_logits, end_logits = tf.unstack(logits, axis=0) # start_logits_masked = start_logits * p_mask - 1e30 * (1 - p_mask) # start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) start_log_probs = tf.nn.log_softmax(start_logits, -1) # end_logits_masked = end_logits * p_mask - 1e30 * (1 - p_mask) # end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_log_probs = tf.nn.log_softmax(end_logits, -1) return_dict["start_logits"] = start_logits return_dict["end_logits"] = end_logits if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs # an additional layer to predict answer class, 0: span, 1:yes, 2:no with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name='pooler') ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) # hotpot has 3 classes, # squad 2.0 has 2 classes cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes, kernel_initializer=initializer, name="cls") cls_log_probs = tf.nn.log_softmax(cls_logits, -1) return_dict["cls_logits"] = cls_logits if is_training: return_dict["cls_log_probs"] = cls_log_probs return return_dict
def get_qa_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" input_ids = features["input_ids"] seg_id = features["segment_ids"] input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32) cls_index = tf.reshape(tf.reduce_sum(input_mask_int, axis=1), [-1]) p_mask = tf.cast(tf.cast(seg_id, tf.bool), tf.float32) input_ids = tf.transpose(input_ids, [1, 0]) input_mask = 1 - tf.cast(input_mask_int, tf.float32) input_mask = tf.transpose(input_mask, [1, 0]) seg_id = tf.transpose(seg_id, [1, 0]) seq_len = tf.shape(input_ids)[0] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_id, input_mask=input_mask) output = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() return_dict = {} with tf.variable_scope("logits"): # logits: seq, batch_size, 2 logits = tf.layers.dense(output, 2, kernel_initializer=initializer) # logits: 2, batch_size, seq logits = tf.transpose(logits, [2, 1, 0]) # start_logits: batch_size, seq # end_logits: batch_size, seq start_logits, end_logits = tf.unstack(logits, axis=0) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs else: return_dict["start_logits"] = start_logits return_dict["end_logits"] = end_logits # an additional layer to predict answer class, 0: span, 1:yes, 2:no with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name='pooler') ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) # hotpot has 3 classes, # squad 2.0 has 2 classes cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes, kernel_initializer=initializer, name="cls") cls_log_probs = tf.nn.log_softmax(cls_logits, -1) if is_training: return_dict["cls_log_probs"] = cls_log_probs return_dict["cls_logits"] = cls_logits return return_dict