def _create_model(self, input_ids, input_masks, segment_ids, label_ids, label_list, mode): """Creates XLNet-NER model""" model = xlnet.XLNetModel(xlnet_config=self.model_config, run_config=xlnet.create_run_config( mode == tf.estimator.ModeKeys.TRAIN, True, FLAGS), input_ids=tf.transpose(input_ids, perm=[1, 0]), input_mask=tf.transpose(input_masks, perm=[1, 0]), seg_ids=tf.transpose(segment_ids, perm=[1, 0])) initializer = model.get_initializer() with tf.variable_scope("ner", reuse=tf.AUTO_REUSE): result = tf.transpose(model.get_sequence_output(), perm=[1, 0, 2]) result_mask = tf.cast(tf.expand_dims(1 - input_masks, axis=-1), dtype=tf.float32) dense_layer = tf.keras.layers.Dense( units=len(label_list), activation=None, use_bias=True, kernel_initializer=initializer, bias_initializer=tf.zeros_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) result = dense_layer(result) if mode == tf.estimator.ModeKeys.TRAIN: result = dropout_layer(result) masked_predict = result * result_mask + MIN_FLOAT * (1 - result_mask) predict_ids = tf.cast(tf.argmax(tf.nn.softmax(masked_predict, axis=-1), axis=-1), dtype=tf.int32) loss = tf.constant(0.0, dtype=tf.float32) if mode in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ] and label_ids is not None: with tf.variable_scope("loss", reuse=tf.AUTO_REUSE): label = tf.cast(label_ids, dtype=tf.float32) label_mask = tf.cast(1 - input_masks, dtype=tf.float32) masked_label = tf.cast(label * label_mask, dtype=tf.int32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_label, logits=masked_predict) loss = tf.reduce_sum( cross_entropy * label_mask) / tf.reduce_sum( tf.reduce_max(label_mask, axis=-1)) return loss, predict_ids
def __init__(self, FLAGS=FLAGS, n_class=2, is_training=False): import xlnet, modeling import tensorflow as tf init_log() logging.info("Init semantic model ...") self.sp = spm.SentencePieceProcessor() self.sp.Load(FLAGS.spiece_model_file) tf.logging.set_verbosity(tf.logging.INFO) tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32 self.input_ids = tf.placeholder(dtype=tf.int64, shape=[None, None], name="input_ids") self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name="segment_ids") self.input_mask = tf.placeholder(dtype=tf_float, shape=[None, None], name="input_mask") self.label_ids = tf.placeholder(dtype=tf.int64, shape=[None], name="label_ids") bsz_per_core = tf.shape(self.input_ids)[0] inp = tf.transpose(self.input_ids, [1, 0]) seg_id = tf.transpose(self.segment_ids, [1, 0]) inp_mask = tf.transpose(self.input_mask, [1, 0]) label = tf.reshape(self.label_ids, [bsz_per_core]) self.sess = tf.Session() xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) self.summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): if FLAGS.cls_scope is not None and FLAGS.cls_scope: cls_scope = "classification_{}".format(FLAGS.cls_scope) else: cls_scope = "classification_{}".format(FLAGS.task_name.lower()) per_example_loss, logits = modeling.classification_loss( hidden=self.summary, labels=label, n_class=n_class, initializer=xlnet_model.get_initializer(), scope=cls_scope, return_logits=True) total_loss = tf.reduce_mean(per_example_loss) num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) xlnet_model.saver.restore(self.sess, FLAGS.init_checkpoint) #### load pretrained models #scaffold_fn = model_utils.init_from_checkpoint(FLAGS) logging.info("Init semantic model finished ...")
def get_classification_outputs(FLAGS, features, is_training): """Loss for downstream classification tasks.""" input_ids = features["input_ids"] seg_id = features["segment_ids"] input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32) input_mask = 1 - tf.cast(input_mask_int, tf.float32) num_choices = FLAGS.num_choices batch_size = tf.shape(features["input_ids"])[0] def _transform_features(feature): out = tf.reshape(feature, [batch_size, num_choices, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, batch_size * num_choices]) return out if num_choices: input_ids = _transform_features(input_ids) seg_id = _transform_features(seg_id) input_mask = _transform_features(input_mask) else: input_ids = tf.transpose(input_ids, [1, 0]) seg_id = tf.transpose(seg_id, [1, 0]) input_mask = tf.transpose(input_mask, [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_id, input_mask=input_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) initializer = xlnet_model.get_initializer() return_dict = {} with tf.variable_scope("model", reuse=tf.AUTO_REUSE): with tf.variable_scope("answer_class"): # race has 4 classes, # boolq has 2 classes if num_choices: num_classes = 1 else: num_classes = FLAGS.num_classes cls_logits = tf.layers.dense(summary, num_classes, kernel_initializer=initializer, name="cls") if num_choices: cls_logits = tf.reshape(cls_logits, [batch_size, num_choices]) cls_log_probs = tf.nn.log_softmax(cls_logits, -1) if is_training: return_dict["cls_log_probs"] = cls_log_probs return_dict["cls_logits"] = cls_logits return return_dict
def xlnet_layer(self): xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.xlnet_config) run_config = xlnet.create_run_config(self.is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=self.input_ids, seg_ids=self.segment_ids, input_mask=self.input_mask) self.embedded = xlnet_model.get_sequence_output() self.model_inputs = tf.nn.dropout(self.embedded, self.dropout)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) # no need for dropout in prediction mode xlnet_config.dropout = 0.0 xlnet_config.dropatt = 0.0 run_config = xlnet.create_run_config(False, True, FLAGS) # no need for dropout in prediction mode run_config.dropout = 0.0 run_config.dropatt = 0.0 xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) # Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) # load pretrained models scaffold_fn = init_from_checkpoint(FLAGS) # Get a sequence output seq_out = xlnet_model.get_sequence_output() tokens = tf.transpose(seq_out, [1, 0, 2]) predictions = { "unique_id": unique_ids, 'tokens': tokens, 'input_mask': tf.transpose(inp_mask, [1, 0]) } if FLAGS.use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec
def get_multi_task_weight_loss( FLAGS, features, is_training): bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label = tf.reshape(features["label_ids"], [bsz_per_core, 7]) target = label[:, 0] aux = label[:, 1:] weight = tf.reshape(features["weight"], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): with tf.variable_scope("regression_label"): target_logits = tf.layers.dense( summary, 1, kernel_initializer=xlnet_model.get_initializer(), name='target') target_logits = tf.squeeze(target_logits) target_loss = tf.losses.sigmoid_cross_entropy(target, target_logits, weights=weight) with tf.variable_scope("regression_aux"): aux_logits = tf.layers.dense( summary, 6, kernel_initializer=xlnet_model.get_initializer(), name='aux' ) aux_loss = tf.losses.sigmoid_cross_entropy(aux, aux_logits) per_example_loss = target_loss + aux_loss total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, target_logits
def get_decomposed_classification_outputs(FLAGS, features, is_training): seq1_ids = features["seq1_ids"] seq2_ids = features["seq2_ids"] seq_len = FLAGS.max_seq_length first_seq_len = FLAGS.max_first_length + 2 second_seq_len = seq_len - first_seq_len seq1_attn_mask = get_attention_mask(seq1_ids, first_seq_len) seq2_attn_mask = get_attention_mask(seq2_ids, second_seq_len) seq_attn_mask = get_attention_mask(tf.concat([seq2_ids, seq1_ids], axis=0), seq_len) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) initializer = xlnet._get_initializer(run_config) tfm_args = dict( n_token=xlnet_config.n_token, initializer=initializer, attn_type="bi", n_layer=xlnet_config.n_layer, d_model=xlnet_config.d_model, n_head=xlnet_config.n_head, d_head=xlnet_config.d_head, d_inner=xlnet_config.d_inner, ff_activation=xlnet_config.ff_activation, untie_r=xlnet_config.untie_r, is_training=run_config.is_training, use_bfloat16=run_config.use_bfloat16, use_tpu=run_config.use_tpu, dropout=run_config.dropout, dropatt=run_config.dropatt, # mem_len=run_config.mem_len, # reuse_len=run_config.reuse_len, # bi_data=run_config.bi_data, clamp_len=run_config.clamp_len, # same_length=run_config.same_length, ctx_ids=seq2_ids, q_ids=seq1_ids, q_seq_len=first_seq_len, ctx_seq_len=second_seq_len, sep_layer=FLAGS.sep_layer, q_attn_mask=seq1_attn_mask, c_attn_mask=seq2_attn_mask, qc_attn_mask=seq_attn_mask, ) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): upper_outputs = transformer_xl_decomposed(**tfm_args)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tpu_config = model_utils.configure_tpu(FLAGS) model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) model_builder = XLNetModelBuilder( default_model_config=model_config, default_run_config=run_config, default_init_checkpoint=FLAGS.init_checkpoint, use_tpu=FLAGS.use_tpu) model_fn = model_builder.get_model_fn(model_config, run_config, FLAGS.init_checkpoint, FLAGS.model_type) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_config, export_to_tpu=FLAGS.use_tpu, train_batch_size=1) tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file, lower_case=FLAGS.lower_case) example_converter = XLNetExampleConverter( label_list=[], max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) features = example_converter.convert_examples_to_features( [PaddingInputExample()]) input_fn = XLNetInputBuilder.get_input_builder(features, FLAGS.max_seq_length, True, False) estimator.train(input_fn, max_steps=1) tf.gfile.MakeDirs(FLAGS.export_dir) serving_input_fn = XLNetInputBuilder.get_serving_input_fn( FLAGS.max_seq_length) estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
def get_classification_loss(FLAGS, features, n_class, is_training): """Loss for downstream classification tasks.""" bsz_per_core = tf.shape(features['input_ids'])[0] inp = tf.transpose(features['input_ids'], [1, 0]) seg_id = tf.transpose(features['segment_ids'], [1, 0]) inp_mask = tf.transpose(features['input_mask'], [1, 0]) label = tf.reshape(features['label_ids'], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path = FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config = xlnet_config, run_config = run_config, input_ids = inp, seg_ids = seg_id, input_mask = inp_mask, ) summary = xlnet_model.get_pooled_out( FLAGS.summary_type, FLAGS.use_summ_proj ) with tf.variable_scope('model', reuse = tf.AUTO_REUSE): if FLAGS.cls_scope is not None and FLAGS.cls_scope: cls_scope = 'classification_{}'.format(FLAGS.cls_scope) else: cls_scope = 'classification_{}'.format(FLAGS.task_name.lower()) per_example_loss, logits = modeling.classification_loss( hidden = summary, labels = label, n_class = n_class, initializer = xlnet_model.get_initializer(), scope = cls_scope, return_logits = True, ) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def get_ner_loss(FLAGS, features, is_training, num_labels): inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label_id = tf.transpose(features["label_ids"], [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output_layer = xlnet_model.get_sequence_output() def hidden2tag(hiddenlayer, numclass): linear = tf.keras.layers.Dense(numclass, activation=None) return linear(hiddenlayer) def softmax_layer(logits, labels, num_labels, mask): logits = tf.reshape(logits, [-1, num_labels]) labels = tf.reshape(labels, [-1]) mask = tf.cast(mask, dtype=tf.float32) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_labels) loss *= tf.reshape(mask, [-1]) loss = tf.reduce_sum(loss) total_size = tf.reduce_sum(mask) total_size += 1e-12 # to avoid division by 0 for all-0 weights loss /= total_size # predict not mask we could filtered it in the prediction part. probabilities = tf.math.softmax(logits, axis=-1) predict = tf.math.argmax(probabilities, axis=-1) return loss, predict if is_training: output_layer = tf.keras.layers.Dropout(rate=0.1)(output_layer) logits = hidden2tag(output_layer, num_labels) logits = tf.reshape(logits, [-1, 128, num_labels]) loss, predict = softmax_layer(logits, label_id, num_labels, inp_mask) return loss, logits, predict
def __init__(self, ckpt_num=156000, is_training=False): #init_log() self.logs = {} batch_size = 1 logging.info("Init query weight model ...") self.sp = Tokenizer() self.lm = language_model() self.xgb_model = xgb.Booster(model_file=conf.rank_model) #self.xgb_dict = parse_xgb_dict(conf.rank_model + '.txt') tf.logging.set_verbosity(tf.logging.INFO) tf_float = tf.bfloat16 if FLAGS.use_bfloat16 else tf.float32 self.input_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size, FLAGS.seq_len], name="input_ids") self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[batch_size, FLAGS.seq_len], name="segment_ids") self.input_mask = tf.placeholder(dtype=tf_float, shape=[batch_size, FLAGS.seq_len], name="input_mask") self.label_ids = tf.placeholder(dtype=tf.int64, shape=[batch_size], name="label_ids") inp = tf.transpose(self.input_ids, [1, 0]) seg_id = tf.transpose(self.segment_ids, [1, 0]) inp_mask = tf.transpose(self.input_mask, [1, 0]) self.sess = tf.Session() xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) self.output, self.attn_prob, self.attention_out = xlnet_model.output_encode, xlnet_model.attn_prob, xlnet_model.attention_out num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) xlnet_model.saver.restore( self.sess, FLAGS.init_checkpoint + "/model.ckpt-" + str(ckpt_num)) #### load pretrained models # scaffold_fn = model_utils.init_from_checkpoint(FLAGS) logging.info("Init query weight model finished ...")
def get_race_loss(FLAGS, features, is_training): """Loss for downstream multi-choice QA tasks such as RACE.""" bsz_per_core = tf.shape(features['input_ids'])[0] def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 4, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 4]) return out inp = _transform_features(features['input_ids']) seg_id = _transform_features(features['segment_ids']) inp_mask = _transform_features(features['input_mask']) label = tf.reshape(features['label_ids'], [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path = FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config = xlnet_config, run_config = run_config, input_ids = inp, seg_ids = seg_id, input_mask = inp_mask, ) summary = xlnet_model.get_pooled_out( FLAGS.summary_type, FLAGS.use_summ_proj ) with tf.variable_scope('logits'): logits = tf.layers.dense( summary, 1, kernel_initializer = xlnet_model.get_initializer() ) logits = tf.reshape(logits, [bsz_per_core, 4]) one_hot_target = tf.one_hot(label, 4) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1 ) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def attention_net(input_x, input_val, is_training=True, scope='AttenNet', config=AttentionConfig()): debug_info = {} with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): bsz = tf.shape(input_x)[0] qlen = tf.shape(input_x)[1] segment_ids = tf.zeros([bsz, qlen], dtype=tf.int32) used = tf.ones([bsz, qlen], dtype=tf.int32) #tf.sign(tf.abs(input_x)) length = tf.reduce_sum(used, reduction_indices=1) lengths = tf.cast(length, tf.int32) # attention mask def cond(i, _length, _output): return tf.less(i, tf.shape(_length)[0]) def body(i, _length, _output): return [i + 1, _length, _output.write(i, tf.concat([tf.zeros([_length[i]]), tf.ones(qlen - _length[i])], axis=-1))] Out = tf.TensorArray(size=0, dtype=tf.float32, dynamic_size=True, clear_after_read=False) res = tf.while_loop(cond, body, [0, lengths, Out]) input_mask = tf.convert_to_tensor(res[-1].stack()) #_input_mask_ = tf.equal(0, input_x) #input_mask_ = tf.cast(_input_mask_, tf.float32) inp = tf.transpose(input_x, [1, 0]) seg_id = tf.transpose(segment_ids, [1, 0]) inp_mask = tf.transpose(input_mask, [1, 0]) # XLNetConfig contains hyperparameters that are specific to a model checkpoint. xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) # RunConfig contains hyperparameters that could be different between pretraining and finetuning. run_config = xlnet.create_run_config(is_training=is_training, is_finetune=True, FLAGS=FLAGS) # Construct an XLNet model xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, input_vals=input_val, seg_ids=seg_id, input_mask=inp_mask) # Get a summary of the sequence using the last hidden state summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) # Get a sequence output seq_out_ = xlnet_model.get_sequence_output() seq_out = tf.transpose(seq_out_, [1, 0, 2]) debug_info['input_x']=input_x;debug_info['segment_ids']=segment_ids;debug_info['input_mask']=input_mask debug_info['summary']=summary; #debug_info['input_mask_']=input_mask_ debug_info['lengths']=lengths return summary, debug_info
def create_model(FLAGS, input_ids, input_mask, segment_ids, labels, is_training=True): bsz_per_core = tf.shape(input_ids)[0] inp = tf.transpose(input_ids, [1, 0]) seg_id = tf.transpose(segment_ids, [1, 0]) inp_mask = tf.transpose(input_mask, [1, 0]) label = tf.reshape(labels, [bsz_per_core]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): if FLAGS.cls_scope is not None and FLAGS.cls_scope: cls_scope = "classification_{}".format(FLAGS.cls_scope) else: cls_scope = "classification_{}".format(FLAGS.task_name.lower()) per_example_loss, logits = modeling.classification_loss( hidden=summary, labels=label, n_class=FLAGS.num_labels, initializer=xlnet_model.get_initializer(), scope=cls_scope, return_logits=True) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def get_jigsaw_loss(FLAGS, features, n_output, is_training): """Loss for jigsaw task.""" inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label = features["label_ids"] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): if FLAGS.cls_scope is not None and FLAGS.cls_scope: cls_scope = "classification_{}".format(FLAGS.cls_scope) else: cls_scope = "classification_{}".format(FLAGS.task_name.lower()) per_example_loss, logits = jigsaw_loss( hidden=summary, labels=label, n_output=n_output, initializer=xlnet_model.get_initializer(), scope=cls_scope, return_logits=True) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def create_model(FLAGS, features, is_training, num_labels): xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) inp = features["input_ids"] seg_id = features["segment_ids"] inp_mask = features["input_mask"] label_list = features["label_list"] xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() with tf.variable_scope("loss"): if is_training: output_layer = tf.nn.dropout(output, keep_prob=0.9) logits = tf.layers.dense(output, num_labels, activation=None) logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) input_mask = tf.cast(inp_mask, dtype=tf.float32) log_prob = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_list, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_prob, axis=-1) input_mask *= -1 input_mask += 1 per_example_loss *= input_mask loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, logits
def main(_): tf.logging.set_verbosity(tf.logging.INFO) processor = NerProcessor(data_dir=FLAGS.data_dir, task_name=FLAGS.task_name.lower()) label_list = processor.get_labels() tpu_config = model_utils.configure_tpu(FLAGS) model_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) model_builder = XLNetModelBuilder( default_model_config=model_config, default_run_config=run_config, default_init_checkpoint=FLAGS.init_checkpoint, use_tpu=FLAGS.use_tpu) model_fn = model_builder.get_model_fn(model_config, run_config, FLAGS.init_checkpoint, label_list) # If TPU is not available, this will fall back to normal Estimator on CPU or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=tpu_config, export_to_tpu=FLAGS.use_tpu, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) tokenizer = XLNetTokenizer(sp_model_file=FLAGS.spiece_model_file, lower_case=FLAGS.lower_case) example_converter = XLNetExampleConverter( label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) if FLAGS.do_train: train_examples = processor.get_train_examples() tf.logging.info("***** Run training *****") tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", FLAGS.train_steps) train_features = example_converter.convert_examples_to_features( train_examples) train_input_fn = XLNetInputBuilder.get_input_builder( train_features, FLAGS.max_seq_length, True, True) estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) if FLAGS.do_eval: eval_examples = processor.get_dev_examples() tf.logging.info("***** Run evaluation *****") tf.logging.info(" Num examples = %d", len(eval_examples)) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_features = example_converter.convert_examples_to_features( eval_examples) eval_input_fn = XLNetInputBuilder.get_input_builder( eval_features, FLAGS.max_seq_length, False, False) result = estimator.evaluate(input_fn=eval_input_fn) precision = result["precision"] recall = result["recall"] f1_score = 2.0 * precision * recall / (precision + recall) tf.logging.info("***** Evaluation result *****") tf.logging.info(" Precision (token-level) = %s", str(precision)) tf.logging.info(" Recall (token-level) = %s", str(recall)) tf.logging.info(" F1 score (token-level) = %s", str(f1_score)) if FLAGS.do_predict: predict_examples = processor.get_test_examples() tf.logging.info("***** Run prediction *****") tf.logging.info(" Num examples = %d", len(predict_examples)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) predict_features = example_converter.convert_examples_to_features( predict_examples) predict_input_fn = XLNetInputBuilder.get_input_builder( predict_features, FLAGS.max_seq_length, False, False) result = estimator.predict(input_fn=predict_input_fn) predict_recorder = XLNetPredictRecorder( output_dir=FLAGS.output_dir, label_list=label_list, max_seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) predicts = [{ "input_ids": feature.input_ids, "input_masks": feature.input_masks, "label_ids": feature.label_ids, "predict_ids": predict["predict"].tolist() } for feature, predict in zip(predict_features, result)] predict_recorder.record(predicts) if FLAGS.do_export: tf.logging.info("***** Running exporting *****") tf.gfile.MakeDirs(FLAGS.export_dir) serving_input_fn = XLNetInputBuilder.get_serving_input_fn( FLAGS.max_seq_length) estimator.export_savedmodel(FLAGS.export_dir, serving_input_fn, as_text=False)
def get_qa_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" input_ids = features["input_ids"] seg_id = features["segment_ids"] input_mask_int = tf.cast(tf.cast(input_ids, tf.bool), tf.int32) cls_index = tf.reshape(tf.reduce_sum(input_mask_int, axis=1), [-1]) p_mask = tf.cast(tf.cast(seg_id, tf.bool), tf.float32) input_ids = tf.transpose(input_ids, [1, 0]) input_mask = 1 - tf.cast(input_mask_int, tf.float32) input_mask = tf.transpose(input_mask, [1, 0]) seg_id = tf.transpose(seg_id, [1, 0]) seq_len = tf.shape(input_ids)[0] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=input_ids, seg_ids=seg_id, input_mask=input_mask) output = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() return_dict = {} with tf.variable_scope("logits"): # logits: seq, batch_size, 2 logits = tf.layers.dense(output, 2, kernel_initializer=initializer) # logits: 2, batch_size, seq logits = tf.transpose(logits, [2, 1, 0]) # start_logits: batch_size, seq # end_logits: batch_size, seq start_logits, end_logits = tf.unstack(logits, axis=0) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs else: return_dict["start_logits"] = start_logits return_dict["end_logits"] = end_logits # an additional layer to predict answer class, 0: span, 1:yes, 2:no with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name='pooler') ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) # hotpot has 3 classes, # squad 2.0 has 2 classes cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes, kernel_initializer=initializer, name="cls") cls_log_probs = tf.nn.log_softmax(cls_logits, -1) if is_training: return_dict["cls_log_probs"] = cls_log_probs return_dict["cls_logits"] = cls_logits return return_dict
def _create_model(self, input_ids, input_masks, segment_ids, token_label_ids, sent_label_ids, token_label_list, sent_label_list, mode): """Creates XLNet-NLU model""" model = xlnet.XLNetModel(xlnet_config=self.model_config, run_config=xlnet.create_run_config( mode == tf.estimator.ModeKeys.TRAIN, True, FLAGS), input_ids=tf.transpose(input_ids, perm=[1, 0]), input_mask=tf.transpose(input_masks, perm=[1, 0]), seg_ids=tf.transpose(segment_ids, perm=[1, 0])) initializer = model.get_initializer() with tf.variable_scope("token", reuse=tf.AUTO_REUSE): token_result = tf.transpose(model.get_sequence_output(), perm=[1, 0, 2]) token_result_mask = tf.cast(tf.expand_dims(1 - input_masks, axis=-1), dtype=tf.float32) token_dense_layer = tf.keras.layers.Dense( units=len(token_label_list), activation=None, use_bias=True, kernel_initializer=initializer, bias_initializer=tf.zeros_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) token_dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) token_result = token_dense_layer(token_result) if mode == tf.estimator.ModeKeys.TRAIN: token_result = token_dropout_layer(token_result) masked_token_predict = token_result * token_result_mask + MIN_FLOAT * ( 1 - token_result_mask) token_predict_ids = tf.cast(tf.argmax(tf.nn.softmax( masked_token_predict, axis=-1), axis=-1), dtype=tf.int32) with tf.variable_scope("sent", reuse=tf.AUTO_REUSE): sent_result = model.get_pooled_out("last") sent_result_mask = tf.cast(tf.reduce_max(1 - input_masks, axis=-1, keepdims=True), dtype=tf.float32) sent_dense_layer = tf.keras.layers.Dense( units=len(sent_label_list), activation=None, use_bias=True, kernel_initializer=initializer, bias_initializer=tf.zeros_initializer, kernel_regularizer=None, bias_regularizer=None, trainable=True) sent_dropout_layer = tf.keras.layers.Dropout( rate=0.1, seed=np.random.randint(10000)) sent_result = sent_dense_layer(sent_result) if mode == tf.estimator.ModeKeys.TRAIN: sent_result = sent_dropout_layer(sent_result) masked_sent_predict = sent_result * sent_result_mask + MIN_FLOAT * ( 1 - sent_result_mask) sent_predict_ids = tf.cast(tf.argmax(tf.nn.softmax( masked_sent_predict, axis=-1), axis=-1), dtype=tf.int32) loss = tf.constant(0.0, dtype=tf.float32) if mode not in [ tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL ]: return loss, token_predict_ids, sent_predict_ids if token_label_ids is not None: with tf.variable_scope("token_loss", reuse=tf.AUTO_REUSE): token_label = tf.cast(token_label_ids, dtype=tf.float32) token_label_mask = tf.cast(1 - input_masks, dtype=tf.float32) masked_token_label = tf.cast(token_label * token_label_mask, dtype=tf.int32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_token_label, logits=masked_token_predict) token_loss = tf.reduce_sum( cross_entropy * token_label_mask) / tf.reduce_sum( tf.reduce_max(token_label_mask, axis=-1)) loss = loss + token_loss if sent_label_ids is not None: with tf.variable_scope("sent_loss", reuse=tf.AUTO_REUSE): sent_label = tf.cast(sent_label_ids, dtype=tf.float32) sent_label_mask = tf.cast(tf.reduce_max(1 - input_masks, axis=-1), dtype=tf.float32) masked_sent_label = tf.cast(sent_label * sent_label_mask, dtype=tf.int32) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=masked_sent_label, logits=masked_sent_predict) sent_loss = tf.reduce_sum( cross_entropy * sent_label_mask) / tf.reduce_sum( tf.reduce_max(sent_label_mask, axis=-1)) loss = loss + sent_loss return loss, token_predict_ids, sent_predict_ids
def get_crf_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) if FLAGS.label_mode == "normal": label = features["label_ids"] elif FLAGS.label_mode == "X": label = features["label_x_id"] elif FLAGS.label_mode == "gather": label = features["label_gather"] else: raise ValueError("unsupport label mode {}".format(FLAGS.label_mode)) if FLAGS.label_mask == "normal": mask = 1 - features["input_mask"] re_mask = features["label_mask_x"] elif FLAGS.label_mask == "X": mask = features["label_mask_x"] re_mask = features["label_mask_x"] elif FLAGS.label_mask == "gather": mask = features["label_mask_gather"] re_mask = features["label_mask_gather"] else: raise ValueError("unsupport mask mode {}".format(FLAGS.label_mode)) max_seq_length = FLAGS.max_seq_length batch_size = tf.shape(inp)[1] if FLAGS.label_mode == "X": classes = FLAGS.crf_classes + 1 else: classes = FLAGS.crf_classes xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() with tf.variable_scope("crf_layer"): output = tf.transpose(output, [1, 0, 2]) start_logits = tf.layers.dense(output, classes, kernel_initializer=initializer) if FLAGS.label_mode == "gather": flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * max_seq_length, [-1, 1]) flat_positions = tf.reshape(features["label_index"] + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape( start_logits, [batch_size * max_seq_length, classes]) start_logits = tf.gather(flat_sequence_tensor, flat_positions) start_logits = tf.reshape(start_logits, [batch_size, max_seq_length, classes]) if FLAGS.no_crf: logits = tf.nn.log_softmax(start_logits) one_hot_target = tf.one_hot(label, classes) per_example_loss = -tf.reduce_sum(logits * one_hot_target, -1) numerator = tf.reduce_sum(tf.reshape(mask * per_example_loss, [-1])) denominator = tf.reduce_sum(tf.reshape(mask, [-1])) + 1e-5 total_loss = numerator / denominator logits = tf.argmax(logits, axis=-1) else: seq_len = tf.reduce_sum(tf.cast(mask, tf.int64), axis=-1) transition_params = tf.get_variable( 'trans', [classes, classes], dtype=tf.float32, initializer=tf.zeros_initializer) per_example_loss, transition_params = tf.contrib.crf.crf_log_likelihood( start_logits, label, seq_len, transition_params=transition_params) logits, tf_score = tf.contrib.crf.crf_decode( start_logits, transition_params, seq_len) per_example_loss = -per_example_loss total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits, label, re_mask
def get_ner_loss(FLAGS, features, is_training, lengths): """Loss for downstream sequence labelling such as NER.""" bsz_per_core = tf.shape(features["input_ids"])[0] def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 1, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 1]) return out inp = _transform_features(features["input_ids"]) seg_id = _transform_features(features["segment_ids"]) inp_mask = _transform_features(features["input_mask"]) labels = tf.reshape(features["label_ids"], [bsz_per_core, FLAGS.max_seq_length]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) # todo summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) embedded_chars = xlnet_model.get_sequence_output() embedding_dims = 768 num_labels = 2 # logits = project_crf_layer(embedded_chars) # loss, trans = crf_layer(logits) from tensorflow.contrib.layers.python.layers import initializers with tf.variable_scope("logits", reuse=tf.AUTO_REUSE): W = tf.get_variable("W", shape=[embedding_dims, num_labels], dtype=tf.float32, initializer=initializers.xavier_initializer()) b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(embedded_chars, shape=[-1, embedding_dims ]) # [batch_size, embedding_dims] pred = tf.tanh(tf.nn.xw_plus_b(output, W, b)) logits = tf.reshape(pred, [-1, FLAGS.max_seq_length, num_labels]) trans1 = tf.get_variable("transitions", shape=[num_labels, num_labels], initializer=initializers.xavier_initializer()) # crf log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=labels, sequence_lengths=lengths) # log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( # inputs=logits, # tag_indices=labels, # transition_params=trans1, # sequence_lengths=lengths) # CRF decode, pred_ids 是一条最大概率的标注路径 pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths) # return (loss, logits, trans, pred_ids) one_hot_target = tf.one_hot(labels, num_labels) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def get_ner_loss(FLAGS, features, is_training): # , lengths): """Loss for downstream sequence labelling such as NER.""" bsz_per_core = tf.shape(features["input_ids"])[0] input_ids = features["input_ids"] print("&&&&&&&&&&%%%%%%%%% the input_ids shape is ", input_ids.shape) used = tf.sign(tf.abs(input_ids)) print("&&&&&&&&&&%%%%%%%%% the used shape is ", used.shape) # [batch_size] 大小的向量,包含了当前batch中的序列长度 lengths = tf.reduce_sum(used, reduction_indices=1) print("&&&&&&&&&&%%%%%%%%% lengths shape is ", lengths.shape) def _transform_features(feature): out = tf.reshape(feature, [bsz_per_core, 1, -1]) out = tf.transpose(out, [2, 0, 1]) out = tf.reshape(out, [-1, bsz_per_core * 1]) return out inp = _transform_features(features["input_ids"]) seg_id = _transform_features(features["segment_ids"]) inp_mask = _transform_features(features["input_mask"]) labels = tf.reshape(features["label_ids"], [bsz_per_core, FLAGS.max_seq_length]) print("&&&&&&&&&&%%%%%%%%% labels shape is ", labels.shape) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) embedded_chars = xlnet_model.get_sequence_output() embedding_dims = embedded_chars.shape[-1] num_labels = 17 with tf.variable_scope("logits", reuse=tf.AUTO_REUSE): W = tf.get_variable("W", shape=[embedding_dims, num_labels], dtype=tf.float32, initializer=initializers.xavier_initializer()) b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32, initializer=tf.zeros_initializer()) x = tf.reshape(embedded_chars, shape=[-1, embedding_dims]) # [batch_size, embedding_dims] pred = tf.nn.xw_plus_b(x, W, b) print("&&&&&&&&&&%%%%%%%%% the embedded_chars shape is ", embedded_chars.shape) print("&&&&&&&&&&%%%%%%%%% W shape is ", W.shape) print("&&&&&&&&&&%%%%%%%%% b shape is ", b.shape) print("&&&&&&&&&&%%%%%%%%% output shape is ", x.shape) print("&&&&&&&&&&%%%%%%%%% pred shape is ", pred.shape) logits = tf.reshape(pred, [-1, FLAGS.max_seq_length, num_labels]) print("&&&&&&&&&&%%%%%%%%% logits shape is ", logits.shape) # trans1 = tf.get_variable( # "transitions", # shape=[num_labels, num_labels], # initializer=initializers.xavier_initializer()) # crf log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( inputs=logits, tag_indices=labels, sequence_lengths=lengths) # log_likelihood, trans = tf.contrib.crf.crf_log_likelihood( # inputs=logits, # tag_indices=labels, # transition_params=trans1, # sequence_lengths=lengths) # CRF decode, pred_ids 是一条最大概率的标注路径 pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits, transition_params=trans, sequence_length=lengths) # return (loss, logits, trans, pred_ids) one_hot_target = tf.one_hot(labels, num_labels) per_example_loss = -tf.reduce_sum( tf.nn.log_softmax(logits) * one_hot_target, -1) total_loss = tf.reduce_mean(per_example_loss) return total_loss, per_example_loss, logits
def model_fn(features, labels, mode, params): #### Training or Evaluation is_training = (mode == tf.estimator.ModeKeys.TRAIN) #### Get loss from inputs #********************************************************************************************# bsz_per_core = tf.shape(features["input_ids"])[0] inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) label_ids = features["label_ids"] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) #summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) # 获取对应的embedding 输入数据[batch_size, seq_length, embedding_size] xlnet_model_out = xlnet_model.get_sequence_output() embedding = tf.transpose(xlnet_model_out, [1, 0, 2]) max_seq_length = embedding.shape[1].value # 算序列真实长度 used = tf.sign(tf.abs(features["input_ids"])) lengths = tf.reduce_sum( used, reduction_indices=1) # [batch_size] 大小的向量,包含了当前batch中的序列长度 # 添加CRF output layer blstm_crf = BLSTM_CRF(embedded_chars=embedding, hidden_unit=10, cell_type="lstm", num_layers=1, dropout_rate=0.5, initializers=initializers, num_labels=n_class, seq_length=max_seq_length, labels=label_ids, lengths=lengths, is_training=is_training) total_loss, logits, trans, pred_ids = blstm_crf.add_blstm_crf_layer( crf_only=True) #********************************************************************************************# #### Check model parameters num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) tf.logging.info('#params: {}'.format(num_params)) #### load pretrained models scaffold_fn = model_utils.init_from_checkpoint(FLAGS) #### Evaluation mode if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(label_ids, pred_ids): return { "eval_loss": tf.metrics.mean_squared_error(labels=label_ids, predictions=pred_ids), } eval_metrics = metric_fn(features["label_ids"], pred_ids) eval_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, eval_metric_ops=eval_metrics) return eval_spec elif mode == tf.estimator.ModeKeys.PREDICT: predictions = { "logits": logits, "labels": label_ids, "pred_ids": pred_ids, "input_mask": features["input_mask"] } output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) return output_spec #### Configuring the optimizer train_op, learning_rate, _ = model_utils.get_train_op( FLAGS, total_loss) monitor_dict = {} monitor_dict["lr"] = learning_rate #### Constucting training TPUEstimatorSpec with new cache. train_spec = tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op) return train_spec
def get_decomposed_qa_outputs(FLAGS, features, is_training): question_ids = features["question_ids"] context_ids = features["context_ids"] seq_len = FLAGS.max_seq_length q_seq_len = FLAGS.max_first_length + 2 ctx_seq_len = seq_len - q_seq_len q_mask_int = tf.cast(tf.cast(question_ids, tf.bool), tf.int32) cls_index = tf.reshape( tf.reduce_sum(q_mask_int, axis=1) + ctx_seq_len, [-1]) # 0 for mask out # q_zeros = tf.zeros_like(question_ids) # p_ids = tf.concat([context_ids, q_zeros], axis=1) # p_mask = tf.cast(tf.cast(p_ids, tf.bool), tf.float32) question_ids = tf.transpose(question_ids, [1, 0]) context_ids = tf.transpose(context_ids, [1, 0]) q_attn_mask = get_attention_mask(question_ids, q_seq_len) c_attn_mask = get_attention_mask(context_ids, ctx_seq_len) qc_attn_mask = get_attention_mask( tf.concat([context_ids, question_ids], axis=0), seq_len) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) initializer = xlnet._get_initializer(run_config) tfm_args = dict( n_token=xlnet_config.n_token, initializer=initializer, attn_type="bi", n_layer=xlnet_config.n_layer, d_model=xlnet_config.d_model, n_head=xlnet_config.n_head, d_head=xlnet_config.d_head, d_inner=xlnet_config.d_inner, ff_activation=xlnet_config.ff_activation, untie_r=xlnet_config.untie_r, is_training=run_config.is_training, use_bfloat16=run_config.use_bfloat16, use_tpu=run_config.use_tpu, dropout=run_config.dropout, dropatt=run_config.dropatt, # mem_len=run_config.mem_len, # reuse_len=run_config.reuse_len, # bi_data=run_config.bi_data, clamp_len=run_config.clamp_len, # same_length=run_config.same_length, ctx_ids=context_ids, q_ids=question_ids, q_seq_len=q_seq_len, ctx_seq_len=ctx_seq_len, sep_layer=FLAGS.sep_layer, q_attn_mask=q_attn_mask, c_attn_mask=c_attn_mask, qc_attn_mask=qc_attn_mask, ) with tf.variable_scope("model", reuse=tf.AUTO_REUSE): upper_outputs = transformer_xl_decomposed(**tfm_args) output = upper_outputs[-1] return_dict = {'upper_outputs': upper_outputs} with tf.variable_scope("logits"): # logits: seq, batch_size, 2 logits = tf.layers.dense(output, 2, kernel_initializer=initializer) # logits: 2, batch_size, seq logits = tf.transpose(logits, [2, 1, 0]) # start_logits: batch_size, seq # end_logits: batch_size, seq start_logits, end_logits = tf.unstack(logits, axis=0) # start_logits_masked = start_logits * p_mask - 1e30 * (1 - p_mask) # start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) start_log_probs = tf.nn.log_softmax(start_logits, -1) # end_logits_masked = end_logits * p_mask - 1e30 * (1 - p_mask) # end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_log_probs = tf.nn.log_softmax(end_logits, -1) return_dict["start_logits"] = start_logits return_dict["end_logits"] = end_logits if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs # an additional layer to predict answer class, 0: span, 1:yes, 2:no with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) ans_feature = tf.layers.dense(cls_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name='pooler') ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) # hotpot has 3 classes, # squad 2.0 has 2 classes cls_logits = tf.layers.dense(ans_feature, FLAGS.num_classes, kernel_initializer=initializer, name="cls") cls_log_probs = tf.nn.log_softmax(cls_logits, -1) return_dict["cls_logits"] = cls_logits if is_training: return_dict["cls_log_probs"] = cls_log_probs return return_dict
def prediction_graph_memory(): """Gets features and return predicted tokens) features: Dict[str:tf.train.features] Contains following features: input_k seg_id input_mask """ features = { "input_k": tf.placeholder(tf.int32, (None, None)), "seg_id": tf.placeholder(tf.int32, (None, None)), "input_mask": tf.placeholder(tf.float32, (None, None)) } # Building prediction graph # Transforming features for batch channel on last axis inp = tf.transpose(features["input_k"], [1, 0]) seg_id = tf.transpose(features["seg_id"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) # Model config xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) run_config.mem_len = FLAGS.max_mem_length perm_mask = _create_mask(tf.shape(inp)[0], 0)[:, :, None] # Getting the hidden states for the prompts xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask, perm_mask=perm_mask) # getting memory mems = xlnet_model.get_new_memory() latest_tokens = None prev_tokens = None prev_confs = None batch_size = tf.shape(mems[0])[1] def cond(*_): """Dummy condition since we stop based on iteration""" return True def body(mems, latest_tokens, mem_mask, prev_tokens, prev_confs): """The main body of sampling loop. mem: cache memory--calculated hidden states of previous tokens latest_tokens: latest sampled tokens mem_mask: masking for setting previous memory zero. Used for padding prev_tokens: all the previous tokens including latest_tokens prev_confs: confidences of respective tokens in prev_tokens """ # get dummy input token and permutation mask input_k, seg_id, perm_mask, input_q, target_mapping = \ inputs_and_mask(latest_tokens, batch_size) input_k = tf.transpose(input_k, (1, 0)) input_q = tf.transpose(input_q, (1, 0)) seg_id = tf.transpose(seg_id, (1, 0)) perm_mask = tf.transpose(perm_mask, (1, 2, 0)) # Set the hidden state of the padded tokens to be zero[ for i, mem in enumerate(mems): mems[i] = (1 - mem_mask[:, :, None]) * mems[i] # Get logits xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=input_k, seg_ids=seg_id, perm_mask=perm_mask, mems=mems, input_mask=None, inp_q=input_q, target_mapping=target_mapping) logits = get_logits(xlnet_model, xlnet_config) # Getting new memory new_mems = xlnet_model.get_new_memory() # sample a token logits = tf.transpose(logits, (1, 0, 2)) sampled_tokens, confs = sample_token(logits) sampled_tokens = sampled_tokens[:, -1, :] # Last token confs = confs[:, -1, :] # Last token prev_tokens = sampled_tokens if prev_tokens is None \ else tf.concat([prev_tokens, sampled_tokens], axis=1) prev_confs = confs if prev_confs is None \ else tf.concat([prev_confs, confs], axis=1) # Cache the memory of the the last latest_tokens if latest_tokens is not None: merged_mems = [] for i, mem in enumerate(mems): merged_mems.append( tf.concat([mems[i][1:], new_mems[i][-2:-1]], axis=0)) mem_mask = tf.concat( [mem_mask[1:], tf.zeros_like(mem_mask[:1])], axis=0) return [ merged_mems, sampled_tokens, mem_mask, prev_tokens, prev_confs ] return [mems, sampled_tokens, mem_mask, prev_tokens, prev_confs] mems, latest_tokens, mem_mask, prev_tokens, prev_confs = body( mems, latest_tokens, inp_mask, prev_tokens, prev_confs) args = tf.while_loop( cond=cond, body=body, maximum_iterations=FLAGS.num_toks_pred - 1, loop_vars=[mems, latest_tokens, mem_mask, prev_tokens, prev_confs], shape_invariants=[[ tf.TensorShape([None, None, None]) for _ in range(len(mems)) ], tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None])]) predicted_tokens, predicted_confs = args[-2:] return (predicted_tokens, predicted_confs), features
def prediction_graph_no_memory(): """Builds graphs and returns prediction and input features. Output: predictions: Tuple(Tensors) Currently returns sampled tokens and confidences features: Dict[str:tf.train.features] Contains following features: input_k seg_id input_mask """ features = { "input_k": tf.placeholder(tf.int32, (None, None)), "seg_id": tf.placeholder(tf.int32, (None, None)), "input_mask": tf.placeholder(tf.float32, (None, None)) } # Building prediction graph # Transforming features for batch channel on last axis inp = tf.transpose(features["input_k"], [1, 0]) seg_id = tf.transpose(features["seg_id"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) # Model config xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) run_config.mem_len = FLAGS.max_mem_length perm_mask = _create_mask(tf.shape(inp)[0], 0)[:, :, None] # Getting the hidden states for the prompts prev_tokens = None prev_conf = None # target mapping seq_len = tf.shape(inp)[0] batch_size = tf.shape(inp)[-1] target_mapping = tf.concat( [tf.zeros((1, seq_len - 1, batch_size)), tf.ones((1, 1, batch_size))], axis=1) def cond(*_): """Dummy condition since we stop based on iteration""" return True def recalc(inp, inp_mask, seg_id, perm_mask): """Augment the inputs for the new token. Appends 1 row or columns accordingly""" input_q = tf.zeros_like(inp, dtype=tf.float32) inp = tf.pad(inp, tf.convert_to_tensor([[0, 1], [0, 0]]), constant_values=0) inp_mask = tf.pad(inp_mask, tf.convert_to_tensor([[0, 1], [0, 0]]), constant_values=0) seg_id = tf.pad(seg_id, tf.convert_to_tensor([[0, 1], [0, 0]]), constant_values=0) col = tf.ones(tf.shape(perm_mask)[0:1], dtype=tf.float32) perm_mask = tf.concat([perm_mask, col[:, None, None]], axis=1) row = tf.concat([ tf.zeros(tf.shape(perm_mask)[1:2] - 1, dtype=tf.float32), tf.ones([1], dtype=tf.float32) ], axis=0) perm_mask = tf.concat([perm_mask, row[None, :, None]], axis=0) input_q = tf.pad(input_q, tf.convert_to_tensor([[0, 1], [0, 0]]), constant_values=1) return inp[1:], inp_mask[1:], perm_mask[1:, 1:], input_q[1:], seg_id[1:] def body(inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf): """The main body of sampling loop. inp: input ids inp_mask: input masks for paddings, etc. seg_id: segment id. Zeros here. perm_mask: permutation mask to pass to transformer prev_tokens: all the previous tokens including latest_tokens prev_conf: confidences of respective tokens in prev_tokens """ # get dummy input token and permutation mask input_k, input_mask, perm_mask, input_q, seg_id = recalc( inp, inp_mask, seg_id, perm_mask) # Get logits xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=input_k, seg_ids=seg_id, input_mask=inp_mask, perm_mask=perm_mask, inp_q=input_q, target_mapping=target_mapping) logits = get_logits(xlnet_model, xlnet_config) # sample a token logits = tf.transpose(logits, (1, 0, 2)) sampled_tokens, confidences = sample_token(logits) sampled_tokens = sampled_tokens[:, -1, :] # Last token confidences = confidences[:, -1, :] prev_tokens = sampled_tokens if prev_tokens is None \ else tf.concat([prev_tokens, sampled_tokens], axis=1) prev_conf = confidences if prev_conf is None \ else tf.concat([prev_conf, confidences], axis=1) input_k = tf.concat( [input_k[:-1], tf.transpose(sampled_tokens, (1, 0))], axis=0) perm_mask = _create_mask(tf.shape(input_k)[0], 0)[:, :, None] return [input_k, input_mask, seg_id, perm_mask, prev_tokens, prev_conf] inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf = body( inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf) args = tf.while_loop( cond=cond, body=body, maximum_iterations=FLAGS.num_toks_pred - 1, loop_vars=[inp, inp_mask, seg_id, perm_mask, prev_tokens, prev_conf], shape_invariants=[ tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), ]) predicted_tokens, predicted_confs = args[-2:] return (predicted_tokens, predicted_confs), features
def get_qa_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) cls_index = tf.reshape(features["cls_index"], [-1]) seq_len = tf.shape(inp)[0] xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() return_dict = {} # invalid position mask such as query and special symbols (PAD, SEP, CLS) p_mask = features["p_mask"] # logit of the start position with tf.variable_scope("start_logits"): start_logits = tf.layers.dense(output, 1, kernel_initializer=initializer) start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) # logit of the end position with tf.variable_scope("end_logits"): if is_training: # during training, compute the end logits based on the # ground truth of the start position start_positions = tf.reshape(features["start_positions"], [-1]) start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum("lbh,bl->bh", output, start_index) start_features = tf.tile(start_features[None], [seq_len, 1, 1]) end_logits = tf.layers.dense(tf.concat([output, start_features], axis=-1), xlnet_config.d_model, kernel_initializer=initializer, activation=tf.tanh, name="dense_0") end_logits = tf.contrib.layers.layer_norm(end_logits, begin_norm_axis=-1) end_logits = tf.layers.dense(end_logits, 1, kernel_initializer=initializer, name="dense_1") end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) else: # during inference, compute the end logits based on beam search start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=FLAGS.start_n_top) start_index = tf.one_hot(start_top_index, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum("lbh,bkl->bkh", output, start_index) end_input = tf.tile(output[:, :, None], [1, 1, FLAGS.start_n_top, 1]) start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1]) end_input = tf.concat([end_input, start_features], axis=-1) end_logits = tf.layers.dense(end_input, xlnet_config.d_model, kernel_initializer=initializer, activation=tf.tanh, name="dense_0") end_logits = tf.contrib.layers.layer_norm(end_logits, begin_norm_axis=-1) end_logits = tf.layers.dense(end_logits, 1, kernel_initializer=initializer, name="dense_1") end_logits = tf.reshape(end_logits, [seq_len, -1, FLAGS.start_n_top]) end_logits = tf.transpose(end_logits, [1, 2, 0]) end_logits_masked = end_logits * ( 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_top_log_probs, end_top_index = tf.nn.top_k(end_log_probs, k=FLAGS.end_n_top) end_top_log_probs = tf.reshape( end_top_log_probs, [-1, FLAGS.start_n_top * FLAGS.end_n_top]) end_top_index = tf.reshape( end_top_index, [-1, FLAGS.start_n_top * FLAGS.end_n_top]) if is_training: return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs else: return_dict["start_top_log_probs"] = start_top_log_probs return_dict["start_top_index"] = start_top_index return_dict["end_top_log_probs"] = end_top_log_probs return_dict["end_top_index"] = end_top_index # an additional layer to predict answerability with tf.variable_scope("answer_class"): # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) # get the representation of START start_p = tf.nn.softmax(start_logits_masked, axis=-1, name="softmax_start") start_feature = tf.einsum("lbh,bl->bh", output, start_p) # note(zhiliny): no dependency on end_feature so that we can obtain # one single `cls_logits` for each sample ans_feature = tf.concat([start_feature, cls_feature], -1) ans_feature = tf.layers.dense(ans_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name="dense_0") ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout, training=is_training) cls_logits = tf.layers.dense(ans_feature, 1, kernel_initializer=initializer, name="dense_1", use_bias=False) cls_logits = tf.squeeze(cls_logits, -1) return_dict["cls_logits"] = cls_logits return return_dict
def get_qa_outputs(FLAGS, features, is_training): """Loss for downstream span-extraction QA tasks such as SQuAD.""" outputs = [] scopes = [] with tf.variable_scope("preprocess_input",reuse=tf.AUTO_REUSE): inp = tf.transpose(features["input_ids"], [1, 0]) seg_id = tf.transpose(features["segment_ids"], [1, 0]) inp_mask = tf.transpose(features["input_mask"], [1, 0]) cls_index = tf.reshape(features["cls_index"], [-1]) seq_len = tf.shape(inp)[0] outputs.append(inp) scopes.append(tf.get_variable_scope().name) xlnet_config = xlnet.XLNetConfig(json_path="xl_net/xlnet_large/xlnet_config.json") run_config = xlnet.create_run_config(is_training, True, FLAGS) xlnet_model = xlnet.XLNetModel( xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output,outputs1,scopes1 = xlnet_model.get_sequence_output() initializer = xlnet_model.get_initializer() outputs = outputs+outputs1 scopes = scopes+scopes1 return_dict = {} # invalid position mask such as query and special symbols (PAD, SEP, CLS) # logit of the start position with tf.variable_scope("logits",reuse=tf.AUTO_REUSE): start_logits = tf.layers.dense( output, 1, kernel_initializer=initializer) start_logits_masked = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) # logit of the end position if True: # during training, compute the end logits based on the # ground truth of the start position start_positions = tf.reshape(features["start_positions"], [-1]) start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum("lbh,bl->bh", output, start_index) start_features = tf.tile(start_features[None], [seq_len, 1, 1]) end_logits = tf.layers.dense( tf.concat([output, start_features], axis=-1), xlnet_config.d_model, kernel_initializer=initializer, activation=tf.tanh, name="dense_0") end_logits = tf.contrib.layers.layer_norm( end_logits, begin_norm_axis=-1) end_logits = tf.layers.dense( end_logits, 1, kernel_initializer=initializer, name="dense_1") end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) end_log_probs = tf.nn.log_softmax(end_logits, -1) return_dict["start_log_probs"] = start_log_probs return_dict["end_log_probs"] = end_log_probs # an additional layer to predict answerability # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum("lbh,bl->bh", output, cls_index) # get the representation of START start_p = tf.nn.softmax(start_logits_masked, axis=-1, name="softmax_start") start_feature = tf.einsum("lbh,bl->bh", output, start_p) # note(zhiliny): no dependency on end_feature so that we can obtain # one single `cls_logits` for each sample ans_feature = tf.concat([start_feature, cls_feature], -1) ans_feature = tf.layers.dense( ans_feature, xlnet_config.d_model, activation=tf.tanh, kernel_initializer=initializer, name="dense_0") ans_feature = tf.layers.dropout(ans_feature, 0.1, training=is_training) cls_logits = tf.layers.dense( ans_feature, 1, kernel_initializer=initializer, name="dense_1", use_bias=False) cls_logits = tf.squeeze(cls_logits, -1) return_dict["cls_logits"] = cls_logits outputs.append(cls_logits) scopes.append(tf.get_variable_scope().name) return return_dict,outputs,scopes
def two_stream_loss(FLAGS, features, labels, mems, is_training): """Pretraining loss with two-stream attention Transformer-XL.""" #### Unpack input mem_name = "mems" mems = mems.get(mem_name, None) inp_k = tf.transpose(features["input_k"], [1, 0]) inp_q = tf.transpose(features["input_q"], [1, 0]) seg_id = tf.transpose(features["seg_id"], [1, 0]) inp_mask = None perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0]) if FLAGS.num_predict is not None: # [num_predict x tgt_len x bsz] target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0]) else: target_mapping = None # target for LM loss tgt = tf.transpose(features["target"], [1, 0]) # target mask for LM loss tgt_mask = tf.transpose(features["target_mask"], [1, 0]) # construct xlnet config and save to model_dir xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS) xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json")) # construct run config from FLAGS run_config = xlnet.create_run_config(is_training, False, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp_k, seg_ids=seg_id, input_mask=inp_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q) output = xlnet_model.get_sequence_output() new_mems = {mem_name: xlnet_model.get_new_memory()} lookup_table = xlnet_model.get_embedding_table() initializer = xlnet_model.get_initializer() with tf.variable_scope("model", reuse=tf.AUTO_REUSE): # LM loss lm_loss = modeling.lm_loss(hidden=output, target=tgt, n_token=xlnet_config.n_token, d_model=xlnet_config.d_model, initializer=initializer, lookup_table=lookup_table, tie_weight=True, bi_data=run_config.bi_data, use_tpu=run_config.use_tpu) #### Quantity to monitor monitor_dict = {} if FLAGS.use_bfloat16: tgt_mask = tf.cast(tgt_mask, tf.float32) lm_loss = tf.cast(lm_loss, tf.float32) total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask) monitor_dict["total_loss"] = total_loss return total_loss, new_mems, monitor_dict
def main(_): assert tf.gfile.Exists(FLAGS.init_checkpoint) if not tf.gfile.Exists(FLAGS.output_dir): tf.gfile.MakeDirs(FLAGS.output_dir) processor = SubLocProcessor() labels = processor.get_labels() train_examples = processor.get_train_examples(FLAGS.data_dir) test_examples = processor.get_test_examples(FLAGS.data_dir) train_file_path = os.path.join(FLAGS.output_dir, get_basename(FLAGS.max_seq_length, "train")) test_file_path = os.path.join(FLAGS.output_dir, get_basename(FLAGS.max_seq_length, "test")) def tokenize_fn(text): text = preprocess_text(text) return encode_ids(text) # Create TF-Record for train examples file_based_convert_examples_to_features(train_examples, labels, FLAGS.max_seq_length, tokenize_fn, train_file_path) # Create TF-Record for test examples file_based_convert_examples_to_features(test_examples, labels, FLAGS.max_seq_length, tokenize_fn, test_file_path) train_set = get_dataset(train_file_path, FLAGS.max_seq_length, FLAGS.batch_size) train_iter = train_set.make_one_shot_iterator() example = train_iter.get_next() inp = tf.transpose(example["input_ids"], [1, 0]) seg_id = tf.transpose(example["segment_ids"], [1, 0]) inp_mask = tf.transpose(example["input_mask"], [1, 0]) xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) run_config = xlnet.create_run_config(False, True, FLAGS) xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=inp, seg_ids=seg_id, input_mask=inp_mask) output = xlnet_model.get_sequence_output() init_from_checkpoint(FLAGS) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) try: while True: outs = sess.run(output) print(outs.shape) except tf.errors.OutOfRangeError: tf.logging.info("DONE")