# The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) encoder_output = encoder(input_embeds, src_input_length) # Builds layers for downstream classification, which is also initialized # with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(encoder_output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense(bert_sent_hidden, config_downstream.hidden_dim, activation=tf.tanh) output = tf.layers.dropout(bert_sent_output, rate=0.1, training=tx.global_mode_train()) print("loading the bert pretrained weights") # Loads pretrained BERT model parameters init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') model_utils.init_bert_checkpoint(init_checkpoint) tgt_embedding = tf.concat( [tf.zeros(shape=[1, embedder.dim]), embedder.embedding[1:, :]], axis=0) decoder = tx.modules.TransformerDecoder(embedding=tgt_embedding, hparams=dcoder_config) # For training outputs = decoder(memory=encoder_output, memory_sequence_length=src_input_length, inputs=embedder(tgt_input_ids),
def main(_): """ Builds the model and runs. """ tf.logging.set_verbosity(tf.logging.INFO) tx.utils.maybe_create_dir(FLAGS.output_dir) bert_pretrain_dir = 'bert_pretrained_models/%s' % FLAGS.config_bert_pretrain # Loads BERT model configuration if FLAGS.config_format_bert == "json": bert_config = model_utils.transform_bert_to_texar_config( os.path.join(bert_pretrain_dir, 'bert_config.json')) elif FLAGS.config_format_bert == 'texar': bert_config = importlib.import_module( 'bert_config_lib.config_model_%s' % FLAGS.config_bert_pretrain) else: raise ValueError('Unknown config_format_bert.') # Loads data processors = { "cola": data_utils.ColaProcessor, "mnli": data_utils.MnliProcessor, "mrpc": data_utils.MrpcProcessor, "xnli": data_utils.XnliProcessor, 'sst': data_utils.SSTProcessor } processor = processors[FLAGS.task.lower()]() num_classes = len(processor.get_labels()) num_train_data = len(processor.get_train_examples(config_data.data_dir)) tokenizer = tokenization.FullTokenizer(vocab_file=os.path.join( bert_pretrain_dir, 'vocab.txt'), do_lower_case=FLAGS.do_lower_case) train_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.train_batch_size, mode='train', output_dir=FLAGS.output_dir) eval_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.eval_batch_size, mode='eval', output_dir=FLAGS.output_dir) test_dataset = data_utils.get_dataset(processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.test_batch_size, mode='test', output_dir=FLAGS.output_dir) iterator = tx.data.FeedableDataIterator({ 'train': train_dataset, 'eval': eval_dataset, 'test': test_dataset }) batch = iterator.get_next() input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] batch_size = tf.shape(input_ids)[0] input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1) # Builds BERT with tf.variable_scope('bert'): embedder = tx.modules.WordEmbedder(vocab_size=bert_config.vocab_size, hparams=bert_config.embed) word_embeds = embedder(input_ids) # Creates segment embeddings for each type of tokens. segment_embedder = tx.modules.WordEmbedder( vocab_size=bert_config.type_vocab_size, hparams=bert_config.segment_embed) segment_embeds = segment_embedder(segment_ids) input_embeds = word_embeds + segment_embeds # The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) output = encoder(input_embeds, input_length) # Builds layers for downstream classification, which is also initialized # with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense(bert_sent_hidden, config_downstream.hidden_dim, activation=tf.tanh) output = tf.layers.dropout(bert_sent_output, rate=0.1, training=tx.global_mode_train()) # Adds the final classification layer logits = tf.layers.dense( output, num_classes, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02)) preds = tf.argmax(logits, axis=-1, output_type=tf.int32) accu = tx.evals.accuracy(batch['label_ids'], preds) # Optimization loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"], logits=logits) global_step = tf.Variable(0, trainable=False) # Builds learning rate decay scheduler static_lr = config_downstream.lr['static_lr'] num_train_steps = int(num_train_data / config_data.train_batch_size * config_data.max_train_epoch) num_warmup_steps = int(num_train_steps * config_data.warmup_proportion) lr = model_utils.get_lr( global_step, num_train_steps, # lr is a Tensor num_warmup_steps, static_lr) train_op = tx.core.get_train_op(loss, global_step=global_step, learning_rate=lr, hparams=config_downstream.opt) # Train/eval/test routine def _run(sess, mode): fetches = { 'accu': accu, 'batch_size': batch_size, 'step': global_step, 'loss': loss, } if mode == 'train': fetches['train_op'] = train_op while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) if rets['step'] % 50 == 0: tf.logging.info('step:%d loss:%f' % (rets['step'], rets['loss'])) if rets['step'] == num_train_steps: break except tf.errors.OutOfRangeError: break if mode == 'eval': cum_acc = 0.0 nsamples = 0 while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'eval'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_acc += rets['accu'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break tf.logging.info('dev accu: {}'.format(cum_acc / nsamples)) if mode == 'test': _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "test_results.tsv") with tf.gfile.GFile(output_file, "w") as writer: writer.write('\n'.join(str(p) for p in _all_preds)) with tf.Session() as sess: # Loads pretrained BERT model parameters init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') model_utils.init_bert_checkpoint(init_checkpoint) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) # Restores trained model if specified saver = tf.train.Saver() if FLAGS.checkpoint: saver.restore(sess, FLAGS.checkpoint) iterator.initialize_dataset(sess) if FLAGS.do_train: iterator.restart_dataset(sess, 'train') _run(sess, mode='train') saver.save(sess, FLAGS.output_dir + '/model.ckpt') if FLAGS.do_eval: iterator.restart_dataset(sess, 'eval') _run(sess, mode='eval') if FLAGS.do_test: iterator.restart_dataset(sess, 'test') _run(sess, mode='test')
# The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) encoder_output = encoder(input_embeds, src_input_length) # Builds layers for downstream classification, which is also initialized # with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(encoder_output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense( bert_sent_hidden, config_downstream.hidden_dim, activation=tf.tanh) output = tf.layers.dropout( bert_sent_output, rate=0.1, training=tx.global_mode_train()) print("loading the bert pretrained weights") init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') model_utils.init_bert_checkpoint(init_checkpoint) #decoder part and mle losss tgt_embedding = tf.concat( [tf.zeros(shape=[1, embedder.dim]), embedder.embedding[1:, :]], axis=0) decoder = tx.modules.TransformerDecoder(embedding=tgt_embedding, hparams=dcoder_config) # For training outputs = decoder( memory=encoder_output, memory_sequence_length=src_input_length,
def main(_): """ Builds the model and runs. """ if FLAGS.distributed: import horovod.tensorflow as hvd hvd.init() tf.logging.set_verbosity(tf.logging.INFO) tx.utils.maybe_create_dir(FLAGS.output_dir) bert_pretrain_dir = '%s' % FLAGS.config_bert_pretrain # Loads BERT model configuration if FLAGS.config_format_bert == "json": bert_config = model_utils.transform_bert_to_texar_config( os.path.join(bert_pretrain_dir, 'bert_config.json')) elif FLAGS.config_format_bert == 'texar': bert_config = importlib.import_module( 'bert_config_lib.config_model_%s' % FLAGS.config_bert_pretrain) else: raise ValueError('Unknown config_format_bert.') num_classes = config_data.num_classes """ train_dataset = data_utils.get_dataset( processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.train_batch_size, mode='train', output_dir=FLAGS.output_dir, is_distributed=FLAGS.distributed) eval_dataset = data_utils.get_dataset( processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.eval_batch_size, mode='eval', output_dir=FLAGS.output_dir) test_dataset = data_utils.get_dataset( processor, tokenizer, config_data.data_dir, config_data.max_seq_length, config_data.test_batch_size, mode='test', output_dir=FLAGS.output_dir) """ train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam) eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam) test_dataset_1 = tx.data.TFRecordData(hparams=config_data.test_hparam_1) test_dataset_2 = tx.data.TFRecordData(hparams=config_data.test_hparam_2) iterator = tx.data.FeedableDataIterator({ 'train': train_dataset, 'eval': eval_dataset, 'test1': test_dataset_1, 'test2': test_dataset_2 }) batch = iterator.get_next() input_ids = batch["input_ids"] batch_size = tf.shape(input_ids)[0] input_length = tf.reduce_sum(1 - tf.to_int32(tf.equal(input_ids, 0)), axis=1) # Builds BERT with tf.variable_scope('bert'): embedder = tx.modules.WordEmbedder(vocab_size=bert_config.vocab_size, hparams=bert_config.embed) word_embeds = embedder(input_ids) # Creates segment embeddings for each type of tokens. """segment_embedder = tx.modules.WordEmbedder( vocab_size=bert_config.type_vocab_size, hparams=bert_config.segment_embed) segment_embeds = segment_embedder(segment_ids) input_embeds = word_embeds + segment_embeds """ input_embeds = word_embeds # The BERT model (a TransformerEncoder) encoder = tx.modules.TransformerEncoder(hparams=bert_config.encoder) output = encoder(input_embeds, input_length) # Builds layers for downstream classification, which is also initialized # with BERT pre-trained checkpoint. with tf.variable_scope("pooler"): # Uses the projection of the 1st-step hidden vector of BERT output # as the representation of the sentence bert_sent_hidden = tf.squeeze(output[:, 0:1, :], axis=1) bert_sent_output = tf.layers.dense(bert_sent_hidden, config_downstream.hidden_dim, activation=tf.nn.relu) output = tf.layers.dropout(bert_sent_output, rate=0.05, training=tx.global_mode_train()) # Adds the final classification layer logits = tf.layers.dense( output, num_classes, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02)) preds = tf.argmax(logits, axis=-1, output_type=tf.int32) accu = tx.evals.accuracy(batch['label_ids'], preds) # Optimization loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"], logits=logits) global_step = tf.Variable(0, trainable=False) # Builds learning rate decay scheduler static_lr = config_downstream.lr['static_lr'] # num_train_steps = int(num_train_data / config_data.train_batch_size) # num_warmup_steps = int(num_train_steps * config_data.warmup_proportion) # lr = model_utils.get_lr(global_step, num_train_steps, # lr is a Tensor # num_warmup_steps, static_lr) lr = static_lr # tf.train.exponential_decay( # learning_rate=0.1, global_step=global_step, decay_steps=1000, decay_rate=0.9, staircase=False) opt = tx.core.get_optimizer(global_step=global_step, learning_rate=lr, hparams=config_downstream.opt) if FLAGS.distributed: opt = hvd.DistributedOptimizer(opt) train_op = tf.contrib.layers.optimize_loss(loss=loss, global_step=global_step, learning_rate=None, optimizer=opt) global best_dev_accu best_dev_accu = 0. # Train/eval/test routine def _is_head(): if not FLAGS.distributed: return True else: return hvd.rank() == 0 def _train_epoch(sess): """Trains on the training set, and evaluates on the dev set periodically. """ iterator.restart_dataset(sess, 'train') cum_acc = 0.0 cum_loss = 0.0 nsamples = 0 fetches = { 'train_op': train_op, 'accu': accu, 'loss': loss, 'batch_size': batch_size, 'step': global_step } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'train'), tx.global_mode(): tf.estimator.ModeKeys.TRAIN, } rets = sess.run(fetches, feed_dict) step = rets['step'] cum_acc += rets['accu'] * rets['batch_size'] cum_loss += rets['loss'] * rets['batch_size'] nsamples += rets['batch_size'] dis_steps = config_data.display_steps if _is_head() and dis_steps > 0 and step % dis_steps == 0: tf.logging.info('step:%d; loss:%f' % (step, rets['loss'])) eval_steps = config_data.eval_steps if _is_head() and eval_steps > 0 and step % eval_steps == 0: _eval_epoch(sess) except tf.errors.OutOfRangeError: break tf.logging.info( 'train accu: {0:.4f}; loss: {1:.4f}; nsamples: {2:.4f}'.format( cum_acc / nsamples, cum_loss / nsamples, nsamples)) def _eval_epoch(sess): """Evaluates on the dev set. """ global best_dev_accu iterator.restart_dataset(sess, 'eval') cum_acc = 0.0 cum_loss = 0.0 nsamples = 0 fetches = { 'accu': accu, 'loss': loss, 'batch_size': batch_size, } while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'eval'), tx.context.global_mode(): tf.estimator.ModeKeys.EVAL, } rets = sess.run(fetches, feed_dict) cum_acc += rets['accu'] * rets['batch_size'] cum_loss += rets['loss'] * rets['batch_size'] nsamples += rets['batch_size'] except tf.errors.OutOfRangeError: break if (cum_acc / nsamples) > best_dev_accu: best_dev_accu = cum_acc / nsamples saver.save(sess, ckpt_best, global_step=global_step) print('updated best dev accu : {}'.format(best_dev_accu)) tf.logging.info( 'eval accu: {0:.4f}; loss: {1:.4f}; nsamples: {2:.4f}'.format( cum_acc / nsamples, cum_loss / nsamples, nsamples)) def _test_epoch(sess): """Does predictions on the test set. """ iterator.restart_dataset(sess, 'test') _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test'), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "results.tsv") with tf.gfile.GFile(output_file, "w") as writer: writer.write('\n'.join(str(p) for p in _all_preds)) print('content score is :{}'.format(sum(_all_preds) / len(_all_preds))) def _pred_epoch(sess, which_score): """Does predictions on the test set. """ iterator.restart_dataset(sess, 'test{}'.format(which_score)) _all_preds = [] while True: try: feed_dict = { iterator.handle: iterator.get_handle(sess, 'test{}'.format(which_score)), tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT, } _preds = sess.run(preds, feed_dict=feed_dict) _all_preds.extend(_preds.tolist()) except tf.errors.OutOfRangeError: break output_file = os.path.join(FLAGS.output_dir, "rule_{}.tsv".format(which_score)) # with tf.gfile.GFile(output_file, "w") as writer: # writer.write('\n'.join(str(p) for p in _all_preds)) with tf.gfile.GFile(output_file, "w") as writer: writer.write('content score_{0} is : {1:.4f}'.format( which_score, sum(_all_preds) / len(_all_preds))) if which_score == 1: print('content score_{0} is : {1:.4f}'.format( which_score, sum(_all_preds) / len(_all_preds))) else: print('content score_{0} is : {1:.4f}'.format( which_score, 1 - sum(_all_preds) / len(_all_preds))) # Loads pretrained BERT model parameters # init_checkpoint = os.path.join(bert_pretrain_dir, 'bert_model.ckpt') # model_utils.init_bert_checkpoint(init_checkpoint) # Broadcasts global variables from rank-0 process if FLAGS.distributed: bcast = hvd.broadcast_global_variables(0) session_config = tf.ConfigProto() if FLAGS.distributed: session_config.gpu_options.visible_device_list = str(hvd.local_rank()) with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) if FLAGS.distributed: bcast.run() # Restores trained model if specified saver = tf.train.Saver() if FLAGS.checkpoint: saver.restore(sess, FLAGS.checkpoint) iterator.initialize_dataset(sess) if FLAGS.do_train: for i in range(config_data.max_train_epoch): _train_epoch(sess) saver.save(sess, ckpt_model, global_step=global_step) if FLAGS.do_eval: _eval_epoch(sess) if FLAGS.do_test: _test_epoch(sess) if FLAGS.do_pred: _pred_epoch(sess, 1) _pred_epoch(sess, 2)