def run_bert(strategy, input_meta_data, train_input_fn=None, eval_input_fn=None): """Run BERT training.""" if FLAGS.model_type == 'bert': bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) else: assert FLAGS.model_type == 'albert' bert_config = modeling.AlbertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.mode == 'export_only': # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. export_classifier(FLAGS.model_export_path, input_meta_data, FLAGS.use_keras_compile_fit, bert_config, FLAGS.model_dir) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') trained_model = run_bert_classifier( strategy, bert_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, train_input_fn, eval_input_fn, run_eagerly=FLAGS.run_eagerly, use_keras_compile_fit=FLAGS.use_keras_compile_fit) if FLAGS.model_export_path: # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. model_saving_utils.export_bert_model( FLAGS.model_export_path, model=trained_model, restore_model_using_load_weights=FLAGS.use_keras_compile_fit) return trained_model
def _run_and_report_benchmark(self, use_ds=True, enable_xla=False, run_eagerly=False): """Runs the benchmark and reports various metrics.""" keras_utils.set_config_v2(enable_xla) start_time_sec = time.time() self._train_squad(use_ds=use_ds, run_eagerly=run_eagerly) wall_time_sec = time.time() - start_time_sec summary = self._read_training_summary_from_file() super(BertSquadBenchmarkReal, self)._report_benchmark(stats=summary, wall_time_sec=wall_time_sec, min_accuracy=0, max_accuracy=1)
def run_bert(strategy, input_meta_data): """Run BERT training.""" if FLAGS.mode == 'export_only': export_classifier(FLAGS.model_export_path, input_meta_data) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') # Runs customized training loop. logging.info( 'Training using customized training loop TF 2.0 with distrubuted' 'strategy.') use_remote_tpu = (FLAGS.strategy_type == 'tpu' and FLAGS.tpu) trained_model = run_customized_training(strategy, bert_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, use_remote_tpu=use_remote_tpu, run_eagerly=FLAGS.run_eagerly) if FLAGS.model_export_path: with tf.device( model_training_utils.get_primary_cpu_task(use_remote_tpu)): model_saving_utils.export_bert_model(FLAGS.model_export_path, model=trained_model) return trained_model
def _run_bert_classifier(self, callbacks=None, use_ds=True, enable_xla=False): """Starts BERT classification task.""" with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) bert_config = modeling.BertConfig.from_json_file( FLAGS.bert_config_file) epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs if self.num_steps_per_epoch: steps_per_epoch = self.num_steps_per_epoch else: train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * steps_per_epoch * 0.1) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) strategy = distribution_utils.get_distribution_strategy( distribution_strategy='mirrored' if use_ds else 'off', num_gpus=self.num_gpus) # TODO(hongkuny): Enable XLA once we are confident with its performance. keras_utils.set_config_v2(enable_xla) steps_per_loop = 1 run_classifier.run_customized_training(strategy, bert_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, custom_callbacks=callbacks)
def run_bert(strategy, input_meta_data): """Run BERT training.""" if FLAGS.mode == 'export_only': export_classifier(FLAGS.model_export_path, input_meta_data) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') trained_model = run_bert_classifier(strategy, bert_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, run_eagerly=FLAGS.run_eagerly) if FLAGS.model_export_path: model_saving_utils.export_bert_model(FLAGS.model_export_path, model=trained_model) return trained_model
def run_bert_pretrain(strategy): """Runs BERT pre-training.""" bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # Padding for divisibility by 8 # if bert_config.vocab_size % 8 != 0: # bert_config.vocab_size += 8 - bert_config.vocab_size % 8 if strategy: logging.info( 'Training using customized training loop TF 2.0 with distrubuted' 'strategy.') keras_utils.set_config_v2(FLAGS.enable_xla) # Runs customized training loop. return run_customized_training( strategy, bert_config, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, FLAGS.model_dir, FLAGS.num_steps_per_epoch, FLAGS.steps_per_loop, FLAGS.num_train_epochs, FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate, FLAGS.warmup_steps, FLAGS.input_files, FLAGS.train_batch_size)
def train_squad(strategy, input_meta_data, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info('Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) use_float16 = common_flags.use_float16() if use_float16: policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16') tf.keras.mixed_precision.experimental.set_policy(policy) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size) train_input_fn = functools.partial( input_pipeline.create_squad_dataset, FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, float_type=tf.float16 if use_float16 else tf.float32) squad_model.optimizer = optimization.create_optimizer( FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) if use_float16: # Wraps optimizer with a LossScaleOptimizer. This is done automatically # in compile() with the "mixed_float16" policy, but since we do not call # compile(), we must wrap the optimizer manually. squad_model.optimizer = ( tf.keras.mixed_precision.experimental.LossScaleOptimizer( squad_model.optimizer, loss_scale=common_flags.get_loss_scale())) return squad_model, core_model # The original BERT model does not scale the loss by # 1/num_replicas_in_sync. It could be an accident. So, in order to use # the same hyper parameter, we do the same thing here by keeping each # replica loss as it is. loss_fn = get_loss_fn(loss_factor=1.0) use_remote_tpu = (FLAGS.strategy_type == 'tpu' and FLAGS.tpu) model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=loss_fn, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, use_remote_tpu=use_remote_tpu, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks)
def train_squad(strategy, input_meta_data, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info('Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) use_float16 = common_flags.use_float16() if use_float16: tf.keras.mixed_precision.experimental.set_policy('mixed_float16') bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file( FLAGS.bert_config_file) epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size) train_input_fn = get_dataset_fn( FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, float_type=tf.float16 if use_float16 else tf.float32, hub_module_url=FLAGS.hub_module_url) squad_model.optimizer = optimization.create_optimizer( FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) if use_float16: # Wraps optimizer with a LossScaleOptimizer. This is done automatically # in compile() with the "mixed_float16" policy, but since we do not call # compile(), we must wrap the optimizer manually. squad_model.optimizer = ( tf.keras.mixed_precision.experimental.LossScaleOptimizer( squad_model.optimizer, loss_scale=common_flags.get_loss_scale())) if FLAGS.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( squad_model.optimizer) return squad_model, core_model # The original BERT model does not scale the loss by # 1/num_replicas_in_sync. It could be an accident. So, in order to use # the same hyper parameter, we do the same thing here by keeping each # replica loss as it is. loss_fn = get_loss_fn( loss_factor=1.0 / strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0) model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=loss_fn, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks)
def train_squad(strategy, input_meta_data, bert_config, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info( 'Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) use_float16 = common_flags.use_float16() if use_float16: tf.keras.mixed_precision.experimental.set_policy('mixed_float16') epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size) train_input_fn = get_dataset_fn(FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, hub_module_url=FLAGS.hub_module_url, hub_module_trainable=FLAGS.hub_module_trainable) squad_model.optimizer = optimization.create_optimizer( FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) if use_float16: # Wraps optimizer with a LossScaleOptimizer. This is done automatically # in compile() with the "mixed_float16" policy, but since we do not call # compile(), we must wrap the optimizer manually. squad_model.optimizer = ( tf.keras.mixed_precision.experimental.LossScaleOptimizer( squad_model.optimizer, loss_scale=common_flags.get_loss_scale())) if FLAGS.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( squad_model.optimizer) return squad_model, core_model # The original BERT model does not scale the loss by # 1/num_replicas_in_sync. It could be an accident. So, in order to use # the same hyper parameter, we do the same thing here by keeping each # replica loss as it is. loss_fn = get_loss_fn( loss_factor=1.0 / strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0) # when all_reduce_sum_gradients = False, apply_gradients() no longer # implicitly allreduce gradients, users manually allreduce gradient and # passed the allreduced grads_and_vars. For now, the clip_by_global_norm # will be moved to before users' manual allreduce to keep the math # unchanged. def clip_by_global_norm_callback(grads_and_vars): grads, variables = zip(*grads_and_vars) (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) return zip(clipped_grads, variables) model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=loss_fn, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks, explicit_allreduce=True, pre_allreduce_callbacks=[clip_by_global_norm_callback])
def train_squad(strategy, input_meta_data, bert_config, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info( 'Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype()) epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size) train_input_fn = get_dataset_fn(FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, hub_module_url=FLAGS.hub_module_url, hub_module_trainable=FLAGS.hub_module_trainable) optimizer = optimization.create_optimizer(FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) squad_model.optimizer = performance.configure_optimizer( optimizer, use_float16=common_flags.use_float16(), use_graph_rewrite=common_flags.use_graph_rewrite()) return squad_model, core_model # If explicit_allreduce = True, apply_gradients() no longer implicitly # allreduce gradients, users manually allreduce gradient and pass the # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be # applied to allreduced gradients. def clip_by_global_norm_callback(grads_and_vars): grads, variables = zip(*grads_and_vars) (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) return zip(clipped_grads, variables) model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=get_loss_fn(), model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks, explicit_allreduce=False, post_allreduce_callbacks=[clip_by_global_norm_callback])
def run_bert(strategy, input_meta_data, model_config, train_input_fn=None, eval_input_fn=None, init_checkpoint=None, custom_callbacks=None): """Run BERT training.""" if FLAGS.mode == 'export_only': # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. export_classifier(FLAGS.model_export_path, input_meta_data, FLAGS.use_keras_compile_fit, model_config, FLAGS.model_dir) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype()) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') if not custom_callbacks: custom_callbacks = [] if FLAGS.log_steps: custom_callbacks.append(keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir)) trained_model = run_bert_classifier( strategy, model_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, init_checkpoint or FLAGS.init_checkpoint, train_input_fn, eval_input_fn, run_eagerly=FLAGS.run_eagerly, use_keras_compile_fit=FLAGS.use_keras_compile_fit, custom_callbacks=custom_callbacks) if FLAGS.model_export_path: # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. model_saving_utils.export_bert_model( FLAGS.model_export_path, model=trained_model, restore_model_using_load_weights=FLAGS.use_keras_compile_fit) return trained_model
def predict_squad(strategy, input_meta_data): """Makes predictions for a squad dataset.""" keras_utils.set_config_v2(FLAGS.enable_xla) config_cls, squad_lib, tokenizer_cls = MODEL_CLASSES[FLAGS.model_type] bert_config = config_cls.from_json_file(FLAGS.bert_config_file) if tokenizer_cls == tokenization.FullTokenizer: tokenizer = tokenizer_cls(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) else: assert tokenizer_cls == tokenization.FullSentencePieceTokenizer tokenizer = tokenizer_cls(sp_model_file=FLAGS.sp_model_file) doc_stride = input_meta_data['doc_stride'] max_query_length = input_meta_data['max_query_length'] # Whether data should be in Ver 2.0 format. version_2_with_negative = input_meta_data.get('version_2_with_negative', False) eval_examples = squad_lib.read_squad_examples( input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative) eval_writer = squad_lib.FeatureWriter(filename=os.path.join( FLAGS.model_dir, 'eval.tf_record'), is_training=False) eval_features = [] def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) # TPU requires a fixed batch size for all batches, therefore the number # of examples must be a multiple of the batch size, or else examples # will get dropped. So we pad with fake examples which are ignored # later on. kwargs = dict(examples=eval_examples, tokenizer=tokenizer, max_seq_length=input_meta_data['max_seq_length'], doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=_append_feature, batch_size=FLAGS.predict_batch_size) # squad_lib_sp requires one more argument 'do_lower_case'. if squad_lib == squad_lib_sp: kwargs['do_lower_case'] = FLAGS.do_lower_case dataset_size = squad_lib.convert_examples_to_features(**kwargs) eval_writer.close() logging.info('***** Running predictions *****') logging.info(' Num orig examples = %d', len(eval_examples)) logging.info(' Num split examples = %d', len(eval_features)) logging.info(' Batch size = %d', FLAGS.predict_batch_size) num_steps = int(dataset_size / FLAGS.predict_batch_size) if FLAGS.benchmark and num_steps > 1000: num_steps = 1000 all_results = predict_squad_customized(strategy, input_meta_data, bert_config, eval_writer.filename, num_steps) if FLAGS.benchmark: return output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json') output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json') output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json') squad_lib.write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose=FLAGS.verbose_logging) if FLAGS.eval_script: eval_out = subprocess.check_output([ sys.executable, FLAGS.eval_script, FLAGS.predict_file, output_prediction_file ]) scores = str(eval_out).strip() exact_match = float(scores.split(":")[1].split(",")[0]) if version_2_with_negative: f1 = float(scores.split(":")[2].split(",")[0]) else: f1 = float(scores.split(":")[2].split("}")[0]) dllogging = input_meta_data['dllogging'] dllogging.logger.log(step=(), data={"f1": f1}, verbosity=Verbosity.DEFAULT) dllogging.logger.log(step=(), data={"exact_match": exact_match}, verbosity=Verbosity.DEFAULT) print(str(eval_out))
def train_squad(strategy, input_meta_data, custom_callbacks=None, run_eagerly=False): """Run bert squad training.""" if strategy: logging.info( 'Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) use_float16 = common_flags.use_float16() if use_float16: tf.keras.mixed_precision.experimental.set_policy('mixed_float16') bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file( FLAGS.bert_config_file) epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if FLAGS.use_horovod: global_batch_size *= hvd.size() steps_per_epoch = int(num_train_examples / global_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / global_batch_size) train_input_fn = get_dataset_fn(FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True, use_horovod=FLAGS.use_horovod) if FLAGS.benchmark: steps_per_epoch = 800 epochs = 1 def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, float_type=tf.float16 if FLAGS.use_fp16 else tf.float32, hub_module_url=FLAGS.hub_module_url) learning_rate = FLAGS.learning_rate * hvd.size( ) if FLAGS.use_horovod else FLAGS.learning_rate squad_model.optimizer = optimization.create_optimizer( learning_rate, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type) if FLAGS.use_fp16: squad_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer( squad_model.optimizer, dynamic=True) return squad_model, core_model # The original BERT model does not scale the loss by # 1/num_replicas_in_sync. It could be an accident. So, in order to use # the same hyper parameter, we do the same thing here by keeping each # replica loss as it is. loss_fn = get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync if FLAGS.scale_loss and strategy else 1.0) params = {'dllogging': input_meta_data['dllogging'], 'FLAGS': FLAGS} model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=loss_fn, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, num_accumulative_step=FLAGS.num_accumulation_steps, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=FLAGS.init_checkpoint, hvd=hvd if FLAGS.use_horovod else None, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks, params=params)