def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, float_type=tf.float16 if use_float16 else tf.float32, hub_module_url=FLAGS.hub_module_url) squad_model.optimizer = optimization.create_optimizer( FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps) if use_float16: # Wraps optimizer with a LossScaleOptimizer. This is done automatically # in compile() with the "mixed_float16" policy, but since we do not call # compile(), we must wrap the optimizer manually. squad_model.optimizer = ( tf.keras.mixed_precision.experimental.LossScaleOptimizer( squad_model.optimizer, loss_scale=common_flags.get_loss_scale())) if FLAGS.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( squad_model.optimizer) return squad_model, core_model
def predict_squad_customized(strategy, input_meta_data, bert_config, predict_tfrecord_path, num_steps): """Make predictions using a Bert-based squad model.""" primary_cpu_task = '/job:worker' if FLAGS.tpu else '' with tf.device(primary_cpu_task): predict_dataset = input_pipeline.create_squad_dataset( predict_tfrecord_path, input_meta_data['max_seq_length'], FLAGS.predict_batch_size, is_training=False) predict_iterator = iter( strategy.experimental_distribute_dataset(predict_dataset)) with strategy.scope(): # Prediction always uses float32, even if training uses mixed precision. tf.keras.mixed_precision.experimental.set_policy('float32') squad_model, _ = bert_models.squad_model( bert_config, input_meta_data['max_seq_length'], float_type=tf.float32) checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir) logging.info('Restoring checkpoints from %s', checkpoint_path) checkpoint = tf.train.Checkpoint(model=squad_model) checkpoint.restore(checkpoint_path).expect_partial() @tf.function def predict_step(iterator): """Predicts on distributed devices.""" def _replicated_step(inputs): """Replicated prediction calculation.""" x, _ = inputs unique_ids, start_logits, end_logits = squad_model( x, training=False) return dict(unique_ids=unique_ids, start_logits=start_logits, end_logits=end_logits) outputs = strategy.experimental_run_v2(_replicated_step, args=(next(iterator), )) return tf.nest.map_structure(strategy.experimental_local_results, outputs) all_results = [] for _ in range(num_steps): predictions = predict_step(predict_iterator) for result in get_raw_results(predictions): all_results.append(result) if len(all_results) % 100 == 0: logging.info('Made predictions for %d records.', len(all_results)) return all_results
def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, float_type=tf.float16 if FLAGS.use_fp16 else tf.float32, hub_module_url=FLAGS.hub_module_url) learning_rate = FLAGS.learning_rate * hvd.size( ) if FLAGS.use_horovod else FLAGS.learning_rate squad_model.optimizer = optimization.create_optimizer( learning_rate, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type) if FLAGS.use_fp16: squad_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer( squad_model.optimizer, dynamic=True) return squad_model, core_model
def export_squad(model_export_path, input_meta_data): """Exports a trained model as a `SavedModel` for inference. Args: model_export_path: a string specifying the path to the SavedModel directory. input_meta_data: dictionary containing meta data about input and model. Raises: Export path is not specified, got an empty string or None. """ if not model_export_path: raise ValueError('Export path is not specified: %s' % model_export_path) bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) squad_model, _ = bert_models.squad_model( bert_config, input_meta_data['max_seq_length'], float_type=tf.float32) model_saving_utils.export_bert_model( model_export_path, model=squad_model, checkpoint_dir=FLAGS.model_dir)
def export_squad(model_export_path, input_meta_data): """Exports a trained model as a `SavedModel` for inference. Args: model_export_path: a string specifying the path to the SavedModel directory. input_meta_data: dictionary containing meta data about input and model. Raises: Export path is not specified, got an empty string or None. """ if not model_export_path: raise ValueError('Export path is not specified: %s' % model_export_path) bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file( FLAGS.bert_config_file) squad_model, _ = bert_models.squad_model(bert_config, input_meta_data['max_seq_length'], float_type=tf.float32) model_saving_utils.export_bert_model(model_export_path + '/savedmodel', model=squad_model, checkpoint_dir=FLAGS.model_dir) model_name = FLAGS.triton_model_name model_folder = model_export_path + "/triton_models/" + model_name version_folder = model_folder + "/" + str(FLAGS.triton_model_version) final_model_folder = version_folder + "/model.savedmodel" if not os.path.exists(version_folder): os.makedirs(version_folder) if (not os.path.exists(final_model_folder)): os.rename(model_export_path + '/savedmodel', final_model_folder) print("Model saved to dir", final_model_folder) else: if (FLAGS.triton_model_overwrite): shutil.rmtree(final_model_folder) os.rename(model_export_path + '/savedmodel', final_model_folder) print("WARNING: Existing model was overwritten. Model dir: {}". format(final_model_folder)) else: print( "ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}" .format(final_model_folder)) return config_filename = os.path.join(model_folder, "config.pbtxt") if (os.path.exists(config_filename) and not FLAGS.triton_model_overwrite): print( "ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}" .format(config_filename)) return config_template = r""" name: "{model_name}" platform: "tensorflow_savedmodel" max_batch_size: {max_batch_size} input [ {{ name: "input_mask" data_type: TYPE_INT32 dims: {seq_length} }}, {{ name: "input_type_ids" data_type: TYPE_INT32 dims: {seq_length} }}, {{ name: "input_word_ids" data_type: TYPE_INT32 dims: {seq_length} }} ] output [ {{ name: "end_positions" data_type: TYPE_FP32 dims: {seq_length} }}, {{ name: "start_positions" data_type: TYPE_FP32 dims: {seq_length} }} ] {dynamic_batching} instance_group [ {{ count: {engine_count} kind: KIND_GPU gpus: [{gpu_list}] }} ]""" batching_str = "" max_batch_size = FLAGS.triton_max_batch_size if (FLAGS.triton_dyn_batching_delay > 0): # Use only full and half full batches pref_batch_size = [int(max_batch_size / 2.0), max_batch_size] batching_str = r""" dynamic_batching {{ preferred_batch_size: [{0}] max_queue_delay_microseconds: {1} }}""".format(", ".join([str(x) for x in pref_batch_size]), int(FLAGS.triton_dyn_batching_delay * 1000.0)) config_values = { "model_name": model_name, "max_batch_size": max_batch_size, "seq_length": input_meta_data['max_seq_length'], "dynamic_batching": batching_str, "gpu_list": ", ".join([ x.name.split(":")[-1] for x in tf.config.list_physical_devices('GPU') ]), "engine_count": FLAGS.triton_engine_count } with open(model_folder + "/config.pbtxt", "w") as file: final_config_str = config_template.format_map(config_values) file.write(final_config_str)
def predict_squad_customized(strategy, input_meta_data, bert_config, predict_tfrecord_path, num_steps): """Make predictions using a Bert-based squad model.""" predict_dataset_fn = get_dataset_fn(predict_tfrecord_path, input_meta_data['max_seq_length'], FLAGS.predict_batch_size, is_training=False, use_horovod=False) if strategy: predict_iterator = iter( strategy.experimental_distribute_datasets_from_function( predict_dataset_fn)) else: predict_iterator = iter(predict_dataset_fn()) with distribution_utils.get_strategy_scope(strategy): squad_model, _ = bert_models.squad_model( bert_config, input_meta_data['max_seq_length'], float_type=tf.float16 if FLAGS.use_fp16 else tf.float32) if FLAGS.init_checkpoint: checkpoint = tf.train.Checkpoint(model=squad_model) checkpoint.restore(FLAGS.init_checkpoint).expect_partial() checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir) logging.info('Restoring checkpoints from %s', checkpoint_path) checkpoint = tf.train.Checkpoint(model=squad_model) checkpoint.restore(checkpoint_path).expect_partial() @tf.function def predict_step(iterator): """Predicts on distributed devices.""" def _replicated_step(inputs): """Replicated prediction calculation.""" x, _ = inputs unique_ids = x.pop('unique_ids') if FLAGS.benchmark: t0 = tf.timestamp() unique_ids = t0 start_logits, end_logits = squad_model(x, training=False) return dict(unique_ids=unique_ids, start_logits=start_logits, end_logits=end_logits) def tuple_fun(x): return (x, ) if strategy: outputs = strategy.experimental_run_v2(_replicated_step, args=(next(iterator), )) map_func = strategy.experimental_local_results else: outputs = _replicated_step(next(iterator), ) map_func = tuple_fun return tf.nest.map_structure(map_func, outputs) all_results = [] time_list = [] eval_start_time = time.time() elapsed_secs = 0 for _ in range(num_steps): predictions = predict_step(predict_iterator) if FLAGS.benchmark: # transfer tensor to CPU for synchronization t0 = predictions['unique_ids'][0] start_logits = predictions['start_logits'][0] start_logits.numpy() elapsed_secs = time.time() - t0.numpy() # Removing first 4 (arbitrary) number of startup iterations from perf evaluations if _ > 3: time_list.append(elapsed_secs) continue for result in get_raw_results(predictions): all_results.append(result) if len(all_results) % 100 == 0: logging.info('Made predictions for %d records.', len(all_results)) eval_time_elapsed = time.time() - eval_start_time logging.info("-----------------------------") logging.info("Summary Inference Statistics") logging.info("Batch size = %d", FLAGS.predict_batch_size) logging.info("Sequence Length = %d", input_meta_data['max_seq_length']) logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32") logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, num_steps * FLAGS.predict_batch_size) if FLAGS.benchmark: eval_time_wo_overhead = sum(time_list) time_list.sort() num_sentences = (num_steps - 4) * FLAGS.predict_batch_size avg = np.mean(time_list) cf_50 = max(time_list[:int(len(time_list) * 0.50)]) cf_90 = max(time_list[:int(len(time_list) * 0.90)]) cf_95 = max(time_list[:int(len(time_list) * 0.95)]) cf_99 = max(time_list[:int(len(time_list) * 0.99)]) cf_100 = max(time_list[:int(len(time_list) * 1)]) ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead logging.info( "Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, (num_steps - 4) * FLAGS.predict_batch_size) logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000) logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000) logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000) logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000) logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000) logging.info("Latency Average (ms) = %0.2f", avg * 1000) logging.info("Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging = input_meta_data['dllogging'] dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) logging.info("-----------------------------") return all_results