def _run_and_report_benchmark(self, force_gpu_memory_alloc=True): if force_gpu_memory_alloc: # force GPU memory allocation, so we always take the same amount of # GPU memory as running in Cloud (see b/151435951) gpus = tf.config.experimental.list_physical_devices("GPU") if gpus: try: for gpu_id in range(0, len(gpus)): tf.config.experimental.set_virtual_device_configuration( gpus[gpu_id], [ tf.config.experimental. VirtualDeviceConfiguration(memory_limit=14700) ]) logical_gpus = tf.config.experimental.list_logical_devices( "GPU") print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Virtual devices must be set before GPUs have been initialized print(e) if FLAGS.tpu: strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, tpu_address=FLAGS.tpu, tpu_zone="europe-west4-a") else: strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, all_reduce_alg=FLAGS.all_reduce_alg, num_gpus=FLAGS.num_gpus) start_time_sec = time.time() run_pretraining.run_bert_pretrain(strategy, [self.timer_callback]) wall_time_sec = time.time() - start_time_sec metrics = [] if self.timer_callback: metrics.append({ "name": "exp_per_second", "value": self.timer_callback.get_examples_per_sec( FLAGS.train_batch_size * FLAGS.steps_per_loop) }) else: logging.error( "exp_per_second not calculated because timer_callback is missing" ) metrics.append({ "name": "exp_per_second", "value": 0.0, }) flags_str = flags_core.get_nondefault_flags_as_str() self.report_benchmark(iters=-1, wall_time=wall_time_sec, metrics=metrics, extras={"flags": flags_str})
def _run_bert_classifier(self, callbacks=None, use_ds=True): """Starts BERT classification task.""" with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file) epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs if self.num_steps_per_epoch: steps_per_epoch = self.num_steps_per_epoch else: train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * steps_per_epoch * 0.1) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if self.tpu: strategy = distribution_utils.get_distribution_strategy( distribution_strategy='tpu', tpu_address=self.tpu) else: strategy = distribution_utils.get_distribution_strategy( distribution_strategy='mirrored' if use_ds else 'off', num_gpus=self.num_gpus) steps_per_loop = 50 max_seq_length = input_meta_data['max_seq_length'] train_input_fn = run_classifier.get_dataset_fn(FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) eval_input_fn = run_classifier.get_dataset_fn(FLAGS.eval_data_path, max_seq_length, FLAGS.eval_batch_size, is_training=False) run_classifier.run_bert_classifier(strategy, bert_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, train_input_fn, eval_input_fn, custom_callbacks=callbacks)
def main(_): # Users should always run this script under TF 2.x tf.enable_v2_behavior() with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) max_seq_length = input_meta_data['max_seq_length'] train_input_fn = get_dataset_fn(FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) eval_input_fn = get_dataset_fn(FLAGS.eval_data_path, max_seq_length, FLAGS.eval_batch_size, is_training=False) bert_config = bert_configs.BertConfig.from_json_file( FLAGS.bert_config_file) run_bert(strategy, input_meta_data, bert_config, train_input_fn, eval_input_fn)
def __init__(self, strategy_type=None, strategy_config=None): _ = distribution_utils.configure_cluster(strategy_config.worker_hosts, strategy_config.task_index) self._strategy = distribution_utils.get_distribution_strategy( distribution_strategy=strategy_type, num_gpus=strategy_config.num_gpus, all_reduce_alg=strategy_config.all_reduce_alg, num_packs=strategy_config.num_packs, tpu_address=strategy_config.tpu)
def _get_distribution_strategy(self, ds_type='mirrored'): """Gets the distribution strategy. Args: ds_type: String, the distribution strategy type to be used. Can be 'mirrored', 'multi_worker_mirrored', 'tpu' and 'off'. Returns: A `tf.distribute.DistibutionStrategy` object. """ if self.tpu or ds_type == 'tpu': return distribution_utils.get_distribution_strategy( distribution_strategy='tpu', tpu_address=self.tpu) elif ds_type == 'multi_worker_mirrored': # Configures cluster spec for multi-worker distribution strategy. _ = distribution_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) return distribution_utils.get_distribution_strategy( distribution_strategy=ds_type, num_gpus=self.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg)
def main(_): # Users should always run this script under TF 2.x tf.enable_v2_behavior() with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribution_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if FLAGS.mode in ('train', 'train_and_predict'): train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly) if FLAGS.mode in ('predict', 'train_and_predict'): predict_squad(strategy, input_meta_data)
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=cifar_preprocessing.HEIGHT, width=cifar_preprocessing.WIDTH, num_channels=cifar_preprocessing.NUM_CHANNELS, num_classes=cifar_preprocessing.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj), drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_preprocessing.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=cifar_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, # Setting drop_remainder to avoid the partial batch logic in normalization # layer, which triggers tf.where and leads to extra memory copy of input # sizes between host and GPU. drop_remainder=(not flags_obj.enable_get_next_as_optional)) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=cifar_preprocessing.parse_record) steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) lr_schedule = 0.1 if flags_obj.use_tensor_lr: initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128 lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE), values=[initial_learning_rate] + list(p[0] * initial_learning_rate for p in LR_SCHEDULE)) with strategy_scope: optimizer = common.get_optimizer(lr_schedule) model = resnet_cifar_model.resnet56( classes=cifar_preprocessing.NUM_CLASSES) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj. force_v2_in_keras_compile) else: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks(steps_per_epoch) if not flags_obj.use_tensor_lr: lr_callback = LearningRateBatchScheduler( schedule=learning_rate_schedule, batch_size=flags_obj.batch_size, steps_per_epoch=steps_per_epoch) callbacks.append(lr_callback) # if mutliple epochs, ignore the train_steps flag. if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats