def construct_estimator(num_gpus, model_dir, params, batch_size, eval_batch_size): """Construct either an Estimator or TPUEstimator for NCF. Args: num_gpus: The number of gpus (Used to select distribution strategy) model_dir: The model directory for the estimator params: The params dict for the estimator batch_size: The mini-batch size for training. eval_batch_size: The batch size used during evaluation. Returns: An Estimator or TPUEstimator. """ if params["use_tpu"]: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], ) tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=100, num_shards=8) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config) tpu_params = {k: v for k, v in params.items() if k != "batch_size"} train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=True, train_batch_size=batch_size, params=tpu_params, config=run_config) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=False, train_batch_size=1, predict_batch_size=eval_batch_size, params=tpu_params, config=run_config) return train_estimator, eval_estimator distribution = distribution_utils.get_distribution_strategy(num_gpus=num_gpus) run_config = tf.estimator.RunConfig(train_distribute=distribution) params["eval_batch_size"] = eval_batch_size estimator = tf.estimator.Estimator(model_fn=neumf_model.neumf_model_fn, model_dir=model_dir, config=run_config, params=params) return estimator, estimator
def construct_estimator(model_dir, params): """Construct either an Estimator or TPUEstimator for NCF. Args: model_dir: The model directory for the estimator params: The params dict for the estimator Returns: An Estimator or TPUEstimator. """ if params["use_tpu"]: # Some of the networking libraries are quite chatty. for name in ["googleapiclient.discovery", "googleapiclient.discovery_cache", "oauth2client.transport"]: logging.getLogger(name).setLevel(logging.ERROR) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator" ) tf.logging.info("Issuing reset command to TPU to ensure a clean state.") tf.Session.reset(tpu_cluster_resolver.get_master()) # Estimator looks at the master it connects to for MonitoredTrainingSession # by reading the `TF_CONFIG` environment variable, and the coordinator # is used by StreamingFilesDataset. tf_config_env = { "session_master": tpu_cluster_resolver.get_master(), "eval_session_master": tpu_cluster_resolver.get_master(), "coordinator": tpu_cluster_resolver.cluster_spec() .as_dict()["coordinator"] } os.environ['TF_CONFIG'] = json.dumps(tf_config_env) distribution = tf.contrib.distribute.TPUStrategy( tpu_cluster_resolver, steps_per_run=100) else: distribution = distribution_utils.get_distribution_strategy( num_gpus=params["num_gpus"]) run_config = tf.estimator.RunConfig(train_distribute=distribution, eval_distribute=distribution) model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: tf.logging.info("Using XLA for GPU for training and evaluation.") model_fn = xla.estimator_model_fn(model_fn) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=run_config, params=params) return estimator
def construct_estimator(flags_obj, params, schedule_manager): """Construct an estimator from either Estimator or TPUEstimator. Args: flags_obj: The FLAGS object parsed from command line. params: A dict of run specific parameters. schedule_manager: A schedule.Manager object containing the run schedule. Returns: An estimator object to be used for training and eval. """ if not params["use_tpu"]: distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg) return tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, params=params, config=tf.estimator.RunConfig(train_distribute=distribution_strategy)) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=flags_obj.tpu, zone=flags_obj.tpu_zone, project=flags_obj.tpu_gcp_project ) tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=schedule_manager.single_iteration_train_steps, num_shards=flags_obj.num_tpu_shards) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=flags_obj.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config) return tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL, train_batch_size=schedule_manager.batch_size, eval_batch_size=schedule_manager.batch_size, params={ # TPUEstimator needs to populate batch_size itself due to sharding. key: value for key, value in params.items() if key != "batch_size"}, config=run_config)
def get_distribution_strategy(params): """Returns the distribution strategy to use.""" if params["turn_off_distribution_strategy"]: return None if params["use_tpu"]: # Some of the networking libraries are quite chatty. for name in ["googleapiclient.discovery", "googleapiclient.discovery_cache", "oauth2client.transport"]: logging.getLogger(name).setLevel(logging.ERROR) tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator" ) logging.info("Issuing reset command to TPU to ensure a clean state.") tf.Session.reset(tpu_cluster_resolver.get_master()) # Estimator looks at the master it connects to for MonitoredTrainingSession # by reading the `TF_CONFIG` environment variable, and the coordinator # is used by StreamingFilesDataset. tf_config_env = { "session_master": tpu_cluster_resolver.get_master(), "eval_session_master": tpu_cluster_resolver.get_master(), "coordinator": tpu_cluster_resolver.cluster_spec() .as_dict()["coordinator"] } os.environ['TF_CONFIG'] = json.dumps(tf_config_env) distribution = tf.distribute.experimental.TPUStrategy( tpu_cluster_resolver, steps_per_run=100) else: distribution = distribution_utils.get_distribution_strategy( num_gpus=params["num_gpus"]) return distribution
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ if flags_obj.enable_eager: tf.enable_eager_execution() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError('dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) if flags_obj.use_synthetic_data: input_fn = keras_common.get_synth_input_fn( height=cifar_main.HEIGHT, width=cifar_main.WIDTH, num_channels=cifar_main.NUM_CHANNELS, num_classes=cifar_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: input_fn = cifar_main.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) strategy = distribution_utils.get_distribution_strategy( num_gpus=flags_obj.num_gpus, turn_off_distribution_strategy=flags_obj.turn_off_distribution_strategy) strategy_scope = keras_common.get_strategy_scope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, cifar_main.NUM_IMAGES['train']) train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (cifar_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[ time_callback, lr_callback, tensorboard_callback ], validation_steps=num_eval_steps, validation_data=validation_data, verbose=1) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=1) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: common.set_gpu_thread_mode_and_count(flags_obj) if flags_obj.data_delay_prefetch: common.data_delay_prefetch() common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'float16': policy = tf.keras.mixed_precision.experimental.Policy( 'infer_float32_vars') tf.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # Configures cluster spec for distribution strategy. num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=num_workers, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_preprocessing.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, ) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, dtype=dtype, drop_remainder=drop_remainder) lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=LR_SCHEDULE[0][1], boundaries=list(p[1] for p in LR_SCHEDULE[1:]), multipliers=list(p[0] for p in LR_SCHEDULE), compute_lr_on_cpu=True) with strategy_scope: optimizer = common.get_optimizer(lr_schedule) if dtype == 'float16': # TODO(reedwm): Remove manually wrapping optimizer once mixed precision # can be enabled with a single line of code. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) if flags_obj.use_trivial_model: model = trivial_model.trivial_model( imagenet_preprocessing.NUM_CLASSES, dtype) else: model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj. force_v2_in_keras_compile) else: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) callbacks = common.get_callbacks( learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train']) train_steps = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps // 15, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=1) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats
def train_and_eval( params: base_configs.ExperimentConfig, strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]: """Runs the train and eval path using compile/fit.""" logging.info('Running train and eval.') # Note: for TPUs, strategy and scope should be created before the dataset strategy = strategy_override or distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) strategy_scope = distribution_utils.get_strategy_scope(strategy) logging.info('Detected %d devices.', strategy.num_replicas_in_sync if strategy else 1) label_smoothing = params.model.loss.label_smoothing one_hot = label_smoothing and label_smoothing > 0 builders = _get_dataset_builders(params, strategy, one_hot) datasets = [ builder.build(strategy) if builder else None for builder in builders ] # Unpack datasets and builders based on train/val/test splits train_builder, validation_builder = builders # pylint: disable=unbalanced-tuple-unpacking train_dataset, validation_dataset = datasets train_epochs = params.train.epochs train_steps = params.train.steps or train_builder.num_steps validation_steps = params.evaluation.steps or validation_builder.num_steps initialize(params, train_builder) logging.info('Global batch size: %d', train_builder.global_batch_size) with strategy_scope: model_params = params.model.model_params.as_dict() model = get_models()[params.model.name](**model_params) learning_rate = optimizer_factory.build_learning_rate( params=params.model.learning_rate, batch_size=train_builder.global_batch_size, train_steps=train_steps) optimizer = optimizer_factory.build_optimizer( optimizer_name=params.model.optimizer.name, base_learning_rate=learning_rate, params=params.model.optimizer.as_dict()) metrics_map = _get_metrics(one_hot) metrics = [metrics_map[metric] for metric in params.train.metrics] if one_hot: loss_obj = tf.keras.losses.CategoricalCrossentropy( label_smoothing=params.model.loss.label_smoothing) else: loss_obj = tf.keras.losses.SparseCategoricalCrossentropy() model.compile( optimizer=optimizer, loss=loss_obj, metrics=metrics, experimental_steps_per_execution=params.train.steps_per_loop) initial_epoch = 0 if params.train.resume_checkpoint: initial_epoch = resume_from_checkpoint(model=model, model_dir=params.model_dir, train_steps=train_steps) callbacks = custom_callbacks.get_callbacks( model_checkpoint=params.train.callbacks. enable_checkpoint_and_export, include_tensorboard=params.train.callbacks.enable_tensorboard, time_history=params.train.callbacks.enable_time_history, track_lr=params.train.tensorboard.track_lr, write_model_weights=params.train.tensorboard.write_model_weights, initial_step=initial_epoch * train_steps, batch_size=train_builder.global_batch_size, log_steps=params.train.time_history.log_steps, model_dir=params.model_dir) serialize_config(params=params, model_dir=params.model_dir) if params.evaluation.skip_eval: validation_kwargs = {} else: validation_kwargs = { 'validation_data': validation_dataset, 'validation_steps': validation_steps, 'validation_freq': params.evaluation.epochs_between_evals, } history = model.fit(train_dataset, epochs=train_epochs, steps_per_epoch=train_steps, initial_epoch=initial_epoch, callbacks=callbacks, verbose=2, **validation_kwargs) validation_output = None if not params.evaluation.skip_eval: validation_output = model.evaluate(validation_dataset, steps=validation_steps, verbose=2) # TODO(dankondratyuk): eval and save final test accuracy stats = common.build_stats(history, validation_output, callbacks) return stats
def custom_main(custom_callbacks=None): """Run classification or regression. Args: custom_callbacks: list of tf.keras.Callbacks passed to training loop. """ gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) label_type = LABEL_TYPES_MAP[input_meta_data.get('label_type', 'int')] if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.mode == 'export_only': export_classifier(FLAGS.model_export_path, input_meta_data, bert_config, FLAGS.model_dir) return strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) eval_input_fn = get_dataset_fn( FLAGS.eval_data_path, input_meta_data['max_seq_length'], FLAGS.eval_batch_size, is_training=False, label_type=label_type) if FLAGS.mode == 'predict': with strategy.scope(): classifier_model = bert_models.classifier_model( bert_config, input_meta_data['num_labels'])[0] checkpoint = tf.train.Checkpoint(model=classifier_model) latest_checkpoint_file = ( FLAGS.predict_checkpoint_path or tf.train.latest_checkpoint(FLAGS.model_dir)) assert latest_checkpoint_file logging.info('Checkpoint file %s found and restoring from ' 'checkpoint', latest_checkpoint_file) checkpoint.restore( latest_checkpoint_file).assert_existing_objects_matched() preds, _ = get_predictions_and_labels( strategy, classifier_model, eval_input_fn, return_probs=True) output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv') with tf.io.gfile.GFile(output_predict_file, 'w') as writer: logging.info('***** Predict results *****') for probabilities in preds: output_line = '\t'.join( str(class_probability) for class_probability in probabilities) + '\n' writer.write(output_line) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) train_input_fn = get_dataset_fn( FLAGS.train_data_path, input_meta_data['max_seq_length'], FLAGS.train_batch_size, is_training=True, label_type=label_type) run_bert( strategy, input_meta_data, bert_config, train_input_fn, eval_input_fn, custom_callbacks=custom_callbacks)
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ config = keras_common.get_config_proto() # TODO(tobyboyd): Remove eager flag when tf 1.0 testing ends. # Eager is default in tf 2.0 and should not be toggled if not keras_common.is_v2_0(): if flags_obj.enable_eager: tf.compat.v1.enable_eager_execution(config=config) else: sess = tf.Session(config=config) tf.keras.backend.set_session(sess) # TODO(haoyuzhang): Set config properly in TF2.0 when the config API is ready. dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError('dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=cifar_main.HEIGHT, width=cifar_main.WIDTH, num_channels=cifar_main.NUM_CHANNELS, num_classes=cifar_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_main.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus) strategy_scope = distribution_utils.MaybeDistributionScope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, cifar_main.NUM_IMAGES['train']) train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (cifar_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[ time_callback, lr_callback, tensorboard_callback ], validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def run_executor(params, mode, checkpoint_path=None, train_input_fn=None, eval_input_fn=None, callbacks=None, prebuilt_strategy=None): """Runs the object detection model on distribution strategy defined by the user.""" if params.architecture.use_bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) model_builder = model_factory.model_generator(params) if prebuilt_strategy is not None: strategy = prebuilt_strategy else: strategy_config = params.strategy_config distribution_utils.configure_cluster(strategy_config.worker_hosts, strategy_config.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.strategy_type, num_gpus=strategy_config.num_gpus, all_reduce_alg=strategy_config.all_reduce_alg, num_packs=strategy_config.num_packs, tpu_address=strategy_config.tpu) num_workers = int(strategy.num_replicas_in_sync + 7) // 8 is_multi_host = (int(num_workers) >= 2) if mode == 'train': def _model_fn(params): return model_builder.build_model(params, mode=ModeKeys.TRAIN) logging.info( 'Train num_replicas_in_sync %d num_workers %d is_multi_host %s', strategy.num_replicas_in_sync, num_workers, is_multi_host) dist_executor = DetectionDistributedExecutor( strategy=strategy, params=params, model_fn=_model_fn, loss_fn=model_builder.build_loss_fn, is_multi_host=is_multi_host, predict_post_process_fn=model_builder.post_processing, trainable_variables_filter=model_builder. make_filter_trainable_variables_fn()) if is_multi_host: train_input_fn = functools.partial( train_input_fn, batch_size=params.train.batch_size // strategy.num_replicas_in_sync) return dist_executor.train( train_input_fn=train_input_fn, model_dir=params.model_dir, iterations_per_loop=params.train.iterations_per_loop, total_steps=params.train.total_steps, init_checkpoint=model_builder.make_restore_checkpoint_fn(), custom_callbacks=callbacks, save_config=True) elif mode == 'eval' or mode == 'eval_once': def _model_fn(params): return model_builder.build_model(params, mode=ModeKeys.PREDICT_WITH_GT) logging.info( 'Eval num_replicas_in_sync %d num_workers %d is_multi_host %s', strategy.num_replicas_in_sync, num_workers, is_multi_host) if is_multi_host: eval_input_fn = functools.partial( eval_input_fn, batch_size=params.eval.batch_size // strategy.num_replicas_in_sync) dist_executor = DetectionDistributedExecutor( strategy=strategy, params=params, model_fn=_model_fn, loss_fn=model_builder.build_loss_fn, is_multi_host=is_multi_host, predict_post_process_fn=model_builder.post_processing, trainable_variables_filter=model_builder. make_filter_trainable_variables_fn()) if mode == 'eval': results = dist_executor.evaluate_from_model_dir( model_dir=params.model_dir, eval_input_fn=eval_input_fn, eval_metric_fn=model_builder.eval_metrics, eval_timeout=params.eval.eval_timeout, min_eval_interval=params.eval.min_eval_interval, total_steps=params.train.total_steps) else: # Run evaluation once for a single checkpoint. if not checkpoint_path: raise ValueError('checkpoint_path cannot be empty.') if tf.io.gfile.isdir(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) summary_writer = executor.SummaryWriter(params.model_dir, 'eval') results, _ = dist_executor.evaluate_checkpoint( checkpoint_path=checkpoint_path, eval_input_fn=eval_input_fn, eval_metric_fn=model_builder.eval_metrics, summary_writer=summary_writer) for k, v in results.items(): logging.info('Final eval metric %s: %f', k, v) return results else: raise ValueError('Mode not found: %s.' % mode)
def construct_estimator(num_gpus, model_dir, iterations, params, batch_size, eval_batch_size): """Construct either an Estimator or TPUEstimator for NCF. Args: num_gpus: The number of gpus (Used to select distribution strategy) model_dir: The model directory for the estimator iterations: Estimator iterations params: The params dict for the estimator batch_size: The mini-batch size for training. eval_batch_size: The batch size used during evaluation. Returns: An Estimator or TPUEstimator. """ if params["use_tpu"]: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], ) tf.logging.info("Issuing reset command to TPU to ensure a clean state.") tf.Session.reset(tpu_cluster_resolver.get_master()) tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=iterations, num_shards=8) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, save_checkpoints_secs=600, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config) tpu_params = {k: v for k, v in params.items() if k != "batch_size"} train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=True, train_batch_size=batch_size, eval_batch_size=eval_batch_size, params=tpu_params, config=run_config) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=True, train_batch_size=1, eval_batch_size=eval_batch_size, params=tpu_params, config=run_config) return train_estimator, eval_estimator distribution = distribution_utils.get_distribution_strategy(num_gpus=num_gpus) run_config = tf.estimator.RunConfig(train_distribute=distribution, eval_distribute=distribution) params["eval_batch_size"] = eval_batch_size model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: tf.logging.info("Using XLA for GPU for training and evaluation.") model_fn = xla.estimator_model_fn(model_fn) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=run_config, params=params) return estimator, estimator
def run_deep_speech(_): """Run deep speech training and eval loop.""" tf.compat.v1.set_random_seed(flags_obj.seed) # Data preprocessing tf.compat.v1.logging.info("Data preprocessing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) # Number of label classes. Label string is "[a-z]' -" num_classes = len(train_speech_dataset.speech_labels) # Use distribution strategy for multi-gpu training num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus=num_gpus) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, } ) # Benchmark logging run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_replica_batch_size = distribution_utils.per_replica_batch_size( flags_obj.batch_size, num_gpus) def input_fn_train(): return dataset.input_fn( per_replica_batch_size, train_speech_dataset) def input_fn_eval(): return dataset.input_fn( per_replica_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle) # Perform batch_wise dataset shuffling train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) # Evaluation tf.logging.info("Starting to evaluate...") eval_results = evaluate_model( estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) # Log the WER and CER results. benchmark_logger.log_evaluation_result(eval_results) tf.logging.info( "Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) # If some evaluation threshold is met if model_helpers.past_stop_threshold( flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def resnet_main(flags_obj, model_function, input_function, dataset_name, percent, model_class, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Returns: Dict of results of the run. Contains the keys `eval_results` and `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5. `train_hooks` is a list the instances of hooks used during training. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Configures cluster spec for distribution strategy. num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), num_workers=num_workers, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60 * 60 * 24, save_checkpoints_steps=None) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None params = { 'resnet_size': int(flags_obj.resnet_size), 'data_format': 'channels_last', 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj, default_for_fp16=128), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'num_workers': num_workers, 'adv_train': False, 'attack': False, } classifier = tf.compat.v1.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params=params) params['adv_train'] = True classifier_adv = tf.compat.v1.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params=params) params['adv_train'] = False params['attack'] = True classifier_attack = tf.compat.v1.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params=params) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, 'num_workers': num_workers, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs, input_context=None): return input_function( is_training=True, percent=percent, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj. datasets_num_private_threads, input_context=input_context) def input_fn_eval(): return input_function( is_training=False, percent=0, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) def input_fn_eval_attack(): return input_function( is_training=False, percent=100, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else flags_obj.train_epochs) # tf.compat.v1.logging.info(tf.global_variables()) use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1 if use_train_and_evaluate: train_spec = tf.estimator.TrainSpec( input_fn=lambda input_context=None: input_fn_train( train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval) tf.compat.v1.logging.info('Starting to train and evaluate.') tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) # tf.estimator.train_and_evalute doesn't return anything in multi-worker # case. eval_results = {} else: if train_epochs == 0: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: # Since we are calling classifier.train immediately in each loop, the # value of num_train_epochs in the lambda function will not be changed # before it is used. So it is safe to ignore the pylint error here # pylint: disable=cell-var-from-loop if flags_obj.adv_train: classifier_adv.train( input_fn=lambda input_context=None: input_fn_train( num_train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) else: classifier.train( input_fn=lambda input_context=None: input_fn_train( num_train_epochs, input_context=input_context), hooks=train_hooks, max_steps=flags_obj.max_train_steps) # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, # which will iterate forever. Passing steps=flags_obj.max_train_steps # allows the eval (which is generally unimportant in those circumstances) # to terminate. Note that eval will run for max_train_steps each loop, # regardless of the global_step count. tf.compat.v1.logging.info('Starting to evaluate clean.') eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) tf.compat.v1.logging.info('Starting to evaluate adv.') eval_results_adv = classifier_adv.evaluate( input_fn=input_fn_eval, steps=flags_obj.max_train_steps) tf.compat.v1.logging.info('Starting to evaluate attack.') eval_results_attack = classifier_attack.evaluate( input_fn=input_fn_eval_attack, steps=flags_obj.max_train_steps) print( '########################## clean #############################' ) benchmark_logger.log_evaluation_result(eval_results) print( '########################## adv #############################') benchmark_logger.log_evaluation_result(eval_results_adv) print( '########################## attack #############################' ) benchmark_logger.log_evaluation_result(eval_results_attack) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial(image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) stats = {} stats['eval_results'] = eval_results stats['eval_atttack_results'] = eval_results_attack stats['eval_adv_results'] = eval_results_adv stats['train_hooks'] = train_hooks return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_xla=flags_obj.enable_xla) performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj)) if tf.config.list_physical_devices('GPU'): if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj. datasets_num_private_threads) common.set_cudnn_batchnorm_mode() # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info( 'Training %d epochs, each epoch has %d steps, ' 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory( flags_obj.batch_size, flags_obj.log_steps, logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) with distribution_utils.get_strategy_scope(strategy): runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback, per_epoch_steps) eval_interval = flags_obj.epochs_between_evals * per_epoch_steps checkpoint_interval = (per_epoch_steps if flags_obj.enable_checkpoint_and_export else None) summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None checkpoint_manager = tf.train.CheckpointManager( runnable.checkpoint, directory=flags_obj.model_dir, max_to_keep=10, step_counter=runnable.global_step, checkpoint_interval=checkpoint_interval) resnet_controller = controller.Controller( strategy, runnable.train, runnable.evaluate if not flags_obj.skip_eval else None, global_step=runnable.global_step, steps_per_loop=steps_per_loop, train_steps=per_epoch_steps * train_epochs, checkpoint_manager=checkpoint_manager, summary_interval=summary_interval, eval_steps=eval_steps, eval_interval=eval_interval) time_callback.on_train_begin() resnet_controller.train(evaluate=not flags_obj.skip_eval) time_callback.on_train_end() stats = build_stats(runnable, time_callback) return stats
def __init__(self, flags_obj): """Init function of TransformerMain. Args: flags_obj: Object containing parsed flag values, i.e., FLAGS. Raises: ValueError: if not using static batch for input data on TPU. """ self.flags_obj = flags_obj self.predict_model = None # Add flag-defined parameters to params object num_gpus = flags_core.get_num_gpus(flags_obj) self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus) params["num_gpus"] = num_gpus params["use_ctl"] = flags_obj.use_ctl params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["static_batch"] = flags_obj.static_batch params["max_length"] = flags_obj.max_length params["decode_batch_size"] = flags_obj.decode_batch_size params["decode_max_length"] = flags_obj.decode_max_length params["padded_decode"] = flags_obj.padded_decode params["num_parallel_calls"] = ( flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE) params["use_synthetic_data"] = flags_obj.use_synthetic_data params["batch_size"] = flags_obj.batch_size or params["default_batch_size"] params["repeat_dataset"] = None params["dtype"] = flags_core.get_tf_dtype(flags_obj) params["enable_metrics_in_training"] = flags_obj.enable_metrics_in_training if params["dtype"] == tf.float16: # TODO(reedwm): It's pretty ugly to set the global policy in a constructor # like this. What if multiple instances of TransformerTask are created? # We should have a better way in the tf.keras.mixed_precision API of doing # this. loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16="dynamic") policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=loss_scale) tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) self.distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=num_gpus, tpu_address=flags_obj.tpu or "") if self.use_tpu: params["num_replicas"] = self.distribution_strategy.num_replicas_in_sync if not params["static_batch"]: raise ValueError("TPU requires static batch for input data.") else: print("Running transformer with num_gpus =", num_gpus) if self.distribution_strategy: print("For training, using distribution strategy: ", self.distribution_strategy) else: print("Not using any distribution strategy.")
def test_mirrored_strategy(self): ds = distribution_utils.get_distribution_strategy(5) self.assertEquals(ds.num_replicas_in_sync, 5) self.assertEquals(len(ds.extended.worker_devices), 5) for device in ds.extended.worker_devices: self.assertIn('GPU', device)
def test_one_device_strategy_gpu(self): ds = distribution_utils.get_distribution_strategy(1) self.assertEquals(ds.num_replicas_in_sync, 1) self.assertEquals(len(ds.extended.worker_devices), 1) self.assertIn('GPU', ds.extended.worker_devices[0])
def run_keras_model_benchmark(_): """Run the benchmark on keras model.""" # Ensure a valid model name was supplied via command line argument if FLAGS.model not in MODELS.keys(): raise AssertionError("The --model command line argument should " "be a key in the `MODELS` dictionary.") # Check if eager execution is enabled if FLAGS.eager: tf.logging.info("Eager execution is enabled...") tf.enable_eager_execution() # Load the model tf.logging.info("Benchmark on {} model...".format(FLAGS.model)) keras_model = MODELS[FLAGS.model] # Get dataset dataset_name = "ImageNet" if FLAGS.use_synthetic_data: tf.logging.info("Using synthetic dataset...") dataset_name += "_Synthetic" train_dataset = dataset.generate_synthetic_input_dataset( FLAGS.model, FLAGS.batch_size) val_dataset = dataset.generate_synthetic_input_dataset( FLAGS.model, FLAGS.batch_size) model = keras_model(weights=None) else: tf.logging.info("Using CIFAR-10 dataset...") dataset_name = "CIFAR-10" ds = dataset.Cifar10Dataset(FLAGS.batch_size) train_dataset = ds.train_dataset val_dataset = ds.test_dataset model = keras_model(weights=None, input_shape=ds.input_shape, classes=ds.num_classes) num_gpus = flags_core.get_num_gpus(FLAGS) distribution = None # Use distribution strategy if FLAGS.dist_strat: distribution = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=num_gpus) elif num_gpus > 1: # Run with multi_gpu_model # If eager execution is enabled, only one GPU is utilized even if multiple # GPUs are provided. if FLAGS.eager: tf.logging.warning( "{} GPUs are provided, but only one GPU is utilized as " "eager execution is enabled.".format(num_gpus)) model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus) # Adam optimizer and some other optimizers doesn't work well with # distribution strategy (b/113076709) # Use GradientDescentOptimizer here optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"], distribute=distribution) # Create benchmark logger for benchmark logging run_params = { "batch_size": FLAGS.batch_size, "synthetic_data": FLAGS.use_synthetic_data, "train_epochs": FLAGS.train_epochs, "num_train_images": FLAGS.num_train_images, "num_eval_images": FLAGS.num_eval_images, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info(model_name=FLAGS.model, dataset_name=dataset_name, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Create callbacks that log metric values about the training and evaluation callbacks = model_callbacks.get_model_callbacks( FLAGS.callbacks, batch_size=FLAGS.batch_size, metric_logger=benchmark_logger) # Train and evaluate the model history = model.fit(train_dataset, epochs=FLAGS.train_epochs, callbacks=callbacks, validation_data=val_dataset, steps_per_epoch=int( np.ceil(FLAGS.num_train_images / FLAGS.batch_size)), validation_steps=int( np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))) tf.logging.info("Logging the evaluation results...") for epoch in range(FLAGS.train_epochs): eval_results = { "accuracy": history.history["val_acc"][epoch], "loss": history.history["val_loss"][epoch], tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(FLAGS.num_eval_images / FLAGS.batch_size) } benchmark_logger.log_evaluation_result(eval_results) # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: synthetic_util.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=cifar_preprocessing.HEIGHT, width=cifar_preprocessing.WIDTH, num_channels=cifar_preprocessing.NUM_CHANNELS, num_classes=cifar_preprocessing.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj), drop_remainder=True) else: synthetic_util.undo_set_up_synthetic_data() input_fn = cifar_preprocessing.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=cifar_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, # Setting drop_remainder to avoid the partial batch logic in normalization # layer, which triggers tf.where and leads to extra memory copy of input # sizes between host and GPU. drop_remainder=(not flags_obj.enable_get_next_as_optional)) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=cifar_preprocessing.parse_record) steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) lr_schedule = 0.1 if flags_obj.use_tensor_lr: initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128 lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE), values=[initial_learning_rate] + list(p[0] * initial_learning_rate for p in LR_SCHEDULE)) with strategy_scope: optimizer = common.get_optimizer(lr_schedule) model = resnet_cifar_model.resnet56( classes=cifar_preprocessing.NUM_CLASSES) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks() if not flags_obj.use_tensor_lr: lr_callback = LearningRateBatchScheduler( schedule=learning_rate_schedule, batch_size=flags_obj.batch_size, steps_per_epoch=steps_per_epoch) callbacks.append(lr_callback) # if mutliple epochs, ignore the train_steps flag. if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) train_ds, test_ds = get_input_dataset(flags_obj, strategy) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info( "Training %d epochs, each epoch has %d steps, " "total steps: %d; Eval %d steps", train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory(flags_obj.batch_size, flags_obj.log_steps) with distribution_utils.get_strategy_scope(strategy): model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, batch_size=flags_obj.batch_size, use_l2_regularizer=not flags_obj.single_l2_loss_op) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) optimizer = common.get_optimizer(lr_schedule) if flags_obj.fp16_implementation == 'graph_rewrite': if not flags_obj.use_tf_function: raise ValueError( '--fp16_implementation=graph_rewrite requires ' '--use_tf_function to be true') loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale) train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) trainable_variables = model.trainable_variables def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: logits = model(images, training=True) prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(prediction_loss) * (1.0 / flags_obj.batch_size) num_replicas = tf.distribute.get_strategy( ).num_replicas_in_sync if flags_obj.single_l2_loss_op: filtered_variables = [ tf.reshape(v, (-1, )) for v in trainable_variables if 'bn' not in v.name ] l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) loss += (l2_loss / num_replicas) else: loss += (tf.reduce_sum(model.losses) / num_replicas) # Scale the loss if flags_obj.dtype == "fp16": loss = optimizer.get_scaled_loss(loss) grads = tape.gradient(loss, trainable_variables) # Unscale the grads if flags_obj.dtype == "fp16": grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, trainable_variables)) train_loss.update_state(loss) training_accuracy.update_state(labels, logits) @tf.function def train_steps(iterator, steps): """Performs distributed training steps in a loop.""" for _ in tf.range(steps): strategy.experimental_run_v2(step_fn, args=(next(iterator), )) def train_single_step(iterator): if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator), )) else: return step_fn(next(iterator)) def test_step(iterator): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs logits = model(images, training=False) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size) test_loss.update_state(loss) test_accuracy.update_state(labels, logits) if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator), )) else: step_fn(next(iterator)) if flags_obj.use_tf_function: train_single_step = tf.function(train_single_step) test_step = tf.function(test_step) train_iter = iter(train_ds) time_callback.on_train_begin() for epoch in range(train_epochs): train_loss.reset_states() training_accuracy.reset_states() steps_in_current_epoch = 0 while steps_in_current_epoch < per_epoch_steps: time_callback.on_batch_begin(steps_in_current_epoch + epoch * per_epoch_steps) steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps, steps_per_loop) if steps == 1: train_single_step(train_iter) else: # Converts steps to a Tensor to avoid tf.function retracing. train_steps(train_iter, tf.convert_to_tensor(steps, dtype=tf.int32)) time_callback.on_batch_end(steps_in_current_epoch + epoch * per_epoch_steps) steps_in_current_epoch += steps logging.info('Training loss: %s, accuracy: %s%% at epoch %d', train_loss.result().numpy(), training_accuracy.result().numpy(), epoch + 1) if (not flags_obj.skip_eval and (epoch + 1) % flags_obj.epochs_between_evals == 0): test_loss.reset_states() test_accuracy.reset_states() test_iter = iter(test_ds) for _ in range(eval_steps): test_step(test_iter) logging.info('Test loss: %s, accuracy: %s%% at epoch: %d', test_loss.result().numpy(), test_accuracy.result().numpy(), epoch + 1) time_callback.on_train_end() eval_result = None train_result = None if not flags_obj.skip_eval: eval_result = [ test_loss.result().numpy(), test_accuracy.result().numpy() ] train_result = [ train_loss.result().numpy(), training_accuracy.result().numpy() ] stats = build_stats(train_result, eval_result, time_callback) return stats
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config( enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: common.set_gpu_thread_mode_and_count(flags_obj) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError('dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional ) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=cifar_preprocessing.HEIGHT, width=cifar_preprocessing.WIDTH, num_channels=cifar_preprocessing.NUM_CHANNELS, num_classes=cifar_preprocessing.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj), drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_preprocessing.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=cifar_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, # Setting drop_remainder to avoid the partial batch logic in normalization # layer, which triggers tf.where and leads to extra memory copy of input # sizes between host and GPU. drop_remainder=(not flags_obj.enable_get_next_as_optional)) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=cifar_preprocessing.parse_record) with strategy_scope: optimizer = common.get_optimizer() model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=(['categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) else: model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=(['categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) callbacks = common.get_callbacks( learning_rate_schedule, cifar_preprocessing.NUM_IMAGES['train']) train_steps = cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. """ if flags_obj.enable_eager: tf.enable_eager_execution() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError('dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # pylint: disable=protected-access if flags_obj.use_synthetic_data: input_fn = keras_common.get_synth_input_fn( height=imagenet_main.DEFAULT_IMAGE_SIZE, width=imagenet_main.DEFAULT_IMAGE_SIZE, num_channels=imagenet_main.NUM_CHANNELS, num_classes=imagenet_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: input_fn = imagenet_main.input_fn train_input_dataset = input_fn(is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn(is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) strategy = distribution_utils.get_distribution_strategy( num_gpus=flags_obj.num_gpus, turn_off_distribution_strategy=flags_obj.turn_off_distribution_strategy) strategy_scope = keras_common.get_strategy_scope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[ time_callback, lr_callback, tensorboard_callback ], validation_steps=num_eval_steps, validation_data=validation_data, verbose=1) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=1) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": policy = tf.keras.mixed_precision.experimental.Policy( "mixed_float16", loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) tf.keras.mixed_precision.experimental.set_policy(policy) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = strategy params["use_tpu"] = (FLAGS.distribution_strategy == "tpu") if params["use_tpu"] and not params["keras_use_ctl"]: logging.error("Custom training loop must be used when using TPUStrategy.") return batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data, strategy)) steps_per_epoch = None if generate_input_online else num_train_steps with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if FLAGS.fp16_implementation == "graph_rewrite": optimizer = \ tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: # When keras_use_ctl is False, instead Model.fit() automatically applies # loss scaling so we don't need to create a LossScaleOptimizer. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, tf.keras.mixed_precision.experimental.global_policy().loss_scale) if params["keras_use_ctl"]: train_loss, eval_results = run_ncf_custom_training( params, strategy, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) if not FLAGS.ml_perf: # Create Tensorboard summary and checkpoint callbacks. summary_dir = os.path.join(FLAGS.model_dir, "summaries") summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( checkpoint_path, save_weights_only=True) callbacks += [summary_callback, checkpoint_callback] history = keras_model.fit( train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_loss_and_metrics = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") # Keras evaluate() API returns scalar loss and metric values from # evaluation as a list. Here, the returned list would contain # [evaluation loss, hr sum, hr count]. eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2] # Format evaluation result into [eval loss, eval hit accuracy]. eval_results = [eval_loss_and_metrics[0], eval_hit_rate] if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config) # initialize our model with all but the dense layer from pretrained resnet if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals, num_gpus=flags_core.get_num_gpus(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def __init__(self, flags_obj): """Init function of TransformerMain. Args: flags_obj: Object containing parsed flag values, i.e., FLAGS. Raises: ValueError: if not using static batch for input data on TPU. """ self.flags_obj = flags_obj self.predict_model = None # Add flag-defined parameters to params object num_gpus = flags_core.get_num_gpus(flags_obj) self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus) params["num_gpus"] = num_gpus params["use_ctl"] = flags_obj.use_ctl params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["static_batch"] = flags_obj.static_batch params["max_length"] = flags_obj.max_length params["decode_batch_size"] = flags_obj.decode_batch_size params["decode_max_length"] = flags_obj.decode_max_length params["padded_decode"] = flags_obj.padded_decode params["max_io_parallelism"] = (flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE) params["use_synthetic_data"] = flags_obj.use_synthetic_data params["batch_size"] = flags_obj.batch_size or params[ "default_batch_size"] params["repeat_dataset"] = None params["dtype"] = flags_core.get_tf_dtype(flags_obj) params["enable_tensorboard"] = flags_obj.enable_tensorboard params[ "enable_metrics_in_training"] = flags_obj.enable_metrics_in_training params["steps_between_evals"] = flags_obj.steps_between_evals params["enable_checkpointing"] = flags_obj.enable_checkpointing self.distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu or "") if self.use_tpu: params[ "num_replicas"] = self.distribution_strategy.num_replicas_in_sync else: logging.info("Running transformer with num_gpus = %d", num_gpus) if self.distribution_strategy: logging.info("For training, using distribution strategy: %s", self.distribution_strategy) else: logging.info("Not using any distribution strategy.") performance.set_mixed_precision_policy( params["dtype"], flags_core.get_loss_scale(flags_obj, default_for_fp16="dynamic"))
def _get_distribution_strategy(self, use_ds=True): """Gets the distribution strategy.""" return distribution_utils.get_distribution_strategy( distribution_strategy='mirrored' if use_ds else 'off', num_gpus=self.num_gpus)
def construct_estimator(model_dir, params): """Construct either an Estimator or TPUEstimator for NCF. Args: model_dir: The model directory for the estimator params: The params dict for the estimator Returns: An Estimator or TPUEstimator. """ if params["use_tpu"]: # Some of the networking libraries are quite chatty. for name in [ "googleapiclient.discovery", "googleapiclient.discovery_cache", "oauth2client.transport" ]: logging.getLogger(name).setLevel(logging.ERROR) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator") tf.logging.info( "Issuing reset command to TPU to ensure a clean state.") tf.Session.reset(tpu_cluster_resolver.get_master()) # Estimator looks at the master it connects to for MonitoredTrainingSession # by reading the `TF_CONFIG` environment variable, and the coordinator # is used by StreamingFilesDataset. tf_config_env = { "session_master": tpu_cluster_resolver.get_master(), "eval_session_master": tpu_cluster_resolver.get_master(), "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"] } os.environ['TF_CONFIG'] = json.dumps(tf_config_env) distribution = tf.contrib.distribute.TPUStrategy(tpu_cluster_resolver, steps_per_run=100) else: distribution = distribution_utils.get_distribution_strategy( num_gpus=params["num_gpus"]) run_config = tf.estimator.RunConfig(train_distribute=distribution, eval_distribute=distribution) model_fn = neumf_model.neumf_model_fn if params["use_xla_for_gpu"]: tf.logging.info("Using XLA for GPU for training and evaluation.") model_fn = xla.estimator_model_fn(model_fn) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir, config=run_config, params=params) return estimator
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ # TODO(tobyboyd): Remove eager flag when tf 1.0 testing ends. # Eager is default in tf 2.0 and should not be toggled if keras_common.is_v2_0(): keras_common.set_config_v2() else: config = keras_common.get_config_proto_v1() if flags_obj.enable_eager: tf.compat.v1.enable_eager_execution(config=config) else: sess = tf.Session(config=config) tf.keras.backend.set_session(sess) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_common.set_gpu_thread_mode_and_count(flags_obj) if flags_obj.data_prefetch_with_slack: keras_common.data_prefetch_with_slack() keras_common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'float16': policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars') tf.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) strategy_scope = distribution_utils.get_strategy_scope(strategy) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=imagenet_main.DEFAULT_IMAGE_SIZE, width=imagenet_main.DEFAULT_IMAGE_SIZE, num_channels=imagenet_main.NUM_CHANNELS, num_classes=imagenet_main.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_main.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, dtype=dtype, drop_remainder=drop_remainder) lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = keras_common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_main.NUM_IMAGES['train'], warmup_epochs=LR_SCHEDULE[0][1], boundaries=list(p[1] for p in LR_SCHEDULE[1:]), multipliers=list(p[0] for p in LR_SCHEDULE), compute_lr_on_cpu=True) with strategy_scope: optimizer = keras_common.get_optimizer(lr_schedule) if dtype == 'float16': # TODO(reedwm): Remove manually wrapping optimizer once mixed precision # can be enabled with a single line of code. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale=flags_core.get_loss_scale(flags_obj)) if flags_obj.enable_xla and not flags_obj.enable_eager: # TODO(b/129861005): Fix OOM issue in eager mode when setting # `batch_size` in keras.Input layer. if strategy and strategy.num_replicas_in_sync > 1: # TODO(b/129791381): Specify `input_layer_batch_size` value in # DistributionStrategy multi-replica case. input_layer_batch_size = None else: input_layer_batch_size = flags_obj.batch_size else: input_layer_batch_size = None if flags_obj.use_trivial_model: model = trivial_model.trivial_model(imagenet_main.NUM_CLASSES, dtype) else: model = resnet_model.resnet50( num_classes=imagenet_main.NUM_CLASSES, dtype=dtype, batch_size=input_layer_batch_size) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), cloning=flags_obj.clone_model_in_keras_dist_strat) callbacks = keras_common.get_callbacks( learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = keras_common.build_stats(history, eval_output, callbacks) return stats
def __init__( self, uri='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', model_dir=None, seq_len=128, dropout_rate=0.1, initializer_range=0.02, learning_rate=3e-5, distribution_strategy='mirrored', num_gpus=-1, tpu='', trainable=True, do_lower_case=True, is_tf2=True, convert_from_saved_model_tf2=False): """Initialze an instance with model paramaters. Args: uri: TF-Hub path/url to Bert module. model_dir: The location of the model checkpoint files. seq_len: Length of the sequence to feed into the model. dropout_rate: The rate for dropout. initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. learning_rate: The initial learning rate for Adam. distribution_strategy: A string specifying which distribution strategy to use. Accepted values are 'off', 'one_device', 'mirrored', 'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive. 'off' means not to use Distribution Strategy; 'tpu' means to use TPUStrategy using `tpu_address`. num_gpus: How many GPUs to use at each worker with the DistributionStrategies API. The default is -1, which means utilize all available GPUs. tpu: TPU address to connect to. trainable: boolean, whether pretrain layer is trainable. do_lower_case: boolean, whether to lower case the input text. Should be True for uncased models and False for cased models. is_tf2: boolean, whether the hub module is in TensorFlow 2.x format. convert_from_saved_model_tf2: Convert to TFLite from saved_model in TF 2.x. """ if compat.get_tf_behavior() not in self.compat_tf_versions: raise ValueError('Incompatible versions. Expect {}, but got {}.'.format( self.compat_tf_versions, compat.get_tf_behavior())) self.seq_len = seq_len self.dropout_rate = dropout_rate self.initializer_range = initializer_range self.learning_rate = learning_rate self.trainable = trainable self.model_dir = model_dir if self.model_dir is None: self.model_dir = tempfile.mkdtemp() num_gpus = get_num_gpus(num_gpus) self.strategy = distribution_utils.get_distribution_strategy( distribution_strategy=distribution_strategy, num_gpus=num_gpus, tpu_address=tpu) self.tpu = tpu self.uri = uri self.do_lower_case = do_lower_case self.is_tf2 = is_tf2 self.bert_config = bert_configs.BertConfig( 0, initializer_range=self.initializer_range, hidden_dropout_prob=self.dropout_rate) self.convert_from_saved_model_tf2 = convert_from_saved_model_tf2 self.is_built = False
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60 * 60 * 24) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj. datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial(image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True)
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), all_reduce_alg=flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'data_format': data_format, }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.compat.v1.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager) dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError('dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=cifar_main.HEIGHT, width=cifar_main.WIDTH, num_channels=cifar_main.NUM_CHANNELS, num_classes=cifar_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_main.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES) model.compile(loss='categorical_crossentropy', optimizer=optimizer, run_eagerly=flags_obj.run_eagerly, metrics=['categorical_accuracy']) callbacks = keras_common.get_callbacks( learning_rate_schedule, cifar_main.NUM_IMAGES['train']) train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (cifar_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = keras_common.build_stats(history, eval_output, callbacks) return stats
def test_mirrored_strategy(self): ds = distribution_utils.get_distribution_strategy(num_gpus=5) self.assertEquals(ds.num_replicas_in_sync, 5) self.assertEquals(len(ds.extended.worker_devices), 5) for device in ds.extended.worker_devices: self.assertIn('GPU', device)
def run_keras_model_benchmark(_): """Run the benchmark on keras model.""" # Ensure a valid model name was supplied via command line argument if FLAGS.model not in MODELS.keys(): raise AssertionError("The --model command line argument should " "be a key in the `MODELS` dictionary.") # Check if eager execution is enabled if FLAGS.eager: tf.logging.info("Eager execution is enabled...") tf.enable_eager_execution() # Load the model tf.logging.info("Benchmark on {} model...".format(FLAGS.model)) keras_model = MODELS[FLAGS.model] model = keras_model(weights=None) # Get dataset dataset_name = "ImageNet" if FLAGS.use_synthetic_data: tf.logging.info("Using synthetic dataset...") dataset_name += "_Synthetic" train_dataset = dataset.generate_synthetic_input_dataset( FLAGS.model, FLAGS.batch_size) val_dataset = dataset.generate_synthetic_input_dataset( FLAGS.model, FLAGS.batch_size) else: raise ValueError("Only synthetic dataset is supported!") num_gpus = flags_core.get_num_gpus(FLAGS) distribution = None # Use distribution strategy if FLAGS.dist_strat: distribution = distribution_utils.get_distribution_strategy( num_gpus=num_gpus) elif num_gpus > 1: # Run with multi_gpu_model # If eager execution is enabled, only one GPU is utilized even if multiple # GPUs are provided. if FLAGS.eager: tf.logging.warning( "{} GPUs are provided, but only one GPU is utilized as " "eager execution is enabled.".format(num_gpus)) model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus) # Adam optimizer and some other optimizers doesn't work well with # distribution strategy (b/113076709) # Use GradientDescentOptimizer here optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"], distribute=distribution) # Create benchmark logger for benchmark logging run_params = { "batch_size": FLAGS.batch_size, "synthetic_data": FLAGS.use_synthetic_data, "train_epochs": FLAGS.train_epochs, "num_train_images": FLAGS.num_train_images, "num_eval_images": FLAGS.num_eval_images, } benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name=FLAGS.model, dataset_name=dataset_name, run_params=run_params, test_id=FLAGS.benchmark_test_id) # Create callbacks that log metric values about the training and evaluation callbacks = model_callbacks.get_model_callbacks( FLAGS.callbacks, batch_size=FLAGS.batch_size, metric_logger=benchmark_logger) # Train and evaluate the model history = model.fit( train_dataset, epochs=FLAGS.train_epochs, callbacks=callbacks, validation_data=val_dataset, steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)), validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size)) ) tf.logging.info("Logging the evaluation results...") for epoch in range(FLAGS.train_epochs): eval_results = { "accuracy": history.history["val_acc"][epoch], "loss": history.history["val_loss"][epoch], tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil( FLAGS.num_eval_images/FLAGS.batch_size) } benchmark_logger.log_evaluation_result(eval_results) # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ config = keras_common.get_config_proto() # TODO(tobyboyd): Remove eager flag when tf 1.0 testing ends. # Eager is default in tf 2.0 and should not be toggled if not keras_common.is_v2_0(): if flags_obj.enable_eager: tf.compat.v1.enable_eager_execution(config=config) else: sess = tf.Session(config=config) tf.keras.backend.set_session(sess) # TODO(haoyuzhang): Set config properly in TF2.0 when the config API is ready. dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'float16': policy = tf.keras.mixed_precision.experimental.Policy('infer_float32_vars') tf.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=imagenet_main.DEFAULT_IMAGE_SIZE, width=imagenet_main.DEFAULT_IMAGE_SIZE, num_channels=imagenet_main.NUM_CHANNELS, num_classes=imagenet_main.NUM_CLASSES, dtype=dtype) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_main.input_fn train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype) eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras, dtype=dtype) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus) strategy_scope = keras_common.get_strategy_scope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() if dtype == 'float16': # TODO(reedwm): Remove manually wrapping optimizer once mixed precision # can be enabled with a single line of code. optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale=flags_core.get_loss_scale(flags_obj)) model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES, dtype=dtype) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[ time_callback, lr_callback, tensorboard_callback ], validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. NotImplementedError: If some features are not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config( enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) performance.set_mixed_precision_policy( flags_core.get_tf_dtype(flags_obj), flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional ) strategy_scope = distribution_utils.get_strategy_scope(strategy) # pylint: disable=protected-access if flags_obj.use_synthetic_data: input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: input_fn = imagenet_preprocessing.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla # Current resnet_model.resnet50 input format is always channel-last. # We use keras_application mobilenet model which input format is depends on # the keras beckend image data format. # This use_keras_image_data_format flags indicates whether image preprocessor # output format should be same as the keras backend image data format or just # channel-last format. use_keras_image_data_format = (flags_obj.model == 'mobilenet') train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=imagenet_preprocessing.get_parse_record_fn( use_keras_image_data_format=use_keras_image_data_format), datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=imagenet_preprocessing.get_parse_record_fn( use_keras_image_data_format=use_keras_image_data_format), dtype=dtype, drop_remainder=drop_remainder) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) steps_per_epoch = ( imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) with strategy_scope: if flags_obj.optimizer == 'resnet50_default': optimizer = common.get_optimizer(lr_schedule) elif flags_obj.optimizer == 'mobilenet_default': initial_learning_rate = \ flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size optimizer = tf.keras.optimizers.SGD( learning_rate=tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay, decay_rate=flags_obj.lr_decay_factor, staircase=True), momentum=0.9) if flags_obj.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # TODO(hongkuny): Remove trivial model usage and move it to benchmark. if flags_obj.use_trivial_model: model = test_utils.trivial_model(imagenet_preprocessing.NUM_CLASSES) elif flags_obj.model == 'resnet50_v1.5': model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.model == 'mobilenet': # TODO(kimjaehong): Remove layers attribute when minimum TF version # support 2.0 layers by default. model = tf.keras.applications.mobilenet.MobileNet( weights=None, classes=imagenet_preprocessing.NUM_CLASSES, layers=tf.keras.layers) if flags_obj.pretrained_filepath: model.load_weights(flags_obj.pretrained_filepath) if flags_obj.pruning_method == 'polynomial_decay': if dtype != tf.float32: raise NotImplementedError( 'Pruning is currently only supported on dtype=tf.float32.') pruning_params = { 'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay( initial_sparsity=flags_obj.pruning_initial_sparsity, final_sparsity=flags_obj.pruning_final_sparsity, begin_step=flags_obj.pruning_begin_step, end_step=flags_obj.pruning_end_step, frequency=flags_obj.pruning_frequency), } model = tfmot.sparsity.keras.prune_low_magnitude(model, **pruning_params) elif flags_obj.pruning_method: raise NotImplementedError( 'Only polynomial_decay is currently supported.') model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks( steps_per_epoch=steps_per_epoch, pruning_method=flags_obj.pruning_method, enable_checkpoint_and_export=flags_obj.enable_checkpoint_and_export, model_dir=flags_obj.model_dir) # if mutliple epochs, ignore the train_steps flag. if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = ( imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if flags_obj.pruning_method: model = tfmot.sparsity.keras.strip_pruning(model) if flags_obj.enable_checkpoint_and_export: if dtype == tf.bfloat16: logging.warning('Keras model.save does not support bfloat16 dtype.') else: # Keras model.save assumes a float32 input designature. export_path = os.path.join(flags_obj.model_dir, 'saved_model') model.save(export_path, include_optimizer=False) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats
def run_deep_speech(_): """Run deep speech training and eval loop.""" tf.set_random_seed(flags_obj.seed) # Data preprocessing tf.logging.info("Data preprocessing...") train_speech_dataset = generate_dataset(flags_obj.train_data_dir) eval_speech_dataset = generate_dataset(flags_obj.eval_data_dir) # Number of label classes. Label string is "[a-z]' -" num_classes = len(train_speech_dataset.speech_labels) # Use distribution strategy for multi-gpu training num_gpus = flags_core.get_num_gpus(flags_obj) distribution_strategy = distribution_utils.get_distribution_strategy(num_gpus) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy) estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, config=run_config, params={ "num_classes": num_classes, } ) # Benchmark logging run_params = { "batch_size": flags_obj.batch_size, "train_epochs": flags_obj.train_epochs, "rnn_hidden_size": flags_obj.rnn_hidden_size, "rnn_hidden_layers": flags_obj.rnn_hidden_layers, "rnn_type": flags_obj.rnn_type, "is_bidirectional": flags_obj.is_bidirectional, "use_bias": flags_obj.use_bias } dataset_name = "LibriSpeech" benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info("deep_speech", dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) per_device_batch_size = distribution_utils.per_device_batch_size( flags_obj.batch_size, num_gpus) def input_fn_train(): return dataset.input_fn( per_device_batch_size, train_speech_dataset) def input_fn_eval(): return dataset.input_fn( per_device_batch_size, eval_speech_dataset) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: %d/%d", cycle_index + 1, total_training_cycle) # Perform batch_wise dataset shuffling train_speech_dataset.entries = dataset.batch_wise_dataset_shuffle( train_speech_dataset.entries, cycle_index, flags_obj.sortagrad, flags_obj.batch_size) estimator.train(input_fn=input_fn_train, hooks=train_hooks) # Evaluation tf.logging.info("Starting to evaluate...") eval_results = evaluate_model( estimator, eval_speech_dataset.speech_labels, eval_speech_dataset.entries, input_fn_eval) # Log the WER and CER results. benchmark_logger.log_evaluation_result(eval_results) tf.logging.info( "Iteration {}: WER = {:.2f}, CER = {:.2f}".format( cycle_index + 1, eval_results[_WER_KEY], eval_results[_CER_KEY])) # If some evaluation threshold is met if model_helpers.past_stop_threshold( flags_obj.wer_threshold, eval_results[_WER_KEY]): break
def test_one_device_strategy_gpu(self): ds = distribution_utils.get_distribution_strategy(num_gpus=1) self.assertEquals(ds.num_replicas_in_sync, 1) self.assertEquals(len(ds.extended.worker_devices), 1) self.assertIn('GPU', ds.extended.worker_devices[0])
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) params = ncf_common.parse_flags(FLAGS) model_helpers.apply_clean(flags.FLAGS) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) params["distribute_strategy"] = strategy if not keras_utils.is_v2_0() and strategy is not None: logging.error( "NCF Keras only works with distribution strategy in TF 2.0") return if (params["keras_use_ctl"] and (not keras_utils.is_v2_0() or strategy is None)): logging.error( "Custom training loop only works with tensorflow 2.0 and dist strat." ) return if params["use_tpu"] and not params["keras_use_ctl"]: logging.error( "Custom training loop must be used when using TPUStrategy.") return batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) use_remote_tpu = params["use_tpu"] and FLAGS.tpu primary_cpu_task = tpu_lib.get_primary_cpu_task(use_remote_tpu) with tf.device(primary_cpu_task): (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data, strategy)) steps_per_epoch = None if generate_input_online else num_train_steps with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if FLAGS.dtype == "fp16": optimizer = \ tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) if params["keras_use_ctl"]: train_loss, eval_results = run_ncf_custom_training( params, strategy, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online) else: # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if FLAGS.force_v2_in_keras_compile is not None: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly, experimental_run_tf_function=FLAGS. force_v2_in_keras_compile) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_loss_and_metrics = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") # Keras evaluate() API returns scalar loss and metric values from # evaluation as a list. Here, the returned list would contain # [evaluation loss, hr sum, hr count]. eval_hit_rate = eval_loss_and_metrics[ 1] / eval_loss_and_metrics[2] # Format evaluation result into [eval loss, eval hit accuracy]. eval_results = [eval_loss_and_metrics[0], eval_hit_rate] if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ if flags_obj.enable_eager: tf.enable_eager_execution() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') per_device_batch_size = distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) if flags_obj.use_synthetic_data: input_fn = keras_common.get_synth_input_fn( height=cifar_main.HEIGHT, width=cifar_main.WIDTH, num_channels=cifar_main.NUM_CHANNELS, num_classes=cifar_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: input_fn = cifar_main.input_fn train_input_dataset = input_fn(is_training=True, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn(is_training=False, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) strategy = distribution_utils.get_distribution_strategy( flags_obj.num_gpus, flags_obj.turn_off_distribution_strategy) strategy_scope = keras_common.get_strategy_scope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_cifar_model.resnet56(classes=cifar_main.NUM_CLASSES) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, cifar_main.NUM_IMAGES['train']) train_steps = cifar_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (cifar_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit( train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[time_callback, lr_callback, tensorboard_callback], validation_steps=num_eval_steps, validation_data=validation_data, verbose=1) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=1) stats = keras_common.build_stats(history, eval_output) return stats
def construct_estimator(num_gpus, model_dir, params, batch_size, eval_batch_size): """Construct either an Estimator or TPUEstimator for NCF. Args: num_gpus: The number of gpus (Used to select distribution strategy) model_dir: The model directory for the estimator params: The params dict for the estimator batch_size: The mini-batch size for training. eval_batch_size: The batch size used during evaluation. Returns: An Estimator or TPUEstimator. """ if params["use_tpu"]: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], ) tf.logging.info("Issuing reset command to TPU to ensure a clean state.") tf.Session.reset(tpu_cluster_resolver.get_master()) tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=100, num_shards=8) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), tpu_config=tpu_config) tpu_params = {k: v for k, v in params.items() if k != "batch_size"} train_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=True, train_batch_size=batch_size, params=tpu_params, config=run_config) eval_estimator = tf.contrib.tpu.TPUEstimator( model_fn=neumf_model.neumf_model_fn, use_tpu=False, train_batch_size=1, predict_batch_size=eval_batch_size, params=tpu_params, config=run_config) return train_estimator, eval_estimator distribution = distribution_utils.get_distribution_strategy(num_gpus=num_gpus) run_config = tf.estimator.RunConfig(train_distribute=distribution) params["eval_batch_size"] = eval_batch_size estimator = tf.estimator.Estimator(model_fn=neumf_model.neumf_model_fn, model_dir=model_dir, config=run_config, params=params) return estimator, estimator
def run_mnist(flags_obj): """Run MNIST training and eval loop. Args: flags_obj: An object containing parsed flag values. """ model_helpers.apply_clean(flags_obj) model_function = model_fn session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') mnist_classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'data_format': data_format, }) # Set up training and evaluation input functions. def train_input_fn(): """Prepare data for training.""" # When choosing shuffle buffer sizes, larger sizes result in better # randomness, while smaller sizes use less memory. MNIST is a small # enough dataset that we can easily shuffle the full epoch. ds = dataset.train(flags_obj.data_dir) ds = ds.cache().shuffle(buffer_size=50000).batch(flags_obj.batch_size) # Iterate through the dataset a set number (`epochs_between_evals`) of times # during each training session. ds = ds.repeat(flags_obj.epochs_between_evals) return ds def eval_input_fn(): return dataset.test(flags_obj.data_dir).batch( flags_obj.batch_size).make_one_shot_iterator().get_next() # Set up hook that outputs training logs every 100 steps. train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) # Train and evaluate model. for _ in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): mnist_classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break # Export the model if flags_obj.export_dir is not None: image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True)
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. """ if flags_obj.enable_eager: tf.compat.v1.enable_eager_execution() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = keras_common.get_synth_input_fn( height=imagenet_main.DEFAULT_IMAGE_SIZE, width=imagenet_main.DEFAULT_IMAGE_SIZE, num_channels=imagenet_main.NUM_CHANNELS, num_classes=imagenet_main.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj)) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_main.input_fn train_input_dataset = input_fn(is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) eval_input_dataset = input_fn(is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=parse_record_keras) strategy = distribution_utils.get_distribution_strategy( num_gpus=flags_obj.num_gpus, turn_off_distribution_strategy=flags_obj.turn_off_distribution_strategy ) strategy_scope = keras_common.get_strategy_scope(strategy) with strategy_scope: optimizer = keras_common.get_optimizer() model = resnet_model.resnet50(num_classes=imagenet_main.NUM_CLASSES) model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['sparse_categorical_accuracy']) time_callback, tensorboard_callback, lr_callback = keras_common.get_callbacks( learning_rate_schedule, imagenet_main.NUM_IMAGES['train']) train_steps = imagenet_main.NUM_IMAGES['train'] // flags_obj.batch_size train_epochs = flags_obj.train_epochs if flags_obj.train_steps: train_steps = min(flags_obj.train_steps, train_steps) train_epochs = 1 num_eval_steps = (imagenet_main.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit( train_input_dataset, epochs=train_epochs, steps_per_epoch=train_steps, callbacks=[time_callback, lr_callback, tensorboard_callback], validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = keras_common.build_stats(history, eval_output, time_callback) return stats
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Returns: Dict of results of the run. """ model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60*60*24) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_tf_dtype(flags_obj), datasets_num_private_threads=flags_obj.datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))] schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial( image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) return eval_results
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals, num_gpus=flags_core.get_num_gpus(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)