def _get_run_config(mode, model_dir, use_xla, use_dali, use_cpu, gpu_memory_fraction, gpu_id=0, seed=None): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) if seed is not None: tf_random_seed = 2 * (seed + hvd.rank()) else: tf_random_seed = None config = tf.estimator.RunConfig( model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=100 if mode in ['train', 'validation'] else 1e9, # disabled in benchmark mode save_checkpoints_steps=None, save_checkpoints_secs=None, session_config=Runner._get_session_config( mode=mode, use_xla=use_xla, use_dali=use_dali, use_cpu=use_cpu, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id), keep_checkpoint_max=5, keep_checkpoint_every_n_hours=1e6, # disabled log_step_count_steps=1e9, train_distribute=None, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None) if mode == 'train': config = config.replace( save_checkpoints_steps=1000 if hvd.rank() == 0 else None, keep_checkpoint_every_n_hours=3) return config
def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: print("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def __init__(self, filenames, idx_filenames, height, width, batch_size, num_threads, dtype=tf.uint8, dali_cpu=True, deterministic=False, training=False): device_id = hvd.local_rank() shard_id = hvd.rank() num_gpus = hvd.size() pipe = HybridPipe(tfrec_filenames=filenames, tfrec_idx_filenames=idx_filenames, height=height, width=width, batch_size=batch_size, num_threads=num_threads, device_id=device_id, shard_id=shard_id, num_gpus=num_gpus, deterministic=deterministic, dali_cpu=dali_cpu, training=training) daliop = dali_tf.DALIIterator() with tf.device("/gpu:0"): self.images, self.labels = daliop(pipeline=pipe, shapes=[(batch_size, height, width, 3), (batch_size, 1)], dtypes=[tf.float32, tf.int64], device_id=device_id)
def __init__(self, tfrec_filenames, tfrec_idx_filenames, height, width, batch_size, num_threads, device_id, shard_id, num_gpus, deterministic=False, dali_cpu=True, training=True): kwargs = dict() if deterministic: kwargs['seed'] = 7 * (1 + hvd.rank()) super(HybridPipe, self).__init__(batch_size, num_threads, device_id, **kwargs) self.training = training self.input = dali.ops.TFRecordReader( path=tfrec_filenames, index_path=tfrec_idx_filenames, random_shuffle=True, shard_id=shard_id, num_shards=num_gpus, initial_fill=10000, features={ 'image/encoded': dali.tfrecord.FixedLenFeature((), dali.tfrecord.string, ""), 'image/class/label': dali.tfrecord.FixedLenFeature([1], dali.tfrecord.int64, -1), 'image/class/text': dali.tfrecord.FixedLenFeature([], dali.tfrecord.string, ''), 'image/object/bbox/xmin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0), 'image/object/bbox/ymin': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0), 'image/object/bbox/xmax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0), 'image/object/bbox/ymax': dali.tfrecord.VarLenFeature(dali.tfrecord.float32, 0.0) }) if self.training: self.decode = dali.ops.ImageDecoderRandomCrop( device="cpu" if dali_cpu else "mixed", output_type=dali.types.RGB, random_aspect_ratio=[0.75, 1.33], random_area=[0.05, 1.0], num_attempts=100) self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_x=width, resize_y=height) else: self.decode = dali.ops.ImageDecoder( device="cpu" if dali_cpu else "mixed", output_type=dali.types.RGB) # Make sure that every image > 224 for CropMirrorNormalize self.resize = dali.ops.Resize(device="cpu" if dali_cpu else "gpu", resize_shorter=256) self.normalize = dali.ops.CropMirrorNormalize( device="gpu", output_dtype=dali.types.FLOAT, crop=(height, width), image_type=dali.types.RGB, mean=[123.68, 116.28, 103.53], std=[58.395, 57.120, 57.385], output_layout=dali.types.NHWC) self.cast_float = dali.ops.Cast(device="gpu", dtype=dali.types.FLOAT) self.mirror = dali.ops.CoinFlip() self.iter = 0
def evaluate( self, iter_unit, num_iter, batch_size, warmup_steps=50, log_every_n_steps=1, is_benchmark=False, export_dir=None, quantize=False, symmetric=False, use_qdq=False, use_final_conv=False, ): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for evaluation!') if hvd.rank() != 0: raise RuntimeError('Multi-GPU inference is not supported') estimator_params = { 'quantize': quantize, 'symmetric': symmetric, 'use_qdq': use_qdq, 'use_final_conv': use_final_conv } image_classifier = self._get_estimator( mode='validation', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="validation", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=batch_size, ) else: num_epochs = 1 num_decay_steps = -1 num_steps = num_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="validation") eval_hooks = [] if hvd.rank() == 0: self.eval_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps) eval_hooks.append(self.eval_logging_hook) print('Starting Model Evaluation...') print("Evaluation Epochs", num_epochs) print("Evaluation Steps", num_steps) print("Decay Steps", num_decay_steps) print("Global Batch Size", batch_size) def evaluation_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: print("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: eval_results = image_classifier.evaluate( input_fn=evaluation_data_fn, steps=num_steps, hooks=eval_hooks, ) eval_throughput = self.eval_logging_hook.mean_throughput.value() if len(self.eval_logging_hook.latencies) > 0: eval_latencies = np.array( self.eval_logging_hook.latencies) * 1000 eval_latencies_q = np.quantile(eval_latencies, q=[0.9, 0.95, 0.99]) eval_latencies_mean = np.mean(eval_latencies) additional_metrics = { 'eval_latency_avg': eval_latencies_mean, 'eval_latency_p90': eval_latencies_q[0], 'eval_latency_p95': eval_latencies_q[1], 'eval_latency_p99': eval_latencies_q[2], } else: additional_metrics = {} dllogger.log(data={ 'top1_accuracy': float(eval_results['top1_accuracy']), 'top5_accuracy': float(eval_results['top5_accuracy']), 'eval_throughput': eval_throughput, **additional_metrics }, step=tuple()) if export_dir is not None: dllogger.log(data={'export_dir': export_dir}, step=tuple()) input_receiver_fn = data_utils.get_serving_input_receiver_fn( batch_size=None, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, dtype=self.run_hparams.dtype) self.exported_path = image_classifier.export_savedmodel( export_dir, input_receiver_fn) except KeyboardInterrupt: print("Keyboard interrupt") print('Model evaluation finished')
def __init__( self, # ========= Model HParams ========= # n_classes=1001, architecture='resnet50', input_format='NHWC', # NCHW or NHWC compute_format='NCHW', # NCHW or NHWC dtype=tf.float32, # tf.float32 or tf.float16 n_channels=3, height=224, width=224, distort_colors=False, model_dir=None, log_dir=None, data_dir=None, data_idx_dir=None, weight_init="fan_out", # ======= Optimization HParams ======== # use_xla=False, use_tf_amp=False, use_dali=False, use_cpu=False, gpu_memory_fraction=1.0, gpu_id=0, # ======== Debug Flags ======== # debug_verbosity=0, seed=None): if dtype not in [tf.float32, tf.float16]: raise ValueError( "Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)" % dtype) if compute_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])" % compute_format) if input_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])" % input_format) if n_channels not in [1, 3]: raise ValueError( "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels) tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None # ============================================ # Optimsation Flags - Do not remove # ============================================ os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '2' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' os.environ["TF_XLA_FLAGS"] = ( os.environ.get("TF_XLA_FLAGS", "") + " --tf_xla_enable_lazy_compilation=false") # ============================================ # TF-AMP Setup - Do not remove # ============================================ if dtype == tf.float16: if use_tf_amp: raise RuntimeError( "TF AMP can not be activated for FP16 precision") elif use_tf_amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "0" # ================================================= model_hparams = tf.contrib.training.HParams( width=height, height=width, n_channels=n_channels, n_classes=n_classes, dtype=dtype, input_format=input_format, compute_format=compute_format, distort_colors=distort_colors, seed=tf_seed) num_preprocessing_threads = 10 if not use_dali else 4 run_config_performance = tf.contrib.training.HParams( num_preprocessing_threads=num_preprocessing_threads, use_tf_amp=use_tf_amp, use_xla=use_xla, use_dali=use_dali, use_cpu=use_cpu, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id) run_config_additional = tf.contrib.training.HParams( model_dir=model_dir, log_dir=log_dir if hvd.rank() == 0 else None, data_dir=data_dir, data_idx_dir=data_idx_dir, num_preprocessing_threads=num_preprocessing_threads) self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional, run_config_performance) model_name = architecture architecture = resnet.model_architectures[architecture] self._model = resnet.ResnetModel( model_name=model_name, n_classes=model_hparams.n_classes, layers_count=architecture["layers"], layers_depth=architecture["widths"], expansions=architecture["expansions"], input_format=model_hparams.input_format, compute_format=model_hparams.compute_format, dtype=model_hparams.dtype, weight_init=weight_init, use_dali=use_dali, use_cpu=use_cpu, cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1, use_se=architecture['use_se'] if 'use_se' in architecture else False, se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1) if self.run_hparams.seed is not None: tf.set_random_seed(self.run_hparams.seed) self.training_logging_hook = None self.eval_logging_hook = None
def train(self, iter_unit, num_iter, run_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, mixup=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False, quantize=False, symmetric=False, quant_delay=0, finetune_checkpoint=None, use_final_conv=False, use_qdq=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if run_iter == -1: run_iter = num_steps else: run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train") training_hooks = [] if hvd.rank() == 0: print('Starting Model Training...') print("Training Epochs", num_epochs) print("Total Steps", num_steps) print("Steps per Epoch", steps_per_epoch) print("Decay Steps", num_decay_steps) print("Weight Decay Factor", weight_decay) print("Init Learning Rate", lr_init) print("Momentum", momentum) print("Num GPUs", num_gpus) print("Per-GPU Batch Size", batch_size) if is_benchmark: self.training_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=global_batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps) else: self.training_logging_hook = hooks.TrainingLoggingHook( global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, steps_per_epoch=steps_per_epoch, logging_steps=log_every_n_steps) training_hooks.append(self.training_logging_hook) if hvd.size() > 1: bcast_hook = hvd.hvd_global_object.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) training_hooks.append(hooks.TrainingPartitionHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'mixup': mixup, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr, 'use_final_conv': use_final_conv, 'quantize': quantize, 'use_qdq': use_qdq, 'symmetric': symmetric, 'quant_delay': quant_delay } if finetune_checkpoint: estimator_params['finetune_checkpoint'] = finetune_checkpoint image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: print("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: current_step = image_classifier.get_variable_value("global_step") except ValueError: current_step = 0 run_iter = max(0, min(run_iter, num_steps - current_step)) print("Current step:", current_step) if run_iter > 0: try: image_classifier.train( input_fn=training_data_fn, steps=run_iter, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: if run_iter > 0: print('Ending Model Training ...') train_throughput = self.training_logging_hook.mean_throughput.value( ) dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) else: print( 'Model already trained required number of steps. Skipped')
def __init__( self, model_name, n_classes, layers_count, layers_depth, expansions, compute_format='NCHW', input_format='NHWC', weight_init='fan_out', dtype=tf.float32, use_dali=False, use_cpu=False, cardinality=1, use_se=False, se_ratio=1, ): self.model_hparams = tf.contrib.training.HParams( n_classes=n_classes, compute_format=compute_format, input_format=input_format, dtype=dtype, layers_count=layers_count, layers_depth=layers_depth, expansions=expansions, model_name=model_name, use_dali=use_dali, use_cpu=use_cpu, cardinality=cardinality, use_se=use_se, se_ratio=se_ratio ) self.batch_norm_hparams = tf.contrib.training.HParams( decay=0.9, epsilon=1e-5, scale=True, center=True, param_initializers={ 'beta': tf.constant_initializer(0.0), 'gamma': tf.constant_initializer(1.0), 'moving_mean': tf.constant_initializer(0.0), 'moving_variance': tf.constant_initializer(1.0) }, ) self.conv2d_hparams = tf.contrib.training.HParams( kernel_initializer=tf.compat.v1.variance_scaling_initializer( scale=2.0, distribution='truncated_normal', mode=weight_init ), bias_initializer=tf.constant_initializer(0.0) ) self.dense_hparams = tf.contrib.training.HParams( kernel_initializer=tf.compat.v1.variance_scaling_initializer( scale=2.0, distribution='truncated_normal', mode=weight_init ), bias_initializer=tf.constant_initializer(0.0) ) if hvd.rank() == 0: print("Model HParams:") print("Name", model_name) print("Number of classes", n_classes) print("Compute_format", compute_format) print("Input_format", input_format) print("dtype", str(dtype))
from utils import hvd_wrapper as hvd import tensorflow as tf import os import warnings warnings.simplefilter("ignore") if __name__ == "__main__": tf.logging.set_verbosity(tf.logging.ERROR) FLAGS = parse_cmdline(model_architectures.keys()) hvd.init(True) if hvd.rank() == 0: log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) os.makedirs(FLAGS.results_dir, exist_ok=True) dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(FLAGS), step='PARAMETER') runner = Runner( # ========= Model HParams ========= # n_classes=1001,