def train(self, iter_unit, num_iter, run_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, mixup=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False, quantize=False, symmetric=False, quant_delay=0, finetune_checkpoint=None, use_final_conv=False, use_qdq=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if run_iter == -1: run_iter = num_steps else: run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train") training_hooks = [] if hvd.rank() == 0: print('Starting Model Training...') print("Training Epochs", num_epochs) print("Total Steps", num_steps) print("Steps per Epoch", steps_per_epoch) print("Decay Steps", num_decay_steps) print("Weight Decay Factor", weight_decay) print("Init Learning Rate", lr_init) print("Momentum", momentum) print("Num GPUs", num_gpus) print("Per-GPU Batch Size", batch_size) if is_benchmark: self.training_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=global_batch_size, warmup_steps=warmup_steps) else: self.training_logging_hook = hooks.TrainingLoggingHook( global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, steps_per_epoch=steps_per_epoch) training_hooks.append(self.training_logging_hook) if hvd_utils.is_using_hvd(): bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) training_hooks.append(hooks.TrainingPartitionHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'mixup': mixup, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr, 'use_final_conv': use_final_conv, 'quantize': quantize, 'use_qdq': use_qdq, 'symmetric': symmetric, 'quant_delay': quant_delay } if finetune_checkpoint: estimator_params['finetune_checkpoint'] = finetune_checkpoint image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: print("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: current_step = image_classifier.get_variable_value("global_step") except ValueError: current_step = 0 run_iter = max(0, min(run_iter, num_steps - current_step)) print("Current step:", current_step) if run_iter > 0: try: image_classifier.train( input_fn=training_data_fn, steps=run_iter, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: if run_iter > 0: print('Ending Model Training ...') train_throughput = self.training_logging_hook.mean_throughput.value( ) train_time = self.training_logging_hook.train_time dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) dllogger.log(data={'Total Training time': train_time}, step=tuple()) else: print( 'Model already trained required number of steps. Skipped')
def train( self, iter_unit, num_iter, batch_size, warmup_steps=50, weight_decay=1e-4, learning_rate_init=0.1, momentum=0.9, log_every_n_steps=10, is_benchmark=False ): if iter_unit not in ["epoch", "batch"]: raise ValueError('`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp: if hvd.rank() == 0: LOGGER.log("TF Loss Auto Scaling is activated - Experimental Feature") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" num_gpus = hvd.size() global_batch_size = batch_size * num_gpus num_samples = data_utils.get_num_images(self.run_hparams.data_dir, mode="train") scaled_learning_rate = learning_rate_init * global_batch_size if iter_unit == 'epoch': num_steps = (num_samples // global_batch_size) * num_iter num_epochs = num_iter steps_per_epoch = num_steps / num_epochs else: num_steps = num_iter num_epochs = math.ceil(num_steps / (num_samples // global_batch_size)) steps_per_epoch = num_steps training_hooks = [] if hvd.rank() == 0: LOGGER.log('Starting Model Training...') LOGGER.log("Training Epochs", num_epochs) LOGGER.log("Total Steps", num_steps) LOGGER.log("Steps per Epoch", steps_per_epoch) LOGGER.log("Init Learning Rate", learning_rate_init) LOGGER.log("Scaled Learning Rate", scaled_learning_rate) LOGGER.log("Momentum", momentum) LOGGER.log("Num GPUs", num_gpus) LOGGER.log("Per-GPU Batch Size", batch_size) training_logging_hook = hooks.TrainingLoggingHook( global_batch_size=global_batch_size, log_every=log_every_n_steps ) training_hooks.append(training_logging_hook) bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) estimator_params = { 'batch_size': batch_size, 'num_gpus': num_gpus, 'momentum': momentum, 'learning_rate': scaled_learning_rate, 'log_dir' : self.run_hparams.log_dir, } image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla ) def training_data_fn(): return data_utils.get_input_fn( data_dir=self.run_hparams.data_dir, mode='train', batch_size=batch_size, label_output_scale=self.run_hparams.label_output_scale ) try: image_classifier.train( input_fn=training_data_fn, steps=num_steps, hooks=training_hooks, ) except KeyboardInterrupt as e: print("Keyboard interrupt") if hvd.rank() == 0: LOGGER.log('Ending Model Training ...')
def train( self, iter_unit, num_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False ): if iter_unit not in ["epoch", "batch"]: raise ValueError('`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: LOGGER.log("TF Loss Auto Scaling is activated") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames,num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train" ) training_hooks = [] if hvd.rank() == 0: LOGGER.log('Starting Model Training...') LOGGER.log("Training Epochs", num_epochs) LOGGER.log("Total Steps", num_steps) LOGGER.log("Steps per Epoch", steps_per_epoch) LOGGER.log("Decay Steps", num_decay_steps) LOGGER.log("Weight Decay Factor", weight_decay) LOGGER.log("Init Learning Rate", lr_init) LOGGER.log("Momentum", momentum) LOGGER.log("Num GPUs", num_gpus) LOGGER.log("Per-GPU Batch Size", batch_size) if is_benchmark: benchmark_logging_hook = hooks.BenchmarkLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training_benchmark.json"), global_batch_size=global_batch_size, log_every=log_every_n_steps, warmup_steps=warmup_steps ) training_hooks.append(benchmark_logging_hook) else: training_logging_hook = hooks.TrainingLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"), global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, log_every=log_every_n_steps ) training_hooks.append(training_logging_hook) if hvd_utils.is_using_hvd(): bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) # NVTX nvtx_callback = NVTXHook(skip_n_steps=1, name='Train') training_hooks.append(nvtx_callback) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr } image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction ) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: LOGGER.log("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: image_classifier.train( input_fn=training_data_fn, steps=num_steps, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: LOGGER.log('Ending Model Training ...')
def train(self, iter_unit, num_iter, batch_size, warmup_steps=10, weight_decay=2e-4, learning_rate_init=0.1, momentum=0.9, log_every_n_steps=1, loss_scale=1024, use_auto_loss_scaling=False, is_benchmark=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_fast_math or self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_auto_loss_scaling: LOGGER.log( "TF Loss Auto Scaling is activated - Experimental Feature") os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: use_auto_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size() global_batch_size = batch_size * num_gpus num_epochs = num_iter # training_hooks = [tf_debug.LocalCLIDebugHook(), tf_debug.TensorBoardDebugHook("0.0.0.0:6006")] training_hooks = [] num_samples = 50000 num_steps = (num_samples // global_batch_size) * num_iter steps_per_epoch = num_steps / num_epochs num_decay_steps = num_steps if hvd.rank() == 0: LOGGER.log('Starting Model Training...') LOGGER.log("Training Epochs", num_epochs) LOGGER.log("Total Steps", num_steps) LOGGER.log("Steps per Epoch", steps_per_epoch) # LOGGER.log("Decay Steps", num_decay_steps) LOGGER.log("Weight Decay Factor", weight_decay) LOGGER.log("Init Learning Rate", learning_rate_init) LOGGER.log("Momentum", momentum) LOGGER.log("Num GPUs", num_gpus) LOGGER.log("Per-GPU Batch Size", batch_size) if is_benchmark: benchmark_logging_hook = hooks.BenchmarkLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training_benchmark.json"), global_batch_size=global_batch_size, log_every=log_every_n_steps, warmup_steps=warmup_steps) training_hooks.append(benchmark_logging_hook) else: training_logging_hook = hooks.TrainingLoggingHook( log_file_path=os.path.join(self.run_hparams.log_dir, "training.json"), global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, log_every=log_every_n_steps) training_hooks.append(training_logging_hook) if hvd_utils.is_using_hvd(): bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'learning_rate_init': learning_rate_init, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': not use_auto_loss_scaling } image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla) def training_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( data_dir=self.run_hparams.data_dir, num_epochs=num_iter, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, # distort_color=self.run_hparams.distort_colors, # num_threads=self.run_hparams.num_preprocessing_threads, datasets_num_private_threads=None # deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: image_classifier.train( input_fn=training_data_fn, steps=num_steps, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: LOGGER.log('Ending Model Training ...') input_receiver_fn = data_utils.build_serving_input_receiver_fn( shape=[32, 32, 3], dtype=tf.float32, batch_size=None) image_classifier.export_savedmodel( export_dir_base=self.run_hparams.model_dir, serving_input_receiver_fn=input_receiver_fn) start_time = time.time() trt_graph = trt.create_inference_graph( input_graph_def=None, outputs=None, input_saved_model_dir=os.path.join( self.run_hparams.model_dir, next(os.walk(self.run_hparams.model_dir))[1][0]), input_saved_model_tags=['serve'], max_batch_size=128, max_workspace_size_bytes=1 << 20, precision_mode="FP16") print("time(ms) to run tf-trt: %d" % (time.time() - start_time)) print("number of nodes after conversion: %d" % len(trt_graph.node)) print("number of trt engine nodes: %d" % len([1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp'])) # with tf.gfile.GFile("test.pb", "wb") as f: # f.write(trt_graph.SerializeToString()) tf.io.write_graph(trt_graph, self.run_hparams.model_dir, 'model.graphdef', as_text=False) print("write graph to: %s/model.graphdef" % (self.run_hparams.model_dir))