def after_run(self, run_context, run_values): self.global_step = run_values.results[0] + 1 if hvd.size() > 1 and len(run_values.results) == 2: if run_values.results[1] > 0: run_context.request_stop() elif self.signal_recieved: run_context.request_stop()
def before_run(self, run_context): fetches = [tf.train.get_global_step()] feed_dict = None if hvd.size() > 1 and (self.global_step % self.sync_freq) == 0: fetches += [self.allreduce_op] feed_dict = {self.input_op: int(self.signal_recieved)} return tf.train.SessionRunArgs(fetches, feed_dict=feed_dict)
def _get_session_config(mode, use_xla, use_dali, use_cpu, gpu_memory_fraction, gpu_id=0): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) config = tf.ConfigProto() if not use_cpu: # Limit available GPU memory (tune the size) if use_dali: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = False else: config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.visible_device_list = str(gpu_id) config.gpu_options.force_gpu_compatible = True # Force pinned memory if hvd.size() > 1: config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory if use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 if mode == 'train': if not use_cpu: config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // max(hvd.size(), 8) - 2)) return config
def __init__(self, filenames, idx_filenames, height, width, batch_size, num_threads, dtype=tf.uint8, dali_cpu=True, deterministic=False, training=False): device_id = hvd.local_rank() shard_id = hvd.rank() num_gpus = hvd.size() pipe = HybridPipe(tfrec_filenames=filenames, tfrec_idx_filenames=idx_filenames, height=height, width=width, batch_size=batch_size, num_threads=num_threads, device_id=device_id, shard_id=shard_id, num_gpus=num_gpus, deterministic=deterministic, dali_cpu=dali_cpu, training=training) daliop = dali_tf.DALIIterator() with tf.device("/gpu:0"): self.images, self.labels = daliop(pipeline=pipe, shapes=[(batch_size, height, width, 3), (batch_size, 1)], dtypes=[tf.float32, tf.int64], device_id=device_id)
def begin(self): if hvd.size() > 1: with tf.device("/cpu:0"): self.input_op = tf.placeholder(tf.int32, shape=()) self.allreduce_op = hvd.hvd_global_object.allreduce( self.input_op, op=hvd.hvd_global_object.Sum, name="signal_handler_all_reduce")
def train(self, iter_unit, num_iter, run_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, mixup=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False, quantize=False, symmetric=False, quant_delay=0, finetune_checkpoint=None, use_final_conv=False, use_qdq=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if run_iter == -1: run_iter = num_steps else: run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train") training_hooks = [] if hvd.rank() == 0: print('Starting Model Training...') print("Training Epochs", num_epochs) print("Total Steps", num_steps) print("Steps per Epoch", steps_per_epoch) print("Decay Steps", num_decay_steps) print("Weight Decay Factor", weight_decay) print("Init Learning Rate", lr_init) print("Momentum", momentum) print("Num GPUs", num_gpus) print("Per-GPU Batch Size", batch_size) if is_benchmark: self.training_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=global_batch_size, warmup_steps=warmup_steps, logging_steps=log_every_n_steps) else: self.training_logging_hook = hooks.TrainingLoggingHook( global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, steps_per_epoch=steps_per_epoch, logging_steps=log_every_n_steps) training_hooks.append(self.training_logging_hook) if hvd.size() > 1: bcast_hook = hvd.hvd_global_object.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) training_hooks.append(hooks.TrainingPartitionHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'mixup': mixup, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr, 'use_final_conv': use_final_conv, 'quantize': quantize, 'use_qdq': use_qdq, 'symmetric': symmetric, 'quant_delay': quant_delay } if finetune_checkpoint: estimator_params['finetune_checkpoint'] = finetune_checkpoint image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: print("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: current_step = image_classifier.get_variable_value("global_step") except ValueError: current_step = 0 run_iter = max(0, min(run_iter, num_steps - current_step)) print("Current step:", current_step) if run_iter > 0: try: image_classifier.train( input_fn=training_data_fn, steps=run_iter, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: if run_iter > 0: print('Ending Model Training ...') train_throughput = self.training_logging_hook.mean_throughput.value( ) dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) else: print( 'Model already trained required number of steps. Skipped')
def _get_global_batch_size(worker_batch_size): return worker_batch_size * hvd.size()
def __call__(self, features, labels, mode, params): if mode == tf.estimator.ModeKeys.TRAIN: mandatory_params = [ "batch_size", "lr_init", "num_gpus", "steps_per_epoch", "momentum", "weight_decay", "loss_scale", "label_smoothing" ] for p in mandatory_params: if p not in params: raise RuntimeError("Parameter {} is missing.".format(p)) if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali: with tf.device('/cpu:0'): # Stage inputs on the host cpu_prefetch_op, (features, labels) = self._stage([features, labels]) if not self.model_hparams.use_cpu: with tf.device('/gpu:0'): # Stage inputs to the device gpu_prefetch_op, (features, labels) = self._stage([features, labels]) main_device = "/gpu:0" if not self.model_hparams.use_cpu else "/cpu:0" with tf.device(main_device): if features.dtype != self.model_hparams.dtype: features = tf.cast(features, self.model_hparams.dtype) # Subtract mean per channel # and enforce values between [-1, 1] if not self.model_hparams.use_dali: features = normalized_inputs(features) mixup = 0 eta = 0 if mode == tf.estimator.ModeKeys.TRAIN: eta = params['label_smoothing'] mixup = params['mixup'] if mode != tf.estimator.ModeKeys.PREDICT: n_cls = self.model_hparams.n_classes one_hot_smoothed_labels = tf.one_hot(labels, n_cls, on_value=1 - eta + eta / n_cls, off_value=eta / n_cls) if mixup != 0: print("Using mixup training with beta=", params['mixup']) beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup']) feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1]) reversed_feature_coefficients = tf.subtract( tf.ones(shape=feature_coefficients.shape), feature_coefficients ) rotated_features = tf.reverse(features, axis=[0]) features = feature_coefficients * features + reversed_feature_coefficients * rotated_features label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3]) rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0]) reversed_label_coefficients = tf.subtract( tf.ones(shape=label_coefficients.shape), label_coefficients ) one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels # Update Global Step global_step = tf.train.get_or_create_global_step() tf.identity(global_step, name="global_step_ref") tf.identity(features, name="features_ref") if mode == tf.estimator.ModeKeys.TRAIN: tf.identity(labels, name="labels_ref") probs, logits = self.build_model( features, training=mode == tf.estimator.ModeKeys.TRAIN, reuse=False, use_final_conv=params['use_final_conv'] ) if params['use_final_conv']: logits = tf.squeeze(logits, axis=[-2, -1]) y_preds = tf.argmax(logits, axis=1, output_type=tf.int32) # Check the output dtype, shall be FP32 in training assert (probs.dtype == tf.float32) assert (logits.dtype == tf.float32) assert (y_preds.dtype == tf.int32) tf.identity(logits, name="logits_ref") tf.identity(probs, name="probs_ref") tf.identity(y_preds, name="y_preds_ref") #if mode == tf.estimator.ModeKeys.TRAIN: # # assert (len(tf.trainable_variables()) == 161) # #else: # # assert (len(tf.trainable_variables()) == 0) if mode == tf.estimator.ModeKeys.TRAIN and params['quantize']: dllogger.log(data={"QUANTIZATION AWARE TRAINING ENABLED": True}, step=tuple()) if params['symmetric']: dllogger.log(data={"MODE": "USING SYMMETRIC MODE"}, step=tuple()) tf.contrib.quantize.experimental_create_training_graph( tf.get_default_graph(), symmetric=True, use_qdq=params['use_qdq'], quant_delay=params['quant_delay'] ) else: dllogger.log(data={"MODE": "USING ASSYMETRIC MODE"}, step=tuple()) tf.contrib.quantize.create_training_graph( tf.get_default_graph(), quant_delay=params['quant_delay'], use_qdq=params['use_qdq'] ) # Fix for restoring variables during fine-tuning of Resnet if 'finetune_checkpoint' in params.keys(): train_vars = tf.trainable_variables() train_var_dict = {} for var in train_vars: train_var_dict[var.op.name] = var dllogger.log(data={"Restoring variables from checkpoint": params['finetune_checkpoint']}, step=tuple()) tf.train.init_from_checkpoint(params['finetune_checkpoint'], train_var_dict) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'classes': y_preds, 'probabilities': probs} return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={'predict': tf.estimator.export.PredictOutput(predictions)} ) else: with tf.device(main_device): if mode == tf.estimator.ModeKeys.TRAIN: acc_top1 = tf.nn.in_top_k(predictions=logits, targets=labels, k=1) acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5) else: acc_top1, acc_top1_update_op = tf.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=1) ) acc_top5, acc_top5_update_op = tf.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=5) ) tf.identity(acc_top1, name="acc_top1_ref") tf.identity(acc_top5, name="acc_top5_ref") predictions = { 'classes': y_preds, 'probabilities': probs, 'accuracy_top1': acc_top1, 'accuracy_top5': acc_top5 } cross_entropy = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_smoothed_labels) assert (cross_entropy.dtype == tf.float32) tf.identity(cross_entropy, name='cross_entropy_loss_ref') def loss_filter_fn(name): """we don't need to compute L2 loss for BN and bias (eq. to add a cste)""" return all( [ tensor_name not in name.lower() # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"] for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"] ] ) filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)] if len(filtered_params) != 0: l2_loss_per_vars = [tf.nn.l2_loss(v) for v in filtered_params] l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars), params["weight_decay"]) else: l2_loss = tf.zeros(shape=(), dtype=tf.float32) assert (l2_loss.dtype == tf.float32) tf.identity(l2_loss, name='l2_loss_ref') total_loss = tf.add(cross_entropy, l2_loss, name="total_loss") assert (total_loss.dtype == tf.float32) tf.identity(total_loss, name='total_loss_ref') tf.summary.scalar('cross_entropy', cross_entropy) tf.summary.scalar('l2_loss', l2_loss) tf.summary.scalar('total_loss', total_loss) if mode == tf.estimator.ModeKeys.TRAIN: with tf.device("/cpu:0"): learning_rate = learning_rate_scheduler( lr_init=params["lr_init"], lr_warmup_epochs=params["lr_warmup_epochs"], global_step=global_step, batch_size=params["batch_size"], num_batches_per_epoch=params["steps_per_epoch"], num_decay_steps=params["num_decay_steps"], num_gpus=params["num_gpus"], use_cosine_lr=params["use_cosine_lr"] ) tf.identity(learning_rate, name='learning_rate_ref') tf.summary.scalar('learning_rate', learning_rate) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"]) if params["apply_loss_scaling"]: optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"]) if hvd.size() > 1: optimizer = hvd.hvd_global_object.DistributedOptimizer(optimizer) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if mode != tf.estimator.ModeKeys.TRAIN: update_ops += [acc_top1_update_op, acc_top5_update_op] deterministic = True gate_gradients = (tf.compat.v1.train.Optimizer.GATE_OP if deterministic else tf.compat.v1.train.Optimizer.GATE_NONE) backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step) if self.model_hparams.use_dali: train_ops = tf.group(backprop_op, update_ops, name='train_ops') elif self.model_hparams.use_cpu: train_ops = tf.group( backprop_op, cpu_prefetch_op, update_ops, name='train_ops' ) else: train_ops = tf.group( backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops' ) return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = { "top1_accuracy": (acc_top1, acc_top1_update_op), "top5_accuracy": (acc_top5, acc_top5_update_op) } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=total_loss, eval_metric_ops=eval_metrics ) else: raise NotImplementedError('Unknown mode {}'.format(mode))
loss_scale=FLAGS.static_loss_scale, label_smoothing=FLAGS.label_smoothing, mixup=FLAGS.mixup, use_static_loss_scaling=(FLAGS.static_loss_scale != -1), use_cosine_lr=FLAGS.cosine_lr, is_benchmark=FLAGS.mode == 'training_benchmark', use_final_conv=FLAGS.use_final_conv, quantize=FLAGS.quantize, symmetric=FLAGS.symmetric, quant_delay=FLAGS.quant_delay, use_qdq=FLAGS.use_qdq, finetune_checkpoint=FLAGS.finetune_checkpoint) if FLAGS.mode in ["train_and_evaluate", 'evaluate', 'inference_benchmark']: if FLAGS.mode == 'inference_benchmark' and hvd.size() > 1: raise NotImplementedError( "Only single GPU inference is implemented.") elif hvd.rank() == 0: runner.evaluate(iter_unit=FLAGS.iter_unit if FLAGS.mode != "train_and_evaluate" else "epoch", num_iter=FLAGS.num_iter if FLAGS.mode != "train_and_evaluate" else 1, warmup_steps=FLAGS.warmup_steps, batch_size=FLAGS.batch_size, log_every_n_steps=FLAGS.display_every, is_benchmark=FLAGS.mode == 'inference_benchmark', export_dir=FLAGS.export_dir, quantize=FLAGS.quantize, symmetric=FLAGS.symmetric, use_final_conv=FLAGS.use_final_conv, use_qdq=FLAGS.use_qdq)