def evaluation_data_fn(): if not is_benchmark or self.run_hparams.data_dir is not None: return self.dataset.dataset_fn( batch_size=batch_size, training=False, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", only_defective_images=False, augment_data=False, seed=self.run_hparams.seed) else: LOGGER.log("Using Synthetic Data ...") return self.dataset.synth_dataset_fn( batch_size=batch_size, training=False, input_shape=list(self.run_hparams.input_shape) + [self.run_hparams.n_channels], mask_shape=list(self.run_hparams.mask_shape) + [self.run_hparams.n_channels], num_threads=64, use_gpu_prefetch=True, normalize_data_method="zero_centered", only_defective_images=False, augment_data=False, seed=self.run_hparams.seed)
def evaluation_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=False, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: LOGGER.log("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def validate(model, criterion, valate_dataset, iteration, collate_fn, distributed_run, args): """Handles all the validation scoring and printing""" with evaluating(model), torch.no_grad(): val_loader = DataLoader(valate_dataset, num_workers=1, shuffle=False, batch_size=args.batch_size // len(args.validation_anchor_dirs), pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 for i, batch in enumerate(val_loader): x, y, num_frames = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, args.world_size).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
def validate(model, criterion, valset, iteration, batch_size, world_size, collate_fn, distributed_run, rank, batch_to_gpu, fp16_run): """Handles all the validation scoring and printing""" with evaluating(model), torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, num_workers=1, shuffle=False, sampler=val_sampler, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 for i, batch in enumerate(val_loader): x, y, len_x = batch_to_gpu(batch) if fp16_run: y_pred = model(fp32_to_fp16(x)) loss = criterion(fp16_to_fp32(y_pred), y) else: y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, world_size).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) LOGGER.log(key="val_iter_loss", value=reduced_val_loss)
def predict(self): """Perform prediction with the runner's classifier """ if hvd.rank() == 0: LOGGER.log("Begin predict...") begin = time.time() pred = self._classifier.predict(input_fn=self._dataset.test_fn) predictions = [p['logits'] for p in pred] print('Inference took: {} sec'.format(time.time() - begin)) binary_masks = [np.argmax(p, axis=-1).astype(np.uint8) * 255 for p in predictions] multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR) for mask in binary_masks] output_dir = os.path.join(self._model_dir, 'pred') if not os.path.exists(output_dir): os.makedirs(output_dir) multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'), compression="tiff_deflate", save_all=True, append_images=multipage_tif[1:]) pickle.dump(predictions, open(os.path.join(output_dir, 'predictions.pkl'), 'wb')) LOGGER.log("Predict finished")
def training_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( data_dir=self.run_hparams.data_dir, num_epochs=num_iter, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, # distort_color=self.run_hparams.distort_colors, # num_threads=self.run_hparams.num_preprocessing_threads, datasets_num_private_threads=None # deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def end(self, session): try: avg_processing_speed = float( ProfilerHook.moving_average(self._processing_speed_arr, n=100)[-1]) except: avg_processing_speed = float(np.mean(self._processing_speed_arr)) total_processing_time = time.time() - self._start_training_time total_processing_hours, rem = divmod(total_processing_time, 3600) total_processing_minutes, total_processing_seconds = divmod(rem, 60) LOGGER.log("Final Summary:\n" "\t[*] Average Imgs/sec: %d\n" "\t[*] Total Processing Time: %dh %02dm %02ds\n" % (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds)) perf_dict = { 'throughput': str(avg_processing_speed), 'processing_time': str(total_processing_time) } perf_filename = "performances_%s.json" % ("train" if self._is_training else "eval") with open(os.path.join(self._sample_dir, "..", perf_filename), 'w') as f: json.dump(perf_dict, f)
def _get_session_config(mode, use_xla): if mode not in ["train", 'validation', 'benchmark']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark')" % mode) config = tf.ConfigProto() config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.allow_growth = True if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: LOGGER.log("XLA is activated - Experimental Feature") config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def after_create_session(self, session, coord): params_count = tf.get_default_graph().get_tensor_by_name("trainable_parameters_count_ref:0") _params_count = session.run(params_count) LOGGER.log("# Total Trainable Parameters:", int(_params_count)) self._start_training_time = time.time()
def adjust_learning_rate(optimizer, epoch, args): lr = cosine_decay(args.init_lr, args.final_lr, epoch, args.epochs) if optimizer.param_groups[0]['lr'] != lr: LOGGER.log_event("learning_rate changed", value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr
def before_run(self, run_context): LOGGER.iteration_start() run_args = tf.train.SessionRunArgs( fetches=[ tf.train.get_global_step(), 'cross_entropy_loss_ref:0', 'l2_loss_ref:0', 'total_loss_ref:0', 'learning_rate_ref:0' ] ) self.t0 = time.time() return run_args
def before_run(self, run_context): LOGGER.iteration_start() run_args = tf.train.SessionRunArgs( fetches=[ 'UNet/cross_loss_ref:0', 'UNet/dice_loss_ref:0', 'UNet/total_loss_ref:0'] ) self._t0 = time.time() return run_args
def _log_hparams(classname, layername, **kwargs): log_msg = "%s: `%s`" % (classname, layername) for arg, val in sorted(kwargs.items()): log_msg += "\n\t[*] {}: {}".format(arg, val) log_msg += "\n" if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0: LOGGER.log(log_msg)
def train(self): """Perform training with the runner's classifier""" LOGGER.log("Begin training...") try: self._classifier.train(input_fn=self._dataset.train_fn, steps=self._max_steps, hooks=self._training_hooks) except KeyboardInterrupt: print("Keyboard interrupt") LOGGER.log("Training finished")
def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) # Limit available GPU memory (tune the size) if use_dali: LOGGER.log( "DALI is activated, GPU memory fraction used for training is limited to", gpu_memory_fraction) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = False else: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.visible_device_list = str(gpu_id) if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: LOGGER.log("XLA is activated - Experimental Feature") config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def main(_): """ Starting point of the application """ flags = PARSER.parse_args() params = _cmd_params(flags) tf.logging.set_verbosity(tf.logging.ERROR) # Optimization flags os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' if params['use_amp']: assert params['dtype'] == tf.float32, "TF-AMP requires FP32 precision" LOGGER.log("TF AMP is activated - Experimental Feature") os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' runner = Runner(params) if 'train' in params['exec_mode'] \ or 'train_and predict' in params['exec_mode']: runner.train() if 'train_and predict' in params['exec_mode'] \ or 'predict' in params['exec_mode']: runner.predict() if 'benchmark' in params['exec_mode']: runner.benchmark()
def after_run(self, run_context, run_values): cross_loss, dice_loss, total_loss = run_values.results batch_time = time.time() - self._t0 ips = self._global_batch_size / batch_time ips *= hvd.size() if self._current_step >= self._warmup_steps: LOGGER.log("iteration", int(self._current_step)) LOGGER.log("loss", float(cross_loss)) LOGGER.log("dice_loss", float(dice_loss)) LOGGER.log("total_loss", float(total_loss)) self._perf.record(ips) LOGGER.iteration_stop() self._current_step += 1
def end(self, session): try: avg_processing_speed = float(ProfilerHook.moving_average(self._processing_speed_arr, n=100)[-1]) except: avg_processing_speed = float(np.mean(self._processing_speed_arr)) total_processing_time = time.time() - self._start_training_time total_processing_hours, rem = divmod(total_processing_time, 3600) total_processing_minutes, total_processing_seconds = divmod(rem, 60) LOGGER.log( "Final Summary:\n" "\t[*] Average Imgs/sec: %d\n" "\t[*] Total Processing Time: %dh %02dm %02ds\n" % (avg_processing_speed, total_processing_hours, total_processing_minutes, total_processing_seconds) )
def after_run(self, run_context, run_values): #global_step, cross_entropy, logits_flat, labels_flat, rgb_inputs, depth_inputs, labels, logits = run_values.results global_step, cross_entropy = run_values.results batch_time = time.time() - self.t0 ips = self.global_batch_size / batch_time LOGGER.log("iteration", int(self.current_step)) LOGGER.log("imgs_per_sec", float(ips)) LOGGER.log("cross_entropy", float(cross_entropy)) LOGGER.iteration_stop() self.current_step += 1
def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: LOGGER.log("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True ) else: if hvd.rank() == 0: LOGGER.log("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def __init__(self, out_dir, global_batch_size, log_every=10, warmup_steps=20): LOGGER.set_model_name('UNet_TF') LOGGER.set_backends([ dllg.JsonBackend(log_file=os.path.join(out_dir, 'dlloger_out.json'), logging_scope=dllg.Scope.TRAIN_ITER, iteration_interval=1), dllg.StdOutBackend(log_file=None, logging_scope=dllg.Scope.TRAIN_ITER, iteration_interval=log_every) ]) self._perf = dllg.AverageMeter() LOGGER.register_metric('loss', meter=dllg.AverageMeter(), metric_scope=dllg.Scope.TRAIN_ITER) LOGGER.register_metric('dice_loss', meter=dllg.AverageMeter(), metric_scope=dllg.Scope.TRAIN_ITER) LOGGER.register_metric('total_loss', meter=dllg.AverageMeter(), metric_scope=dllg.Scope.TRAIN_ITER) self._warmup_steps = warmup_steps self._global_batch_size = global_batch_size self._current_step = 0
def _build_hparams(*args): hparams = tf.contrib.training.HParams() for _hparams in args: if not isinstance(_hparams, tf.contrib.training.HParams): raise ValueError("Non valid HParams argument object detected:", _hparams) for key, val in _hparams.values().items(): try: hparams.add_hparam(name=key, value=val) except ValueError: LOGGER.log( "the parameter `{}` already exists - existing value: {} and duplicated value: {}" .format(key, hparams.get(key), val)) return hparams
def adjust_learning_rate(epoch, optimizer, learning_rate, anneal_steps, anneal_factor): p = 0 if anneal_steps is not None: for i, a_step in enumerate(anneal_steps): if epoch >= int(a_step): p = p+1 if anneal_factor == 0.3: lr = learning_rate*((0.1 ** (p//2))*(1.0 if p % 2 == 0 else 0.3)) else: lr = learning_rate*(anneal_factor ** p) if optimizer.param_groups[0]['lr'] != lr: LOGGER.log_event("learning_rate changed", value=str(optimizer.param_groups[0]['lr']) + " -> " + str(lr)) for param_group in optimizer.param_groups: param_group['lr'] = lr
def __init__(self, params): hvd.init() LOGGER.log(str(params)) data_dir = params['data_dir'] batch_size = params['batch_size'] augment = params['augment'] benchmark = params['benchmark'] seed = params['seed'] self._model_dir = params['model_dir'] self._max_steps = params['max_steps'] self._classifier = tf.estimator.Estimator( model_fn=_model_fn, model_dir=self._model_dir, params=params, config=tf.estimator.RunConfig( tf_random_seed=None, session_config=self._get_session_config(), save_checkpoints_steps=self._max_steps if hvd.rank() == 0 else None, keep_checkpoint_max=1)) self._dataset = Dataset(data_dir=data_dir, batch_size=batch_size, augment=augment, gpu_id=hvd.rank(), num_gpus=hvd.size(), seed=seed) self._training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] if benchmark and hvd.rank() == 0: self._training_hooks.append( ProfilerHook(self._model_dir, batch_size, log_every=params['log_every'], warmup_steps=params['warmup_steps']))
def __init__(self, global_batch_size, log_every=10): LOGGER.set_model_name('SuctionAffordancePredictor') LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=log_every) ]) # Set-up the train_iter scope metrics LOGGER.register_metric("iteration", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("imgs_per_sec", meter=dllg.AverageMeter(), metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("cross_entropy", meter=dllg.AverageMeter(), metric_scope=dllg.TRAIN_ITER_SCOPE) self.global_batch_size = global_batch_size self.current_step = 0 self.current_epoch = 0
def learning_rate_scheduler(lr_init, lr_warmup_epochs, global_step, batch_size, num_batches_per_epoch, num_decay_steps, num_gpus, use_cosine_lr): def get_scaled_base_learning_rate(): """Calculates base learning rate for creating lr schedule. In replicated mode, gradients are summed rather than averaged which, with the sgd and momentum optimizers, increases the effective learning rate by lr * num_gpus. Dividing the base lr by num_gpus negates the increase. Args: batch_size: Total batch-size. Returns: Base learning rate to use to create lr schedule. """ base_lr = lr_init * num_gpus # Starting LR = 0.1 with BS = 256, else linearly scale return base_lr * (batch_size / 256.0) rescaled_lr = get_scaled_base_learning_rate() if use_cosine_lr: LOGGER.log("Using cosine learning rate schedule") lr = tf.train.cosine_decay(rescaled_lr, global_step, num_decay_steps) else: LOGGER.log("Using step learning rate schedule") boundaries = [int(num_batches_per_epoch * x) for x in [30, 60, 80, 90]] values = [1e0, 1e-1, 1e-2, 1e-3, 1e-4] values = [rescaled_lr * v for v in values] lr = tf.train.piecewise_constant(global_step, boundaries, values) warmup_steps = int(num_batches_per_epoch * lr_warmup_epochs) warmup_lr = (rescaled_lr * tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)) return tf.cond(global_step < warmup_steps, lambda: warmup_lr, lambda: lr)
def predict(self, to_predict): estimator_params = {} if to_predict is not None: filenames = runner_utils.parse_inference_input(to_predict) image_classifier = self._get_estimator( mode='inference', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction) inference_hooks = [] def inference_data_fn(): return data_utils.get_inference_input_fn( filenames=filenames, height=self.run_hparams.height, width=self.run_hparams.width, num_threads=self.run_hparams.num_preprocessing_threads) try: inference_results = image_classifier.predict( input_fn=inference_data_fn, predict_keys=None, hooks=inference_hooks, yield_single_examples=True) for result in inference_results: LOGGER.log(result['classes'], str(result['probabilities'][result['classes']])) except KeyboardInterrupt: print("Keyboard interrupt") LOGGER.log('Ending Inference ...')
def evaluation_data_fn(): if self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( data_dir=self.run_hparams.data_dir, num_epochs=num_iter, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, datasets_num_private_threads=None) else: LOGGER.log("Using Synthetic Data ...\n") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, )
def before_run(self, run_context): LOGGER.iteration_start() ''' run_args = tf.train.SessionRunArgs( fetches=[ tf.train.get_global_step(), 'cross_entropy_loss_ref:0', 'logits_flat_ref:0', 'labels_flat_ref:0', 'rgb_inputs:0', 'depth_inputs:0', 'labels_flat_ref:0', 'logits_flat_ref:0', ] ) ''' run_args = tf.train.SessionRunArgs(fetches=[ tf.train.get_global_step(), 'cross_entropy_loss_ref:0', ]) self.t0 = time.time() return run_args
def after_run(self, run_context, run_values): batch_time = time.time() - self.t0 ips = self.global_batch_size / batch_time if self.current_step >= self.warmup_steps: LOGGER.log("iteration", int(self.current_step)) LOGGER.log("total_ips", float(ips)) LOGGER.iteration_stop() self.current_step += 1