def create_validation_estimator(infer_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] batch_size = params['batch_size'] data_dir = params['data_dir'] data_idx_dir = params['data_idx_dir'] log_dir = params['log_dir'] precision = params['precision'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] num_iter = params['num_iter'] checkpoint_secs = params['checkpoint_secs'] display_every = params['display_every'] iter_unit = params['iter_unit'] dali_cpu = params['dali_cpu'] # Determinism is not fully supported by all TF ops. deterministic = False if deterministic: tf.set_random_seed(2 * (1 + hvd.rank())) random.seed(3 * (1 + hvd.rank())) np.random.seed(2) log_dir = None if log_dir == "" else log_dir data_dir = None if data_dir == "" else data_dir if data_dir is None: raise ValueError("data_dir must be specified") if log_dir is None: raise ValueError("log_dir must be specified") filename_pattern = os.path.join(data_dir, '%s-*') eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) num_eval_samples = _get_num_records(eval_filenames) eval_idx_filenames = None if data_idx_dir is not None: filename_pattern = os.path.join(data_idx_dir, '%s-*') eval_idx_filenames = sorted( tf.gfile.Glob(filename_pattern % 'validation')) else: raise ValueError("data_idx_dir must be specified") # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() #config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = 40 // hvd.size() - 2 classifier_eval = tf.estimator.Estimator( model_fn=_cnn_model_function, model_dir=log_dir, params={ 'model': infer_func, 'format': image_format, 'dtype': tf.float16 if precision == 'fp16' else tf.float32, 'momentum': momentum, 'learning_rate_init': learning_rate_init, 'learning_rate_power': learning_rate_power, 'decay_steps': None, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'larc_eta': larc_eta, 'larc_mode': larc_mode, 'deterministic': deterministic, 'n_classes': 1000, 'dali_cpu': dali_cpu, }, config=tf.estimator.RunConfig( tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None, session_config=config, save_checkpoints_secs=None, save_checkpoints_steps=None, keep_checkpoint_every_n_hours=3)) if not deterministic: num_preproc_threads = 4 else: num_preproc_threads = 1 input_fn = lambda: nvutils.image_set(eval_filenames, batch_size, image_height, image_width, training=False, distort_color=False, deterministic=deterministic, dali_cpu=dali_cpu, idx_filenames=eval_idx_filenames, num_threads=num_preproc_threads) return classifier_eval, input_fn, (num_eval_samples / batch_size)
x = graph.get_tensor_by_name(x_tensor_name) class_ids = graph.get_tensor_by_name(c_tensor_name) probabilities = graph.get_tensor_by_name(p_tensor_name) class_ids_, probs_ = None, None total = 0 if data_dir is not None: filename_pattern = os.path.join(data_dir, '%s-*') eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) num_preproc_threads = 10 dataset = nvutils.image_set(eval_filenames, batch_size, 224, 224, training=False, distort_color=False, deterministic=False, num_threads=num_preproc_threads) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() try: while True: value_, _ = sess.run(next_element) tclass_ids_, tprobs_ = sess.run([class_ids, probabilities], {x: value_}) total += tclass_ids_.shape[0] if class_ids_ is None: class_ids_ = tclass_ids_ probs_ = tprobs_ else:
def train(infer_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] batch_size = params['batch_size'] distort_color = params['distort_color'] data_dir = params['data_dir'] data_idx_dir = params['data_idx_dir'] log_dir = params['log_dir'] precision = params['precision'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] num_iter = params['num_iter'] checkpoint_secs = params['checkpoint_secs'] display_every = params['display_every'] iter_unit = params['iter_unit'] dali_cpu = params['dali_cpu'] epoch_evaluation = params['epoch_evaluation'] use_xla = params['use_xla'] # Determinism is not fully supported by all TF ops. # Disabling until remaining wrinkles can be ironed out. deterministic = False if deterministic: tf.set_random_seed(2 * (1 + hvd.rank())) random.seed(3 * (1 + hvd.rank())) np.random.seed(2) log_dir = None if log_dir == "" else log_dir data_dir = None if data_dir == "" else data_dir data_idx_dir = None if data_idx_dir == "" else data_idx_dir global_batch_size = batch_size * hvd.size() if data_dir is not None: filename_pattern = os.path.join(data_dir, '%s-*') train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) num_training_samples = _get_num_records(train_filenames) else: num_training_samples = global_batch_size train_idx_filenames = None if data_idx_dir is not None: filename_pattern = os.path.join(data_idx_dir, '%s-*') train_idx_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train')) if iter_unit.lower() == 'epoch': nstep = num_training_samples * num_iter // global_batch_size num_epochs = num_iter decay_steps = nstep else: nstep = num_iter num_epochs = max(nstep * global_batch_size // num_training_samples, 1) decay_steps = 90 * num_training_samples // global_batch_size nstep_per_epoch = num_training_samples // global_batch_size # Horovod: pin GPU to be used to process local rank (one GPU per process) gpu_options = GPUOptions(per_process_gpu_memory_fraction=0.7) config = ConfigProto(gpu_options=gpu_options) if use_xla: config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) #config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = max(2, 40 // hvd.size() - 2) classifier = tf.estimator.Estimator( model_fn=_cnn_model_function, model_dir=log_dir, params={ 'model': infer_func, 'format': image_format, 'dtype': tf.float16 if precision == 'fp16' else tf.float32, 'momentum': momentum, 'learning_rate_init': learning_rate_init, 'learning_rate_power': learning_rate_power, 'decay_steps': decay_steps, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'larc_eta': larc_eta, 'larc_mode': larc_mode, 'deterministic': deterministic, 'n_classes': 1000, 'dali_cpu': dali_cpu, }, config=tf.estimator.RunConfig( tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None, session_config=config, save_checkpoints_secs=checkpoint_secs if hvd.rank() == 0 else None, save_checkpoints_steps=nstep if hvd.rank() == 0 else None, keep_checkpoint_every_n_hours=3)) print("Training") if not deterministic: num_preproc_threads = 4 else: num_preproc_threads = 1 training_hooks = [ hvd.BroadcastGlobalVariablesHook(0), _PrefillStagingAreasHook() ] if hvd.rank() == 0: training_hooks.append( _LogSessionRunHook(global_batch_size, num_training_samples, display_every)) input_func = lambda: nvutils.image_set(train_filenames, batch_size, image_height, image_width, training=True, distort_color=distort_color, deterministic=deterministic, num_threads=num_preproc_threads, dali_cpu=dali_cpu, idx_filenames=train_idx_filenames) if epoch_evaluation: classifier_eval, eval_input_func, eval_steps = create_validation_estimator( infer_func, params) try: if epoch_evaluation: for i in range(num_epochs): classifier.train(input_fn=input_func, steps=nstep // num_epochs, hooks=training_hooks) if hvd.rank() == 0: eval_result = classifier_eval.evaluate( input_fn=eval_input_func, steps=eval_steps) print('epoch {} top1: {}%'.format( i, eval_result['top1_accuracy'] * 100)) print('epoch {} top5: {}%'.format( i, eval_result['top5_accuracy'] * 100)) else: classifier.train(input_fn=input_func, max_steps=nstep, hooks=training_hooks) except KeyboardInterrupt: print("Keyboard interrupt")
def validate(infer_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] batch_size = params['batch_size'] data_dir = params['data_dir'] log_dir = params['log_dir'] precision = params['precision'] momentum = params['momentum'] learning_rate_init = params['learning_rate_init'] learning_rate_power = params['learning_rate_power'] weight_decay = params['weight_decay'] loss_scale = params['loss_scale'] larc_eta = params['larc_eta'] larc_mode = params['larc_mode'] num_iter = params['num_iter'] checkpoint_secs = params['checkpoint_secs'] display_every = params['display_every'] iter_unit = params['iter_unit'] use_dali = params['use_dali'] # Determinism is not fully supported by all TF ops. # Disabling until remaining wrinkles can be ironed out. deterministic = False if deterministic: tf.set_random_seed(2 * (1 + hvd.rank())) random.seed(3 * (1 + hvd.rank())) np.random.seed(2) log_dir = None if log_dir == "" else log_dir data_dir = None if data_dir == "" else data_dir if data_dir is None: raise ValueError("data_dir must be specified") if log_dir is None: raise ValueError("log_dir must be specified") filename_pattern = os.path.join(data_dir, '%s-*') eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation')) # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() #config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.force_gpu_compatible = True # Force pinned memory config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads config.inter_op_parallelism_threads = 40 // hvd.size() - 2 # HACK TESTING classifier = tf.estimator.Estimator( model_fn=_cnn_model_function, model_dir=log_dir, params={ 'model': infer_func, 'format': image_format, 'dtype' : tf.float16 if precision == 'fp16' else tf.float32, 'momentum' : momentum, 'learning_rate_init' : learning_rate_init, 'learning_rate_power' : learning_rate_power, 'decay_steps' : None, 'weight_decay' : weight_decay, 'loss_scale' : loss_scale, 'larc_eta' : larc_eta, 'larc_mode' : larc_mode, 'deterministic' : deterministic, 'n_classes': 1000, 'use_dali': False, }, config=tf.estimator.RunConfig( tf_random_seed=2 * (1 + hvd.rank()) if deterministic else None, session_config=config, save_checkpoints_secs=None, save_checkpoints_steps=None, keep_checkpoint_every_n_hours=3)) if not deterministic and not use_dali: num_preproc_threads = 10 elif not deterministic and use_dali: num_preproc_threads = 2 elif deterministic: num_preproc_threads = 1 if hvd.rank() == 0: print("Evaluating") try: eval_result = classifier.evaluate( input_fn=lambda: nvutils.image_set( eval_filenames, batch_size, image_height, image_width, training=False, distort_color=False, deterministic=deterministic, num_threads=num_preproc_threads)) print('Top-1 accuracy:', eval_result['top1_accuracy']*100, '%') print('Top-5 accuracy:', eval_result['top5_accuracy']*100, '%') except KeyboardInterrupt: print("Keyboard interrupt")