def imagenet_model_fn(features, labels, mode, params): """Our model_fn for ResNet to be used with our Estimator.""" # Warmup and higher lr may not be valid for fine tuning with small batches # and smaller numbers of training images. if params['fine_tune'] or ('disable_warmup' in params and params['disable_warmup']): warmup = False base_lr = .1 else: warmup = True base_lr = .128 # According to https://arxiv.org/abs/1706.02677 and our internal experiments, # the best accuracy results for more than 16 devices are achieved when base_lr == 0.1 if horovod_enabled() and hvd.size() > 16: base_lr = .1 # Used for ResNeXt101-32x4d if params['use_cosine_lr']: base_lr = .256 if horovod_enabled(): total_batch_size = params['batch_size'] * hvd.size() else: total_batch_size = params['batch_size'] * params.get('num_workers', 1) learning_rate_fn = resnet_run_loop.learning_rate_with_decay( batch_size=total_batch_size, batch_denom=256, num_images=NUM_IMAGES['train'], boundary_epochs=[30, 60, 80, 90], train_epochs=params['train_epochs'], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4], warmup=warmup, warmup_epochs=params['warmup_epochs'], base_lr=base_lr, use_cosine_lr=params['use_cosine_lr']) return resnet_run_loop.resnet_model_fn( features=features, labels=labels, mode=mode, model_class=ImagenetModel, resnet_size=params['resnet_size'], weight_decay=flags.FLAGS.weight_decay, learning_rate_fn=learning_rate_fn, momentum=flags.FLAGS.momentum, data_format=params['data_format'], resnet_version=params['resnet_version'], loss_scale=params['loss_scale'], loss_filter_fn=None, model_type=params['model_type'], dtype=params['dtype'], fine_tune=params['fine_tune'], label_smoothing=flags.FLAGS.label_smoothing)
def prepare_model_dir(params): worker_id = hvd_rank() if horovod_enabled() else 0 if params.benchmark or (not params.log_all_workers and worker_id != 0): return None model_dir = os.path.join(params.model_dir, "model_checkpoint") if params.log_all_workers and horovod_enabled(): model_dir = os.path.join(model_dir, f'worker_{worker_id}') os.makedirs(model_dir, exist_ok=True) if ('train' in params.exec_mode) and (not params.resume_training): os.system('rm -rf {}/*'.format(model_dir)) return model_dir
def __init__(self, dump_root, tensor_debug_mode, circular_buffer_size, op_regex, output_regex=None): self._dump_root = dump_root if horovod_enabled(): self._dump_root = os.path.join(self._dump_root, f"rank_{hvd_rank()}") self._tensor_debug_mode = debug_event_pb2.TensorDebugMode.Value( tensor_debug_mode) self._circular_buffer_size = circular_buffer_size self._op_regex = re.compile(op_regex) if isinstance(op_regex, str) else op_regex self._output_regex = re.compile(output_regex) if isinstance( output_regex, str) else output_regex self._tfdbg_run_id = '' self._dump_op_counter = 0 debug_writer_args = { "dump_root": self._dump_root, "circular_buffer_size": self._circular_buffer_size } if not tf.__version__.startswith("2.2"): debug_writer_args["tfdbg_run_id"] = self._tfdbg_run_id self._writer = debug_events_writer.DebugEventsWriter( **debug_writer_args)
def input_fn(params): """The actual input function.""" if use_tpu: batch_size = params["batch_size"] else: batch_size = bsz if FLAGS.deterministic_run: d = tf.data.TFRecordDataset(input_file) d = d.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, num_parallel_calls=1, drop_remainder=True)) return d # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. d = tf.data.TFRecordDataset(input_file) if is_training: if horovod_enabled(): d = d.shard(hvd.size(), hvd.rank()) d = d.repeat() d = d.shuffle(buffer_size=100) d = d.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, drop_remainder=drop_remainder)) return d
def before_run(self, run_context): if horovod_enabled() and hvd_rank() != 0: return self.t0 = time.time() if self.num_accumulation_steps <= 1: if FLAGS.manual_fp16 or FLAGS.amp: return tf.estimator.SessionRunArgs(fetches=[ 'step_update:0', 'total_loss:0', 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0', 'loss_scale:0' ]) else: return tf.estimator.SessionRunArgs(fetches=[ 'step_update:0', 'total_loss:0', 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0' ]) else: if FLAGS.manual_fp16 or FLAGS.amp: return tf.estimator.SessionRunArgs(fetches=[ 'step_update:0', 'update_step:0', 'total_loss:0', 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0', 'loss_scale:0' ]) else: return tf.estimator.SessionRunArgs(fetches=[ 'step_update:0', 'update_step:0', 'total_loss:0', 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0' ])
def get_global_batch_size(batch_size): global global_batch_size if global_batch_size is None: global_batch_size = batch_size if horovod_enabled(): global_batch_size = batch_size * hvd_size() return global_batch_size
def hvd_info(msg): hvd_try_init() if horovod_enabled(): head = 'hvd rank{}/{} in {}'.format(hvd.rank(), hvd.size(), socket.gethostname()) else: head = '{}'.format(socket.gethostname()) tf.logging.info('{}: {}'.format(head, msg))
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "masked_lm_positions": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_ids": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64), "masked_lm_weights": tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32), "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64), } # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) if horovod_enabled(): d = d.shard(hvd_size(), hvd_rank()) d = d.repeat() d = d.shuffle(buffer_size=len(input_files)) # `cycle_length` is the number of parallel files that get read. cycle_length = min(num_cpu_threads, len(input_files)) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply( tf.data.experimental.parallel_interleave( tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length)) d = d.shuffle(buffer_size=100) else: d = tf.data.TFRecordDataset(input_files) # Since we evaluate for a fixed number of steps we don't want to encounter # out-of-range exceptions. d = d.repeat() # We must `drop_remainder` on training because the TPU requires fixed # size dimensions. For eval, we assume we are evaluating on the CPU or GPU # and we *don't* want to drop the remainder, otherwise we wont cover # every sample. d = d.apply( tf.data.experimental.map_and_batch( lambda record: _decode_record(record, name_to_features), batch_size=batch_size, num_parallel_batches=num_cpu_threads, drop_remainder=True if is_training else False)) return d
def _calculate_mean_and_var(self, x, axes, keep_dims): with ops.name_scope('moments', values=[x, axes]): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x if horovod_enabled(): num_shards = hvd.size() else: num_shards = 1 if num_shards > 1: local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True) local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True) batch_size = math_ops.cast(array_ops.shape_v2(y)[0], dtypes.float32) # y_sum, y_squared_sum, global_batch_size = ( # replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, [ # local_sum, local_squared_sum, batch_size])) # hvd_info(f'local_sum {local_sum.shape}, local_squared_sum {local_squared_sum.shape}') y_sum = hvd.allreduce(local_sum, average=False) y_squared_sum = hvd.allreduce(local_squared_sum, average=False) global_batch_size = batch_size * num_shards axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))] multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32) multiplier = multiplier * global_batch_size mean = y_sum / multiplier y_squared_mean = y_squared_sum / multiplier # var = E(x^2) - E(x)^2 variance = y_squared_mean - math_ops.square(mean) else: # Compute true mean while keeping the dims for proper broadcasting. mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean') # sample variance, not unbiased variance # Note: stop_gradient does not change the gradient that gets # backpropagated to the mean from the variance calculation, # because that gradient is zero variance = math_ops.reduce_mean( math_ops.squared_difference(y, array_ops.stop_gradient(mean)), axes, keepdims=True, name='variance') if not keep_dims: mean = array_ops.squeeze(mean, axes) variance = array_ops.squeeze(variance, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(variance, dtypes.float16)) else: return (mean, variance)
def hvd_try_init(): global IS_HVD_INIT if not IS_HVD_INIT and horovod_enabled(): hvd_init() IS_HVD_INIT = True tf.get_logger().propagate = False if hvd.rank() == 0: tf.logging.set_verbosity('INFO') else: tf.logging.set_verbosity('WARN')
def get_logger(params): backends = [] worker_id = hvd_rank() if horovod_enabled() else 0 if worker_id == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: os.makedirs(params.log_dir, exist_ok=True) log_file = f"{params.log_dir}/log.json" backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)] logger.init(backends=backends) return logger
def main(): """ Starting point of the application """ params = parse_args(description="UNet-medical") if params.use_horovod: hvd_init() set_flags(params) model_dir = prepare_model_dir(params) params.model_dir = model_dir logger = get_logger(params) tb_logger = None if params.tensorboard_logging: log_dir = params.log_dir if horovod_enabled() and params.log_all_workers: log_dir = os.path.join(log_dir, f'worker_{hvd_rank()}') tb_logger = namedtuple('TBSummaryWriters', 'train_writer eval_writer')( tf.summary.create_file_writer(log_dir), tf.summary.create_file_writer(os.path.join(log_dir, 'eval'))) model = Unet() dataset = Dataset(data_dir=params.data_dir, batch_size=params.batch_size, fold=params.fold, augment=params.augment, hpu_id=hvd_rank() if horovod_enabled() else 0, num_hpus=hvd_size() if horovod_enabled() else 1, seed=params.seed) if 'train' in params.exec_mode: with dump_callback(params.dump_config): train(params, model, dataset, logger, tb_logger) if 'evaluate' in params.exec_mode: evaluate(params, model, dataset, logger, tb_logger) if 'predict' in params.exec_mode: predict(params, model, dataset, logger)
def _configure_learning_rate(num_samples_per_epoch, global_step): """Configures the learning rate. Args: num_samples_per_epoch: The number of samples in each epoch of training. global_step: The global_step tensor. Returns: A `Tensor` representing the learning rate. Raises: ValueError: if """ # Note: when num_clones is > 1, this will actually have each clone to go # over each epoch FLAGS.num_epochs_per_decay times. This is different # behavior from sync replicas and is expected to produce different results. steps_per_epoch = num_samples_per_epoch / FLAGS.batch_size / FLAGS.num_workers if FLAGS.sync_replicas: steps_per_epoch /= FLAGS.replicas_to_aggregate decay_steps = int(steps_per_epoch * FLAGS.num_epochs_per_decay) if FLAGS.learning_rate_decay_type == 'exponential': learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True, name='exponential_decay_learning_rate') elif FLAGS.learning_rate_decay_type == 'fixed': learning_rate = tf.constant(FLAGS.learning_rate, name='fixed_learning_rate') elif FLAGS.learning_rate_decay_type == 'polynomial': learning_rate = tf.train.polynomial_decay( FLAGS.learning_rate, global_step, decay_steps, FLAGS.end_learning_rate, power=1.0, cycle=False, name='polynomial_decay_learning_rate') else: raise ValueError('learning_rate_decay_type [%s] was not recognized' % FLAGS.learning_rate_decay_type) if FLAGS.warmup_epochs: warmup_lr = (FLAGS.learning_rate * tf.cast(global_step, tf.float32) / (steps_per_epoch * FLAGS.warmup_epochs)) learning_rate = tf.minimum(warmup_lr, learning_rate) if horovod_enabled(): learning_rate = learning_rate * hvd.size() return learning_rate
def hvd_info_rank0(msg, with_head=True): hvd_try_init() if is_rank0(): if with_head: if horovod_enabled(): head = 'hvd only rank{}/{} in {}'.format( hvd.rank(), hvd.size(), socket.gethostname()) else: head = '{}'.format(socket.gethostname()) tf.logging.info('{}: {}'.format(head, msg)) else: tf.logging.info(msg)
def update(accum_vars): with tf.control_dependencies([global_step.assign(new_global_step) ]): if allreduce_post_accumulation and horovod_enabled(): accum_vars = [ hvd.allreduce(tf.convert_to_tensor(value=accum_var)) if isinstance(accum_var, tf.IndexedSlices) else hvd.allreduce(accum_var) for accum_var in accum_vars ] return optimizer.apply_gradients(list(zip(accum_vars, tvars)), global_step=global_step)
def eval_end(self): """See base class.""" if self.flags_obj.use_distributed_eval and horovod_enabled(): test_accuracy = hvd.allreduce(self.test_accuracy.result()) else: test_accuracy = self.test_accuracy.result() return { 'test_loss': self.test_loss.result(), 'test_accuracy': test_accuracy }
def train_step(features, labels, warmup_batch=False): with tf.GradientTape() as tape: output_map = model(features) crossentropy_loss, dice_loss = partial_losses(output_map, labels) added_losses = tf.add(crossentropy_loss, dice_loss, name="total_loss_ref") loss = added_losses + params.weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in model.trainable_variables if 'batch_normalization' not in v.name]) if horovod_enabled(): tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if horovod_enabled() and warmup_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) ce_loss(crossentropy_loss) f1_loss(dice_loss) return loss
def step_fn(inputs): """Function to run on the device.""" images, labels = inputs if self.one_hot: labels = tf.cast(labels, tf.int32) labels = tf.one_hot(labels, 1001) labels = tf.squeeze(labels) with tf.GradientTape() as tape: logits = self.model(images, training=True) prediction_loss = self.get_prediction_loss(labels, logits) loss = tf.reduce_sum(prediction_loss) * (1.0 / self.flags_obj.batch_size) if not self.use_lars_optimizer: num_replicas = self.strategy.num_replicas_in_sync if self.flags_obj.single_l2_loss_op: l2_loss = self.flags_obj.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in self.model.trainable_variables if ('bn' not in v.name) ]) loss += (l2_loss / num_replicas) else: loss += (tf.reduce_sum(self.model.losses) / num_replicas) if horovod_enabled(): tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss, self.model.trainable_variables) grads_and_vars = zip(grads, self.model.trainable_variables) self.optimizer.apply_gradients( grads_and_vars, experimental_aggregate_gradients=False) tf.cond(self.global_step == 1, lambda: hvd.broadcast_variables(self.model.variables + self.optimizer.variables(), root_rank=0), lambda: tf.constant(True)) else: grad_utils.minimize_using_explicit_allreduce( tape, self.optimizer, loss, self.model.trainable_variables) self.train_loss.update_state(loss) self.train_accuracy.update_state(labels, logits)
def _configure_optimizer(learning_rate): """Configures the optimizer used for training. Args: learning_rate: A scalar or `Tensor` learning rate. Returns: An instance of an optimizer. Raises: ValueError: if FLAGS.optimizer is not recognized. """ if FLAGS.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate, rho=FLAGS.adadelta_rho, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer( learning_rate, initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value) elif FLAGS.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer( learning_rate, learning_rate_power=FLAGS.ftrl_learning_rate_power, initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value, l1_regularization_strength=FLAGS.ftrl_l1, l2_regularization_strength=FLAGS.ftrl_l2) elif FLAGS.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=FLAGS.momentum, name='Momentum') elif FLAGS.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.rmsprop_momentum, epsilon=FLAGS.opt_epsilon) elif FLAGS.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) else: raise ValueError('Optimizer [%s] was not recognized' % FLAGS.optimizer) if horovod_enabled(): optimizer = hvd.DistributedOptimizer(optimizer) return optimizer
def _moments(self, inputs, reduction_axes, keep_dims): """Compute the mean and variance: it overrides the original _moments.""" shard_mean, shard_variance = super(SyncBatchNormalization, self)._moments( inputs, reduction_axes, keep_dims=keep_dims) num_shards = hvd.size() if horovod_enabled() else 1 if num_shards > 1: # Compute variance using: Var[X]= E[X^2] - E[X]^2. shard_square_of_mean = tf.math.square(shard_mean) shard_mean_of_square = shard_variance + shard_square_of_mean group_mean = hvd.allreduce(shard_mean) group_mean_of_square = hvd.allreduce(shard_mean_of_square) group_variance = group_mean_of_square - tf.math.square(group_mean) return (group_mean, group_variance) else: return (shard_mean, shard_variance)
def get_mllog_mlloger(): from mlperf_logging import mllog from mlperf_compliance import tf_mlperf_log str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() filenames = "resnet50v1.5.log-" + str_hvd_rank mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog, tf_mlperf_log
def get_mllog_mlloger(output_dir=None): from mlperf_logging import mllog str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0" mllogger = mllog.get_mllogger() mllogger.propagate = False mllog.propagate=False if output_dir is None: output_dir='./log' filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt" mllog.config(filename=filenames) workername = "worker" + str_hvd_rank mllog.config( default_namespace = workername, default_stack_offset = 1, default_clear_line = False, root_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", ".."))) return mllogger, mllog
def train_step(images, labels, step): with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_object(labels, predictions) if horovod_enabled(): tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables), experimental_aggregate_gradients=True) tf.cond( step == 0, lambda: hvd.broadcast_variables( model.variables + optimizer.variables(), root_rank=0), lambda: tf.constant(True)) else: grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss, model.trainable_variables) train_loss(loss) train_accuracy(labels, predictions)
def set_flags(params): if params.tf_verbosity: os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(params.tf_verbosity) if not params.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path np.random.seed(params.seed) tf.random.set_seed(params.seed) if params.use_xla: tf.config.optimizer.set_jit(True) per_hpu_thread_count = 1 num_hpus = hvd_size() if horovod_enabled() else 1 cpu_count = multiprocessing.cpu_count() total_hpu_thread_count = per_hpu_thread_count * num_hpus tf.config.threading.set_intra_op_parallelism_threads(0) tf.config.threading.set_inter_op_parallelism_threads(cpu_count - total_hpu_thread_count)
def input_fn(is_training, data_dir, batch_size, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, parse_record_fn=parse_record, input_context=None, drop_remainder=False, tf_data_experimental_slack=False, experimental_preloading=False, dataset_fn=None): """Input function which provides batches for train or eval. Args: is_training: A boolean denoting whether the input is for training. data_dir: The directory containing the input data. batch_size: The number of samples per batch. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features datasets_num_private_threads: Number of private threads for tf.data. parse_record_fn: Function to use for parsing the records. input_context: A `tf.distribute.InputContext` object passed in by `tf.distribute.Strategy`. drop_remainder: A boolean indicates whether to drop the remainder of the batches. If True, the batch dimension will be static. tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack` option. Returns: A dataset that can be used for iteration. """ if dataset_fn is None: filenames = get_filenames(is_training, data_dir) dataset = tf.data.Dataset.from_tensor_slices(filenames) else: dataset = dataset_fn() if is_training and horovod_enabled(): dataset = dataset.shard(hvd.size(), hvd.rank()) if input_context: tf.compat.v1.logging.info( 'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d' % (input_context.input_pipeline_id, input_context.num_input_pipelines)) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if is_training: # Shuffle the input files dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES) # Convert to individual records. # cycle_length = 10 means that up to 10 files will be read and deserialized in # parallel. You may want to increase this number if you have a large number of # CPU cores. dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, num_parallel_calls=tf.data.experimental.AUTOTUNE) return resnet_run_loop.process_record_dataset( dataset=dataset, is_training=is_training, batch_size=batch_size, shuffle_buffer=_SHUFFLE_BUFFER, parse_record_fn=parse_record_fn, num_epochs=num_epochs, dtype=dtype, datasets_num_private_threads=datasets_num_private_threads, drop_remainder=drop_remainder, tf_data_experimental_slack=tf_data_experimental_slack, experimental_preloading=experimental_preloading)
def adjust_batch_size(batch_size): if horovod_enabled(): return batch_size * comm_size() return batch_size
def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer, parse_record_fn, num_epochs=1, dtype=tf.float32, datasets_num_private_threads=None, drop_remainder=False, tf_data_experimental_slack=False, experimental_preloading=False): """Given a Dataset with raw records, return an iterator over the records. Args: dataset: A Dataset representing raw records is_training: A boolean denoting whether the input is for training. batch_size: The number of samples per batch. shuffle_buffer: The buffer size to use when shuffling records. A larger value results in better randomness, but smaller values reduce startup time and use less memory. parse_record_fn: A function that takes a raw record and returns the corresponding (image, label) pair. num_epochs: The number of epochs to repeat the dataset. dtype: Data type to use for images/features. datasets_num_private_threads: Number of threads for a private threadpool created for all datasets computation. drop_remainder: A boolean indicates whether to drop the remainder of the batches. If True, the batch dimension will be static. tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack` option. Returns: Dataset of (image, label) pairs ready for iteration. """ # Defines a specific size thread pool for tf.data operations. if datasets_num_private_threads: options = tf.data.Options() options.experimental_threading.private_threadpool_size = ( datasets_num_private_threads) dataset = dataset.with_options(options) tf.compat.v1.logging.info('datasets_num_private_threads: %s', datasets_num_private_threads) if not experimental_preloading: # Disable intra-op parallelism to optimize for throughput instead of latency. options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 dataset = dataset.with_options(options) # Prefetches a batch at a time to smooth out the time taken to load input # files for shuffling and processing. dataset = dataset.prefetch(buffer_size=batch_size) if is_training: # Shuffles records before repeating to respect epoch boundaries. dataset = dataset.shuffle(buffer_size=shuffle_buffer) else: dataset = dataset.take(imagenet_main.NUM_IMAGES['validation']) if horovod_enabled(): # Repeats the dataset. Due to sharding in multinode, training is related # directly to the number of max iterations not to number of epochs. dataset = dataset.repeat() else: # Repeats the dataset for the number of epochs to train. dataset = dataset.repeat(num_epochs) num_parallel_calls = 16 if horovod_enabled() else tf.data.experimental.AUTOTUNE # Parses the raw records into images and labels. dataset = dataset.map( lambda value: parse_record_fn(value, is_training, dtype), num_parallel_calls=num_parallel_calls, deterministic=False) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) # Operations between the final prefetch and the get_next call to the iterator # will happen synchronously during run time. We prefetch here again to # background all of the above processing work and keep it out of the # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE # allows DistributionStrategies to adjust how many batches to fetch based # on how many devices are present. if experimental_preloading: device = "/device:HPU:0" dataset = dataset.apply(tf.data.experimental.prefetch_to_device(device)) else: dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) if tf_data_experimental_slack: options = tf.data.Options() options.experimental_slack = True dataset = dataset.with_options(options) return dataset
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. Returns: Dict of results of the run. Contains the keys `eval_results` and `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5. `train_hooks` is a list the instances of hooks used during training. """ experimental_preloading = flags_obj.experimental_preloading model_helpers.apply_clean(flags.FLAGS) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Configures cluster spec for distribution strategy. num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=not experimental_preloading) if horovod_enabled(): # The Scoped Allocator Optimization is enabled by default unless disabled by a flag. if not condition_env_var('TF_DISABLE_SCOPED_ALLOCATOR', default=False): from tensorflow.core.protobuf import rewriter_config_pb2 # pylint: disable=import-error session_config.graph_options.rewrite_options.scoped_allocator_optimization = rewriter_config_pb2.RewriterConfig.ON enable_op = session_config.graph_options.rewrite_options.scoped_allocator_opts.enable_op del enable_op[:] enable_op.append("HorovodAllreduce") distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_core.get_num_gpus(flags_obj), num_workers=num_workers, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) # Creates a `RunConfig` that checkpoints every 24 hours which essentially # results in checkpoints determined only by `epochs_between_evals`. run_config = tf.estimator.RunConfig( train_distribute=distribution_strategy, session_config=session_config, log_step_count_steps=flags_obj.display_steps, save_checkpoints_secs=None, save_checkpoints_steps=flags_obj.save_checkpoint_steps) # Initializes model with all but the dense layer from pretrained ResNet. # if flags_obj.pretrained_model_checkpoint_path is not None: # warm_start_settings = tf.estimator.WarmStartSettings( # flags_obj.pretrained_model_checkpoint_path, # vars_to_warm_start='^(?!.*dense)') # else: # warm_start_settings = None warm_start_settings = None model_dir=flags_obj.model_dir if horovod_enabled(): model_dir="{}/rank_{}".format(flags_obj.model_dir, hvd.rank()) if experimental_preloading: SelectedEstimator = HabanaEstimator else: SelectedEstimator = tf.estimator.Estimator if flags.FLAGS.is_mlperf_enabled: for eval_batch_size in range(flags_obj.batch_size, 1, -1): if imagenet_main.NUM_IMAGES['validation'] % eval_batch_size == 0: break else: eval_batch_size = flags_obj.batch_size classifier = SelectedEstimator( model_fn=model_function, model_dir=model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'model_type': flags_obj.model_type, 'loss_scale': flags_core.get_loss_scale(flags_obj, default_for_fp16=128), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'num_workers': num_workers, 'train_epochs': flags_obj.train_epochs, 'warmup_epochs': flags_obj.warmup_epochs, 'use_cosine_lr': flags_obj.use_cosine_lr, }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'model_type': flags_obj.model_type, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, 'num_workers': num_workers, } if flags.FLAGS.is_mlperf_enabled: run_params['eval_batch_size'] = eval_batch_size if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=model_dir, batch_size=flags_obj.batch_size) if flags.FLAGS.is_mlperf_enabled: _log_cache = [] def formatter(x): """Abuse side effects to get tensors out of the model_fn.""" if _log_cache: _log_cache.pop() _log_cache.append(x.copy()) return str(x) compliance_hook = tf.estimator.LoggingTensorHook( tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME}, every_n_iter=int(1e10), at_end=True, formatter=formatter) else: compliance_hook = None if horovod_enabled(): if "tf_profiler_hook" not in flags_obj.hooks and os.environ.get("TF_RANGE_TRACE", False): from TensorFlow.common.utils import RangeTFProfilerHook begin = (imagenet_main.NUM_IMAGES["train"] // (flags_obj.batch_size * hvd.size()) + 100) train_hooks.append(RangeTFProfilerHook(begin,20, "./rank-{}".format(hvd.rank()))) if "synapse_logger_hook" not in flags_obj.hooks and "range" == os.environ.get("HABANA_SYNAPSE_LOGGER", "False").lower(): from TensorFlow.common.horovod_helpers import SynapseLoggerHook begin = (imagenet_main.NUM_IMAGES["train"] // (flags_obj.batch_size * hvd.size()) + 100) end = begin + 100 print("Begin: {}".format(begin)) print("End: {}".format(end)) train_hooks.append(SynapseLoggerHook(list(range(begin, end)), False)) train_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) def input_fn_train(num_epochs, input_context=None): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, dtype=flags_core.get_dl_type(flags_obj), datasets_num_private_threads=flags_obj.datasets_num_private_threads, input_context=input_context, experimental_preloading=experimental_preloading) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_replica_batch_size( eval_batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_dl_type(flags_obj), experimental_preloading=experimental_preloading) train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else flags_obj.train_epochs) max_train_steps = flags_obj.max_train_steps global_batch_size = flags_obj.batch_size * (hvd.size() if horovod_enabled() else 1) steps_per_epoch = (imagenet_main.NUM_IMAGES['train'] // global_batch_size) if max_train_steps is None: max_train_steps = steps_per_epoch * (train_epochs + flags_obj.train_offset) max_eval_steps = flags_obj.max_eval_steps if max_eval_steps is None: max_eval_steps = (imagenet_main.NUM_IMAGES['validation'] + eval_batch_size - 1) // eval_batch_size use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1 if use_train_and_evaluate: train_spec = tf.estimator.TrainSpec( input_fn=lambda input_context=None: input_fn_train( train_epochs, input_context=input_context), hooks=train_hooks, max_steps=max_train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval) tf.compat.v1.logging.info('Starting to train and evaluate.') tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) # tf.estimator.train_and_evalute doesn't return anything in multi-worker # case. eval_results = {} else: if train_epochs == 0: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals) schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))] schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting. if flags.FLAGS.is_mlperf_enabled: mllogger.event(key=mllog.constants.CACHE_CLEAR) mllogger.start(key=mllog.constants.RUN_START) mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) final_step = 0 if flags.FLAGS.is_mlperf_enabled: success = False if flags_obj.train_offset > 0: final_step += flags_obj.train_offset * steps_per_epoch mllogger.event(key=mllog.constants.FIRST_EPOCH_NUM, value=1, metadata={'number of epochs before main loop: ': flags_obj.train_offset}) for i in range(flags_obj.train_offset): mllogger.event(key=mllog.constants.EPOCH_NUM, value=i+1) classifier.train( input_fn=lambda input_context=None: input_fn_train( flags_obj.train_offset, input_context=input_context), hooks=train_hooks + [compliance_hook], max_steps=max_train_steps if max_train_steps < final_step else final_step) for cycle_index, num_train_epochs in enumerate(schedule): tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if flags.FLAGS.is_mlperf_enabled: mllogger.start(key=mllog.constants.BLOCK_START, value=cycle_index+1) mllogger.event(key=mllog.constants.FIRST_EPOCH_NUM, value=cycle_index*flags_obj.epochs_between_evals + flags_obj.train_offset + 1) mllogger.event(key=mllog.constants.EPOCH_COUNT, value=flags_obj.epochs_between_evals) for j in range(flags_obj.epochs_between_evals): mllogger.event(key=mllog.constants.EPOCH_NUM, value=cycle_index * flags_obj.epochs_between_evals + j + flags_obj.train_offset + 1) if num_train_epochs: # Since we are calling classifier.train immediately in each loop, the # value of num_train_epochs in the lambda function will not be changed # before it is used. So it is safe to ignore the pylint error here # pylint: disable=cell-var-from-loop final_step += num_train_epochs * steps_per_epoch classifier.train( input_fn=lambda input_context=None: input_fn_train( num_train_epochs, input_context=input_context), hooks=train_hooks + [compliance_hook] if compliance_hook is not None else train_hooks, max_steps=max_train_steps if max_train_steps < final_step else final_step) if flags.FLAGS.is_mlperf_enabled: mllogger.end(key=mllog.constants.BLOCK_STOP, value=cycle_index+1) if flags.FLAGS.is_mlperf_enabled: mllogger.start(key=mllog.constants.EVAL_START) # max_eval_steps is associated with testing and profiling. # As a result it is frequently called with synthetic data, # which will iterate forever. Passing steps=max_eval_steps # allows the eval (which is generally unimportant in those circumstances) # to terminate. Note that eval will run for max_eval_steps each loop, # regardless of the global_step count. if flags_obj.get_flag_value("return_before_eval", False): return {} if flags_obj.get_flag_value("disable_eval", False): eval_results = None continue tf.compat.v1.logging.info('Starting to evaluate.') eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=max_eval_steps) if flags.FLAGS.is_mlperf_enabled: mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=int(eval_results[_NUM_EXAMPLES_NAME])) valdiation_epoch = (cycle_index + 1) * flags_obj.epochs_between_evals + flags_obj.train_offset mllogger.event(key=mllog.constants.EVAL_ACCURACY, value=float(eval_results['accuracy']), metadata={'epoch_num: ': valdiation_epoch}) mllogger.end(key=mllog.constants.EVAL_STOP, metadata={'epoch_num: ' : valdiation_epoch}) if flags_obj.stop_threshold: success = bool(eval_results['accuracy'] >= flags_obj.stop_threshold) benchmark_logger.log_evaluation_result(eval_results) if flags_obj.stop_threshold: if horovod_enabled(): past_treshold = tf.cast(model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']), tf.float32) global_past_treshold = tf.math.greater( hvd.allreduce(past_treshold, op=hvd.Sum), tf.zeros(1, tf.float32)) if global_past_treshold.eval(session=tf.compat.v1.Session()): break else: if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. export_dtype = flags_core.get_tf_dtype(flags_obj) if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial( image_bytes_serving_input_fn, shape, dtype=export_dtype) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size, dtype=export_dtype) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, strip_default_attrs=True) stats = {} stats['eval_results'] = eval_results stats['train_hooks'] = train_hooks if flags.FLAGS.is_mlperf_enabled: mllogger.event(key=mllog.constants.RUN_STOP, value={"success": success}) mllogger.end(key=mllog.constants.RUN_STOP) return stats
def resnet_model_fn(features, labels, mode, model_class, resnet_size, weight_decay, learning_rate_fn, momentum, data_format, resnet_version, loss_scale, loss_filter_fn=None, model_type=resnet_model.DEFAULT_MODEL_TYPE, dtype=resnet_model.DEFAULT_DTYPE, fine_tune=False, label_smoothing=0.0): """Shared functionality for different resnet model_fns. Initializes the ResnetModel representing the model layers and uses that model to build the necessary EstimatorSpecs for the `mode` in question. For training, this means building losses, the optimizer, and the train op that get passed into the EstimatorSpec. For evaluation and prediction, the EstimatorSpec is returned without a train op, but with the necessary parameters for the given mode. Args: features: tensor representing input images labels: tensor representing class labels for all input images mode: current estimator mode; should be one of `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT` model_class: a class representing a TensorFlow model that has a __call__ function. We assume here that this is a subclass of ResnetModel. resnet_size: A single integer for the size of the ResNet model. weight_decay: weight decay loss rate used to regularize learned variables. learning_rate_fn: function that returns the current learning rate given the current global_step momentum: momentum term used for optimization data_format: Input format ('channels_last', 'channels_first', or None). If set to None, the format is dependent on whether a GPU is available. resnet_version: Integer representing which version of the ResNet network to use. See README for details. Valid values: [1, 2] loss_scale: The factor to scale the loss for numerical stability. A detailed summary is present in the arg parser help text. loss_filter_fn: function that takes a string variable name and returns True if the var should be included in loss calculation, and False otherwise. If None, batch_normalization variables will be excluded from the loss. dtype: the TensorFlow dtype to use for calculations. fine_tune: If True only train the dense layers(final layers). label_smoothing: If greater than 0 then smooth the labels. Returns: EstimatorSpec parameterized according to the input params and the current mode. """ # Uncomment the following lines if you want to write images to summary, # we turned it off for performance reason # Generate a summary node for the images # tf.compat.v1.summary.image('images', # (features, tf.cast(features, tf.float32)) [features.dtype == tf.bfloat16], # max_outputs=6) if features.dtype != tf.bfloat16: # Checks that features/images have same data type being used for calculations. assert features.dtype == dtype model = model_class(resnet_size, data_format, resnet_version=resnet_version, model_type=model_type, dtype=dtype) logits = model(features, mode == tf.estimator.ModeKeys.TRAIN) # This acts as a no-op if the logits are already in fp32 (provided logits are # not a SparseTensor). If dtype is is low precision, logits must be cast to # fp32 for numerical stability. logits = tf.cast(logits, tf.float32) if flags.FLAGS.is_mlperf_enabled: num_examples_metric = tf_mlperf_log.sum_metric(tensor=tf.shape(input=logits)[0], name=_NUM_EXAMPLES_NAME) predictions = { 'classes': tf.argmax(input=logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } if mode == tf.estimator.ModeKeys.PREDICT: # Return the predictions and the specification for serving a SavedModel return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'predict': tf.estimator.export.PredictOutput(predictions) }) # Calculate loss, which includes softmax cross entropy and L2 regularization. labels = tf.cast(labels, tf.int32) if label_smoothing != 0.0: one_hot_labels = tf.one_hot(labels, 1001) cross_entropy = tf.compat.v1.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=label_smoothing) else: cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy( logits=logits, labels=labels) # Create a tensor named cross_entropy for logging purposes. tf.identity(cross_entropy, name='cross_entropy') tf.compat.v1.summary.scalar('cross_entropy', cross_entropy) # If no loss_filter_fn is passed, assume we want the default behavior, # which is that batch_normalization variables are excluded from loss. def exclude_batch_norm(name): return 'batch_normalization' not in name loss_filter_fn = loss_filter_fn or exclude_batch_norm # Add weight decay to the loss. l2_loss = weight_decay * tf.add_n( # loss is computed using fp32 for numerical stability. [ tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.compat.v1.trainable_variables() if loss_filter_fn(v.name) ]) tf.compat.v1.summary.scalar('l2_loss', l2_loss) loss = cross_entropy + l2_loss if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() learning_rate = learning_rate_fn(global_step) # Create a tensor named learning_rate for logging purposes tf.identity(learning_rate, name='learning_rate') tf.compat.v1.summary.scalar('learning_rate', learning_rate) if flags.FLAGS.enable_lars: tf.compat.v1.logging.info('Using LARS Optimizer.') optimizer = lars.LARSOptimizer( learning_rate, momentum=momentum, weight_decay=weight_decay, skip_list=['batch_normalization', 'bias']) if flags.FLAGS.is_mlperf_enabled: mllogger.event(key=mllog.constants.OPT_NAME, value=mllog.constants.LARS) mllogger.event(key=mllog.constants.LARS_EPSILON, value=0.0) mllogger.event(key=mllog.constants.LARS_OPT_WEIGHT_DECAY, value=weight_decay) else: optimizer = tf.compat.v1.train.MomentumOptimizer( learning_rate=learning_rate, momentum=momentum ) fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None) if fp16_implementation == 'graph_rewrite': optimizer = ( tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=loss_scale)) if horovod_enabled(): optimizer = hvd.DistributedOptimizer(optimizer) def _dense_grad_filter(gvs): """Only apply gradient updates to the final layer. This function is used for fine tuning. Args: gvs: list of tuples with gradients and variable info Returns: filtered gradients so that only the dense layer remains """ return [(g, v) for g, v in gvs if 'dense' in v.name] if loss_scale != 1 and fp16_implementation != 'graph_rewrite': # When computing fp16 gradients, often intermediate tensor values are # so small, they underflow to 0. To avoid this, we multiply the loss by # loss_scale to make these tensor values loss_scale times bigger. scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale) if fine_tune: scaled_grad_vars = _dense_grad_filter(scaled_grad_vars) # Once the gradient computation is complete we can scale the gradients # back to the correct scale before passing them to the optimizer. unscaled_grad_vars = [(grad / loss_scale, var) for grad, var in scaled_grad_vars] minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step) else: grad_vars = optimizer.compute_gradients(loss*loss_scale) if fine_tune: grad_vars = _dense_grad_filter(grad_vars) grad_vars = [(grad / loss_scale, var) for grad, var in grad_vars] minimize_op = optimizer.apply_gradients(grad_vars, global_step) update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) if flags.FLAGS.is_mlperf_enabled: train_op = tf.group(minimize_op, update_ops, num_examples_metric[1]) else: train_op = tf.group(minimize_op, update_ops) else: train_op = None accuracy = tf.compat.v1.metrics.accuracy(labels, predictions['classes']) accuracy_top_5 = tf.compat.v1.metrics.mean( tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op')) metrics = {'accuracy': accuracy, 'accuracy_top_5': accuracy_top_5} if flags.FLAGS.is_mlperf_enabled: metrics.update({_NUM_EXAMPLES_NAME: num_examples_metric}) # Create a tensor named train_accuracy for logging purposes tf.identity(accuracy[1], name='train_accuracy') tf.identity(accuracy_top_5[1], name='train_accuracy_top_5') tf.compat.v1.summary.scalar('train_accuracy', accuracy[1]) tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1]) return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op, eval_metric_ops=metrics)
def main(argv): tf.disable_v2_behavior() tf.enable_resource_variables() if FLAGS.use_hpu and FLAGS.recipe_cache: prepare_recipe_cache() if FLAGS.use_horovod: if FLAGS.use_hpu: from TensorFlow.common.horovod_helpers import hvd_init, horovod_enabled, hvd hvd_init() assert horovod_enabled() if FLAGS.recipe_cache: # Other ranks should wait for recipe cache to be removed. # This operation can't be done before hvd_init. from mpi4py import MPI MPI.COMM_WORLD.Barrier() else: import horovod.tensorflow as hvd hvd.init() assert hvd.size() > 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) if FLAGS.use_hpu: if FLAGS.use_bf16: os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path dyn_shapes_flag = 'TF_ENABLE_DYNAMIC_SHAPES' if dyn_shapes_flag not in os.environ: os.environ[dyn_shapes_flag] = 'false' from habana_frameworks.tensorflow import load_habana_module # noqa load_habana_module() usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # If we just have to print the registry, do that and exit early. maybe_log_registry_and_exit() # Create HParams. if argv: set_hparams_from_args(argv[1:]) if FLAGS.schedule != "run_std_server": hparams = create_hparams() if FLAGS.gpu_automatic_mixed_precision: setattr(hparams, "gpu_automatic_mixed_precision", True) if FLAGS.deterministic_dataset: hparams.add_hparam("deterministic_dataset", True) hparams.add_hparam("use_horovod", FLAGS.use_horovod) hparams.add_hparam("use_hpu", FLAGS.use_hpu) if FLAGS.use_horovod: hparams.add_hparam("hvd_worker_id", hvd.rank()) hparams.add_hparam("hvd_size", hvd.size()) if FLAGS.schedule == "run_std_server": run_std_server() trainer_lib.set_random_seed(FLAGS.random_seed) if FLAGS.generate_data: generate_data() exp_fn = create_experiment_fn() exp = exp_fn(create_run_config(hparams), hparams) if is_chief(): save_metadata(hparams) with dump_callback(): execute_schedule(exp)