def imagenet_model_fn(features, labels, mode, params):
    """Our model_fn for ResNet to be used with our Estimator."""

    # Warmup and higher lr may not be valid for fine tuning with small batches
    # and smaller numbers of training images.
    if params['fine_tune'] or ('disable_warmup' in params
                               and params['disable_warmup']):
        warmup = False
        base_lr = .1
    else:
        warmup = True
        base_lr = .128

    # According to https://arxiv.org/abs/1706.02677 and our internal experiments,
    # the best accuracy results for more than 16 devices are achieved when base_lr == 0.1
    if horovod_enabled() and hvd.size() > 16:
        base_lr = .1

    # Used for ResNeXt101-32x4d
    if params['use_cosine_lr']:
        base_lr = .256

    if horovod_enabled():
        total_batch_size = params['batch_size'] * hvd.size()
    else:
        total_batch_size = params['batch_size'] * params.get('num_workers', 1)

    learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
        batch_size=total_batch_size,
        batch_denom=256,
        num_images=NUM_IMAGES['train'],
        boundary_epochs=[30, 60, 80, 90],
        train_epochs=params['train_epochs'],
        decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
        warmup=warmup,
        warmup_epochs=params['warmup_epochs'],
        base_lr=base_lr,
        use_cosine_lr=params['use_cosine_lr'])

    return resnet_run_loop.resnet_model_fn(
        features=features,
        labels=labels,
        mode=mode,
        model_class=ImagenetModel,
        resnet_size=params['resnet_size'],
        weight_decay=flags.FLAGS.weight_decay,
        learning_rate_fn=learning_rate_fn,
        momentum=flags.FLAGS.momentum,
        data_format=params['data_format'],
        resnet_version=params['resnet_version'],
        loss_scale=params['loss_scale'],
        loss_filter_fn=None,
        model_type=params['model_type'],
        dtype=params['dtype'],
        fine_tune=params['fine_tune'],
        label_smoothing=flags.FLAGS.label_smoothing)
Beispiel #2
0
def prepare_model_dir(params):
    worker_id = hvd_rank() if horovod_enabled() else 0
    if params.benchmark or (not params.log_all_workers and worker_id != 0):
        return None

    model_dir = os.path.join(params.model_dir, "model_checkpoint")
    if params.log_all_workers and horovod_enabled():
        model_dir = os.path.join(model_dir, f'worker_{worker_id}')

    os.makedirs(model_dir, exist_ok=True)
    if ('train' in params.exec_mode) and (not params.resume_training):
        os.system('rm -rf {}/*'.format(model_dir))
    return model_dir
Beispiel #3
0
    def __init__(self,
                 dump_root,
                 tensor_debug_mode,
                 circular_buffer_size,
                 op_regex,
                 output_regex=None):
        self._dump_root = dump_root
        if horovod_enabled():
            self._dump_root = os.path.join(self._dump_root,
                                           f"rank_{hvd_rank()}")
        self._tensor_debug_mode = debug_event_pb2.TensorDebugMode.Value(
            tensor_debug_mode)
        self._circular_buffer_size = circular_buffer_size
        self._op_regex = re.compile(op_regex) if isinstance(op_regex,
                                                            str) else op_regex
        self._output_regex = re.compile(output_regex) if isinstance(
            output_regex, str) else output_regex
        self._tfdbg_run_id = ''
        self._dump_op_counter = 0

        debug_writer_args = {
            "dump_root": self._dump_root,
            "circular_buffer_size": self._circular_buffer_size
        }

        if not tf.__version__.startswith("2.2"):
            debug_writer_args["tfdbg_run_id"] = self._tfdbg_run_id

        self._writer = debug_events_writer.DebugEventsWriter(
            **debug_writer_args)
Beispiel #4
0
    def input_fn(params):
        """The actual input function."""
        if use_tpu:
            batch_size = params["batch_size"]
        else:
            batch_size = bsz

        if FLAGS.deterministic_run:
            d = tf.data.TFRecordDataset(input_file)
            d = d.apply(
                tf.data.experimental.map_and_batch(
                    lambda record: _decode_record(record, name_to_features),
                    batch_size=batch_size,
                    num_parallel_calls=1,
                    drop_remainder=True))
            return d
        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            if horovod_enabled():
                d = d.shard(hvd.size(), hvd.rank())
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.data.experimental.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d
 def before_run(self, run_context):
     if horovod_enabled() and hvd_rank() != 0:
         return
     self.t0 = time.time()
     if self.num_accumulation_steps <= 1:
         if FLAGS.manual_fp16 or FLAGS.amp:
             return tf.estimator.SessionRunArgs(fetches=[
                 'step_update:0', 'total_loss:0', 'learning_rate:0',
                 'nsp_loss:0', 'mlm_loss:0', 'loss_scale:0'
             ])
         else:
             return tf.estimator.SessionRunArgs(fetches=[
                 'step_update:0', 'total_loss:0', 'learning_rate:0',
                 'nsp_loss:0', 'mlm_loss:0'
             ])
     else:
         if FLAGS.manual_fp16 or FLAGS.amp:
             return tf.estimator.SessionRunArgs(fetches=[
                 'step_update:0', 'update_step:0', 'total_loss:0',
                 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0',
                 'loss_scale:0'
             ])
         else:
             return tf.estimator.SessionRunArgs(fetches=[
                 'step_update:0', 'update_step:0', 'total_loss:0',
                 'learning_rate:0', 'nsp_loss:0', 'mlm_loss:0'
             ])
Beispiel #6
0
def get_global_batch_size(batch_size):
    global global_batch_size
    if global_batch_size is None:
        global_batch_size = batch_size
        if horovod_enabled():
            global_batch_size = batch_size * hvd_size()
    return global_batch_size
Beispiel #7
0
def hvd_info(msg):
    hvd_try_init()
    if horovod_enabled():
        head = 'hvd rank{}/{} in {}'.format(hvd.rank(), hvd.size(),
                                            socket.gethostname())
    else:
        head = '{}'.format(socket.gethostname())
    tf.logging.info('{}: {}'.format(head, msg))
    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        name_to_features = {
            "input_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids":
            tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "masked_lm_positions":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_ids":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
            "masked_lm_weights":
            tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
            "next_sentence_labels":
            tf.io.FixedLenFeature([1], tf.int64),
        }

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        if is_training:
            d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
            if horovod_enabled(): d = d.shard(hvd_size(), hvd_rank())
            d = d.repeat()
            d = d.shuffle(buffer_size=len(input_files))

            # `cycle_length` is the number of parallel files that get read.
            cycle_length = min(num_cpu_threads, len(input_files))

            # `sloppy` mode means that the interleaving is not exact. This adds
            # even more randomness to the training pipeline.
            d = d.apply(
                tf.data.experimental.parallel_interleave(
                    tf.data.TFRecordDataset,
                    sloppy=is_training,
                    cycle_length=cycle_length))
            d = d.shuffle(buffer_size=100)
        else:
            d = tf.data.TFRecordDataset(input_files)
            # Since we evaluate for a fixed number of steps we don't want to encounter
            # out-of-range exceptions.
            d = d.repeat()

        # We must `drop_remainder` on training because the TPU requires fixed
        # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
        # and we *don't* want to drop the remainder, otherwise we wont cover
        # every sample.
        d = d.apply(
            tf.data.experimental.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                num_parallel_batches=num_cpu_threads,
                drop_remainder=True if is_training else False))
        return d
    def _calculate_mean_and_var(self, x, axes, keep_dims):

        with ops.name_scope('moments', values=[x, axes]):
            # The dynamic range of fp16 is too limited to support the collection of
            # sufficient statistics. As a workaround we simply perform the operations
            # on 32-bit floats before converting the mean and variance back to fp16
            y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x

            if horovod_enabled():
                num_shards = hvd.size()
            else:
                num_shards = 1

            if num_shards > 1:
                local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True)
                local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True)
                batch_size = math_ops.cast(array_ops.shape_v2(y)[0], dtypes.float32)
                # y_sum, y_squared_sum, global_batch_size = (
                #     replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, [
                #         local_sum, local_squared_sum, batch_size]))

                # hvd_info(f'local_sum {local_sum.shape}, local_squared_sum {local_squared_sum.shape}')

                y_sum = hvd.allreduce(local_sum, average=False)
                y_squared_sum = hvd.allreduce(local_squared_sum, average=False)

                global_batch_size = batch_size * num_shards
                axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))]
                multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32)
                multiplier = multiplier * global_batch_size

                mean = y_sum / multiplier
                y_squared_mean = y_squared_sum / multiplier
                # var = E(x^2) - E(x)^2
                variance = y_squared_mean - math_ops.square(mean)
            else:
                # Compute true mean while keeping the dims for proper broadcasting.
                mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean')
                # sample variance, not unbiased variance
                # Note: stop_gradient does not change the gradient that gets
                #       backpropagated to the mean from the variance calculation,
                #       because that gradient is zero
                variance = math_ops.reduce_mean(
                    math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
                    axes,
                    keepdims=True,
                    name='variance')
            if not keep_dims:
                mean = array_ops.squeeze(mean, axes)
                variance = array_ops.squeeze(variance, axes)
            if x.dtype == dtypes.float16:
                return (math_ops.cast(mean, dtypes.float16),
                        math_ops.cast(variance, dtypes.float16))
            else:
                return (mean, variance)
Beispiel #10
0
def hvd_try_init():
    global IS_HVD_INIT
    if not IS_HVD_INIT and horovod_enabled():
        hvd_init()
        IS_HVD_INIT = True

        tf.get_logger().propagate = False
        if hvd.rank() == 0:
            tf.logging.set_verbosity('INFO')
        else:
            tf.logging.set_verbosity('WARN')
Beispiel #11
0
def get_logger(params):
    backends = []
    worker_id = hvd_rank() if horovod_enabled() else 0
    if worker_id == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
Beispiel #12
0
def main():
    """
    Starting point of the application
    """
    params = parse_args(description="UNet-medical")
    if params.use_horovod:
        hvd_init()
    set_flags(params)

    model_dir = prepare_model_dir(params)
    params.model_dir = model_dir
    logger = get_logger(params)

    tb_logger = None
    if params.tensorboard_logging:
        log_dir = params.log_dir
        if horovod_enabled() and params.log_all_workers:
            log_dir = os.path.join(log_dir, f'worker_{hvd_rank()}')
        tb_logger = namedtuple('TBSummaryWriters', 'train_writer eval_writer')(
            tf.summary.create_file_writer(log_dir),
            tf.summary.create_file_writer(os.path.join(log_dir, 'eval')))

    model = Unet()

    dataset = Dataset(data_dir=params.data_dir,
                      batch_size=params.batch_size,
                      fold=params.fold,
                      augment=params.augment,
                      hpu_id=hvd_rank() if horovod_enabled() else 0,
                      num_hpus=hvd_size() if horovod_enabled() else 1,
                      seed=params.seed)

    if 'train' in params.exec_mode:
        with dump_callback(params.dump_config):
            train(params, model, dataset, logger, tb_logger)

    if 'evaluate' in params.exec_mode:
        evaluate(params, model, dataset, logger, tb_logger)

    if 'predict' in params.exec_mode:
        predict(params, model, dataset, logger)
def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

  Args:
    num_samples_per_epoch: The number of samples in each epoch of training.
    global_step: The global_step tensor.

  Returns:
    A `Tensor` representing the learning rate.

  Raises:
    ValueError: if
  """
    # Note: when num_clones is > 1, this will actually have each clone to go
    # over each epoch FLAGS.num_epochs_per_decay times. This is different
    # behavior from sync replicas and is expected to produce different results.
    steps_per_epoch = num_samples_per_epoch / FLAGS.batch_size / FLAGS.num_workers
    if FLAGS.sync_replicas:
        steps_per_epoch /= FLAGS.replicas_to_aggregate

    decay_steps = int(steps_per_epoch * FLAGS.num_epochs_per_decay)

    if FLAGS.learning_rate_decay_type == 'exponential':
        learning_rate = tf.train.exponential_decay(
            FLAGS.learning_rate,
            global_step,
            decay_steps,
            FLAGS.learning_rate_decay_factor,
            staircase=True,
            name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
        learning_rate = tf.constant(FLAGS.learning_rate,
                                    name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
        learning_rate = tf.train.polynomial_decay(
            FLAGS.learning_rate,
            global_step,
            decay_steps,
            FLAGS.end_learning_rate,
            power=1.0,
            cycle=False,
            name='polynomial_decay_learning_rate')
    else:
        raise ValueError('learning_rate_decay_type [%s] was not recognized' %
                         FLAGS.learning_rate_decay_type)

    if FLAGS.warmup_epochs:
        warmup_lr = (FLAGS.learning_rate * tf.cast(global_step, tf.float32) /
                     (steps_per_epoch * FLAGS.warmup_epochs))
        learning_rate = tf.minimum(warmup_lr, learning_rate)
    if horovod_enabled():
        learning_rate = learning_rate * hvd.size()
    return learning_rate
Beispiel #14
0
def hvd_info_rank0(msg, with_head=True):
    hvd_try_init()
    if is_rank0():
        if with_head:
            if horovod_enabled():
                head = 'hvd only rank{}/{} in {}'.format(
                    hvd.rank(), hvd.size(), socket.gethostname())
            else:
                head = '{}'.format(socket.gethostname())
            tf.logging.info('{}: {}'.format(head, msg))
        else:
            tf.logging.info(msg)
Beispiel #15
0
        def update(accum_vars):
            with tf.control_dependencies([global_step.assign(new_global_step)
                                          ]):
                if allreduce_post_accumulation and horovod_enabled():
                    accum_vars = [
                        hvd.allreduce(tf.convert_to_tensor(value=accum_var))
                        if isinstance(accum_var, tf.IndexedSlices) else
                        hvd.allreduce(accum_var) for accum_var in accum_vars
                    ]

                return optimizer.apply_gradients(list(zip(accum_vars, tvars)),
                                                 global_step=global_step)
Beispiel #16
0
  def eval_end(self):
    """See base class."""

    if self.flags_obj.use_distributed_eval and horovod_enabled():
      test_accuracy = hvd.allreduce(self.test_accuracy.result())
    else:
      test_accuracy = self.test_accuracy.result()

    return {
        'test_loss': self.test_loss.result(),
        'test_accuracy': test_accuracy
    }
Beispiel #17
0
    def train_step(features, labels, warmup_batch=False):
        with tf.GradientTape() as tape:
            output_map = model(features)
            crossentropy_loss, dice_loss = partial_losses(output_map, labels)
            added_losses = tf.add(crossentropy_loss, dice_loss, name="total_loss_ref")
            loss = added_losses + params.weight_decay * tf.add_n(
                [tf.nn.l2_loss(v) for v in model.trainable_variables
                 if 'batch_normalization' not in v.name])

        if horovod_enabled():
            tape = hvd.DistributedGradientTape(tape)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if horovod_enabled() and warmup_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        ce_loss(crossentropy_loss)
        f1_loss(dice_loss)
        return loss
Beispiel #18
0
    def step_fn(inputs):
      """Function to run on the device."""
      images, labels = inputs
      if self.one_hot:
        labels = tf.cast(labels, tf.int32)
        labels = tf.one_hot(labels, 1001)
        labels = tf.squeeze(labels)

      with tf.GradientTape() as tape:
        logits = self.model(images, training=True)

        prediction_loss = self.get_prediction_loss(labels, logits)

        loss = tf.reduce_sum(prediction_loss) * (1.0 /
                                                 self.flags_obj.batch_size)

        if not self.use_lars_optimizer:
          num_replicas = self.strategy.num_replicas_in_sync

          if self.flags_obj.single_l2_loss_op:
            l2_loss = self.flags_obj.weight_decay * tf.add_n([
                tf.nn.l2_loss(v)
                for v in self.model.trainable_variables
                if ('bn' not in v.name)
            ])

            loss += (l2_loss / num_replicas)
          else:
            loss += (tf.reduce_sum(self.model.losses) / num_replicas)

      if horovod_enabled():
        tape = hvd.DistributedGradientTape(tape)
        grads = tape.gradient(loss, self.model.trainable_variables)
        grads_and_vars = zip(grads, self.model.trainable_variables)

        self.optimizer.apply_gradients(
          grads_and_vars, experimental_aggregate_gradients=False)

        tf.cond(self.global_step == 1,
          lambda: hvd.broadcast_variables(self.model.variables + self.optimizer.variables(),
                                          root_rank=0),
          lambda: tf.constant(True))
      else:
        grad_utils.minimize_using_explicit_allreduce(
          tape, self.optimizer, loss, self.model.trainable_variables)

      self.train_loss.update_state(loss)
      self.train_accuracy.update_state(labels, logits)
def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

  Args:
    learning_rate: A scalar or `Tensor` learning rate.

  Returns:
    An instance of an optimizer.

  Raises:
    ValueError: if FLAGS.optimizer is not recognized.
  """
    if FLAGS.optimizer == 'adadelta':
        optimizer = tf.train.AdadeltaOptimizer(learning_rate,
                                               rho=FLAGS.adadelta_rho,
                                               epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'adagrad':
        optimizer = tf.train.AdagradOptimizer(
            learning_rate,
            initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
    elif FLAGS.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate,
                                           beta1=FLAGS.adam_beta1,
                                           beta2=FLAGS.adam_beta2,
                                           epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'ftrl':
        optimizer = tf.train.FtrlOptimizer(
            learning_rate,
            learning_rate_power=FLAGS.ftrl_learning_rate_power,
            initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
            l1_regularization_strength=FLAGS.ftrl_l1,
            l2_regularization_strength=FLAGS.ftrl_l2)
    elif FLAGS.optimizer == 'momentum':
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=FLAGS.momentum,
                                               name='Momentum')
    elif FLAGS.optimizer == 'rmsprop':
        optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                              decay=FLAGS.rmsprop_decay,
                                              momentum=FLAGS.rmsprop_momentum,
                                              epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
        raise ValueError('Optimizer [%s] was not recognized' % FLAGS.optimizer)
    if horovod_enabled():
        optimizer = hvd.DistributedOptimizer(optimizer)
    return optimizer
Beispiel #20
0
  def _moments(self, inputs, reduction_axes, keep_dims):
    """Compute the mean and variance: it overrides the original _moments."""
    shard_mean, shard_variance = super(SyncBatchNormalization, self)._moments(
      inputs, reduction_axes, keep_dims=keep_dims)

    num_shards = hvd.size() if horovod_enabled() else 1
    if num_shards > 1:
      # Compute variance using: Var[X]= E[X^2] - E[X]^2.
      shard_square_of_mean = tf.math.square(shard_mean)
      shard_mean_of_square = shard_variance + shard_square_of_mean
      group_mean = hvd.allreduce(shard_mean)
      group_mean_of_square = hvd.allreduce(shard_mean_of_square)
      group_variance = group_mean_of_square - tf.math.square(group_mean)
      return (group_mean, group_variance)
    else:
      return (shard_mean, shard_variance)
Beispiel #21
0
def get_mllog_mlloger():
    from mlperf_logging import mllog
    from mlperf_compliance import tf_mlperf_log

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    filenames = "resnet50v1.5.log-" + str_hvd_rank
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
        default_namespace = workername,
        default_stack_offset = 1,
        default_clear_line = False,
        root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog, tf_mlperf_log
Beispiel #22
0
def get_mllog_mlloger(output_dir=None):
    from mlperf_logging import mllog

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    mllogger.propagate = False
    mllog.propagate=False
    if output_dir is None: output_dir='./log'
    filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt"
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
            default_namespace = workername,
            default_stack_offset = 1,
            default_clear_line = False,
            root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog
Beispiel #23
0
def train_step(images, labels, step):
    with tf.GradientTape() as tape:
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)

    if horovod_enabled():
        tape = hvd.DistributedGradientTape(tape)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables),
                                  experimental_aggregate_gradients=True)
        tf.cond(
            step == 0, lambda: hvd.broadcast_variables(
                model.variables + optimizer.variables(), root_rank=0),
            lambda: tf.constant(True))
    else:
        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
                                                     model.trainable_variables)

    train_loss(loss)
    train_accuracy(labels, predictions)
Beispiel #24
0
def set_flags(params):
    if params.tf_verbosity:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(params.tf_verbosity)

    if not params.no_hpu:
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        if params.dtype == 'bf16':
            os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path

    np.random.seed(params.seed)
    tf.random.set_seed(params.seed)

    if params.use_xla:
        tf.config.optimizer.set_jit(True)

    per_hpu_thread_count = 1
    num_hpus = hvd_size() if horovod_enabled() else 1
    cpu_count = multiprocessing.cpu_count()
    total_hpu_thread_count = per_hpu_thread_count * num_hpus

    tf.config.threading.set_intra_op_parallelism_threads(0)
    tf.config.threading.set_inter_op_parallelism_threads(cpu_count - total_hpu_thread_count)
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             dtype=tf.float32,
             datasets_num_private_threads=None,
             parse_record_fn=parse_record,
             input_context=None,
             drop_remainder=False,
             tf_data_experimental_slack=False,
             experimental_preloading=False,
             dataset_fn=None):
    """Input function which provides batches for train or eval.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    dtype: Data type to use for images/features
    datasets_num_private_threads: Number of private threads for tf.data.
    parse_record_fn: Function to use for parsing the records.
    input_context: A `tf.distribute.InputContext` object passed in by
      `tf.distribute.Strategy`.
    drop_remainder: A boolean indicates whether to drop the remainder of the
      batches. If True, the batch dimension will be static.
    tf_data_experimental_slack: Whether to enable tf.data's
      `experimental_slack` option.

  Returns:
    A dataset that can be used for iteration.
  """
    if dataset_fn is None:
        filenames = get_filenames(is_training, data_dir)
        dataset = tf.data.Dataset.from_tensor_slices(filenames)
    else:
        dataset = dataset_fn()

    if is_training and horovod_enabled():
        dataset = dataset.shard(hvd.size(), hvd.rank())

    if input_context:
        tf.compat.v1.logging.info(
            'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d'
            % (input_context.input_pipeline_id,
               input_context.num_input_pipelines))
        dataset = dataset.shard(input_context.num_input_pipelines,
                                input_context.input_pipeline_id)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records.
    # cycle_length = 10 means that up to 10 files will be read and deserialized in
    # parallel. You may want to increase this number if you have a large number of
    # CPU cores.
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_SHUFFLE_BUFFER,
        parse_record_fn=parse_record_fn,
        num_epochs=num_epochs,
        dtype=dtype,
        datasets_num_private_threads=datasets_num_private_threads,
        drop_remainder=drop_remainder,
        tf_data_experimental_slack=tf_data_experimental_slack,
        experimental_preloading=experimental_preloading)
Beispiel #26
0
def adjust_batch_size(batch_size):
    if horovod_enabled():
        return batch_size * comm_size()

    return batch_size
Beispiel #27
0
def process_record_dataset(dataset,
                           is_training,
                           batch_size,
                           shuffle_buffer,
                           parse_record_fn,
                           num_epochs=1,
                           dtype=tf.float32,
                           datasets_num_private_threads=None,
                           drop_remainder=False,
                           tf_data_experimental_slack=False,
                           experimental_preloading=False):
  """Given a Dataset with raw records, return an iterator over the records.

  Args:
    dataset: A Dataset representing raw records
    is_training: A boolean denoting whether the input is for training.
    batch_size: The number of samples per batch.
    shuffle_buffer: The buffer size to use when shuffling records. A larger
      value results in better randomness, but smaller values reduce startup
      time and use less memory.
    parse_record_fn: A function that takes a raw record and returns the
      corresponding (image, label) pair.
    num_epochs: The number of epochs to repeat the dataset.
    dtype: Data type to use for images/features.
    datasets_num_private_threads: Number of threads for a private
      threadpool created for all datasets computation.
    drop_remainder: A boolean indicates whether to drop the remainder of the
      batches. If True, the batch dimension will be static.
    tf_data_experimental_slack: Whether to enable tf.data's
      `experimental_slack` option.

  Returns:
    Dataset of (image, label) pairs ready for iteration.
  """
  # Defines a specific size thread pool for tf.data operations.
  if datasets_num_private_threads:
    options = tf.data.Options()
    options.experimental_threading.private_threadpool_size = (
        datasets_num_private_threads)
    dataset = dataset.with_options(options)
    tf.compat.v1.logging.info('datasets_num_private_threads: %s',
                              datasets_num_private_threads)


  if not experimental_preloading:
    # Disable intra-op parallelism to optimize for throughput instead of latency.
    options = tf.data.Options()
    options.experimental_threading.max_intra_op_parallelism = 1
    dataset = dataset.with_options(options)
    # Prefetches a batch at a time to smooth out the time taken to load input
    # files for shuffling and processing.
    dataset = dataset.prefetch(buffer_size=batch_size)

  if is_training:
    # Shuffles records before repeating to respect epoch boundaries.
    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
  else:
    dataset = dataset.take(imagenet_main.NUM_IMAGES['validation'])

  if horovod_enabled():
    # Repeats the dataset. Due to sharding in multinode, training is related
    # directly to the number of max iterations not to number of epochs.
    dataset = dataset.repeat()
  else:
    # Repeats the dataset for the number of epochs to train.
    dataset = dataset.repeat(num_epochs)

  num_parallel_calls = 16 if horovod_enabled() else tf.data.experimental.AUTOTUNE

  # Parses the raw records into images and labels.
  dataset = dataset.map(
      lambda value: parse_record_fn(value, is_training, dtype),
      num_parallel_calls=num_parallel_calls, deterministic=False)
  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)

  # Operations between the final prefetch and the get_next call to the iterator
  # will happen synchronously during run time. We prefetch here again to
  # background all of the above processing work and keep it out of the
  # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE
  # allows DistributionStrategies to adjust how many batches to fetch based
  # on how many devices are present.

  if experimental_preloading:
    device = "/device:HPU:0"
    dataset = dataset.apply(tf.data.experimental.prefetch_to_device(device))
  else:
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    if tf_data_experimental_slack:
      options = tf.data.Options()
      options.experimental_slack = True
      dataset = dataset.with_options(options)

  return dataset
Beispiel #28
0
def resnet_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for ResNet Models.

  Args:
    flags_obj: An object containing parsed flags. See define_resnet_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
     Dict of results of the run.  Contains the keys `eval_results` and
    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
    `train_hooks` is a list the instances of hooks used during training.
  """

  experimental_preloading = flags_obj.experimental_preloading

  model_helpers.apply_clean(flags.FLAGS)

  # Ensures flag override logic is only executed if explicitly triggered.
  if flags_obj.tf_gpu_thread_mode:
    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

  # Configures cluster spec for distribution strategy.
  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
                                                     flags_obj.task_index)

  # Creates session config. allow_soft_placement = True, is required for
  # multi-GPU and is not harmful for other modes.
  session_config = tf.compat.v1.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=not experimental_preloading)

  if horovod_enabled():
    # The Scoped Allocator Optimization is enabled by default unless disabled by a flag.
    if not condition_env_var('TF_DISABLE_SCOPED_ALLOCATOR', default=False):
      from tensorflow.core.protobuf import rewriter_config_pb2  # pylint: disable=import-error
      session_config.graph_options.rewrite_options.scoped_allocator_optimization = rewriter_config_pb2.RewriterConfig.ON
      enable_op = session_config.graph_options.rewrite_options.scoped_allocator_opts.enable_op
      del enable_op[:]
      enable_op.append("HorovodAllreduce")

  distribution_strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=flags_obj.distribution_strategy,
      num_gpus=flags_core.get_num_gpus(flags_obj),
      num_workers=num_workers,
      all_reduce_alg=flags_obj.all_reduce_alg,
      num_packs=flags_obj.num_packs)

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
      log_step_count_steps=flags_obj.display_steps,
      save_checkpoints_secs=None,
      save_checkpoints_steps=flags_obj.save_checkpoint_steps)

  # Initializes model with all but the dense layer from pretrained ResNet.
  # if flags_obj.pretrained_model_checkpoint_path is not None:
  #   warm_start_settings = tf.estimator.WarmStartSettings(
  #       flags_obj.pretrained_model_checkpoint_path,
  #       vars_to_warm_start='^(?!.*dense)')
  # else:
  #   warm_start_settings = None
  warm_start_settings = None

  model_dir=flags_obj.model_dir

  if horovod_enabled():
    model_dir="{}/rank_{}".format(flags_obj.model_dir, hvd.rank())

  if experimental_preloading:
    SelectedEstimator = HabanaEstimator
  else:
    SelectedEstimator = tf.estimator.Estimator

  if flags.FLAGS.is_mlperf_enabled:
    for eval_batch_size in range(flags_obj.batch_size, 1, -1):
      if imagenet_main.NUM_IMAGES['validation'] % eval_batch_size == 0:
        break
  else:
    eval_batch_size = flags_obj.batch_size

  classifier = SelectedEstimator(
      model_fn=model_function, model_dir=model_dir, config=run_config,
      warm_start_from=warm_start_settings, params={
          'resnet_size': int(flags_obj.resnet_size),
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'resnet_version': int(flags_obj.resnet_version),
          'model_type': flags_obj.model_type,
          'loss_scale': flags_core.get_loss_scale(flags_obj,
                                                  default_for_fp16=128),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          'fine_tune': flags_obj.fine_tune,
          'num_workers': num_workers,
          'train_epochs': flags_obj.train_epochs,
          'warmup_epochs': flags_obj.warmup_epochs,
          'use_cosine_lr': flags_obj.use_cosine_lr,
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'resnet_size': flags_obj.resnet_size,
      'resnet_version': flags_obj.resnet_version,
      'model_type': flags_obj.model_type,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
      'num_workers': num_workers,
  }
  if flags.FLAGS.is_mlperf_enabled:
    run_params['eval_batch_size'] = eval_batch_size

  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=model_dir,
      batch_size=flags_obj.batch_size)

  if flags.FLAGS.is_mlperf_enabled:
    _log_cache = []
    def formatter(x):
      """Abuse side effects to get tensors out of the model_fn."""
      if _log_cache:
        _log_cache.pop()
        _log_cache.append(x.copy())
        return str(x)

    compliance_hook = tf.estimator.LoggingTensorHook(
      tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME},
      every_n_iter=int(1e10),
      at_end=True,
      formatter=formatter)
  else:
    compliance_hook = None

  if horovod_enabled():

    if "tf_profiler_hook" not in flags_obj.hooks and os.environ.get("TF_RANGE_TRACE", False):
      from TensorFlow.common.utils import RangeTFProfilerHook
      begin = (imagenet_main.NUM_IMAGES["train"] // (flags_obj.batch_size * hvd.size()) + 100)
      train_hooks.append(RangeTFProfilerHook(begin,20, "./rank-{}".format(hvd.rank())))

    if "synapse_logger_hook" not in flags_obj.hooks and "range" == os.environ.get("HABANA_SYNAPSE_LOGGER", "False").lower():
      from TensorFlow.common.horovod_helpers import SynapseLoggerHook
      begin = (imagenet_main.NUM_IMAGES["train"] // (flags_obj.batch_size * hvd.size()) + 100)
      end = begin + 100
      print("Begin: {}".format(begin))
      print("End: {}".format(end))
      train_hooks.append(SynapseLoggerHook(list(range(begin, end)), False))
    train_hooks.append(hvd.BroadcastGlobalVariablesHook(0))


  def input_fn_train(num_epochs, input_context=None):
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_replica_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_dl_type(flags_obj),
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        input_context=input_context, experimental_preloading=experimental_preloading)

  def input_fn_eval():
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_replica_batch_size(
            eval_batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_dl_type(flags_obj), experimental_preloading=experimental_preloading)

  train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
                  flags_obj.train_epochs)

  max_train_steps = flags_obj.max_train_steps
  global_batch_size = flags_obj.batch_size * (hvd.size() if horovod_enabled() else 1)
  steps_per_epoch = (imagenet_main.NUM_IMAGES['train'] // global_batch_size)
  if max_train_steps is None:
    max_train_steps = steps_per_epoch * (train_epochs + flags_obj.train_offset)

  max_eval_steps = flags_obj.max_eval_steps
  if max_eval_steps is None:
    max_eval_steps = (imagenet_main.NUM_IMAGES['validation'] + eval_batch_size - 1) // eval_batch_size

  use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
  if use_train_and_evaluate:
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda input_context=None: input_fn_train(
            train_epochs, input_context=input_context),
        hooks=train_hooks,
        max_steps=max_train_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
    tf.compat.v1.logging.info('Starting to train and evaluate.')
    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
    # tf.estimator.train_and_evalute doesn't return anything in multi-worker
    # case.
    eval_results = {}
  else:
    if train_epochs == 0:
      # If --eval_only is set, perform a single loop with zero train epochs.
      schedule, n_loops = [0], 1
    else:
      # Compute the number of times to loop while training. All but the last
      # pass will train for `epochs_between_evals` epochs, while the last will
      # train for the number needed to reach `training_epochs`. For instance if
      #   train_epochs = 25 and epochs_between_evals = 10
      # schedule will be set to [10, 10, 5]. That is to say, the loop will:
      #   Train for 10 epochs and then evaluate.
      #   Train for another 10 epochs and then evaluate.
      #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
      n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
      schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
      schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.

    if flags.FLAGS.is_mlperf_enabled:
      mllogger.event(key=mllog.constants.CACHE_CLEAR)
      mllogger.start(key=mllog.constants.RUN_START)
      mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE,
                     value=global_batch_size)

    final_step = 0

    if flags.FLAGS.is_mlperf_enabled:
      success = False
      if flags_obj.train_offset > 0:
        final_step += flags_obj.train_offset * steps_per_epoch
        mllogger.event(key=mllog.constants.FIRST_EPOCH_NUM, value=1, metadata={'number of epochs before main loop: ': flags_obj.train_offset})
        for i in range(flags_obj.train_offset):
          mllogger.event(key=mllog.constants.EPOCH_NUM, value=i+1)
        classifier.train(
              input_fn=lambda input_context=None: input_fn_train(
              flags_obj.train_offset, input_context=input_context),
              hooks=train_hooks + [compliance_hook],
              max_steps=max_train_steps if max_train_steps < final_step else final_step)

    for cycle_index, num_train_epochs in enumerate(schedule):
      tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
                                int(n_loops))
      if flags.FLAGS.is_mlperf_enabled:
        mllogger.start(key=mllog.constants.BLOCK_START, value=cycle_index+1)
        mllogger.event(key=mllog.constants.FIRST_EPOCH_NUM, value=cycle_index*flags_obj.epochs_between_evals + flags_obj.train_offset + 1)
        mllogger.event(key=mllog.constants.EPOCH_COUNT, value=flags_obj.epochs_between_evals)

        for j in range(flags_obj.epochs_between_evals):
          mllogger.event(key=mllog.constants.EPOCH_NUM,
                         value=cycle_index  * flags_obj.epochs_between_evals + j +  flags_obj.train_offset + 1)

      if num_train_epochs:
        # Since we are calling classifier.train immediately in each loop, the
        # value of num_train_epochs in the lambda function will not be changed
        # before it is used. So it is safe to ignore the pylint error here
        # pylint: disable=cell-var-from-loop
        final_step += num_train_epochs * steps_per_epoch
        classifier.train(
            input_fn=lambda input_context=None: input_fn_train(
                num_train_epochs, input_context=input_context),
            hooks=train_hooks + [compliance_hook] if compliance_hook is not None else train_hooks,
            max_steps=max_train_steps if max_train_steps < final_step else final_step)
        if flags.FLAGS.is_mlperf_enabled:
            mllogger.end(key=mllog.constants.BLOCK_STOP, value=cycle_index+1)

      if flags.FLAGS.is_mlperf_enabled:
        mllogger.start(key=mllog.constants.EVAL_START)
      # max_eval_steps is associated with testing and profiling.
      # As a result it is frequently called with synthetic data,
      # which will iterate forever. Passing steps=max_eval_steps
      # allows the eval (which is generally unimportant in those circumstances)
      # to terminate. Note that eval will run for max_eval_steps each loop,
      # regardless of the global_step count.
      if flags_obj.get_flag_value("return_before_eval", False):
        return {}
      if flags_obj.get_flag_value("disable_eval", False):
        eval_results = None
        continue
      tf.compat.v1.logging.info('Starting to evaluate.')
      eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                         steps=max_eval_steps)

      if flags.FLAGS.is_mlperf_enabled:
        mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=int(eval_results[_NUM_EXAMPLES_NAME]))
        valdiation_epoch = (cycle_index + 1) * flags_obj.epochs_between_evals + flags_obj.train_offset
        mllogger.event(key=mllog.constants.EVAL_ACCURACY, value=float(eval_results['accuracy']), metadata={'epoch_num: ': valdiation_epoch})
        mllogger.end(key=mllog.constants.EVAL_STOP, metadata={'epoch_num: ' : valdiation_epoch})
        if flags_obj.stop_threshold:
          success = bool(eval_results['accuracy'] >= flags_obj.stop_threshold)

      benchmark_logger.log_evaluation_result(eval_results)

      if flags_obj.stop_threshold:
        if horovod_enabled():
          past_treshold = tf.cast(model_helpers.past_stop_threshold(
              flags_obj.stop_threshold, eval_results['accuracy']), tf.float32)
          global_past_treshold = tf.math.greater(
              hvd.allreduce(past_treshold, op=hvd.Sum), tf.zeros(1, tf.float32))
          if global_past_treshold.eval(session=tf.compat.v1.Session()):
            break
        else:
          if model_helpers.past_stop_threshold(
              flags_obj.stop_threshold, eval_results['accuracy']):
            break

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    export_dtype = flags_core.get_tf_dtype(flags_obj)
    if flags_obj.image_bytes_as_serving_input:
      input_receiver_fn = functools.partial(
          image_bytes_serving_input_fn, shape, dtype=export_dtype)
    else:
      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
                                 strip_default_attrs=True)

  stats = {}
  stats['eval_results'] = eval_results
  stats['train_hooks'] = train_hooks

  if flags.FLAGS.is_mlperf_enabled:
    mllogger.event(key=mllog.constants.RUN_STOP, value={"success": success})
    mllogger.end(key=mllog.constants.RUN_STOP)

  return stats
Beispiel #29
0
def resnet_model_fn(features, labels, mode, model_class,
                    resnet_size, weight_decay, learning_rate_fn, momentum,
                    data_format, resnet_version, loss_scale,
                    loss_filter_fn=None, model_type=resnet_model.DEFAULT_MODEL_TYPE,
                    dtype=resnet_model.DEFAULT_DTYPE,
                    fine_tune=False, label_smoothing=0.0):
  """Shared functionality for different resnet model_fns.

  Initializes the ResnetModel representing the model layers
  and uses that model to build the necessary EstimatorSpecs for
  the `mode` in question. For training, this means building losses,
  the optimizer, and the train op that get passed into the EstimatorSpec.
  For evaluation and prediction, the EstimatorSpec is returned without
  a train op, but with the necessary parameters for the given mode.

  Args:
    features: tensor representing input images
    labels: tensor representing class labels for all input images
    mode: current estimator mode; should be one of
      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
    model_class: a class representing a TensorFlow model that has a __call__
      function. We assume here that this is a subclass of ResnetModel.
    resnet_size: A single integer for the size of the ResNet model.
    weight_decay: weight decay loss rate used to regularize learned variables.
    learning_rate_fn: function that returns the current learning rate given
      the current global_step
    momentum: momentum term used for optimization
    data_format: Input format ('channels_last', 'channels_first', or None).
      If set to None, the format is dependent on whether a GPU is available.
    resnet_version: Integer representing which version of the ResNet network to
      use. See README for details. Valid values: [1, 2]
    loss_scale: The factor to scale the loss for numerical stability. A detailed
      summary is present in the arg parser help text.
    loss_filter_fn: function that takes a string variable name and returns
      True if the var should be included in loss calculation, and False
      otherwise. If None, batch_normalization variables will be excluded
      from the loss.
    dtype: the TensorFlow dtype to use for calculations.
    fine_tune: If True only train the dense layers(final layers).
    label_smoothing: If greater than 0 then smooth the labels.

  Returns:
    EstimatorSpec parameterized according to the input params and the
    current mode.
  """
  # Uncomment the following lines if you want to write images to summary,
  # we turned it off for performance reason

  # Generate a summary node for the images
  # tf.compat.v1.summary.image('images',
  #     (features, tf.cast(features, tf.float32)) [features.dtype == tf.bfloat16],
  #     max_outputs=6)

  if features.dtype != tf.bfloat16:
    # Checks that features/images have same data type being used for calculations.
    assert features.dtype == dtype

  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
                      model_type=model_type, dtype=dtype)

  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)

  # This acts as a no-op if the logits are already in fp32 (provided logits are
  # not a SparseTensor). If dtype is is low precision, logits must be cast to
  # fp32 for numerical stability.
  logits = tf.cast(logits, tf.float32)

  if flags.FLAGS.is_mlperf_enabled:
    num_examples_metric = tf_mlperf_log.sum_metric(tensor=tf.shape(input=logits)[0], name=_NUM_EXAMPLES_NAME)

  predictions = {
      'classes': tf.argmax(input=logits, axis=1),
      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
  }

  if mode == tf.estimator.ModeKeys.PREDICT:
    # Return the predictions and the specification for serving a SavedModel
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        export_outputs={
            'predict': tf.estimator.export.PredictOutput(predictions)
        })

  # Calculate loss, which includes softmax cross entropy and L2 regularization.
  labels = tf.cast(labels, tf.int32)

  if label_smoothing != 0.0:
    one_hot_labels = tf.one_hot(labels, 1001)
    cross_entropy = tf.compat.v1.losses.softmax_cross_entropy(
        logits=logits, onehot_labels=one_hot_labels,
        label_smoothing=label_smoothing)
  else:
    cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
        logits=logits, labels=labels)

  # Create a tensor named cross_entropy for logging purposes.
  tf.identity(cross_entropy, name='cross_entropy')
  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)

  # If no loss_filter_fn is passed, assume we want the default behavior,
  # which is that batch_normalization variables are excluded from loss.
  def exclude_batch_norm(name):
    return 'batch_normalization' not in name
  loss_filter_fn = loss_filter_fn or exclude_batch_norm

  # Add weight decay to the loss.
  l2_loss = weight_decay * tf.add_n(
      # loss is computed using fp32 for numerical stability.
      [
          tf.nn.l2_loss(tf.cast(v, tf.float32))
          for v in tf.compat.v1.trainable_variables()
          if loss_filter_fn(v.name)
      ])
  tf.compat.v1.summary.scalar('l2_loss', l2_loss)
  loss = cross_entropy + l2_loss

  if mode == tf.estimator.ModeKeys.TRAIN:
    global_step = tf.compat.v1.train.get_or_create_global_step()

    learning_rate = learning_rate_fn(global_step)

    # Create a tensor named learning_rate for logging purposes
    tf.identity(learning_rate, name='learning_rate')
    tf.compat.v1.summary.scalar('learning_rate', learning_rate)

    if flags.FLAGS.enable_lars:
      tf.compat.v1.logging.info('Using LARS Optimizer.')
      optimizer = lars.LARSOptimizer(
          learning_rate,
          momentum=momentum,
          weight_decay=weight_decay,
          skip_list=['batch_normalization', 'bias'])

      if flags.FLAGS.is_mlperf_enabled:
        mllogger.event(key=mllog.constants.OPT_NAME, value=mllog.constants.LARS)
        mllogger.event(key=mllog.constants.LARS_EPSILON, value=0.0)
        mllogger.event(key=mllog.constants.LARS_OPT_WEIGHT_DECAY, value=weight_decay)
    else:
      optimizer = tf.compat.v1.train.MomentumOptimizer(
          learning_rate=learning_rate,
          momentum=momentum
      )

    fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
    if fp16_implementation == 'graph_rewrite':
      optimizer = (
          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
              optimizer, loss_scale=loss_scale))

    if horovod_enabled():
      optimizer = hvd.DistributedOptimizer(optimizer)

    def _dense_grad_filter(gvs):
      """Only apply gradient updates to the final layer.

      This function is used for fine tuning.

      Args:
        gvs: list of tuples with gradients and variable info
      Returns:
        filtered gradients so that only the dense layer remains
      """
      return [(g, v) for g, v in gvs if 'dense' in v.name]

    if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
      # When computing fp16 gradients, often intermediate tensor values are
      # so small, they underflow to 0. To avoid this, we multiply the loss by
      # loss_scale to make these tensor values loss_scale times bigger.
      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)

      if fine_tune:
        scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)

      # Once the gradient computation is complete we can scale the gradients
      # back to the correct scale before passing them to the optimizer.
      unscaled_grad_vars = [(grad / loss_scale, var)
                            for grad, var in scaled_grad_vars]
      minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
    else:
      grad_vars = optimizer.compute_gradients(loss*loss_scale)
      if fine_tune:
        grad_vars = _dense_grad_filter(grad_vars)
      grad_vars = [(grad / loss_scale, var) for grad, var in grad_vars]
      minimize_op = optimizer.apply_gradients(grad_vars, global_step)

    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    if flags.FLAGS.is_mlperf_enabled:
      train_op = tf.group(minimize_op, update_ops, num_examples_metric[1])
    else:
      train_op = tf.group(minimize_op, update_ops)
  else:
    train_op = None

  accuracy = tf.compat.v1.metrics.accuracy(labels, predictions['classes'])
  accuracy_top_5 = tf.compat.v1.metrics.mean(
      tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
  metrics = {'accuracy': accuracy,
             'accuracy_top_5': accuracy_top_5}
  if flags.FLAGS.is_mlperf_enabled:
    metrics.update({_NUM_EXAMPLES_NAME: num_examples_metric})

  # Create a tensor named train_accuracy for logging purposes
  tf.identity(accuracy[1], name='train_accuracy')
  tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
  tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
  tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])

  return tf.estimator.EstimatorSpec(
      mode=mode,
      predictions=predictions,
      loss=loss,
      train_op=train_op,
      eval_metric_ops=metrics)
Beispiel #30
0
def main(argv):
  tf.disable_v2_behavior()
  tf.enable_resource_variables()

  if FLAGS.use_hpu and FLAGS.recipe_cache:
    prepare_recipe_cache()

  if FLAGS.use_horovod:
    if FLAGS.use_hpu:
      from TensorFlow.common.horovod_helpers import hvd_init, horovod_enabled, hvd
      hvd_init()
      assert horovod_enabled()
      if FLAGS.recipe_cache:
        # Other ranks should wait for recipe cache to be removed.
        # This operation can't be done before hvd_init.
        from mpi4py import MPI
        MPI.COMM_WORLD.Barrier()
    else:
      import horovod.tensorflow as hvd
      hvd.init()
      assert hvd.size() > 1
      os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank())

  if FLAGS.use_hpu:
    if FLAGS.use_bf16:
      os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    dyn_shapes_flag = 'TF_ENABLE_DYNAMIC_SHAPES'
    if dyn_shapes_flag not in os.environ:
        os.environ[dyn_shapes_flag] = 'false'

    from habana_frameworks.tensorflow import load_habana_module  # noqa
    load_habana_module()

  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)

  # If we just have to print the registry, do that and exit early.
  maybe_log_registry_and_exit()

  # Create HParams.
  if argv:
    set_hparams_from_args(argv[1:])
  if FLAGS.schedule != "run_std_server":
    hparams = create_hparams()
  if FLAGS.gpu_automatic_mixed_precision:
    setattr(hparams, "gpu_automatic_mixed_precision", True)
  if FLAGS.deterministic_dataset:
    hparams.add_hparam("deterministic_dataset", True)

  hparams.add_hparam("use_horovod", FLAGS.use_horovod)
  hparams.add_hparam("use_hpu", FLAGS.use_hpu)
  if FLAGS.use_horovod:
    hparams.add_hparam("hvd_worker_id", hvd.rank())
    hparams.add_hparam("hvd_size", hvd.size())

  if FLAGS.schedule == "run_std_server":
    run_std_server()
  trainer_lib.set_random_seed(FLAGS.random_seed)

  if FLAGS.generate_data:
    generate_data()

  exp_fn = create_experiment_fn()
  exp = exp_fn(create_run_config(hparams), hparams)
  if is_chief():
    save_metadata(hparams)

  with dump_callback():
    execute_schedule(exp)