Example #1
0
  def __init__(self):
    self._strategy = tf.distribute.get_strategy()
    self.init_async()

    self.global_step = tf.Variable(
        0,
        dtype=tf.int64,
        name='global_step',
        trainable=False,
        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
    self.eval_global_step = tf.Variable(
        0,
        dtype=tf.int64,
        name='eval_global_step',
        trainable=False,
        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)

    train_dataset = self.distribute_dataset(dataset_fn)
    orbit.StandardTrainer.__init__(
        self, train_dataset, options=orbit.StandardTrainerOptions())

    validation_dataset = self.distribute_dataset(dataset_fn)
    orbit.StandardEvaluator.__init__(
        self,
        validation_dataset,
        options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True))
Example #2
0
    def __init__(self,
                 config: ExperimentConfig,
                 task: base_task.Task,
                 model: tf.keras.Model,
                 optimizer: tf.optimizers.Optimizer,
                 train: bool = True,
                 evaluate: bool = True,
                 train_dataset: Optional[
                     Union[tf.data.Dataset,
                           tf.distribute.DistributedDataset]] = None,
                 validation_dataset: Optional[
                     Union[tf.data.Dataset,
                           tf.distribute.DistributedDataset]] = None,
                 checkpoint_exporter=None):
        """Initialize common trainer for TensorFlow models.

    Args:
      config: An `ExperimentConfig` instance specifying experiment config.
      task: A base_task.Task instance.
      model: The model instance, e.g. a tf.keras.Model instance.
      optimizer: tf.optimizers.Optimizer instance.
      train: bool, whether or not this trainer will be used for training.
        default to True.
      evaluate: bool, whether or not this trainer will be used for evaluation.
        default to True.
      train_dataset: a dataset object created for training. With tf.distribute,
        it needs to be a `DistributedDataset`.
      validation_dataset: a dataset object created for evaluation. With
        tf.distribute, it needs to be a `DistributedDataset`. The evaluator will
        create a dataset iterator for each eval round, so the dataset does not
        need to repeat.
      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
        interface.
    """
        # Gets the current distribution strategy. If not inside any strategy scope,
        # it gets a single-replica no-op strategy.
        self._strategy = tf.distribute.get_strategy()
        self._validate_params(config,
                              check_train_data=train_dataset is None,
                              check_validation_data=validation_dataset is None)
        self._config = config
        self._task = task
        self._model = model
        self._optimizer = optimizer
        self._checkpoint_exporter = checkpoint_exporter
        self._recovery = None
        # Runtime options are only applied to train_step.
        # We use default for eval_step.
        self._runtime_options = get_runtime_options(config)

        # Creates a shadow copy of the weights to store weights moving average.
        if isinstance(self._optimizer, optimization.ExponentialMovingAverage
                      ) and not self._optimizer.has_shadow_copy:
            self._optimizer.shadow_copy(self._model)

        # global_step increases by 1 after each training iteration.
        # We should have global_step.numpy() == self.optimizer.iterations.numpy()
        # when there is only 1 optimizer.
        self._global_step = orbit.utils.create_global_step()
        if hasattr(self.model, "checkpoint_items"):
            checkpoint_items = self.model.checkpoint_items
        else:
            checkpoint_items = {}
        self._checkpoint = tf.train.Checkpoint(global_step=self.global_step,
                                               model=self.model,
                                               optimizer=self.optimizer,
                                               **checkpoint_items)

        self._train_loss = tf.keras.metrics.Mean("training_loss",
                                                 dtype=tf.float32)
        self._validation_loss = tf.keras.metrics.Mean("validation_loss",
                                                      dtype=tf.float32)
        model_metrics = model.metrics if hasattr(model, "metrics") else []
        self._train_metrics = self.task.build_metrics(
            training=True) + model_metrics
        self._validation_metrics = self.task.build_metrics(
            training=False) + model_metrics

        self.init_async()

        if train:
            train_dataset = train_dataset or self.distribute_dataset(
                self.task.build_inputs, self.config.task.train_data)
            orbit.StandardTrainer.__init__(
                self,
                train_dataset,
                options=orbit.StandardTrainerOptions(
                    use_tf_while_loop=config.trainer.train_tf_while_loop,
                    use_tf_function=config.trainer.train_tf_function,
                    use_tpu_summary_optimization=config.trainer.
                    allow_tpu_summary))

        if evaluate:
            validation_dataset = validation_dataset or self.distribute_dataset(
                self.task.build_inputs, self.config.task.validation_data)
            orbit.StandardEvaluator.__init__(
                self,
                validation_dataset,
                options=orbit.StandardEvaluatorOptions(
                    use_tf_function=config.trainer.eval_tf_function,
                    use_tf_while_loop=config.trainer.eval_tf_while_loop))
Example #3
0
    def __init__(self, flags_obj, time_callback, epoch_steps):
        self.strategy = tf.distribute.get_strategy()
        self.flags_obj = flags_obj
        self.dtype = flags_core.get_tf_dtype(flags_obj)
        self.time_callback = time_callback

        # Input pipeline related
        batch_size = flags_obj.batch_size
        if batch_size % self.strategy.num_replicas_in_sync != 0:
            raise ValueError(
                'Batch size must be divisible by number of replicas : {}'.
                format(self.strategy.num_replicas_in_sync))

        # As auto rebatching is not supported in
        # `distribute_datasets_from_function()` API, which is
        # required when cloning dataset to multiple workers in eager mode,
        # we use per-replica batch size.
        self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)

        if self.flags_obj.use_synthetic_data:
            self.input_fn = common.get_synth_input_fn(
                height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
                width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
                num_channels=imagenet_preprocessing.NUM_CHANNELS,
                num_classes=imagenet_preprocessing.NUM_CLASSES,
                dtype=self.dtype,
                drop_remainder=True)
        else:
            self.input_fn = imagenet_preprocessing.input_fn

        self.model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            use_l2_regularizer=not flags_obj.single_l2_loss_op)

        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)
        self.optimizer = common.get_optimizer(lr_schedule)
        # Make sure iterations variable is created inside scope.
        self.global_step = self.optimizer.iterations

        use_graph_rewrite = flags_obj.fp16_implementation == 'graph_rewrite'
        if use_graph_rewrite and not flags_obj.use_tf_function:
            raise ValueError('--fp16_implementation=graph_rewrite requires '
                             '--use_tf_function to be true')
        self.optimizer = performance.configure_optimizer(
            self.optimizer,
            use_float16=self.dtype == tf.float16,
            use_graph_rewrite=use_graph_rewrite,
            loss_scale=flags_core.get_loss_scale(flags_obj,
                                                 default_for_fp16=128))

        self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'train_accuracy', dtype=tf.float32)
        self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
        self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
            'test_accuracy', dtype=tf.float32)

        self.checkpoint = tf.train.Checkpoint(model=self.model,
                                              optimizer=self.optimizer)

        # Handling epochs.
        self.epoch_steps = epoch_steps
        self.epoch_helper = orbit.utils.EpochHelper(epoch_steps,
                                                    self.global_step)
        train_dataset = orbit.utils.make_distributed_dataset(
            self.strategy,
            self.input_fn,
            is_training=True,
            data_dir=self.flags_obj.data_dir,
            batch_size=self.batch_size,
            parse_record_fn=imagenet_preprocessing.parse_record,
            datasets_num_private_threads=self.flags_obj.
            datasets_num_private_threads,
            dtype=self.dtype,
            drop_remainder=True)
        orbit.StandardTrainer.__init__(
            self,
            train_dataset,
            options=orbit.StandardTrainerOptions(
                use_tf_while_loop=flags_obj.use_tf_while_loop,
                use_tf_function=flags_obj.use_tf_function))
        if not flags_obj.skip_eval:
            eval_dataset = orbit.utils.make_distributed_dataset(
                self.strategy,
                self.input_fn,
                is_training=False,
                data_dir=self.flags_obj.data_dir,
                batch_size=self.batch_size,
                parse_record_fn=imagenet_preprocessing.parse_record,
                dtype=self.dtype)
            orbit.StandardEvaluator.__init__(
                self,
                eval_dataset,
                options=orbit.StandardEvaluatorOptions(
                    use_tf_function=flags_obj.use_tf_function))
  def __init__(
      self,
      config: ExperimentConfig,
      prog_task: base_task.Task,  # also implemented ProgressivePolicy.
      ckpt_dir: str = '',
      train: bool = True,
      evaluate: bool = True,
      checkpoint_exporter: Any = None):
    """Initialize common trainer for TensorFlow models.

    Args:
      config: An `ExperimentConfig` instance specifying experiment config.
      prog_task: An instance both implemented policies.ProgressivePolicy and
        base_task.Task.
      ckpt_dir: Checkpoint directory.
      train: bool, whether or not this trainer will be used for training.
        default to True.
      evaluate: bool, whether or not this trainer will be used for evaluation.
        default to True.
      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
        interface.
    """
    # Gets the current distribution strategy. If not inside any strategy scope,
    # it gets a single-replica no-op strategy.
    self._strategy = tf.distribute.get_strategy()
    self._config = config
    self._task = prog_task

    # Directory for non-progressive checkpoint
    self._export_ckpt_dir = os.path.join(ckpt_dir, 'exported_ckpts')
    tf.io.gfile.makedirs(self._export_ckpt_dir)

    # Receive other checkpoint export, e.g, best checkpoint exporter.
    # TODO(lehou): unify the checkpoint exporting logic, although the default
    # setting does not use checkpoint_exporter.
    self._checkpoint_exporter = checkpoint_exporter

    self._global_step = orbit.utils.create_global_step()

    self._checkpoint = utils.CheckpointWithHooks(
        before_load_hook=self._update_pt_stage_from_ckpt,
        global_step=self.global_step,
        **self._task.cur_checkpoint_items)

    self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
    self._validation_loss = tf.keras.metrics.Mean(
        'validation_loss', dtype=tf.float32)
    self._train_metrics = self.task.build_metrics(
        training=True) + self.model.metrics
    self._validation_metrics = self.task.build_metrics(
        training=False) + self.model.metrics

    if train:
      orbit.StandardTrainer.__init__(
          self,
          None,  # Manage train_dataset by ourselves, not by StandardTrainer.
          options=orbit.StandardTrainerOptions(
              use_tf_while_loop=config.trainer.train_tf_while_loop,
              use_tf_function=config.trainer.train_tf_function))

    if evaluate:
      orbit.StandardEvaluator.__init__(
          self,
          None,  # Manage train_dataset by ourselves, not by StandardEvaluator.
          options=orbit.StandardEvaluatorOptions(
              use_tf_function=config.trainer.eval_tf_function))
Example #5
0
    def __init__(self,
                 config: ExperimentConfig,
                 task: base_task.Task,
                 model: tf.keras.Model,
                 optimizer: tf.optimizers.Optimizer,
                 train: bool = True,
                 evaluate: bool = True,
                 checkpoint_exporter=None):
        """Initialize common trainer for TensorFlow models.

    Args:
      config: An `ExperimentConfig` instance specifying experiment config.
      task: A base_task.Task instance.
      model: The model instance, e.g. a tf.keras.Model instance.
      optimizer: tf.optimizers.Optimizer instance.
      train: bool, whether or not this trainer will be used for training.
        default to True.
      evaluate: bool, whether or not this trainer will be used for evaluation.
        default to True.
      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
        interface.
    """
        # Gets the current distribution strategy. If not inside any strategy scope,
        # it gets a single-replica no-op strategy.
        self._strategy = tf.distribute.get_strategy()
        self._validate_params(config)
        self._config = config
        self._task = task
        self._model = model
        self._checkpoint_exporter = checkpoint_exporter
        self._optimizer = optimizer

        # global_step increases by 1 after each training iteration.
        # We should have global_step.numpy() == self.optimizer.iterations.numpy()
        # when there is only 1 optimizer.
        self._global_step = orbit.utils.create_global_step()
        if hasattr(self.model, "checkpoint_items"):
            checkpoint_items = self.model.checkpoint_items
        else:
            checkpoint_items = {}
        self._checkpoint = tf.train.Checkpoint(global_step=self.global_step,
                                               model=self.model,
                                               optimizer=self.optimizer,
                                               **checkpoint_items)

        self._train_loss = tf.keras.metrics.Mean("training_loss",
                                                 dtype=tf.float32)
        self._validation_loss = tf.keras.metrics.Mean("validation_loss",
                                                      dtype=tf.float32)
        self._train_metrics = self.task.build_metrics(
            training=True) + self.model.metrics
        self._validation_metrics = self.task.build_metrics(
            training=False) + self.model.metrics

        if train:
            train_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.train_data)
            orbit.StandardTrainer.__init__(
                self,
                train_dataset,
                options=orbit.StandardTrainerOptions(
                    use_tf_while_loop=config.trainer.train_tf_while_loop,
                    use_tf_function=config.trainer.train_tf_function,
                    use_tpu_summary_optimization=config.trainer.
                    allow_tpu_summary))

        if evaluate:
            eval_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.validation_data)
            orbit.StandardEvaluator.__init__(
                self,
                eval_dataset,
                options=orbit.StandardEvaluatorOptions(
                    use_tf_function=config.trainer.eval_tf_function))
Example #6
0
    self._validation_metrics = self.task.build_metrics(
        training=False) + self.model.metrics

    if train:
      orbit.StandardTrainer.__init__(
          self,
          None,  # Manage train_dataset by ourselves, not by StandardTrainer.
          options=orbit.StandardTrainerOptions(
              use_tf_while_loop=config.trainer.train_tf_while_loop,
              use_tf_function=config.trainer.train_tf_function))

    if evaluate:
      orbit.StandardEvaluator.__init__(
          self,
          None,  # Manage train_dataset by ourselves, not by StandardEvaluator.
          options=orbit.StandardEvaluatorOptions(
              use_tf_function=config.trainer.eval_tf_function))

  @property
  def model(self):
    return self._task.cur_model

  @property
  def optimizer(self):
    return self._task.cur_optimizer

  # override
  @property
  def train_dataset(self):
    """Overriding StandardTrainer.train_dataset."""
    return self._task.cur_train_dataset
Example #7
0
    def __init__(self,
                 config: ExperimentConfig,
                 task: base_task.Task,
                 train: bool = True,
                 evaluate: bool = True,
                 model=None,
                 optimizer=None,
                 checkpoint_exporter=None):
        """Initialize common trainer for TensorFlow models.

    Args:
      config: An `ExperimentConfig` instance specifying experiment config.
      task: A base_task.Task instance.
      train: bool, whether or not this trainer will be used for training.
        default to True.
      evaluate: bool, whether or not this trainer will be used for evaluation.
        default to True.
      model: tf.keras.Model instance. If provided, it will be used instead of
        building model using task.build_model(). Default to None.
      optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will
        used instead of the optimizer from config. Default to None.
      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
        interface.
    """
        # Gets the current distribution strategy. If not inside any strategy scope,
        # it gets a single-replica no-op strategy.
        self._strategy = tf.distribute.get_strategy()
        self._config = config
        self._task = task

        self._model = model or task.build_model()

        if optimizer is None:
            opt_factory = optimization.OptimizerFactory(
                config.trainer.optimizer_config)
            self._optimizer = opt_factory.build_optimizer(
                opt_factory.build_learning_rate())
        else:
            self._optimizer = optimizer

        self._checkpoint_exporter = checkpoint_exporter

        # Configuring optimizer when loss_scale is set in runtime config. This helps
        # avoiding overflow/underflow for float16 computations.
        if config.runtime.loss_scale:
            self._optimizer = performance.configure_optimizer(
                self._optimizer,
                use_float16=config.runtime.mixed_precision_dtype == 'float16',
                loss_scale=config.runtime.loss_scale)

        # global_step increases by 1 after each training iteration.
        # We should have global_step.numpy() == self.optimizer.iterations.numpy()
        # when there is only 1 optimizer.
        self._global_step = orbit.utils.create_global_step()
        if hasattr(self.model, 'checkpoint_items'):
            checkpoint_items = self.model.checkpoint_items
        else:
            checkpoint_items = {}
        self._checkpoint = tf.train.Checkpoint(global_step=self.global_step,
                                               model=self.model,
                                               optimizer=self.optimizer,
                                               **checkpoint_items)

        self._train_loss = tf.keras.metrics.Mean('training_loss',
                                                 dtype=tf.float32)
        self._validation_loss = tf.keras.metrics.Mean('validation_loss',
                                                      dtype=tf.float32)
        self._train_metrics = self.task.build_metrics(
            training=True) + self.model.metrics
        self._validation_metrics = self.task.build_metrics(
            training=False) + self.model.metrics

        if train:
            train_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.train_data)
            orbit.StandardTrainer.__init__(
                self,
                train_dataset,
                options=orbit.StandardTrainerOptions(
                    use_tf_while_loop=config.trainer.train_tf_while_loop,
                    use_tf_function=config.trainer.train_tf_function,
                    use_tpu_summary_optimization=config.trainer.
                    allow_tpu_summary))

        if evaluate:
            eval_dataset = orbit.utils.make_distributed_dataset(
                self.strategy, self.task.build_inputs,
                self.config.task.validation_data)
            orbit.StandardEvaluator.__init__(
                self,
                eval_dataset,
                options=orbit.StandardEvaluatorOptions(
                    use_tf_function=config.trainer.eval_tf_function))