def __init__(self): self._strategy = tf.distribute.get_strategy() self.init_async() self.global_step = tf.Variable( 0, dtype=tf.int64, name='global_step', trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA) self.eval_global_step = tf.Variable( 0, dtype=tf.int64, name='eval_global_step', trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA) train_dataset = self.distribute_dataset(dataset_fn) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions()) validation_dataset = self.distribute_dataset(dataset_fn) orbit.StandardEvaluator.__init__( self, validation_dataset, options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True))
def __init__(self, config: ExperimentConfig, task: base_task.Task, model: tf.keras.Model, optimizer: tf.optimizers.Optimizer, train: bool = True, evaluate: bool = True, train_dataset: Optional[ Union[tf.data.Dataset, tf.distribute.DistributedDataset]] = None, validation_dataset: Optional[ Union[tf.data.Dataset, tf.distribute.DistributedDataset]] = None, checkpoint_exporter=None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. task: A base_task.Task instance. model: The model instance, e.g. a tf.keras.Model instance. optimizer: tf.optimizers.Optimizer instance. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. train_dataset: a dataset object created for training. With tf.distribute, it needs to be a `DistributedDataset`. validation_dataset: a dataset object created for evaluation. With tf.distribute, it needs to be a `DistributedDataset`. The evaluator will create a dataset iterator for each eval round, so the dataset does not need to repeat. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._validate_params(config, check_train_data=train_dataset is None, check_validation_data=validation_dataset is None) self._config = config self._task = task self._model = model self._optimizer = optimizer self._checkpoint_exporter = checkpoint_exporter self._recovery = None # Runtime options are only applied to train_step. # We use default for eval_step. self._runtime_options = get_runtime_options(config) # Creates a shadow copy of the weights to store weights moving average. if isinstance(self._optimizer, optimization.ExponentialMovingAverage ) and not self._optimizer.has_shadow_copy: self._optimizer.shadow_copy(self._model) # global_step increases by 1 after each training iteration. # We should have global_step.numpy() == self.optimizer.iterations.numpy() # when there is only 1 optimizer. self._global_step = orbit.utils.create_global_step() if hasattr(self.model, "checkpoint_items"): checkpoint_items = self.model.checkpoint_items else: checkpoint_items = {} self._checkpoint = tf.train.Checkpoint(global_step=self.global_step, model=self.model, optimizer=self.optimizer, **checkpoint_items) self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean("validation_loss", dtype=tf.float32) model_metrics = model.metrics if hasattr(model, "metrics") else [] self._train_metrics = self.task.build_metrics( training=True) + model_metrics self._validation_metrics = self.task.build_metrics( training=False) + model_metrics self.init_async() if train: train_dataset = train_dataset or self.distribute_dataset( self.task.build_inputs, self.config.task.train_data) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function, use_tpu_summary_optimization=config.trainer. allow_tpu_summary)) if evaluate: validation_dataset = validation_dataset or self.distribute_dataset( self.task.build_inputs, self.config.task.validation_data) orbit.StandardEvaluator.__init__( self, validation_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function, use_tf_while_loop=config.trainer.eval_tf_while_loop))
def __init__(self, flags_obj, time_callback, epoch_steps): self.strategy = tf.distribute.get_strategy() self.flags_obj = flags_obj self.dtype = flags_core.get_tf_dtype(flags_obj) self.time_callback = time_callback # Input pipeline related batch_size = flags_obj.batch_size if batch_size % self.strategy.num_replicas_in_sync != 0: raise ValueError( 'Batch size must be divisible by number of replicas : {}'. format(self.strategy.num_replicas_in_sync)) # As auto rebatching is not supported in # `distribute_datasets_from_function()` API, which is # required when cloning dataset to multiple workers in eager mode, # we use per-replica batch size. self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync) if self.flags_obj.use_synthetic_data: self.input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=self.dtype, drop_remainder=True) else: self.input_fn = imagenet_preprocessing.input_fn self.model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, use_l2_regularizer=not flags_obj.single_l2_loss_op) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) self.optimizer = common.get_optimizer(lr_schedule) # Make sure iterations variable is created inside scope. self.global_step = self.optimizer.iterations use_graph_rewrite = flags_obj.fp16_implementation == 'graph_rewrite' if use_graph_rewrite and not flags_obj.use_tf_function: raise ValueError('--fp16_implementation=graph_rewrite requires ' '--use_tf_function to be true') self.optimizer = performance.configure_optimizer( self.optimizer, use_float16=self.dtype == tf.float16, use_graph_rewrite=use_graph_rewrite, loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'train_accuracy', dtype=tf.float32) self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) self.checkpoint = tf.train.Checkpoint(model=self.model, optimizer=self.optimizer) # Handling epochs. self.epoch_steps = epoch_steps self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step) train_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.input_fn, is_training=True, data_dir=self.flags_obj.data_dir, batch_size=self.batch_size, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=self.flags_obj. datasets_num_private_threads, dtype=self.dtype, drop_remainder=True) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=flags_obj.use_tf_while_loop, use_tf_function=flags_obj.use_tf_function)) if not flags_obj.skip_eval: eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.input_fn, is_training=False, data_dir=self.flags_obj.data_dir, batch_size=self.batch_size, parse_record_fn=imagenet_preprocessing.parse_record, dtype=self.dtype) orbit.StandardEvaluator.__init__( self, eval_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=flags_obj.use_tf_function))
def __init__( self, config: ExperimentConfig, prog_task: base_task.Task, # also implemented ProgressivePolicy. ckpt_dir: str = '', train: bool = True, evaluate: bool = True, checkpoint_exporter: Any = None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. prog_task: An instance both implemented policies.ProgressivePolicy and base_task.Task. ckpt_dir: Checkpoint directory. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._config = config self._task = prog_task # Directory for non-progressive checkpoint self._export_ckpt_dir = os.path.join(ckpt_dir, 'exported_ckpts') tf.io.gfile.makedirs(self._export_ckpt_dir) # Receive other checkpoint export, e.g, best checkpoint exporter. # TODO(lehou): unify the checkpoint exporting logic, although the default # setting does not use checkpoint_exporter. self._checkpoint_exporter = checkpoint_exporter self._global_step = orbit.utils.create_global_step() self._checkpoint = utils.CheckpointWithHooks( before_load_hook=self._update_pt_stage_from_ckpt, global_step=self.global_step, **self._task.cur_checkpoint_items) self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean( 'validation_loss', dtype=tf.float32) self._train_metrics = self.task.build_metrics( training=True) + self.model.metrics self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: orbit.StandardTrainer.__init__( self, None, # Manage train_dataset by ourselves, not by StandardTrainer. options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function)) if evaluate: orbit.StandardEvaluator.__init__( self, None, # Manage train_dataset by ourselves, not by StandardEvaluator. options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function))
def __init__(self, config: ExperimentConfig, task: base_task.Task, model: tf.keras.Model, optimizer: tf.optimizers.Optimizer, train: bool = True, evaluate: bool = True, checkpoint_exporter=None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. task: A base_task.Task instance. model: The model instance, e.g. a tf.keras.Model instance. optimizer: tf.optimizers.Optimizer instance. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._validate_params(config) self._config = config self._task = task self._model = model self._checkpoint_exporter = checkpoint_exporter self._optimizer = optimizer # global_step increases by 1 after each training iteration. # We should have global_step.numpy() == self.optimizer.iterations.numpy() # when there is only 1 optimizer. self._global_step = orbit.utils.create_global_step() if hasattr(self.model, "checkpoint_items"): checkpoint_items = self.model.checkpoint_items else: checkpoint_items = {} self._checkpoint = tf.train.Checkpoint(global_step=self.global_step, model=self.model, optimizer=self.optimizer, **checkpoint_items) self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean("validation_loss", dtype=tf.float32) self._train_metrics = self.task.build_metrics( training=True) + self.model.metrics self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: train_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.train_data) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function, use_tpu_summary_optimization=config.trainer. allow_tpu_summary)) if evaluate: eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.validation_data) orbit.StandardEvaluator.__init__( self, eval_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function))
self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: orbit.StandardTrainer.__init__( self, None, # Manage train_dataset by ourselves, not by StandardTrainer. options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function)) if evaluate: orbit.StandardEvaluator.__init__( self, None, # Manage train_dataset by ourselves, not by StandardEvaluator. options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function)) @property def model(self): return self._task.cur_model @property def optimizer(self): return self._task.cur_optimizer # override @property def train_dataset(self): """Overriding StandardTrainer.train_dataset.""" return self._task.cur_train_dataset
def __init__(self, config: ExperimentConfig, task: base_task.Task, train: bool = True, evaluate: bool = True, model=None, optimizer=None, checkpoint_exporter=None): """Initialize common trainer for TensorFlow models. Args: config: An `ExperimentConfig` instance specifying experiment config. task: A base_task.Task instance. train: bool, whether or not this trainer will be used for training. default to True. evaluate: bool, whether or not this trainer will be used for evaluation. default to True. model: tf.keras.Model instance. If provided, it will be used instead of building model using task.build_model(). Default to None. optimizer: tf.keras.optimizers.Optimizer instance. If provided, it will used instead of the optimizer from config. Default to None. checkpoint_exporter: an object that has the `maybe_export_checkpoint` interface. """ # Gets the current distribution strategy. If not inside any strategy scope, # it gets a single-replica no-op strategy. self._strategy = tf.distribute.get_strategy() self._config = config self._task = task self._model = model or task.build_model() if optimizer is None: opt_factory = optimization.OptimizerFactory( config.trainer.optimizer_config) self._optimizer = opt_factory.build_optimizer( opt_factory.build_learning_rate()) else: self._optimizer = optimizer self._checkpoint_exporter = checkpoint_exporter # Configuring optimizer when loss_scale is set in runtime config. This helps # avoiding overflow/underflow for float16 computations. if config.runtime.loss_scale: self._optimizer = performance.configure_optimizer( self._optimizer, use_float16=config.runtime.mixed_precision_dtype == 'float16', loss_scale=config.runtime.loss_scale) # global_step increases by 1 after each training iteration. # We should have global_step.numpy() == self.optimizer.iterations.numpy() # when there is only 1 optimizer. self._global_step = orbit.utils.create_global_step() if hasattr(self.model, 'checkpoint_items'): checkpoint_items = self.model.checkpoint_items else: checkpoint_items = {} self._checkpoint = tf.train.Checkpoint(global_step=self.global_step, model=self.model, optimizer=self.optimizer, **checkpoint_items) self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) self._validation_loss = tf.keras.metrics.Mean('validation_loss', dtype=tf.float32) self._train_metrics = self.task.build_metrics( training=True) + self.model.metrics self._validation_metrics = self.task.build_metrics( training=False) + self.model.metrics if train: train_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.train_data) orbit.StandardTrainer.__init__( self, train_dataset, options=orbit.StandardTrainerOptions( use_tf_while_loop=config.trainer.train_tf_while_loop, use_tf_function=config.trainer.train_tf_function, use_tpu_summary_optimization=config.trainer. allow_tpu_summary)) if evaluate: eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, self.task.build_inputs, self.config.task.validation_data) orbit.StandardEvaluator.__init__( self, eval_dataset, options=orbit.StandardEvaluatorOptions( use_tf_function=config.trainer.eval_tf_function))