def train_dataset_fn(input_context): train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context, ) train_input = train_input.repeat() return train_input
def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input
def train_dataset_fn( input_context: tf.distribute.InputContext) -> tf.data.Dataset: """Callable to create train input.""" train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=model, input_context=input_context, ) train_input = train_input.repeat() return train_input
def train_dataset_fn(input_context): """Callable to create train input.""" batch_size = input_context.get_per_replica_batch_size( train_config.batch_size) # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, params={'batch_size': batch_size}) train_input = train_input.repeat() return train_input.shard(input_context.num_input_pipelines, input_context.input_pipeline_id)
def get_train_input(config_path): """ Get the tf dataset that inputs training batches args: - config_path [str]: path to the edited config file returns: - dataset [tf.Dataset]: data outputting augmented batches """ # parse config configs = get_configs_from_pipeline_file(config_path) train_config = configs['train_config'] train_input_config = configs['train_input_config'] # get the dataset dataset = train_input(train_config, train_input_config, configs['model']) return dataset
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. If export_to_tpu is not provided, we will look for it in hparams too. checkpoint_every_n: Checkpoint every n training steps. **kwargs: Additional keyword arguments for configuration override. """ ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step') optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint(detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() if not use_tpu: tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors use_bfloat16 = train_config.use_bfloat16 add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # TODO(kaftan): Either make strategy a parameter of this method, or ## grab it w/ Distribution strategy's get_scope # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint( detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors, use_tpu, use_bfloat16) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), use_bfloat16=use_bfloat16, add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. tf.print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()