def testBuildEmptyOptimizer(self): optimizer_text_proto = """ """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) with self.assertRaises(ValueError): optimizer_builder.build(optimizer_proto)
def testBuildEmptyOptimizer(self): optimizer_text_proto = """ """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) with self.assertRaises(ValueError): optimizer_builder.build(optimizer_proto)
def testBuildEmptyOptimizer(self): optimizer_text_proto = """ """ global_summaries = set([]) optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) with self.assertRaises(ValueError): optimizer_builder.build(optimizer_proto, global_summaries)
def testBuildEmptyOptimizer(self): optimizer_text_proto = """ """ global_summaries = set([]) optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) with self.assertRaises(ValueError): optimizer_builder.build(optimizer_proto, global_summaries)
def testMovingAverageOptimizerUnsupported(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) with self.assertRaises(ValueError): optimizer_builder.build(optimizer_proto)
def testBuildMovingAverageOptimizer(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertIsInstance(optimizer, contrib_opt.MovingAverageOptimizer)
def testBuildAdamOptimizer(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
def testBuildAdamOptimizer(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
def testBuildMomentumOptimizer(self): optimizer_text_proto = """ momentum_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.001 } } momentum_optimizer_value: 0.99 } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
def testBuildMomentumOptimizer(self): optimizer_text_proto = """ momentum_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.001 } } momentum_optimizer_value: 0.99 } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
def testBuildMovingAverageOptimizer(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True """ global_summaries = set([]) optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer = optimizer_builder.build(optimizer_proto, global_summaries) self.assertTrue( isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
def testBuildMovingAverageOptimizer(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True """ global_summaries = set([]) optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer = optimizer_builder.build(optimizer_proto, global_summaries) self.assertTrue( isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
def testBuildMovingAverageOptimizerWithNonDefaultDecay(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True moving_average_decay: 0.2 """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertIsInstance(optimizer, contrib_opt.MovingAverageOptimizer) # TODO(rathodv): Find a way to not depend on the private members. self.assertAlmostEqual(optimizer._ema._decay, 0.2)
def testBuildMovingAverageOptimizerWithNonDefaultDecay(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True moving_average_decay: 0.2 """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue( isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer)) # TODO(rathodv): Find a way to not depend on the private members. self.assertAlmostEqual(optimizer._ema._decay, 0.2)
def testBuildRMSPropOptimizer(self): optimizer_text_proto = """ rms_prop_optimizer: { learning_rate: { exponential_decay_learning_rate { initial_learning_rate: 0.004 decay_steps: 800720 decay_factor: 0.95 } } momentum_optimizer_value: 0.9 decay: 0.9 epsilon: 1.0 } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
def testBuildMovingAverageOptimizerWithNonDefaultDecay(self): optimizer_text_proto = """ adam_optimizer: { learning_rate: { constant_learning_rate { learning_rate: 0.002 } } } use_moving_average: True moving_average_decay: 0.2 """ global_summaries = set([]) optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer = optimizer_builder.build(optimizer_proto, global_summaries) self.assertTrue( isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer)) # TODO: Find a way to not depend on the private members. self.assertAlmostEqual(optimizer._ema._decay, 0.2)
def testBuildRMSPropOptimizer(self): optimizer_text_proto = """ rms_prop_optimizer: { learning_rate: { exponential_decay_learning_rate { initial_learning_rate: 0.004 decay_steps: 800720 decay_factor: 0.95 } } momentum_optimizer_value: 0.9 decay: 0.9 epsilon: 1.0 } use_moving_average: false """ optimizer_proto = optimizer_pb2.Optimizer() text_format.Merge(optimizer_text_proto, optimizer_proto) optimizer, _ = optimizer_builder.build(optimizer_proto) self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, checkpoint_max_to_keep=7, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. If export_to_tpu is not provided, we will look for it in hparams too. checkpoint_every_n: Checkpoint every n training steps. checkpoint_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. **kwargs: Additional keyword arguments for configuration override. """ ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = _get_filepath(strategy, os.path.join(model_dir, 'train')) summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) if use_tpu: num_steps_per_iteration = 100 else: # TODO(b/135933080) Explore setting to 100 when GPU performance issues # are fixed. num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint( detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, fine_tune_checkpoint_version, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer) manager_dir = _get_filepath(strategy, model_dir) if not strategy.extended.should_checkpoint: checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) checkpointed_step = int(global_step.value()) logged_step = global_step.value() last_step_time = time.time() for _ in range(global_step.value(), train_steps, num_steps_per_iteration): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() tf.compat.v2.summary.scalar('steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) logged_step = global_step.value() if ((int(global_step.value()) - checkpointed_step) >= checkpoint_every_n): manager.save() checkpointed_step = int(global_step.value()) # Remove the checkpoint directories of the non-chief workers that # MultiWorkerMirroredStrategy forces us to save during sync distributed # training. _clean_temporary_directories(strategy, manager_dir) _clean_temporary_directories(strategy, summary_writer_filepath)
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. If export_to_tpu is not provided, we will look for it in hparams too. checkpoint_every_n: Checkpoint every n training steps. **kwargs: Additional keyword arguments for configuration override. """ ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step') optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint(detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() if not use_tpu: tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()
def train_loop(config_path: str, model_dir: str, config_override: Optional[ pipeline_pb2.TrainEvalPipelineConfig] = None, train_steps: Optional[int] = None, use_tpu: bool = False, save_final_config: bool = False, log_every_n: int = 100, ckpt_every_n: int = 1000, ckpt_max_to_keep: int = 7, record_summaries: bool = True, **kwargs) -> None: """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `ckpt_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`. train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. log_every_n: Log total loss every n training steps. ckpt_every_n: Checkpoint every n training steps. ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. **kwargs: Additional keyword arguments for configuration override. """ # parse config configs = config_util.get_configs_from_pipeline_file( config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu, }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_gt_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradient_norm = None if train_config.gradient_clipping_by_norm > 0: clip_gradient_norm = train_config.gradient_clipping_by_norm if kwargs['use_bfloat16']: tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError( 'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2') # base checkpoint to fine-tune from config_util.update_fine_tune_checkpoint_type(train_config) base_ckpt = train_config.fine_tune_checkpoint base_ckpt_type = train_config.fine_tune_checkpoint_type base_ckpt_ver = train_config.fine_tune_checkpoint_version # write the as-run pipeline config to disk if save_final_config: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # build model, input, optimizer strategy = tf.distribute.get_strategy() with strategy.scope(): # build model model = model_builder.build(model_config=model_config, is_training=True) # build input def train_dataset_fn( input_context: tf.distribute.InputContext) -> tf.data.Dataset: """Callable to create train input.""" train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=model, input_context=input_context, ) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) # build optimizer global_step = tf.Variable( 0, trainable=False, dtype=tf.int64, name='global_step', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate # prepare for training # get appropriate filepath (temporary or not) based on whether the worker is the chief summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.summary.create_file_writer(summary_log_path) else: summary_writer = tf.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # prepare checkpoint manager # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker) ckpt = tf.train.Checkpoint(model=model, step=global_step, optimizer=optimizer) ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1 manager_dir = get_filepath(strategy, model_dir) manager = tf.train.CheckpointManager( ckpt, manager_dir, max_to_keep=ckpt_max_to_keep) latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: # load latest checkpoint being trained ckpt.restore(latest_ckpt).expect_partial() elif base_ckpt: # load a pre-trained checkpoint load_base_ckpt(model, base_ckpt, base_ckpt_type, base_ckpt_ver, train_input, unpad_gt_tensors) # get trainable variables train_vars = get_train_vars(model, train_config) # define training step def train_step_fn(features: Dict, labels: Dict): """Single train step.""" loss = eager_train_step( model, train_vars, features, labels, unpad_gt_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradient_norm=clip_gradient_norm, global_step=global_step, num_replicas=strategy.num_replicas_in_sync, ) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) # save initialized version of checkpoint if int(global_step.value()) == 0: manager.save() ckpt_step = int(global_step.value()) logged_step = global_step.value() # proceed with training last_step_time = time.time() for _ in range(global_step.value(), train_config.num_steps, num_steps_per_iteration): # execute a step (forward pass + backward pass) loss = _dist_train_step(train_input_iter) # log time curr_step = global_step.value() time_taken = time.time() - last_step_time last_step_time = time.time() tf.summary.scalar( 'steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step, ) # log loss if curr_step - logged_step >= log_every_n: step_time = time_taken / num_steps_per_iteration step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format( curr_step, step_time, loss) v1.logging.info(step_msg) logged_step = curr_step # save checkpoint regularly if (curr_step - ckpt_step) >= ckpt_every_n: manager.save() ckpt_step = curr_step # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync # distributed training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_log_path)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, sets finetune_checkpoint_type based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) if not use_tpu: tf.summary.scalar('regularization_loss', regularization_loss) total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = ( features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2)) tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. eval_metrics = eval_config.metrics_set if not eval_metrics: eval_metrics = ['coco_detection_metrics'] eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_metrics, category_index.values(), eval_dict, include_metrics_per_category=False) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)
def train_loop(pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, num_steps_per_iteration=NUM_STEPS_PER_ITERATION, **kwargs): # """Trains a model using eager + functions. config_override = None configs = config_util.get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors # False add_regularization_loss = train_config.add_regularization_loss # True clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: # Not run clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided train_steps = num_train_steps if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError('train_pb2.load_all_detection_checkpoint_vars ' 'unsupported in TF2') config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # 'detection' fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() from object_detection import inputs from object_detection.builders import optimizer_builder from object_detection.utils import variables_helper with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: def learning_rate_fn(): return learning_rate # Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) else: #summary_writer = tf2.summary.create_noop_writer() summary_writer = tf.summary.create_noop_writer() with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if train_config.fine_tune_checkpoint: variables_helper.ensure_checkpoint_supported( train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, model_dir) load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config. run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer) manager_dir = get_filepath(strategy, model_dir) if not strategy.extended.should_checkpoint: checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, 'run'): per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the # num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): # Following suggestion on yaqs/5402607292645376 with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) if int(global_step.value()) == 0: manager.save() checkpointed_step = int(global_step.value()) logged_step = global_step.value() last_step_time = time.time() for _ in range(global_step.value(), train_steps, num_steps_per_iteration): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar('steps_per_sec', steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) logged_step = global_step.value() if ((int(global_step.value()) - checkpointed_step) >= checkpoint_every_n): manager.save() checkpointed_step = int(global_step.value())
def train_loop( pipeline_config_path, model_dir, val_checkpoint_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. val_checkpoint_dir: The directory to save validation checkpoint. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. checkpoint_every_n: Checkpoint every n training steps. checkpoint_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. performance_summary_exporter: function for exporting performance metrics. **kwargs: Additional keyword arguments for configuration override. """ print('START train looop function ========================') ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] steps_per_sec_list = [] configs = get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError('train_pb2.load_all_detection_checkpoint_vars ' 'unsupported in TF2') config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Write the as-run pipeline config to disk. if save_final_config: tf.logging.info('Saving pipeline config file to directory {}'.format( model_dir)) pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate,) = optimizer_builder.build( train_config.optimizer, global_step=global_step) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if train_config.optimizer.use_moving_average: _ensure_model_is_built(detection_model, train_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) else: summary_writer = tf2.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: # TODO(b/135933080) Explore setting to 100 when GPU performance issues # are fixed. num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if train_config.fine_tune_checkpoint: load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config.run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) val_ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) manager_dir = get_filepath(strategy, model_dir) val_manager_dir = get_filepath(strategy, val_checkpoint_dir) # if not strategy.extended.should_checkpoint: # checkpoint_max_to_keep = 1 checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) val_manager = tf.compat.v2.train.CheckpointManager( val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep) model_checkpoint_callback = tfc.ModelCheckpoint(val_manager) early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min') train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt') cancellation_point = tfc.CancellationPoint() # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) val_ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, 'run'): per_replica_losses = strategy.run( train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): # Following suggestion on yaqs/5402607292645376 with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) if int(global_step.value()) == 0: manager.save() checkpointed_step = int(global_step.value()) logged_step = global_step.value() # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration last_step_time = time.time() for epoch, _ in enumerate(range(global_step.value(), train_steps, num_steps_per_iteration)): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar( 'steps_per_sec', steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) manager.save() checkpointed_step = int(global_step.value()) log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20) log_metrics['train_total_loss'] = loss model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss']) stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss']) train_logger_callback.log(log_metrics) if stop_training or cancellation_point.check(): break print(log_metrics) logged_step = global_step.value() # Remove the checkpoint directories of the non-chief workers that # MultiWorkerMirroredStrategy forces us to save during sync distributed # training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_writer_filepath) # TODO(pkanwar): add accuracy metrics. if performance_summary_exporter is not None: metrics = { 'steps_per_sec': np.mean(steps_per_sec_list), 'steps_per_sec_p50': np.median(steps_per_sec_list), 'steps_per_sec_max': max(steps_per_sec_list), 'last_batch_loss': float(loss) } mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32' performance_summary_exporter(metrics, mixed_precision)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn( is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = ( labels[fields.InputDataFields.groundtruth_boxes].get_shape() .as_list()) unpad_groundtruth_tensors = boxes_shape[1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[fields.InputDataFields.groundtruth_weights] gt_confidences_list = None if fields.InputDataFields.groundtruth_confidences in labels: gt_confidences_list = labels[ fields.InputDataFields.groundtruth_confidences] gt_is_crowd_list = None if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.contrib.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper(( prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses() if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = ( train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = ( train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = ['gradients', 'gradient_norm', 'global_gradient_norm'] train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = ( fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[fields.InputDataFields .original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) # Set policy for mixed-precision training with Keras-based models. if use_tpu and train_config.use_bfloat16: from tensorflow.python.keras.engine import base_layer_utils # pylint: disable=g-import-not-at-top # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0. base_layer_utils.enable_v2_dtype_behavior() tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): provide_groundtruth(detection_model, labels) preprocessed_images = features[fields.InputDataFields.image] side_inputs = detection_model.get_side_inputs(features) if use_tpu and train_config.use_bfloat16: with tf.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) prediction_dict = ops.bfloat16_to_float32_nested( prediction_dict) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: load_pretrained = hparams.load_pretrained if hparams else False if train_config.fine_tune_checkpoint and load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): if (mode == tf.estimator.ModeKeys.EVAL and eval_config.use_dummy_loss_in_eval): total_loss = tf.constant(1.0) losses_dict = {'Loss/total_loss': total_loss} else: losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if use_tpu and train_config.use_bfloat16: regularization_losses = ops.bfloat16_to_float32_nested( regularization_losses) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = slim.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = slim.optimizers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if fields.InputDataFields.image_additional_channels in features: eval_dict[fields.InputDataFields. image_additional_channels] = features[ fields.InputDataFields.image_additional_channels] if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: keypoint_edges = [(kp.start, kp.end) for kp in eval_config.keypoint_edge] eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False, keypoint_edges=keypoint_edges or None) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def train(create_model_fn, create_tensor_dict_fn, train_config, train_dir, img_root, video_root): detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options] gpu_num = 2 with tf.device('cpu:0'): global_step = slim.create_global_step() input_queue = _create_input_queue(train_config.batch_size*gpu_num, create_tensor_dict_fn, detection_model, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options, img_root, video_root) # inputOp = input_queue.dequeue() (InitBox1, image1, groundtruth_boxes1, groundtruth_classes1, groundtruth_masks ) = _get_inputs(input_queue) reuse_vars = False #task_lossb = [] tower_grads = [] for gpu_id in range(gpu_num): with tf.device(assign_to_device('/gpu:{}'.format(gpu_id), ps_device='/cpu:0')): _groundtruth_boxes1 = groundtruth_boxes1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size] _InitBox1 = InitBox1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size] _groundtruth_classes1 = groundtruth_classes1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size] _image1 = image1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size] detection_model = create_model_fn() task_lossa = _create_losses(_groundtruth_boxes1,_InitBox1,_groundtruth_classes1,_image1, detection_model,reuse=reuse_vars) optimizer = optimizer_builder.build(train_config.optimizer, set()) grads = optimizer.compute_gradients(task_lossa) pretrained_regex_list = ['^FeatureExtractor/MobilenetV1/Conv2d'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads, pretrained_regex_list, multiplier=0.0) pretrained_regex_list = ['^InitFeatureExtractor/MobilenetV1/Conv2d_[0-13]'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, pretrained_regex_list, multiplier=0.0) reuse_vars = True tower_grads.append(grads_and_vars) tower_grads = average_gradients(tower_grads) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = optimizer.apply_gradients(tower_grads,global_step=global_step) update_ops.append(train_op) update_op = tf.group(*update_ops) total_loss = tf.losses.get_total_loss() with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # create initial restore op init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint) init_var_map = detection_model.restore_init_map( from_detection_checkpoint=train_config.from_detection_checkpoint) available_var_map = (variables_helper. get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_available_var_map = (variables_helper. get_variables_available_in_checkpoint( init_var_map, train_config.fine_tune_checkpoint)) saver = tf.train.Saver(available_var_map) init_saver = tf.train.Saver(init_available_var_map) def initializer_fn(sess): saver.restore(sess, train_config.fine_tune_checkpoint) init_saver.restore(sess, train_config.fine_tune_checkpoint) init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn # session_config.gpu_options.allow_growth = True keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours training_saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) slim.learning.train(train_tensor,logdir=train_dir,session_config=session_config, init_fn=init_fn, number_of_steps=( train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, saver=training_saver, global_step=global_step)
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors use_bfloat16 = train_config.use_bfloat16 add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # TODO(kaftan): Either make strategy a parameter of this method, or ## grab it w/ Distribution strategy's get_scope # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint( detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors, use_tpu, use_bfloat16) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), use_bfloat16=use_bfloat16, add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. tf.print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. """ detection_model = create_model_fn() #Object for create the detection model data_augmentation_options = [ #for ssd it's ssd random crop preprocessor_builder.build( step) #random_horizontal_flip in the faster rcnn config file for step in train_config.data_augmentation_options ] with tf.Graph().as_default( ): #we need a default graph in order to create the model # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. #global step is needed to keep the records with tf.device(deploy_config.variables_device() ): #suitable device for operation +++On CPU I think global_step = slim.create_global_step( ) #created the global step tensor #The following will create an input Que images ,boxes m targets with tf.device(deploy_config.inputs_device() ): #Device to use to build the inputs ++++on CPU ?? input_queue = _create_input_queue( train_config.batch_size // num_clones, #here batch size/number_clones create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) #random_horizontal_flip # Gather initial summaries. summaries = set(tf.get_collection( tf.GraphKeys.SUMMARIES)) #vreate the summeries global_summaries = set([]) #Creating the loss model_fn = functools.partial( _create_losses, #This will create the losses , It need a object of our model as an argivement create_model_fn=create_model_fn) clones = model_deploy.create_clones( deploy_config, model_fn, [input_queue ]) #creating the clones with respect to t he input model fn first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): #This is important training_optimizer = optimizer_builder.build( train_config.optimizer, #optimization global_summaries ) #will select rms_prop , Adam Here derectly we get the optimizer sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.SyncReplicasOptimizer( #This is more of synchronising the optimizer because there are repicas doing optimizing training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: #This is the checkpoint path file init_fn = detection_model.restore_fn( #Re storing the weights from the feature extractors train_config.fine_tune_checkpoint, from_detection_checkpoint=train_config. from_detection_checkpoint ) #This is more of the initializer which is re-stored from check points with tf.device(deploy_config.optimizer_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( #This gives the total loss and also the grad and var pairs (Tuple) clones, training_optimizer, regularization_losses=None) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: #We have not initialized a bias gradient multiplier biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: #Here we are not freezing any may be it's good to freeze the #This will be usefult to go through the variables print("Priting the grad_and_vars to check the tuples ") print(grad_and_vars) grads_and_vars = variables_helper.freeze_gradients_matching_regex( #input to this also grads and vars which means grads_and_vars, train_config.freeze_variables) #This function will output #We are getiing gradients and of their varaibles exept the froxen list # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, #updating the gradinets list global_step=global_step) update_ops.append(grad_updates) #Here the new updated variables update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( #saving the checkpoints keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train( #Training the network using a compact function train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, sync_optimizer=sync_optimizer, saver=saver)
def fine_tune_pruned_model(pipeline_config, train_dir, pruned_shape, pruned_weights, checkpoint_path, fine_tune =True): task = 0 if task == 0: tf.gfile.MakeDirs(train_dir) if pipeline_config: configs = config_util.get_configs_from_pipeline_file(pipeline_config) if task == 0: tf.gfile.Copy(pipeline_config, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: print("FAIL") sys.exit(1) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] train_config.fine_tune_checkpoint_type = 'detection' model_c_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True, add_summaries=True, convDict=pruned_shape) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' create_model_fn = model_c_fn create_tensor_dict_fn = create_input_dict_fn pruned_graph = tf.Graph() with pruned_graph.as_default(): detection_model = model_c_fn() data_augmentation_options = [preprocessor_builder.build(step) for step in train_config.data_augmentation_options] # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=1, clone_on_cpu=False, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( train_config.batch_size // 1, create_input_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=model_c_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build(train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) sync_optimizer = None # Create ops required to initialize the model from a given checkpoint. init_fn = None if pruned_weights: init_assign_op, init_feed_dict = slim.assign_from_values(pruned_weights) def initializer_fn(sess): sess.run(init_assign_op, init_feed_dict) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()): regularization_losses = (None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=regularization_losses) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add(tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add(tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) with pruned_graph.as_default(): slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=( train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, sync_optimizer=sync_optimizer, saver=saver)
def train_loop(pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, num_steps_per_iteration=NUM_STEPS_PER_ITERATION, **kwargs): get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ "get_configs_from_pipeline_file"] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ "merge_external_params_with_configs"] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ "create_pipeline_proto_from_configs"] steps_per_sec_list = [] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ "train_steps": train_steps, "use_bfloat16": configs["train_config"].use_bfloat16 and use_tpu, }) configs = merge_external_params_with_configs(configs, None, kwargs_dict=kwargs) model_config = configs["model"] train_config = configs["train_config"] train_input_config = configs["train_input_config"] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps if kwargs["use_bfloat16"]: tf.compat.v2.keras.mixed_precision.set_global_policy("mixed_bfloat16") if train_config.load_all_detection_checkpoint_vars: raise ValueError( "train_pb2.load_all_detection_checkpoint_vars unsupported in TF2") config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP["detection_model_fn_base"]( model_config=model_config, is_training=True) def train_dataset_fn(input_context): train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context, ) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name="global_step", aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA, ) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if train_config.optimizer.use_moving_average: _ensure_model_is_built(detection_model, train_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, "train")) # summary_writer = tf.compat.v2.summary.create_file_writer(summary_writer_filepath) summary_writer = tf2.summary.create_noop_writer() with summary_writer.as_default(): with strategy.scope(): with tf2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): if train_config.fine_tune_checkpoint: load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config. run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors, ) # ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer) # manager_dir = get_filepath(strategy, model_dir) # manager = tf.compat.v2.train.CheckpointManager(ckpt, manager_dir, max_to_keep=1) # latest_checkpoint = tf.train.latest_checkpoint(model_dir) # ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync, ) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, "run"): per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): with tf.name_scope(""): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) checkpointed_step = int(global_step.value()) logged_step = global_step.value() last_step_time = time.time() for _ in range(global_step.value(), train_steps, num_steps_per_iteration): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar("steps_per_sec", steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( "Step {} per-step time {:.3f}s loss={:.3f}".format( global_step.value(), time_taken / num_steps_per_iteration, loss)) logged_step = global_step.value() if (int(global_step.value()) - checkpointed_step) >= checkpoint_every_n: # manager.save() checkpointed_step = int(global_step.value()) # clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_writer_filepath)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: asg_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) eval_dict = eval_util.result_dict_for_single_example( tf.expand_dims(features[fields.InputDataFields.original_image][0], 0), features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2) if not use_tpu: tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() coco_evaluator = coco_evaluation.CocoDetectionEvaluator( category_index.values()) eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( image_id=eval_dict[input_data_fields.key], groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes], detection_boxes=eval_dict[detection_fields.detection_boxes], detection_scores=eval_dict[detection_fields.detection_scores], detection_classes=eval_dict[detection_fields.detection_classes]) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)
def eval_continuously( pipeline_config_path, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, use_tpu=False, override_eval_num_epochs=True, postprocess_on_cpu=False, model_dir=None, checkpoint_dir=None, wait_interval=180, timeout=3600, eval_index=0, save_final_config=False, **kwargs): """Run continuous evaluation of a detection model eagerly. This method builds the model, and continously restores it from the most recent training checkpoint in the checkpoint directory & evaluates it on the evaluation data. Args: pipeline_config_path: A path to a pipeline config file. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. use_tpu: Boolean, whether training and evaluation should run on TPU. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true, postprocess is scheduled on the host cpu. model_dir: Directory to output resulting evaluation summaries to. checkpoint_dir: Directory that contains the training checkpoints. wait_interval: The mimmum number of seconds to wait before checking for a new checkpoint. timeout: The maximum number of seconds to wait for a checkpoint. Execution will terminate if no new checkpoints are found after these many seconds. eval_index: int, If given, only evaluate the dataset at the given index. By default, evaluates dataset at 0'th index. save_final_config: Whether to save the pipeline config file to the model directory. **kwargs: Additional keyword arguments for configuration override. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] configs = get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) if train_steps is not None: kwargs['train_steps'] = train_steps if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) tf.logging.warning( 'Forced number of epochs for all eval validations to be 1.') configs = merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) if model_dir and save_final_config: tf.logging.info('Saving pipeline config file to directory {}'.format( model_dir)) pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) model_config = configs['model'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: tf.logging.warning('Expected number of evaluation epochs is 1, but ' 'instead encountered `eval_on_train_input_config' '.num_epochs` = ' '{}. Overwriting `num_epochs` to 1.'.format( eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') eval_input_config = eval_input_configs[eval_index] strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( model_config=model_config, is_training=True) eval_input = strategy.experimental_distribute_dataset( inputs.eval_input( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config, model=detection_model)) global_step = tf.compat.v2.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, _ = optimizer_builder.build( configs['train_config'].optimizer, global_step=global_step) for latest_checkpoint in tf.train.checkpoints_iterator( checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval): ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if eval_config.use_moving_averages: unpad_groundtruth_tensors = (eval_config.batch_size == 1 and not use_tpu) _ensure_model_is_built(detection_model, eval_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) ckpt.restore(latest_checkpoint).expect_partial() if eval_config.use_moving_averages: optimizer.swap_weights() summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(model_dir, 'eval', eval_input_config.name)) with summary_writer.as_default(): eval_metrics = eager_eval_loop( detection_model, configs, eval_input, use_tpu=use_tpu, postprocess_on_cpu=postprocess_on_cpu, global_step=global_step, ) return eval_metrics
def train(datasets_dicts, epochs, val_every, iters_cnt, validate_with_eval_model, pipeline_config, num_clones=1, save_cback=None, is_transfer_learning=False): logger.info('Start train') configs = configs_from_pipeline(pipeline_config) model_config = configs['model'] train_config = configs['train_config'] create_model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) detection_model = create_model_fn() def get_next(dataset): return dataset_util.make_initializable_iterator( build_dataset(dataset)).get_next() create_tensor_dict_fn = functools.partial(get_next, datasets_dicts['train']) create_tensor_dict_fn_val = functools.partial(get_next, datasets_dicts['val']) data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=4, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0, worker_job_name='lonely_worker') # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() with tf.device(deploy_config.inputs_device()): coord = coordinator.Coordinator() input_queue = create_input_queue( train_config.batch_size, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) input_queue_val = create_input_queue( train_config.batch_size, create_tensor_dict_fn_val, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # create validation graph create_model_fn_val = functools.partial( model_builder.build, model_config=model_config, is_training=not validate_with_eval_model) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') train_losses = [] grads_and_vars = [] with slim.arg_scope([slim.model_variable, slim.variable], device='/device:CPU:0'): for curr_dev_id in range(num_clones): with tf.device('/gpu:{}'.format(curr_dev_id)): with tf.name_scope( 'clone_{}'.format(curr_dev_id)) as scope: with tf.variable_scope( tf.get_variable_scope(), reuse=True if curr_dev_id > 0 else None): losses = _create_losses_val( input_queue, create_model_fn, train_config) clones_loss = tf.add_n(losses) clones_loss = tf.divide(clones_loss, 1.0 * num_clones) grads = training_optimizer.compute_gradients( clones_loss) train_losses.append(clones_loss) grads_and_vars.append(grads) if curr_dev_id == 0: update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS) val_total_loss = get_val_loss(num_clones, input_queue_val, create_model_fn_val, train_config) with tf.device(deploy_config.optimizer_device()): total_loss = tf.add_n(train_losses) grads_and_vars = model_deploy._sum_clones_gradients(grads_and_vars) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) coord.clear_stop() sess = tf.Session(config=config) saver = tf.train.Saver() graph = ops.get_default_graph() with graph.as_default(): with ops.name_scope('init_ops'): init_op = variables.global_variables_initializer() ready_op = variables.report_uninitialized_variables() local_init_op = control_flow_ops.group( variables.local_variables_initializer(), lookup_ops.tables_initializer()) # graph.finalize() sess.run([init_op, ready_op, local_init_op]) queue_runners = graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS) threads = [] for qr in queue_runners: threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) logger.info('Start restore') if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars and (not is_transfer_learning))) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) if 'global_step' in available_var_map: del available_var_map['global_step'] init_saver = tf.train.Saver(available_var_map) logger.info('Restoring model weights from previous checkpoint.') init_saver.restore(sess, train_config.fine_tune_checkpoint) logger.info('Model restored.') eval_planner = EvalPlanner(epochs, val_every) progress = sly.Progress('Model training: ', epochs * iters_cnt['train']) best_val_loss = float('inf') epoch_flt = 0 for epoch in range(epochs): logger.info("Before new epoch", extra={'epoch': epoch_flt}) for train_it in range(iters_cnt['train']): total_loss, np_global_step = sess.run( [train_tensor, global_step]) metrics_values_train = { 'loss': total_loss, } progress.iter_done_report() epoch_flt = epoch_float(epoch, train_it + 1, iters_cnt['train']) sly.report_metrics_training(epoch_flt, metrics_values_train) if eval_planner.need_validation(epoch_flt): logger.info("Before validation", extra={'epoch': epoch_flt}) overall_val_loss = 0 for val_it in range(iters_cnt['val']): overall_val_loss += sess.run(val_total_loss) logger.info("Validation in progress", extra={ 'epoch': epoch_flt, 'val_iter': val_it, 'val_iters': iters_cnt['val'] }) metrics_values_val = { 'loss': overall_val_loss / iters_cnt['val'], } sly.report_metrics_validation(epoch_flt, metrics_values_val) logger.info("Validation has been finished", extra={'epoch': epoch_flt}) eval_planner.validation_performed() val_loss = metrics_values_val['loss'] model_is_best = val_loss < best_val_loss if model_is_best: best_val_loss = val_loss logger.info( 'It\'s been determined that current model is the best one for a while.' ) save_cback(saver, sess, model_is_best, opt_data={ 'epoch': epoch_flt, 'val_metrics': metrics_values_val, }) logger.info("Epoch was finished", extra={'epoch': epoch_flt}) coord.request_stop() coord.join(threads)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, train_steps, to_keep, save_steps, graph_hook_fn=None): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. train_steps: Number of training steps to_keep: Number of checkpoints to keep save_steps: Save after every n seconds graph_hook_fn: Optional function that is called after the inference graph is built (before optimization). This is helpful to perform additional changes to the training graph such as adding FakeQuant ops. The function should modify the default graph. Raises: ValueError: If both num_clones > 1 and train_config.sync_replicas is true. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() if num_clones != 1 and train_config.sync_replicas: raise ValueError('In Synchronous SGD mode num_clones must ', 'be 1. Found num_clones: {}'.format(num_clones)) batch_size = train_config.batch_size // num_clones if train_config.sync_replicas: batch_size //= train_config.replicas_to_aggregate with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( batch_size, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope if graph_hook_fn: with tf.device(deploy_config.variables_device()): graph_hook_fn() # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.train.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=worker_replicas) sync_optimizer = training_optimizer with tf.device(deploy_config.optimizer_device()): regularization_losses = ( None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=regularization_losses) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram('ModelVars/' + model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar('Losses/' + loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, max_to_keep=to_keep) # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, fine_tune_checkpoint_type is set based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint, include_global_step=False)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn train_config.num_steps = train_steps slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, save_interval_secs=save_steps, sync_optimizer=sync_optimizer, saver=saver)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = ( labels[fields.InputDataFields.groundtruth_boxes].get_shape() .as_list()) unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=labels[ fields.InputDataFields.groundtruth_weights], groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = ( features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) img_summary = None if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2, use_normalized_coordinates=False)) img_summary = tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single example. eval_metrics = eval_config.metrics_set if not eval_metrics: eval_metrics = ['coco_detection_metrics'] eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_metrics, category_index.values(), eval_dict, include_metrics_per_category=eval_config.include_metrics_per_category) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if img_summary is not None: eval_metric_ops['Detections_Left_Groundtruth_Right'] = ( img_summary, tf.no_op()) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = True if boxes_shape[ 1] is not None else False labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[ fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[ fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[ fields.InputDataFields.groundtruth_weights] if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[ fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = (features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) img_summary = None if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2, use_normalized_coordinates=False)) img_summary = tf.summary.image( 'Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, category_index.values(), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if img_summary is not None: eval_metric_ops['Detections_Left_Groundtruth_Right'] = ( img_summary, tf.no_op()) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def train(create_model_fn, create_tensor_dict_fn, train_config, train_dir, img_root): detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.device('cpu:0'): global_step = slim.create_global_step() input_queue = _create_input_queue(train_config.batch_size, create_tensor_dict_fn, detection_model, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options, img_root) with tf.device('gpu:0'): _create_losses(input_queue, create_model_fn) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) training_optimizer = optimizer_builder.build(train_config.optimizer, set()) # create initial restore op init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn # loss and grads total_loss = tf.losses.get_total_loss() grads_and_vars = training_optimizer.compute_gradients( total_loss, tf.trainable_variables()) # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # create summary summaries = set() for loss_tensor in tf.losses.get_losses(): summaries.add(tf.summary.scalar(loss_tensor.op.name, loss_tensor)) summaries.add(tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) summary_op = tf.summary.merge(list(summaries), name='summary_op') session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # session_config.gpu_options.allow_growth = True keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train(train_tensor, logdir=train_dir, session_config=session_config, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, saver=saver)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=None): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. graph_hook_fn: Optional function that is called after the training graph is completely built. This is helpful to perform additional changes to the training graph such as optimizing batchnorm. The function should modify the default graph. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( train_config.batch_size // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.train.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=worker_replicas) sync_optimizer = training_optimizer with tf.device(deploy_config.optimizer_device()): regularization_losses = (None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=regularization_losses) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') if graph_hook_fn: with tf.device(deploy_config.variables_device()): graph_hook_fn() # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add(tf.summary.histogram('ModelVars/' + model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add(tf.summary.scalar('Losses/' + loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, fine_tune_checkpoint_type is set based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = (variables_helper. get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=( train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, sync_optimizer=sync_optimizer, saver=saver)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None scaffold = None eval_metric_ops = None if mode == tf.estimator.ModeKeys.TRAIN: # get the optimizer and global step: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) #get the trainable variables #trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) #get the clip_gradients_value clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm total_loss = 0. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): feature_list, label_list = split_features_and_labels( features, labels, train_config.GPU_num) for i in xrange(train_config.GPU_num): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: loss = tower_loss(scope=scope, features=feature_list[i], labels=label_list[i], detection_model=detection_model, train_config=train_config) tf.get_variable_scope().reuse_variables() grads = training_optimizer.compute_gradients( loss=loss) if isinstance(clip_gradients_value, float): grads = clip_gradients_by_norm( grads, clip_gradients_value) tower_grads.append(grads) total_loss += loss total_loss /= train_config.GPU_num grad_avg = average_gradients(tower_grads) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): apply_gradient_op = training_optimizer.apply_gradients( grads_and_vars=grad_avg, global_step=global_step) train_op = apply_gradient_op if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) elif mode == tf.estimator.ModeKeys.EVAL: detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. #in mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT, I explictly set the evaluation and prediction to run on CPU with tf.device('/cpu:1'): # training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer ) boxes_shape = (labels[fields.InputDataFields. groundtruth_boxes].get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) gt_boxes_list = labels[ fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[ fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[ fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[ fields.InputDataFields.groundtruth_weights] gt_confidences_list = None if fields.InputDataFields.groundtruth_confidences in labels: gt_confidences_list = labels[ fields.InputDataFields.groundtruth_confidences] gt_is_crowd_list = None if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[ fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() class_agnostic = ( fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[ fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config. max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, category_index.values(), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = { str(k): v for k, v in eval_metric_ops.items() } if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore( ) keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver(variables_to_restore, keep_checkpoint_every_n_hours= keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) elif mode == tf.estimator.ModeKeys.PREDICT: detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) #similar to EVAL mode, I run PREDICT on CPU too. with tf.device(':/cpu:1'): preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) exported_output = exporter_lib.add_output_tensor_nodes( detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: #scafold here only contains Saver if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = tf.train.create_global_step() with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( train_config.batch_size // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, sync_optimizer=sync_optimizer, saver=saver)
summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(trainer._create_losses, create_model_fn=model, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) print(len(clones)) first_clone_scope = clones[0].scope update_ops = [] with tf.device(deploy_config.optimizer_device()): #momentum optimizer va summaries training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) #restore checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess):