def load_fine_tune_checkpoint(model, checkpoint_path, checkpoint_type, load_all_detection_checkpoint_vars, input_dataset, unpad_groundtruth_tensors, use_tpu, use_bfloat16): features, labels = iter(input_dataset).next() def _dummy_computation_fn(features, labels): model._is_training = False # pylint: disable=protected-access tf.keras.backend.set_learning_phase(False) labels = model_lib.unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) return _compute_losses_and_predictions_dicts(model, features, labels, use_tpu=use_tpu, use_bfloat16=use_bfloat16) strategy = tf.compat.v2.distribute.get_strategy() strategy.experimental_run_v2(_dummy_computation_fn, args=( features, labels, )) var_map = model.restore_map(fine_tune_checkpoint_type=checkpoint_type, load_all_detection_checkpoint_vars=( load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, checkpoint_path, include_global_step=False)) tf.train.init_from_checkpoint(checkpoint_path, available_var_map)
def restore_fn(self, checkpoint_path, from_detection_checkpoint=True): """Return callable for loading a checkpoint into the tensorflow graph. Args: checkpoint_path: path to checkpoint to restore. from_detection_checkpoint: whether to restore from a full detection checkpoint (with compatible variable names) or to restore from a classification checkpoint for initialization prior to training. Returns: a callable which takes a tf.Session as input and loads a checkpoint when run. """ variables_to_restore = {} for variable in tf.all_variables(): if variable.op.name.startswith(self._extract_features_scope): var_name = variable.op.name if not from_detection_checkpoint: var_name = ( re.split('^' + self._extract_features_scope + '/', var_name)[-1]) variables_to_restore[var_name] = variable # TODO: Load variables selectively using scopes. variables_to_restore = ( variables_helper.get_variables_available_in_checkpoint( variables_to_restore, checkpoint_path)) saver = tf.train.Saver(variables_to_restore) def restore(sess): saver.restore(sess, checkpoint_path) return restore
def test_return_variables_with_correct_sizes(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') with tf.Graph().as_default(): bias_variable = tf.Variable(3.0, name='biases') global_step = tf.train.get_or_create_global_step() graph1_variables = [ tf.Variable([[1.0, 2.0], [3.0, 4.0]], name='weights'), bias_variable, global_step ] init_op = tf.global_variables_initializer() saver = tf.train.Saver(graph1_variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) with tf.Graph().as_default(): graph2_variables = [ tf.Variable([1.0, 2.0], name='weights'), # New variable shape. bias_variable, global_step ] out_variables = variables_helper.get_variables_available_in_checkpoint( graph2_variables, checkpoint_path, include_global_step=True) self.assertItemsEqual(out_variables, [bias_variable, global_step])
def restore_fn(self, checkpoint_path, from_detection_checkpoint=True): """Return callable for loading a checkpoint into the tensorflow graph. Args: checkpoint_path: path to checkpoint to restore. from_detection_checkpoint: whether to restore from a full detection checkpoint (with compatible variable names) or to restore from a classification checkpoint for initialization prior to training. Returns: a callable which takes a tf.Session as input and loads a checkpoint when run. """ variables_to_restore = {} for variable in tf.all_variables(): #all the variacles #We only re_stor variables names started with feature extractor . No others this is good if variable.op.name.startswith(self._extract_features_scope): #Select the variables with the feature_extractor title var_name = variable.op.name #This means we will extract the variables scoped by the feature extrators if not from_detection_checkpoint: #Here we are detecting from a detection network var_name = ( ('^' + self._extract_features_scope + '/', var_name)[-1]) #We don't restore weights for classification chckpoints variables_to_restore[var_name] = variable # TODO: Load variables selectively using scopes. variables_to_restore = ( variables_helper.get_variables_available_in_checkpoint( variables_to_restore, checkpoint_path)) #In the SSD we re store the weights for feature extractors saver = tf.train.Saver(variables_to_restore) def restore(sess): saver.restore(sess, checkpoint_path) return restore re.split
def load_fine_tune_checkpoint(model, checkpoint_path, checkpoint_type, load_all_detection_checkpoint_vars, input_dataset, unpad_groundtruth_tensors, use_tpu, use_bfloat16): """Load a fine tuning classification or detection checkpoint. To make sure the model variables are all built, this method first executes the model by computing a dummy loss. (Models might not have built their variables before their first execution) It then loads a variable-name based classification or detection checkpoint that comes from converted TF 1.x slim model checkpoints. This method updates the model in-place and does not return a value. Args: model: A DetectionModel (based on Keras) to load a fine-tuning checkpoint for. checkpoint_path: Directory with checkpoints file or path to checkpoint. checkpoint_type: Whether to restore from a full detection checkpoint (with compatible variable names) or to restore from a classification checkpoint for initialization prior to training. Valid values: `detection`, `classification`. load_all_detection_checkpoint_vars: whether to load all variables (when `fine_tune_checkpoint_type` is `detection`). If False, only variables within the feature extractor scopes are included. Default False. input_dataset: The tf.data Dataset the model is being trained on. Needed to get the shapes for the dummy loss computation. unpad_groundtruth_tensors: A parameter passed to unstack_batch. use_tpu: Whether computation should happen on a TPU. use_bfloat16: Whether computation on a TPU should use bfloat16. """ features, labels = iter(input_dataset).next() def _dummy_computation_fn(features, labels): model._is_training = False # pylint: disable=protected-access tf.keras.backend.set_learning_phase(False) labels = model_lib.unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) return _compute_losses_and_predictions_dicts(model, features, labels, use_tpu=use_tpu, use_bfloat16=use_bfloat16) strategy = tf.compat.v2.distribute.get_strategy() strategy.experimental_run_v2(_dummy_computation_fn, args=( features, labels, )) var_map = model.restore_map(fine_tune_checkpoint_type=checkpoint_type, load_all_detection_checkpoint_vars=( load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, checkpoint_path, include_global_step=False)) tf.train.init_from_checkpoint(checkpoint_path, available_var_map)
def model_fn(self, features, labels, mode): """Define Faster R-CNN model_fn used by TensorFlow Estimator.""" logging.info('Faster R-CNN model function action') self.model = self.trainer.model self.config = self.trainer.config predict_result_dict = self.model(features, labels, mode == tf.estimator.ModeKeys.TRAIN) self.fine_tune_checkpoint_type = self.config.fine_tune_checkpoint_type self.load_all_detection_checkpoint_vars = True asg_map = self.model.restore_map( fine_tune_checkpoint_type=self.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( self.load_all_detection_checkpoint_vars)) self.fine_tune_checkpoint = self.config.fine_tune_checkpoint available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, self.fine_tune_checkpoint, include_global_step=False)) tf.train.init_from_checkpoint(self.fine_tune_checkpoint, available_var_map) losses_dict = self.model.loss( predict_result_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] total_loss = tf.add_n(losses, name='total_loss') train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() self.optimizer, self.optimizer_summary_vars = TFOptimizer( self.config.optimizer).get_real_optimizer(global_step) trainable_variables = None trainable_variables = slim.filter_variables( tf.trainable_variables()) clip_gradients_value = None summaries = None train_op = slim.optimizers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=self.optimizer, update_ops=self.model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.valid_metrics(predict_result_dict, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def test_return_all_variables_from_checkpoint(self): variables = [ tf.Variable(1.0, name='weights'), tf.Variable(1.0, name='biases') ] checkpoint_path = os.path.join(self.get_temp_dir(), 'graph.pb') init_op = tf.global_variables_initializer() saver = tf.train.Saver(variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) out_variables = variables_helper.get_variables_available_in_checkpoint( variables, checkpoint_path) self.assertItemsEqual(out_variables, variables)
def get_restore_checkpoint_ops(restore_checkpoints, detection_model, train_config): """Restore checkpoint from saved checkpoints. Args: restore_checkpoints: loaded checkpoints. detection_model: Object detection model built from config file. train_config: a train_pb2.TrainConfig protobuf. Returns: restorers: A list ops to init the model from checkpoints. """ restorers = [] vars_restored = [] for restore_checkpoint in restore_checkpoints: var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, restore_checkpoint)) for var_name, var in six.iteritems(available_var_map): if var in vars_restored: tf.logging.info( 'Variable %s contained in multiple checkpoints', var.op.name) del available_var_map[var_name] else: vars_restored.append(var) # Initialize from ExponentialMovingAverages if possible. available_ema_var_map = {} ckpt_reader = tf.train.NewCheckpointReader(restore_checkpoint) ckpt_vars_to_shape_map = ckpt_reader.get_variable_to_shape_map() for var_name, var in six.iteritems(available_var_map): var_name_ema = var_name + '/ExponentialMovingAverage' if var_name_ema in ckpt_vars_to_shape_map: available_ema_var_map[var_name_ema] = var else: available_ema_var_map[var_name] = var available_var_map = available_ema_var_map init_saver = tf.train.Saver(available_var_map) if list(available_var_map.keys()): restorers.append(init_saver) else: tf.logging.info( 'WARNING: Checkpoint %s has no restorable variables', restore_checkpoint) return restorers
def test_return_variables_available_in_checkpoint(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'graph.pb') weight_variable = tf.Variable(1.0, name='weights') global_step = tf.train.get_or_create_global_step() graph1_variables = [weight_variable, global_step] init_op = tf.global_variables_initializer() saver = tf.train.Saver(graph1_variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) graph2_variables = graph1_variables + [tf.Variable(1.0, name='biases')] out_variables = variables_helper.get_variables_available_in_checkpoint( graph2_variables, checkpoint_path, include_global_step=False) self.assertItemsEqual(out_variables, [weight_variable])
def restore_from_classification_checkpoint_fn( self, checkpoint_path, first_stage_feature_extractor_scope, second_stage_feature_extractor_scope): """Returns callable for loading a checkpoint into the tensorflow graph. Note that this overrides the default implementation in faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for InceptionResnetV2 checkpoints. TODO: revisit whether it's possible to force the `Repeat` namescope as created in `_extract_box_classifier_features` to start counting at 2 (e.g. `Repeat_2`) so that the default restore_fn can be used. Args: checkpoint_path: Path to checkpoint to restore. first_stage_feature_extractor_scope: A scope name for the first stage feature extractor. second_stage_feature_extractor_scope: A scope name for the second stage feature extractor. Returns: a callable which takes a tf.Session as input and loads a checkpoint when run. """ variables_to_restore = {} for variable in tf.global_variables(): if variable.op.name.startswith( first_stage_feature_extractor_scope): var_name = variable.op.name.replace( first_stage_feature_extractor_scope + '/', '') variables_to_restore[var_name] = variable if variable.op.name.startswith( second_stage_feature_extractor_scope): var_name = variable.op.name.replace( second_stage_feature_extractor_scope + '/InceptionResnetV2/Repeat', 'InceptionResnetV2/Repeat_2') var_name = var_name.replace( second_stage_feature_extractor_scope + '/', '') variables_to_restore[var_name] = variable variables_to_restore = ( variables_helper.get_variables_available_in_checkpoint( variables_to_restore, checkpoint_path)) saver = tf.train.Saver(variables_to_restore) def restore(sess): saver.restore(sess, checkpoint_path) return restore
def get_restore_checkpoint_ops(restore_checkpoints, detection_model, train_config): """Restore checkpoint from saved checkpoints. Args: restore_checkpoints: loaded checkpoints. detection_model: Object detection model built from config file. train_config: a train_pb2.TrainConfig protobuf. Returns: restorers: A list ops to init the model from checkpoints. """ restorers = [] vars_restored = [] for restore_checkpoint in restore_checkpoints: var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, restore_checkpoint)) for var_name, var in available_var_map.iteritems(): if var in vars_restored: logging.info('Variable %s contained in multiple checkpoints', var.op.name) del available_var_map[var_name] else: vars_restored.append(var) # Initialize from ExponentialMovingAverages if possible. available_ema_var_map = {} ckpt_reader = tf.train.NewCheckpointReader(restore_checkpoint) ckpt_vars_to_shape_map = ckpt_reader.get_variable_to_shape_map() for var_name, var in available_var_map.iteritems(): var_name_ema = var_name + '/ExponentialMovingAverage' if var_name_ema in ckpt_vars_to_shape_map: available_ema_var_map[var_name_ema] = var else: available_ema_var_map[var_name] = var available_var_map = available_ema_var_map init_saver = tf.train.Saver(available_var_map) if available_var_map.keys(): restorers.append(init_saver) else: logging.info('WARNING: Checkpoint %s has no restorable variables', restore_checkpoint) return restorers
def _get_mtl_init_saver(scope_name): _var_map = detection_model._feature_extractor.mtl_restore_from_classification_checkpoint_fn( scope_name) if train_config.from_detection_checkpoint: _var_map_new = dict() for name, val in _var_map.iteritems(): _var_map_new[detection_model. second_stage_feature_extractor_scope + '/' + name] = val _var_map = _var_map_new _available_var_map = ( variables_helper.get_variables_available_in_checkpoint( _var_map, train_config.fine_tune_checkpoint)) if _available_var_map: return tf.train.Saver(_available_var_map) else: return None
def test_return_all_variables_from_checkpoint_with_partition(self): with tf.Graph().as_default(): partitioner = tf.fixed_size_partitioner(2) variables = [ tf.get_variable( name='weights', shape=(2, 2), partitioner=partitioner), tf.Variable([1.0, 2.0], name='biases') ] checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') init_op = tf.global_variables_initializer() saver = tf.train.Saver(variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) out_variables = variables_helper.get_variables_available_in_checkpoint( variables, checkpoint_path) self.assertItemsEqual(out_variables, variables)
def test_return_variables_available_an_checkpoint_with_dict_inputs(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'graph.pb') graph1_variables = [ tf.Variable(1.0, name='ckpt_weights'), ] init_op = tf.global_variables_initializer() saver = tf.train.Saver(graph1_variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) graph2_variables_dict = { 'ckpt_weights': tf.Variable(1.0, name='weights'), 'ckpt_biases': tf.Variable(1.0, name='biases') } out_variables = variables_helper.get_variables_available_in_checkpoint( graph2_variables_dict, checkpoint_path) self.assertTrue(isinstance(out_variables, dict)) self.assertItemsEqual(out_variables.keys(), ['ckpt_weights']) self.assertTrue(out_variables['ckpt_weights'].op.name == 'weights')
def test_return_variables_available_in_checkpoint(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') with tf.Graph().as_default(): weight_variable = tf.Variable(1.0, name='weights') global_step = tf.train.get_or_create_global_step() graph1_variables = [ weight_variable, global_step ] init_op = tf.global_variables_initializer() saver = tf.train.Saver(graph1_variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) with tf.Graph().as_default(): graph2_variables = graph1_variables + [tf.Variable(1.0, name='biases')] out_variables = variables_helper.get_variables_available_in_checkpoint( graph2_variables, checkpoint_path, include_global_step=False) self.assertItemsEqual(out_variables, [weight_variable])
def test_return_variables_available_an_checkpoint_with_dict_inputs(self): checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt') with tf.Graph().as_default(): graph1_variables = [ tf.Variable(1.0, name='ckpt_weights'), ] init_op = tf.global_variables_initializer() saver = tf.train.Saver(graph1_variables) with self.test_session() as sess: sess.run(init_op) saver.save(sess, checkpoint_path) with tf.Graph().as_default(): graph2_variables_dict = { 'ckpt_weights': tf.Variable(1.0, name='weights'), 'ckpt_biases': tf.Variable(1.0, name='biases') } out_variables = variables_helper.get_variables_available_in_checkpoint( graph2_variables_dict, checkpoint_path) self.assertIsInstance(out_variables, dict) self.assertCountEqual(list(out_variables.keys()), ['ckpt_weights']) self.assertEqual(out_variables['ckpt_weights'].op.name, 'weights')
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = ( labels[fields.InputDataFields.groundtruth_boxes].get_shape() .as_list()) unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=labels[ fields.InputDataFields.groundtruth_weights], groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = ( features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) img_summary = None if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2, use_normalized_coordinates=False)) img_summary = tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single example. eval_metrics = eval_config.metrics_set if not eval_metrics: eval_metrics = ['coco_detection_metrics'] eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_metrics, category_index.values(), eval_dict, include_metrics_per_category=eval_config.include_metrics_per_category) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if img_summary is not None: eval_metric_ops['Detections_Left_Groundtruth_Right'] = ( img_summary, tf.no_op()) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: asg_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) eval_dict = eval_util.result_dict_for_single_example( tf.expand_dims(features[fields.InputDataFields.original_image][0], 0), features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2) if not use_tpu: tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() coco_evaluator = coco_evaluation.CocoDetectionEvaluator( category_index.values()) eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( image_id=eval_dict[input_data_fields.key], groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes], detection_boxes=eval_dict[detection_fields.detection_boxes], detection_scores=eval_dict[detection_fields.detection_scores], detection_classes=eval_dict[detection_fields.detection_classes]) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = True if boxes_shape[ 1] is not None else False labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[ fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[ fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[ fields.InputDataFields.groundtruth_weights] if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[ fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = (features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) img_summary = None if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2, use_normalized_coordinates=False)) img_summary = tf.summary.image( 'Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, category_index.values(), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if img_summary is not None: eval_metric_ops['Detections_Left_Groundtruth_Right'] = ( img_summary, tf.no_op()) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
update_ops = [] with tf.device(deploy_config.optimizer_device()): #momentum optimizer va summaries training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) #restore checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()): regularization_losses = ( None if train_config.add_regularization_loss else []) #print('regularization_losses', regularization_losses) #tf.add_n sumloss va gradient cho tung variable. total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer,
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None scaffold = None eval_metric_ops = None if mode == tf.estimator.ModeKeys.TRAIN: # get the optimizer and global step: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) #get the trainable variables #trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) #get the clip_gradients_value clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm total_loss = 0. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): feature_list, label_list = split_features_and_labels( features, labels, train_config.GPU_num) for i in xrange(train_config.GPU_num): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % ('tower', i)) as scope: loss = tower_loss(scope=scope, features=feature_list[i], labels=label_list[i], detection_model=detection_model, train_config=train_config) tf.get_variable_scope().reuse_variables() grads = training_optimizer.compute_gradients( loss=loss) if isinstance(clip_gradients_value, float): grads = clip_gradients_by_norm( grads, clip_gradients_value) tower_grads.append(grads) total_loss += loss total_loss /= train_config.GPU_num grad_avg = average_gradients(tower_grads) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): apply_gradient_op = training_optimizer.apply_gradients( grads_and_vars=grad_avg, global_step=global_step) train_op = apply_gradient_op if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) elif mode == tf.estimator.ModeKeys.EVAL: detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. #in mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT, I explictly set the evaluation and prediction to run on CPU with tf.device('/cpu:1'): # training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer ) boxes_shape = (labels[fields.InputDataFields. groundtruth_boxes].get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) gt_boxes_list = labels[ fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[ fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[ fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[ fields.InputDataFields.groundtruth_weights] gt_confidences_list = None if fields.InputDataFields.groundtruth_confidences in labels: gt_confidences_list = labels[ fields.InputDataFields.groundtruth_confidences] gt_is_crowd_list = None if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[ fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() class_agnostic = ( fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[ fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config. max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, category_index.values(), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = { str(k): v for k, v in eval_metric_ops.items() } if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore( ) keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver(variables_to_restore, keep_checkpoint_every_n_hours= keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) elif mode == tf.estimator.ModeKeys.PREDICT: detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) #similar to EVAL mode, I run PREDICT on CPU too. with tf.device(':/cpu:1'): preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) exported_output = exporter_lib.add_output_tensor_nodes( detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: #scafold here only contains Saver if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, train_steps, to_keep, save_steps, graph_hook_fn=None): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. train_steps: Number of training steps to_keep: Number of checkpoints to keep save_steps: Save after every n seconds graph_hook_fn: Optional function that is called after the inference graph is built (before optimization). This is helpful to perform additional changes to the training graph such as adding FakeQuant ops. The function should modify the default graph. Raises: ValueError: If both num_clones > 1 and train_config.sync_replicas is true. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() if num_clones != 1 and train_config.sync_replicas: raise ValueError('In Synchronous SGD mode num_clones must ', 'be 1. Found num_clones: {}'.format(num_clones)) batch_size = train_config.batch_size // num_clones if train_config.sync_replicas: batch_size //= train_config.replicas_to_aggregate with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( batch_size, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope if graph_hook_fn: with tf.device(deploy_config.variables_device()): graph_hook_fn() # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.train.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=worker_replicas) sync_optimizer = training_optimizer with tf.device(deploy_config.optimizer_device()): regularization_losses = ( None if train_config.add_regularization_loss else []) total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=regularization_losses) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram('ModelVars/' + model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar('Losses/' + loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, max_to_keep=to_keep) # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, fine_tune_checkpoint_type is set based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint, include_global_step=False)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn train_config.num_steps = train_steps slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, save_interval_secs=save_steps, sync_optimizer=sync_optimizer, saver=saver)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) # Set policy for mixed-precision training with Keras-based models. if use_tpu and train_config.use_bfloat16: from tensorflow.python.keras.engine import base_layer_utils # pylint: disable=g-import-not-at-top # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0. base_layer_utils.enable_v2_dtype_behavior() tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch(labels, unpad_groundtruth_tensors=train_config. unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes]. get_shape().as_list()) unpad_groundtruth_tensors = boxes_shape[ 1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): provide_groundtruth(detection_model, labels) preprocessed_images = features[fields.InputDataFields.image] side_inputs = detection_model.get_side_inputs(features) if use_tpu and train_config.use_bfloat16: with tf.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) prediction_dict = ops.bfloat16_to_float32_nested( prediction_dict) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape], **side_inputs) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper( (prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: load_pretrained = hparams.load_pretrained if hparams else False if train_config.fine_tune_checkpoint and load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint( train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): if (mode == tf.estimator.ModeKeys.EVAL and eval_config.use_dummy_loss_in_eval): total_loss = tf.constant(1.0) losses_dict = {'Loss/total_loss': total_loss} else: losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses( ) if use_tpu and train_config.use_bfloat16: regularization_losses = ops.bfloat16_to_float32_nested( regularization_losses) if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict[ 'Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = (train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = (train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = slim.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = [ 'gradients', 'gradient_norm', 'global_gradient_norm' ] train_op = slim.optimizers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[ fields.InputDataFields.original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if fields.InputDataFields.image_additional_channels in features: eval_dict[fields.InputDataFields. image_additional_channels] = features[ fields.InputDataFields.image_additional_channels] if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index( ) else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: keypoint_edges = [(kp.start, kp.end) for kp in eval_config.keypoint_edge] eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False, keypoint_edges=keypoint_edges or None) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours ) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec(mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN # Make sure to set the Keras learning phase. True during training, # False for inference. tf.keras.backend.set_learning_phase(is_training) detection_model = detection_model_fn( is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: # For evaling on train data, it is necessary to check whether groundtruth # must be unpadded. boxes_shape = ( labels[fields.InputDataFields.groundtruth_boxes].get_shape() .as_list()) unpad_groundtruth_tensors = boxes_shape[1] is not None and not use_tpu labels = unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] gt_weights_list = None if fields.InputDataFields.groundtruth_weights in labels: gt_weights_list = labels[fields.InputDataFields.groundtruth_weights] gt_confidences_list = None if fields.InputDataFields.groundtruth_confidences in labels: gt_confidences_list = labels[ fields.InputDataFields.groundtruth_confidences] gt_is_crowd_list = None if fields.InputDataFields.groundtruth_is_crowd in labels: gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_confidences_list=gt_confidences_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list, groundtruth_weights_list=gt_weights_list, groundtruth_is_crowd_list=gt_is_crowd_list) preprocessed_images = features[fields.InputDataFields.image] if use_tpu and train_config.use_bfloat16: with tf.contrib.tpu.bfloat16_scope(): prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) for k, v in prediction_dict.items(): if v.dtype == tf.bfloat16: prediction_dict[k] = tf.cast(v, tf.float32) else: prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) def postprocess_wrapper(args): return detection_model.postprocess(args[0], args[1]) if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT): if use_tpu and postprocess_on_cpu: detections = tf.contrib.tpu.outside_compilation( postprocess_wrapper, (prediction_dict, features[fields.InputDataFields.true_image_shape])) else: detections = postprocess_wrapper(( prediction_dict, features[fields.InputDataFields.true_image_shape])) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.values()] if train_config.add_regularization_loss: regularization_losses = detection_model.regularization_losses() if regularization_losses: regularization_loss = tf.add_n( regularization_losses, name='regularization_loss') losses.append(regularization_loss) losses_dict['Loss/regularization_loss'] = regularization_loss total_loss = tf.add_n(losses, name='total_loss') losses_dict['Loss/total_loss'] = total_loss if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=is_training) graph_rewriter_fn() # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we # can write learning rate summaries on TPU without host calls. global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: training_optimizer = tf.contrib.tpu.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None include_variables = ( train_config.update_trainable_variables if train_config.update_trainable_variables else None) exclude_variables = ( train_config.freeze_variables if train_config.freeze_variables else None) trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=include_variables, exclude_patterns=exclude_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None if train_config.summarize_gradients: summaries = ['gradients', 'gradient_norm', 'global_gradient_norm'] train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, update_ops=detection_model.updates(), variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: exported_output = exporter_lib.add_output_tensor_nodes(detections) export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(exported_output) } eval_metric_ops = None scaffold = None if mode == tf.estimator.ModeKeys.EVAL: class_agnostic = ( fields.DetectionResultFields.detection_classes not in detections) groundtruth = _prepare_groundtruth_for_eval( detection_model, class_agnostic, eval_input_config.max_number_of_boxes) use_original_images = fields.InputDataFields.original_image in features if use_original_images: eval_images = features[fields.InputDataFields.original_image] true_image_shapes = tf.slice( features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3]) original_image_spatial_shapes = features[fields.InputDataFields .original_image_spatial_shape] else: eval_images = features[fields.InputDataFields.image] true_image_shapes = None original_image_spatial_shapes = None eval_dict = eval_util.result_dict_for_batched_example( eval_images, features[inputs.HASH_KEY], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=True, original_image_spatial_shapes=original_image_spatial_shapes, true_image_shapes=true_image_shapes) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) vis_metric_ops = None if not use_tpu and use_original_images: eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections( category_index, max_examples_to_draw=eval_config.num_visualizations, max_boxes_to_draw=eval_config.max_num_boxes_to_visualize, min_score_thresh=eval_config.min_score_threshold, use_normalized_coordinates=False) vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops( eval_dict) # Eval metrics on a single example. eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_config, list(category_index.values()), eval_dict) for loss_key, loss_tensor in iter(losses_dict.items()): eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor) for var in optimizer_summary_vars: eval_metric_ops[var.op.name] = (var, tf.no_op()) if vis_metric_ops is not None: eval_metric_ops.update(vis_metric_ops) eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()} if eval_config.use_moving_averages: variable_averages = tf.train.ExponentialMovingAverage(0.0) variables_to_restore = variable_averages.variables_to_restore() keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( variables_to_restore, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) scaffold = tf.train.Scaffold(saver=saver) # EVAL executes on CPU, so use regular non-TPU EstimatorSpec. if use_tpu and mode != tf.estimator.ModeKeys.EVAL: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: if scaffold is None: keep_checkpoint_every_n_hours = ( train_config.keep_checkpoint_every_n_hours) saver = tf.train.Saver( sharded=True, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) scaffold = tf.train.Scaffold(saver=saver) return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs, scaffold=scaffold)
def train(create_model_fn, create_tensor_dict_fn, train_config, train_dir, img_root): detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.device('cpu:0'): global_step = slim.create_global_step() input_queue = _create_input_queue(train_config.batch_size, create_tensor_dict_fn, detection_model, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options, img_root) with tf.device('gpu:0'): _create_losses(input_queue, create_model_fn) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) training_optimizer = optimizer_builder.build(train_config.optimizer, set()) # create initial restore op init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn # loss and grads total_loss = tf.losses.get_total_loss() grads_and_vars = training_optimizer.compute_gradients( total_loss, tf.trainable_variables()) # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients(grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # create summary summaries = set() for loss_tensor in tf.losses.get_losses(): summaries.add(tf.summary.scalar(loss_tensor.op.name, loss_tensor)) summaries.add(tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) summary_op = tf.summary.merge(list(summaries), name='summary_op') session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # session_config.gpu_options.allow_growth = True keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train(train_tensor, logdir=train_dir, session_config=session_config, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, saver=saver)
def restore_obj_detection_api_weights(sess, model, pretrained_ckpt_path): """Finds variables pertaining to the object detection api within the model and loads in the pre-trained weights for these variables Args: sess: A TensorFlow session model: detection model object pretrained_ckpt_path: path for the pre-trained weights """ # Returns mapping of the variables used for 2D object detection var_map = model.get_variable_restore_map( fine_tune_checkpoint_type='detection', load_all_detection_checkpoint_vars=True) if model.net_type in [ 'resnet50_16x', 'resnet101_16x', 'resnet101_4x_separate_weights', 'resnet101_4x_squash', 'resnet101_4x_bigmon' ]: # Change to TensorFlow Object Detection API standard variables names full_var_map_cleaned = {} crop_var_map_cleaned = {} for key, var in var_map.items(): # TODO: Remove hard coding of variable scopes full_var_map_cleaned[key.replace( 'FirstStageFeatureExtractor_full/', 'FirstStageFeatureExtractor/')] = var crop_var_map_cleaned[key.replace( 'FirstStageFeatureExtractor_crop/', 'FirstStageFeatureExtractor/')] = var # Find the variables in the checkpoint and match them to the ones in the var_maps available_full_var_map = ( variables_helper.get_variables_available_in_checkpoint( full_var_map_cleaned, pretrained_ckpt_path, include_global_step=False)) available_crop_var_map = ( variables_helper.get_variables_available_in_checkpoint( crop_var_map_cleaned, pretrained_ckpt_path, include_global_step=False)) # Load in weights full_init_saver = tf.train.Saver(available_full_var_map) full_init_saver.restore(sess, pretrained_ckpt_path) crop_init_saver = tf.train.Saver(available_crop_var_map) crop_init_saver.restore(sess, pretrained_ckpt_path) else: available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, pretrained_ckpt_path, include_global_step=False)) # Load in weights init_saver = tf.train.Saver(available_var_map) init_saver.restore(sess, pretrained_ckpt_path) print('Loading in Object Detection API pre-trained weights')
def _load_model( images: "tf.Tensor", filename: Optional[str] = None, url: Optional[str] = None, obj_detection_model: Optional["FasterRCNNMetaArch"] = None, is_training: bool = False, groundtruth_boxes_list: Optional[List["tf.Tensor"]] = None, groundtruth_classes_list: Optional[List["tf.Tensor"]] = None, groundtruth_weights_list: Optional[List["tf.Tensor"]] = None, ) -> Tuple[Dict[str, "tf.Tensor"], ...]: """ Download, extract and load a model from a URL if it not already in the cache. The file at indicated by `url` is downloaded to the path ~/.art/data and given the name `filename`. Files in tar, tar.gz, tar.bz, and zip formats will also be extracted. Then the model is loaded, pipelined and its outputs are returned as a tuple of (predictions, losses, detections). :param images: Input samples of shape (nb_samples, height, width, nb_channels). :param filename: Name of the file. :param url: Download URL. :param is_training: A boolean indicating whether the training version of the computation graph should be constructed. :param groundtruth_boxes_list: A list of 2-D tf.float32 tensors of shape [num_boxes, 4] containing coordinates of the groundtruth boxes. Groundtruth boxes are provided in [y_min, x_min, y_max, x_max] format and also assumed to be normalized and clipped relative to the image window with conditions y_min <= y_max and x_min <= x_max. :param groundtruth_classes_list: A list of 1-D tf.float32 tensors of shape [num_boxes] containing the class targets with the zero index which is assumed to map to the first non-background class. :param groundtruth_weights_list: A list of 1-D tf.float32 tensors of shape [num_boxes] containing weights for groundtruth boxes. :return: A tuple of (predictions, losses, detections): - predictions: a dictionary holding "raw" prediction tensors. - losses: a dictionary mapping loss keys (`Loss/RPNLoss/localization_loss`, `Loss/RPNLoss/objectness_loss`, `Loss/BoxClassifierLoss/localization_loss`, `Loss/BoxClassifierLoss/classification_loss`) to scalar tensors representing corresponding loss values. - detections: a dictionary containing final detection results. """ import tensorflow.compat.v1 as tf # lgtm [py/repeated-import] from object_detection.utils import variables_helper if obj_detection_model is None: from object_detection.utils import config_util from object_detection.builders import model_builder # If obj_detection_model is None, then we need to have parameters filename and url to download, extract # and load the object detection model if filename is None or url is None: # pragma: no cover raise ValueError( "Need input parameters `filename` and `url` to download, " "extract and load the object detection model." ) # Download and extract path = get_file(filename=filename, path=config.ART_DATA_PATH, url=url, extract=True) # Load model config pipeline_config = path + "/pipeline.config" configs = config_util.get_configs_from_pipeline_file(pipeline_config) configs["model"].faster_rcnn.second_stage_batch_size = configs[ "model" ].faster_rcnn.first_stage_max_proposals # Load model obj_detection_model = model_builder.build( model_config=configs["model"], is_training=is_training, add_summaries=False ) # Provide groundtruth if groundtruth_classes_list is not None: groundtruth_classes_list = [ tf.one_hot(groundtruth_class, obj_detection_model.num_classes) for groundtruth_class in groundtruth_classes_list ] obj_detection_model.provide_groundtruth( groundtruth_boxes_list=groundtruth_boxes_list, groundtruth_classes_list=groundtruth_classes_list, groundtruth_weights_list=groundtruth_weights_list, ) # Create model pipeline images *= 255.0 preprocessed_images, true_image_shapes = obj_detection_model.preprocess(images) predictions = obj_detection_model.predict(preprocessed_images, true_image_shapes) losses = obj_detection_model.loss(predictions, true_image_shapes) detections = obj_detection_model.postprocess(predictions, true_image_shapes) # Initialize variables from checkpoint # Get variables to restore variables_to_restore = obj_detection_model.restore_map( fine_tune_checkpoint_type="detection", load_all_detection_checkpoint_vars=True ) # Get variables from checkpoint fine_tune_checkpoint_path = path + "/model.ckpt" vars_in_ckpt = variables_helper.get_variables_available_in_checkpoint( variables_to_restore, fine_tune_checkpoint_path, include_global_step=False ) # Initialize from checkpoint tf.train.init_from_checkpoint(fine_tune_checkpoint_path, vars_in_ckpt) return obj_detection_model, predictions, losses, detections
def train(datasets_dicts, epochs, val_every, iters_cnt, validate_with_eval_model, pipeline_config, num_clones=1, save_cback=None, is_transfer_learning=False): logger.info('Start train') configs = configs_from_pipeline(pipeline_config) model_config = configs['model'] train_config = configs['train_config'] create_model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) detection_model = create_model_fn() def get_next(dataset): return dataset_util.make_initializable_iterator( build_dataset(dataset)).get_next() create_tensor_dict_fn = functools.partial(get_next, datasets_dicts['train']) create_tensor_dict_fn_val = functools.partial(get_next, datasets_dicts['val']) data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=4, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0, worker_job_name='lonely_worker') # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() with tf.device(deploy_config.inputs_device()): coord = coordinator.Coordinator() input_queue = create_input_queue( train_config.batch_size, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) input_queue_val = create_input_queue( train_config.batch_size, create_tensor_dict_fn_val, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # create validation graph create_model_fn_val = functools.partial( model_builder.build, model_config=model_config, is_training=not validate_with_eval_model) with tf.device(deploy_config.optimizer_device()): training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var, family='LearningRate') train_losses = [] grads_and_vars = [] with slim.arg_scope([slim.model_variable, slim.variable], device='/device:CPU:0'): for curr_dev_id in range(num_clones): with tf.device('/gpu:{}'.format(curr_dev_id)): with tf.name_scope( 'clone_{}'.format(curr_dev_id)) as scope: with tf.variable_scope( tf.get_variable_scope(), reuse=True if curr_dev_id > 0 else None): losses = _create_losses_val( input_queue, create_model_fn, train_config) clones_loss = tf.add_n(losses) clones_loss = tf.divide(clones_loss, 1.0 * num_clones) grads = training_optimizer.compute_gradients( clones_loss) train_losses.append(clones_loss) grads_and_vars.append(grads) if curr_dev_id == 0: update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS) val_total_loss = get_val_loss(num_clones, input_queue_val, create_model_fn_val, train_config) with tf.device(deploy_config.optimizer_device()): total_loss = tf.add_n(train_losses) grads_and_vars = model_deploy._sum_clones_gradients(grads_and_vars) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops, name='update_barrier') with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) coord.clear_stop() sess = tf.Session(config=config) saver = tf.train.Saver() graph = ops.get_default_graph() with graph.as_default(): with ops.name_scope('init_ops'): init_op = variables.global_variables_initializer() ready_op = variables.report_uninitialized_variables() local_init_op = control_flow_ops.group( variables.local_variables_initializer(), lookup_ops.tables_initializer()) # graph.finalize() sess.run([init_op, ready_op, local_init_op]) queue_runners = graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS) threads = [] for qr in queue_runners: threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) logger.info('Start restore') if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config. fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars and (not is_transfer_learning))) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) if 'global_step' in available_var_map: del available_var_map['global_step'] init_saver = tf.train.Saver(available_var_map) logger.info('Restoring model weights from previous checkpoint.') init_saver.restore(sess, train_config.fine_tune_checkpoint) logger.info('Model restored.') eval_planner = EvalPlanner(epochs, val_every) progress = sly.Progress('Model training: ', epochs * iters_cnt['train']) best_val_loss = float('inf') epoch_flt = 0 for epoch in range(epochs): logger.info("Before new epoch", extra={'epoch': epoch_flt}) for train_it in range(iters_cnt['train']): total_loss, np_global_step = sess.run( [train_tensor, global_step]) metrics_values_train = { 'loss': total_loss, } progress.iter_done_report() epoch_flt = epoch_float(epoch, train_it + 1, iters_cnt['train']) sly.report_metrics_training(epoch_flt, metrics_values_train) if eval_planner.need_validation(epoch_flt): logger.info("Before validation", extra={'epoch': epoch_flt}) overall_val_loss = 0 for val_it in range(iters_cnt['val']): overall_val_loss += sess.run(val_total_loss) logger.info("Validation in progress", extra={ 'epoch': epoch_flt, 'val_iter': val_it, 'val_iters': iters_cnt['val'] }) metrics_values_val = { 'loss': overall_val_loss / iters_cnt['val'], } sly.report_metrics_validation(epoch_flt, metrics_values_val) logger.info("Validation has been finished", extra={'epoch': epoch_flt}) eval_planner.validation_performed() val_loss = metrics_values_val['loss'] model_is_best = val_loss < best_val_loss if model_is_best: best_val_loss = val_loss logger.info( 'It\'s been determined that current model is the best one for a while.' ) save_cback(saver, sess, model_is_best, opt_data={ 'epoch': epoch_flt, 'val_metrics': metrics_values_val, }) logger.info("Epoch was finished", extra={'epoch': epoch_flt}) coord.request_stop() coord.join(threads)
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, sets finetune_checkpoint_type based on # from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' if train_config.fine_tune_checkpoint and hparams.load_pretrained: if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' asg_map = detection_model.restore_map( fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] if train_config.add_regularization_loss: regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) if regularization_losses: regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') losses.append(regularization_loss) if not use_tpu: tf.summary.scalar('regularization_loss', regularization_loss) total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) use_original_images = fields.InputDataFields.original_image in features eval_images = ( features[fields.InputDataFields.original_image] if use_original_images else features[fields.InputDataFields.image]) eval_dict = eval_util.result_dict_for_single_example( eval_images[0:1], features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) if not use_tpu and use_original_images: detection_and_groundtruth = ( vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2)) tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. eval_metrics = eval_config.metrics_set if not eval_metrics: eval_metrics = ['coco_detection_metrics'] eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators( eval_metrics, category_index.values(), eval_dict, include_metrics_per_category=False) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): global_step = tf.train.create_global_step() with tf.device(deploy_config.inputs_device()): input_queue = create_input_queue( train_config.batch_size // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options) # Gather initial summaries. # TODO(rathodv): See if summaries can be added/extracted from global tf # collections so that they don't have to be passed around. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) model_fn = functools.partial(_create_losses, create_model_fn=create_model_fn, train_config=train_config) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) with tf.device(deploy_config.optimizer_device()): training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) sync_optimizer = None if train_config.sync_replicas: training_optimizer = tf.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn with tf.device(deploy_config.optimizer_device()): total_loss, grads_and_vars = model_deploy.optimize_clones( clones, training_optimizer, regularization_losses=None) total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.') # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # Add summaries. for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) slim.learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), save_summaries_secs=120, sync_optimizer=sync_optimizer, saver=saver)
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, num_examples, total_configs, model_config, is_first_training=True): """Training function for detection models. Args: create_tensor_dict_fn: a function to create a tensor input dictionary. create_model_fn: a function that creates a DetectionModel and generates losses. train_config: a train_pb2.TrainConfig protobuf. master: BNS name of the TensorFlow master to use. task: The task id of this training instance. num_clones: The number of clones to run per machine. worker_replicas: The number of work replicas to train with. clone_on_cpu: True if clones should be forced to run on CPU. ps_tasks: Number of parameter server tasks. worker_job_name: Name of the worker job. is_chief: Whether this replica is the chief replica. train_dir: Directory to write checkpoints and training summaries to. num_examples: The number of examples in dataset for training. total_configs: config list """ detection_model = create_model_fn() data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] with tf.Graph().as_default(): # Build a configuration specifying multi-GPU and multi-replicas. deploy_config = model_deploy.DeploymentConfig( num_clones=num_clones, clone_on_cpu=clone_on_cpu, replica_id=task, num_replicas=worker_replicas, num_ps_tasks=ps_tasks, worker_job_name=worker_job_name) # Place the global step on the device storing the variables. with tf.device(deploy_config.variables_device()): if is_first_training: global_step = slim.create_global_step() else: prev_global_step = int( train_config.fine_tune_checkpoint.split('-')[-1]) global_step = variable_scope.get_variable( ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64, initializer=tf.constant(prev_global_step, dtype=dtypes.int64), trainable=False, collections=[ ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP ]) with tf.device(deploy_config.inputs_device()): input_queue = _create_input_queue( train_config.batch_size // num_clones, create_tensor_dict_fn, train_config.batch_queue_capacity, train_config.num_batch_queue_threads, train_config.prefetch_queue_capacity, data_augmentation_options, ignore_options=train_config.ignore_options, mtl_window=model_config.mtl.window, mtl_edgemask=model_config.mtl.edgemask) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) global_summaries = set([]) kwargs = {} kwargs['mtl'] = model_config.mtl update_schedule = None model_fn = functools.partial( _create_losses, create_model_fn=create_model_fn, show_image_summary=train_config.show_image_summary, update_schedule=update_schedule, **kwargs) clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue]) first_clone_scope = clones[0].scope with tf.device(deploy_config.optimizer_device()): training_optimizer = optimizer_builder.build( train_config.optimizer, global_summaries) sync_optimizer = None if train_config.sync_replicas: # TODO: support syncrhonous update for manual loss update training_optimizer = tf.SyncReplicasOptimizer( training_optimizer, replicas_to_aggregate=train_config.replicas_to_aggregate, total_num_replicas=train_config.worker_replicas) sync_optimizer = training_optimizer # Create ops required to initialize the model from a given checkpoint. init_fn = None if train_config.fine_tune_checkpoint: var_map = detection_model.restore_map( from_detection_checkpoint=train_config. from_detection_checkpoint, restore_box_predictor=train_config.restore_box_predictor, restore_window=train_config.restore_window, restore_edgemask=train_config.restore_edgemask, restore_closeness=train_config.restore_closeness, restore_mtl_refine=train_config.restore_mtl_refine, ) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( var_map, train_config.fine_tune_checkpoint)) init_saver = tf.train.Saver(available_var_map) mtl = model_config.mtl mtl_init_saver_list = [] def _get_mtl_init_saver(scope_name): _var_map = detection_model._feature_extractor.mtl_restore_from_classification_checkpoint_fn( scope_name) if train_config.from_detection_checkpoint: _var_map_new = dict() for name, val in _var_map.iteritems(): _var_map_new[detection_model. second_stage_feature_extractor_scope + '/' + name] = val _var_map = _var_map_new _available_var_map = ( variables_helper.get_variables_available_in_checkpoint( _var_map, train_config.fine_tune_checkpoint)) if _available_var_map: return tf.train.Saver(_available_var_map) else: return None # if mtl.share_second_stage_init and mtl.shared_feature == 'proposal_feature_maps': if mtl.share_second_stage_init and train_config.from_detection_checkpoint == False: if mtl.window: mtl_init_saver_list.append( _get_mtl_init_saver( detection_model.window_box_predictor_scope)) if mtl.closeness: mtl_init_saver_list.append( _get_mtl_init_saver( detection_model.closeness_box_predictor_scope)) if mtl.edgemask: mtl_init_saver_list.append( _get_mtl_init_saver( detection_model.edgemask_predictor_scope)) def initializer_fn(sess): init_saver.restore(sess, train_config.fine_tune_checkpoint) for mtl_init_saver in mtl_init_saver_list: if not mtl_init_saver == None: mtl_init_saver.restore( sess, train_config.fine_tune_checkpoint) init_fn = initializer_fn def _get_trainable_variables(except_scopes=None): trainable_variables = tf.trainable_variables() if except_scopes is None: return trainable_variables for var in tf.trainable_variables(): if any([scope in var.name for scope in except_scopes]): trainable_variables.remove(var) return trainable_variables def _get_update_ops(except_scopes=None): # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by model_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) if except_scopes is None: return update_ops for var in tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope): if any([scope in var.name for scope in except_scopes]): update_ops.remove(var) return update_ops with tf.device(deploy_config.optimizer_device()): def _single_update(): kwargs = {} _training_optimizer = training_optimizer kwargs['var_list'] = None update_ops = _get_update_ops() total_loss, grads_and_vars = model_deploy.optimize_clones( clones, _training_optimizer, regularization_losses=None, **kwargs) # Optionaly multiply gradients by train_config.{grad_multiplier, # divide_grad_by_batch}. if train_config.grad_multiplier or train_config.divide_grad_by_batch: base_multiplier = train_config.grad_multiplier \ if train_config.grad_multiplier else 1.0 batch_divider = float(train_config.batch_size) \ if train_config.divide_grad_by_batch else 1.0 total_multiplier = base_multiplier / batch_divider grads_and_vars = variables_helper.multiply_gradients_by_scalar_multiplier( grads_and_vars, multiplier=total_multiplier) # Optionally multiply bias gradients by train_config.bias_grad_multiplier. if train_config.bias_grad_multiplier: biases_regex_list = ['.*/biases'] grads_and_vars = variables_helper.multiply_gradients_matching_regex( grads_and_vars, biases_regex_list, multiplier=train_config.bias_grad_multiplier) # Optionally freeze some layers by setting their gradients to be zero. if train_config.freeze_variables: grads_and_vars = variables_helper.freeze_gradients_matching_regex( grads_and_vars, train_config.freeze_variables) # Optionally clip gradients if train_config.gradient_clipping_by_norm > 0: with tf.name_scope('clip_grads'): grads_and_vars = slim.learning.clip_gradient_norms( grads_and_vars, train_config.gradient_clipping_by_norm) # Create gradient updates. grad_updates = _training_optimizer.apply_gradients( grads_and_vars, global_step=global_step) # update_ops.append(grad_updates) total_update_ops = update_ops + [grad_updates] update_op = tf.group(*total_update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name=('train_op')) return train_tensor train_tensor = _single_update() # Add summaries. def _get_total_loss_with_collection(collection, add_regularization_losses=True, name="total_loss"): losses = tf.losses.get_losses(loss_collection=collection) if add_regularization_losses: losses += tf.losses.get_regularization_losses() return math_ops.add_n(losses, name=name) for model_var in slim.get_model_variables(): global_summaries.add( tf.summary.histogram(model_var.op.name, model_var)) for loss_tensor in tf.losses.get_losses(): global_summaries.add( tf.summary.scalar(loss_tensor.op.name, loss_tensor)) global_summaries.add( tf.summary.scalar('TotalLoss', tf.losses.get_total_loss())) # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summaries |= global_summaries # Merge all summaries together. summary_op = tf.summary.merge(list(summaries), name='summary_op') # not contained in global_summaries config_summary_list = select_config_summary_list(total_configs, as_matrix=False) # Soft placement allows placing on CPU ops without GPU implementation. session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) # Save checkpoints regularly. keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours saver = tf.train.Saver( keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours) custom_learning.train( train_tensor, logdir=train_dir, master=master, is_chief=is_chief, global_step=(None if is_first_training else global_step), session_config=session_config, startup_delay_steps=train_config.startup_delay_steps, init_fn=init_fn, summary_op=summary_op, number_of_steps=(train_config.num_steps if train_config.num_steps else None), log_every_n_steps=(train_config.log_every_n_steps if train_config.log_every_n_steps else None), save_summaries_secs=train_config.save_summaries_secs, save_interval_secs=train_config.save_interval_secs, sync_optimizer=sync_optimizer, saver=saver, batch_size=train_config.batch_size, num_examples=num_examples, config_summary_list=config_summary_list)