def testBuildEmptyOptimizer(self):
     optimizer_text_proto = """
 """
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     with self.assertRaises(ValueError):
         optimizer_builder.build(optimizer_proto)
コード例 #2
0
 def testBuildEmptyOptimizer(self):
   optimizer_text_proto = """
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   with self.assertRaises(ValueError):
     optimizer_builder.build(optimizer_proto)
コード例 #3
0
 def testBuildEmptyOptimizer(self):
     optimizer_text_proto = """
 """
     global_summaries = set([])
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     with self.assertRaises(ValueError):
         optimizer_builder.build(optimizer_proto, global_summaries)
コード例 #4
0
 def testBuildEmptyOptimizer(self):
   optimizer_text_proto = """
   """
   global_summaries = set([])
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   with self.assertRaises(ValueError):
     optimizer_builder.build(optimizer_proto, global_summaries)
コード例 #5
0
 def testMovingAverageOptimizerUnsupported(self):
     optimizer_text_proto = """
   adam_optimizer: {
     learning_rate: {
       constant_learning_rate {
         learning_rate: 0.002
       }
     }
   }
   use_moving_average: True
 """
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     with self.assertRaises(ValueError):
         optimizer_builder.build(optimizer_proto)
コード例 #6
0
 def testBuildMovingAverageOptimizer(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: True
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertIsInstance(optimizer, contrib_opt.MovingAverageOptimizer)
 def testBuildAdamOptimizer(self):
     optimizer_text_proto = """
   adam_optimizer: {
     learning_rate: {
       constant_learning_rate {
         learning_rate: 0.002
       }
     }
   }
   use_moving_average: false
 """
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     optimizer, _ = optimizer_builder.build(optimizer_proto)
     self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
コード例 #8
0
 def testBuildAdamOptimizer(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: false
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertTrue(isinstance(optimizer, tf.train.AdamOptimizer))
コード例 #9
0
 def testBuildMomentumOptimizer(self):
   optimizer_text_proto = """
     momentum_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.001
         }
       }
       momentum_optimizer_value: 0.99
     }
     use_moving_average: false
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
 def testBuildMomentumOptimizer(self):
     optimizer_text_proto = """
   momentum_optimizer: {
     learning_rate: {
       constant_learning_rate {
         learning_rate: 0.001
       }
     }
     momentum_optimizer_value: 0.99
   }
   use_moving_average: false
 """
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     optimizer, _ = optimizer_builder.build(optimizer_proto)
     self.assertTrue(isinstance(optimizer, tf.train.MomentumOptimizer))
コード例 #11
0
 def testBuildMovingAverageOptimizer(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: True
   """
   global_summaries = set([])
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
   self.assertTrue(
       isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
コード例 #12
0
 def testBuildMovingAverageOptimizer(self):
     optimizer_text_proto = """
   adam_optimizer: {
     learning_rate: {
       constant_learning_rate {
         learning_rate: 0.002
       }
     }
   }
   use_moving_average: True
 """
     global_summaries = set([])
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
     self.assertTrue(
         isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
コード例 #13
0
 def testBuildMovingAverageOptimizerWithNonDefaultDecay(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: True
     moving_average_decay: 0.2
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertIsInstance(optimizer, contrib_opt.MovingAverageOptimizer)
   # TODO(rathodv): Find a way to not depend on the private members.
   self.assertAlmostEqual(optimizer._ema._decay, 0.2)
コード例 #14
0
 def testBuildMovingAverageOptimizerWithNonDefaultDecay(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: True
     moving_average_decay: 0.2
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertTrue(
       isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
   # TODO(rathodv): Find a way to not depend on the private members.
   self.assertAlmostEqual(optimizer._ema._decay, 0.2)
コード例 #15
0
 def testBuildRMSPropOptimizer(self):
   optimizer_text_proto = """
     rms_prop_optimizer: {
       learning_rate: {
         exponential_decay_learning_rate {
           initial_learning_rate: 0.004
           decay_steps: 800720
           decay_factor: 0.95
         }
       }
       momentum_optimizer_value: 0.9
       decay: 0.9
       epsilon: 1.0
     }
     use_moving_average: false
   """
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer, _ = optimizer_builder.build(optimizer_proto)
   self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
コード例 #16
0
 def testBuildMovingAverageOptimizerWithNonDefaultDecay(self):
   optimizer_text_proto = """
     adam_optimizer: {
       learning_rate: {
         constant_learning_rate {
           learning_rate: 0.002
         }
       }
     }
     use_moving_average: True
     moving_average_decay: 0.2
   """
   global_summaries = set([])
   optimizer_proto = optimizer_pb2.Optimizer()
   text_format.Merge(optimizer_text_proto, optimizer_proto)
   optimizer = optimizer_builder.build(optimizer_proto, global_summaries)
   self.assertTrue(
       isinstance(optimizer, tf.contrib.opt.MovingAverageOptimizer))
   # TODO: Find a way to not depend on the private members.
   self.assertAlmostEqual(optimizer._ema._decay, 0.2)
 def testBuildRMSPropOptimizer(self):
     optimizer_text_proto = """
   rms_prop_optimizer: {
     learning_rate: {
       exponential_decay_learning_rate {
         initial_learning_rate: 0.004
         decay_steps: 800720
         decay_factor: 0.95
       }
     }
     momentum_optimizer_value: 0.9
     decay: 0.9
     epsilon: 1.0
   }
   use_moving_average: false
 """
     optimizer_proto = optimizer_pb2.Optimizer()
     text_format.Merge(optimizer_text_proto, optimizer_proto)
     optimizer, _ = optimizer_builder.build(optimizer_proto)
     self.assertTrue(isinstance(optimizer, tf.train.RMSPropOptimizer))
コード例 #18
0
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               checkpoint_max_to_keep=7,
               **kwargs):
    """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU. If export_to_tpu is not provided, we will look for it in
      hparams too.
    checkpoint_every_n:
      Checkpoint every n training steps.
    checkpoint_max_to_keep:
      int, the number of most recent checkpoints to keep in the model directory.
    **kwargs: Additional keyword arguments for configuration override.
  """
    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(
            'mixed_bfloat16')

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
    fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        def train_dataset_fn(input_context):
            """Callable to create train input."""
            # Create the inputs.
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=detection_model,
                input_context=input_context)
            train_input = train_input.repeat()
            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.compat.v2.dtypes.int64,
            name='global_step',
            aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    # Get the appropriate filepath (temporary or not) based on whether the worker
    # is the chief.
    summary_writer_filepath = _get_filepath(strategy,
                                            os.path.join(model_dir, 'train'))
    summary_writer = tf.compat.v2.summary.create_file_writer(
        summary_writer_filepath)

    if use_tpu:
        num_steps_per_iteration = 100
    else:
        # TODO(b/135933080) Explore setting to 100 when GPU performance issues
        # are fixed.
        num_steps_per_iteration = 1

    with summary_writer.as_default():
        with strategy.scope():
            with tf.compat.v2.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # Load a fine-tuning checkpoint.
                if fine_tune_checkpoint_path:
                    load_fine_tune_checkpoint(
                        detection_model, fine_tune_checkpoint_path,
                        fine_tune_checkpoint_type,
                        fine_tune_checkpoint_version,
                        load_all_detection_checkpoint_vars, train_input,
                        unpad_groundtruth_tensors)

                ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                     model=detection_model,
                                                     optimizer=optimizer)

                manager_dir = _get_filepath(strategy, model_dir)
                if not strategy.extended.should_checkpoint:
                    checkpoint_max_to_keep = 1
                manager = tf.compat.v2.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)

                # We use the following instead of manager.latest_checkpoint because
                # manager_dir does not point to the model directory when we are running
                # in a worker.
                latest_checkpoint = tf.train.latest_checkpoint(model_dir)
                ckpt.restore(latest_checkpoint)

                def train_step_fn(features, labels):
                    """Single train step."""
                    loss = eager_train_step(
                        detection_model,
                        features,
                        labels,
                        unpad_groundtruth_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradients_value=clip_gradients_value,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync)
                    global_step.assign_add(1)
                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    per_replica_losses = strategy.run(train_step_fn,
                                                      args=(features, labels))
                    # TODO(anjalisridhar): explore if it is safe to remove the
                    ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""

                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            _sample_and_train(strategy, train_step_fn,
                                              data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)
                checkpointed_step = int(global_step.value())
                logged_step = global_step.value()

                last_step_time = time.time()
                for _ in range(global_step.value(), train_steps,
                               num_steps_per_iteration):

                    loss = _dist_train_step(train_input_iter)

                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()

                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                num_steps_per_iteration * 1.0 /
                                                time_taken,
                                                step=global_step)

                    if global_step.value() - logged_step >= 100:
                        tf.logging.info(
                            'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                                global_step.value(),
                                time_taken / num_steps_per_iteration, loss))
                        logged_step = global_step.value()

                    if ((int(global_step.value()) - checkpointed_step) >=
                            checkpoint_every_n):
                        manager.save()
                        checkpointed_step = int(global_step.value())

    # Remove the checkpoint directories of the non-chief workers that
    # MultiWorkerMirroredStrategy forces us to save during sync distributed
    # training.
    _clean_temporary_directories(strategy, manager_dir)
    _clean_temporary_directories(strategy, summary_writer_filepath)
コード例 #19
0
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):
    """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU. If export_to_tpu is not provided, we will look for it in
      hparams too.
    checkpoint_every_n:
      Checkpoint every n training steps.
    **kwargs: Additional keyword arguments for configuration override.
  """
    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(
            'mixed_bfloat16')

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64,
                                            name='global_step')
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(detection_model,
                                          fine_tune_checkpoint_path,
                                          fine_tune_checkpoint_type,
                                          load_all_detection_checkpoint_vars,
                                          train_input,
                                          unpad_groundtruth_tensors)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                if not use_tpu:
                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                1.0 / (end_time - start_time),
                                                step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()
コード例 #20
0
ファイル: trainer.py プロジェクト: ymlsam/deepgaau-detector
def train_loop(config_path: str,
               model_dir: str,
               config_override: Optional[
                   pipeline_pb2.TrainEvalPipelineConfig] = None,
               train_steps: Optional[int] = None,
               use_tpu: bool = False,
               save_final_config: bool = False,
               log_every_n: int = 100,
               ckpt_every_n: int = 1000,
               ckpt_max_to_keep: int = 7,
               record_summaries: bool = True,
               **kwargs) -> None:
    """Trains a model using eager + functions.
    
    This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside tf.functions.
    7. Checkpoints the model every `ckpt_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.
    
    Args:
        config_path: A path to a pipeline config file.
        model_dir: The directory to save checkpoints and summaries to.
        config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`.
        train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted.
        use_tpu: Boolean, whether training and evaluation should run on TPU.
        save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`.
        log_every_n: Log total loss every n training steps.
        ckpt_every_n: Checkpoint every n training steps.
        ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory.
        record_summaries: Boolean, whether or not to record summaries.
        **kwargs: Additional keyword arguments for configuration override.
    """

    # parse config
    configs = config_util.get_configs_from_pipeline_file(
        config_path, config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu,
    })
    configs = config_util.merge_external_params_with_configs(
        configs, None, kwargs_dict=kwargs)

    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_gt_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradient_norm = None

    if train_config.gradient_clipping_by_norm > 0:
        clip_gradient_norm = train_config.gradient_clipping_by_norm

    if kwargs['use_bfloat16']:
        tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

    if train_config.load_all_detection_checkpoint_vars:
        raise ValueError(
            'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2')

    # base checkpoint to fine-tune from
    config_util.update_fine_tune_checkpoint_type(train_config)
    base_ckpt = train_config.fine_tune_checkpoint
    base_ckpt_type = train_config.fine_tune_checkpoint_type
    base_ckpt_ver = train_config.fine_tune_checkpoint_version

    # write the as-run pipeline config to disk
    if save_final_config:
        pipeline_config_final = config_util.create_pipeline_proto_from_configs(
            configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # build model, input, optimizer
    strategy = tf.distribute.get_strategy()
    with strategy.scope():
        # build model
        model = model_builder.build(model_config=model_config,
                                    is_training=True)

        # build input
        def train_dataset_fn(
                input_context: tf.distribute.InputContext) -> tf.data.Dataset:
            """Callable to create train input."""
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=model,
                input_context=input_context,
            )
            train_input = train_input.repeat()

            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        # build optimizer
        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.int64,
            name='global_step',
            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
        )
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    # prepare for training

    # get appropriate filepath (temporary or not) based on whether the worker is the chief
    summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train'))

    if record_summaries:
        summary_writer = tf.summary.create_file_writer(summary_log_path)
    else:
        summary_writer = tf.summary.create_noop_writer()

    if use_tpu:
        num_steps_per_iteration = 100
    else:
        num_steps_per_iteration = 1

    with summary_writer.as_default():
        with strategy.scope():
            with tf.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # prepare checkpoint manager
                # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker)
                ckpt = tf.train.Checkpoint(model=model,
                                           step=global_step,
                                           optimizer=optimizer)
                ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1
                manager_dir = get_filepath(strategy, model_dir)
                manager = tf.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=ckpt_max_to_keep)
                latest_ckpt = tf.train.latest_checkpoint(model_dir)

                if latest_ckpt:
                    # load latest checkpoint being trained
                    ckpt.restore(latest_ckpt).expect_partial()
                elif base_ckpt:
                    # load a pre-trained checkpoint
                    load_base_ckpt(model, base_ckpt, base_ckpt_type,
                                   base_ckpt_ver, train_input,
                                   unpad_gt_tensors)

                # get trainable variables
                train_vars = get_train_vars(model, train_config)

                # define training step
                def train_step_fn(features: Dict, labels: Dict):
                    """Single train step."""
                    loss = eager_train_step(
                        model,
                        train_vars,
                        features,
                        labels,
                        unpad_gt_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradient_norm=clip_gradient_norm,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync,
                    )
                    global_step.assign_add(1)

                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    per_replica_losses = strategy.run(train_step_fn,
                                                      args=(features, labels))

                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""
                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            with tf.name_scope(''):
                                _sample_and_train(strategy, train_step_fn,
                                                  data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)

                # save initialized version of checkpoint
                if int(global_step.value()) == 0:
                    manager.save()

                ckpt_step = int(global_step.value())
                logged_step = global_step.value()

                # proceed with training
                last_step_time = time.time()
                for _ in range(global_step.value(), train_config.num_steps,
                               num_steps_per_iteration):
                    # execute a step (forward pass + backward pass)
                    loss = _dist_train_step(train_input_iter)

                    # log time
                    curr_step = global_step.value()
                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()

                    tf.summary.scalar(
                        'steps_per_sec',
                        num_steps_per_iteration * 1.0 / time_taken,
                        step=global_step,
                    )

                    # log loss
                    if curr_step - logged_step >= log_every_n:
                        step_time = time_taken / num_steps_per_iteration
                        step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                            curr_step, step_time, loss)
                        v1.logging.info(step_msg)
                        logged_step = curr_step

                    # save checkpoint regularly
                    if (curr_step - ckpt_step) >= ckpt_every_n:
                        manager.save()
                        ckpt_step = curr_step

    # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync
    # distributed training.
    clean_temporary_directories(strategy, manager_dir)
    clean_temporary_directories(strategy, summary_log_path)
コード例 #21
0
ファイル: model.py プロジェクト: Toyben/models
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    detection_model = detection_model_fn(is_training=is_training,
                                         add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      labels = unstack_batch(labels, unpad_groundtruth_tensors=False)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list)

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
        preprocessed_images, features[fields.InputDataFields.true_image_shape])
    detections = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
      if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, sets finetune_checkpoint_type based on
        # from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
          train_config.fine_tune_checkpoint_type = 'detection'
        else:
          train_config.fine_tune_checkpoint_type = 'classification'
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map, train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.itervalues()]
      if train_config.add_regularization_loss:
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
          regularization_loss = tf.add_n(regularization_losses,
                                         name='regularization_loss')
          losses.append(regularization_loss)
          if not use_tpu:
            tf.summary.scalar('regularization_loss', regularization_loss)
      total_loss = tf.add_n(losses, name='total_loss')

    if mode == tf.estimator.ModeKeys.TRAIN:
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

      if use_tpu:
        training_optimizer = tpu_optimizer.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      if train_config.freeze_variables:
        trainable_variables = tf.contrib.framework.filter_variables(
            tf.trainable_variables(),
            exclude_patterns=train_config.freeze_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(detections)
      }

    eval_metric_ops = None
    if mode == tf.estimator.ModeKeys.EVAL:
      # Detection summaries during eval.
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
      eval_images = (
          features[fields.InputDataFields.original_image] if use_original_images
          else features[fields.InputDataFields.image])
      eval_dict = eval_util.result_dict_for_single_example(
          eval_images[0:1],
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=False)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      if not use_tpu and use_original_images:
        detection_and_groundtruth = (
            vis_utils.draw_side_by_side_evaluation_image(
                eval_dict, category_index, max_boxes_to_draw=20,
                min_score_thresh=0.2))
        tf.summary.image('Detections_Left_Groundtruth_Right',
                         detection_and_groundtruth)

      # Eval metrics on a single image.
      eval_metrics = eval_config.metrics_set
      if not eval_metrics:
        eval_metrics = ['coco_detection_metrics']
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_metrics, category_index.values(), eval_dict,
          include_metrics_per_category=False)

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs)
def train_loop(pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               checkpoint_every_n=1000,
               checkpoint_max_to_keep=7,
               record_summaries=True,
               performance_summary_exporter=None,
               num_steps_per_iteration=NUM_STEPS_PER_ITERATION,
               **kwargs):
    # """Trains a model using eager + functions.

    config_override = None
    configs = config_util.get_configs_from_pipeline_file(
        pipeline_config_path, config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = config_util.merge_external_params_with_configs(
        configs, None, kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']
    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors  # False
    add_regularization_loss = train_config.add_regularization_loss  # True
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:  # Not run
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    train_steps = num_train_steps
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')

    if train_config.load_all_detection_checkpoint_vars:
        raise ValueError('train_pb2.load_all_detection_checkpoint_vars '
                         'unsupported in TF2')

    config_util.update_fine_tune_checkpoint_type(train_config)
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type  # 'detection'
    fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    from object_detection import inputs
    from object_detection.builders import optimizer_builder
    from object_detection.utils import variables_helper
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        def train_dataset_fn(input_context):
            """Callable to create train input."""
            # Create the inputs.
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=detection_model,
                input_context=input_context)
            train_input = train_input.repeat()
            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)
        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.compat.v2.dtypes.int64,
            name='global_step',
            aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:

            def learning_rate_fn():
                return learning_rate

    # Train the model
    # Get the appropriate filepath (temporary or not) based on whether the worker
    # is the chief.
    summary_writer_filepath = get_filepath(strategy,
                                           os.path.join(model_dir, 'train'))
    if record_summaries:
        summary_writer = tf.compat.v2.summary.create_file_writer(
            summary_writer_filepath)
    else:
        #summary_writer = tf2.summary.create_noop_writer()
        summary_writer = tf.summary.create_noop_writer()

    with summary_writer.as_default():
        with strategy.scope():
            with tf.compat.v2.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # Load a fine-tuning checkpoint.
                if train_config.fine_tune_checkpoint:
                    variables_helper.ensure_checkpoint_supported(
                        train_config.fine_tune_checkpoint,
                        fine_tune_checkpoint_type, model_dir)
                    load_fine_tune_checkpoint(
                        detection_model, train_config.fine_tune_checkpoint,
                        fine_tune_checkpoint_type,
                        fine_tune_checkpoint_version, train_config.
                        run_fine_tune_checkpoint_dummy_computation,
                        train_input, unpad_groundtruth_tensors)

                ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                     model=detection_model,
                                                     optimizer=optimizer)

                manager_dir = get_filepath(strategy, model_dir)
                if not strategy.extended.should_checkpoint:
                    checkpoint_max_to_keep = 1
                manager = tf.compat.v2.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)

                # We use the following instead of manager.latest_checkpoint because
                # manager_dir does not point to the model directory when we are running
                # in a worker.
                latest_checkpoint = tf.train.latest_checkpoint(model_dir)
                ckpt.restore(latest_checkpoint)

                def train_step_fn(features, labels):
                    """Single train step."""
                    loss = eager_train_step(
                        detection_model,
                        features,
                        labels,
                        unpad_groundtruth_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradients_value=clip_gradients_value,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync)

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    if hasattr(tf.distribute.Strategy, 'run'):
                        per_replica_losses = strategy.run(train_step_fn,
                                                          args=(features,
                                                                labels))
                    else:
                        per_replica_losses = strategy.experimental_run_v2(
                            train_step_fn, args=(features, labels))
                    # TODO(anjalisridhar): explore if it is safe to remove the
                    # num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""

                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            # Following suggestion on yaqs/5402607292645376
                            with tf.name_scope(''):
                                _sample_and_train(strategy, train_step_fn,
                                                  data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)

                if int(global_step.value()) == 0:
                    manager.save()

                checkpointed_step = int(global_step.value())
                logged_step = global_step.value()

                last_step_time = time.time()
                for _ in range(global_step.value(), train_steps,
                               num_steps_per_iteration):

                    loss = _dist_train_step(train_input_iter)

                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()
                    steps_per_sec = num_steps_per_iteration * 1.0 / time_taken

                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                steps_per_sec,
                                                step=global_step)

                    steps_per_sec_list.append(steps_per_sec)

                    if global_step.value() - logged_step >= 100:
                        tf.logging.info(
                            'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                                global_step.value(),
                                time_taken / num_steps_per_iteration, loss))
                        logged_step = global_step.value()

                    if ((int(global_step.value()) - checkpointed_step) >=
                            checkpoint_every_n):
                        manager.save()
                        checkpointed_step = int(global_step.value())
コード例 #23
0
def train_loop(
    pipeline_config_path,
    model_dir,
    val_checkpoint_dir,
    config_override=None,
    train_steps=None,
    use_tpu=False,
    save_final_config=False,
    checkpoint_every_n=1000,
    checkpoint_max_to_keep=7,
    record_summaries=True,
    performance_summary_exporter=None,
    **kwargs):
  """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    val_checkpoint_dir:
      The directory to save validation checkpoint.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    checkpoint_every_n:
      Checkpoint every n training steps.
    checkpoint_max_to_keep:
      int, the number of most recent checkpoints to keep in the model directory.
    record_summaries: Boolean, whether or not to record summaries.
    performance_summary_exporter: function for exporting performance metrics.
    **kwargs: Additional keyword arguments for configuration override.
  """

  print('START train looop function ========================')

  ## Parse the configs
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  steps_per_sec_list = []

  configs = get_configs_from_pipeline_file(
      pipeline_config_path, config_override=config_override)
  kwargs.update({
      'train_steps': train_steps,
      'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
  })
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']

  unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
  add_regularization_loss = train_config.add_regularization_loss
  clip_gradients_value = None
  if train_config.gradient_clipping_by_norm > 0:
    clip_gradients_value = train_config.gradient_clipping_by_norm

  # update train_steps from config but only when non-zero value is provided
  if train_steps is None and train_config.num_steps != 0:
    train_steps = train_config.num_steps

  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

  if train_config.load_all_detection_checkpoint_vars:
    raise ValueError('train_pb2.load_all_detection_checkpoint_vars '
                     'unsupported in TF2')

  config_util.update_fine_tune_checkpoint_type(train_config)
  fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
  fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

  # Write the as-run pipeline config to disk.
  if save_final_config:
    tf.logging.info('Saving pipeline config file to directory {}'.format(
        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)

  # Build the model, optimizer, and training input
  strategy = tf.compat.v2.distribute.get_strategy()
  with strategy.scope():
    detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        model_config=model_config, is_training=True)

    def train_dataset_fn(input_context):
      """Callable to create train input."""
      # Create the inputs.
      train_input = inputs.train_input(
          train_config=train_config,
          train_input_config=train_input_config,
          model_config=model_config,
          model=detection_model,
          input_context=input_context)
      train_input = train_input.repeat()
      return train_input

    train_input = strategy.experimental_distribute_datasets_from_function(
        train_dataset_fn)


    global_step = tf.Variable(
        0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step',
        aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
    optimizer, (learning_rate,) = optimizer_builder.build(
        train_config.optimizer, global_step=global_step)

    # We run the detection_model on dummy inputs in order to ensure that the
    # model and all its variables have been properly constructed. Specifically,
    # this is currently necessary prior to (potentially) creating shadow copies
    # of the model variables for the EMA optimizer.
    if train_config.optimizer.use_moving_average:
      _ensure_model_is_built(detection_model, train_input,
                             unpad_groundtruth_tensors)
      optimizer.shadow_copy(detection_model)

    if callable(learning_rate):
      learning_rate_fn = learning_rate
    else:
      learning_rate_fn = lambda: learning_rate

  ## Train the model
  # Get the appropriate filepath (temporary or not) based on whether the worker
  # is the chief.
  summary_writer_filepath = get_filepath(strategy,
                                         os.path.join(model_dir, 'train'))
  if record_summaries:
    summary_writer = tf.compat.v2.summary.create_file_writer(
        summary_writer_filepath)
  else:
    summary_writer = tf2.summary.create_noop_writer()

  if use_tpu:
    num_steps_per_iteration = 100
  else:
    # TODO(b/135933080) Explore setting to 100 when GPU performance issues
    # are fixed.
    num_steps_per_iteration = 1

  with summary_writer.as_default():
    with strategy.scope():
      with tf.compat.v2.summary.record_if(
          lambda: global_step % num_steps_per_iteration == 0):
        # Load a fine-tuning checkpoint.
        if train_config.fine_tune_checkpoint:
          load_fine_tune_checkpoint(
              detection_model, train_config.fine_tune_checkpoint,
              fine_tune_checkpoint_type, fine_tune_checkpoint_version,
              train_config.run_fine_tune_checkpoint_dummy_computation,
              train_input, unpad_groundtruth_tensors)

        ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)
        val_ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)

        manager_dir = get_filepath(strategy, model_dir)
        val_manager_dir = get_filepath(strategy, val_checkpoint_dir)



        # if not strategy.extended.should_checkpoint:
            # checkpoint_max_to_keep = 1
            
        checkpoint_max_to_keep = 1
        manager = tf.compat.v2.train.CheckpointManager(
            ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)
        val_manager = tf.compat.v2.train.CheckpointManager(
            val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep)

        model_checkpoint_callback = tfc.ModelCheckpoint(val_manager)
        early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min')
        train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt')
        cancellation_point = tfc.CancellationPoint()
        

        # We use the following instead of manager.latest_checkpoint because
        # manager_dir does not point to the model directory when we are running
        # in a worker.
        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
        ckpt.restore(latest_checkpoint)
        val_ckpt.restore(latest_checkpoint)

        def train_step_fn(features, labels):
          """Single train step."""
          loss = eager_train_step(
              detection_model,
              features,
              labels,
              unpad_groundtruth_tensors,
              optimizer,
              learning_rate=learning_rate_fn(),
              add_regularization_loss=add_regularization_loss,
              clip_gradients_value=clip_gradients_value,
              global_step=global_step,
              num_replicas=strategy.num_replicas_in_sync)
          global_step.assign_add(1)
          return loss

        def _sample_and_train(strategy, train_step_fn, data_iterator):
          features, labels = data_iterator.next()
          if hasattr(tf.distribute.Strategy, 'run'):
            per_replica_losses = strategy.run(
                train_step_fn, args=(features, labels))
          else:
            per_replica_losses = strategy.experimental_run_v2(
                train_step_fn, args=(features, labels))
          # TODO(anjalisridhar): explore if it is safe to remove the
          ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
          return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                 per_replica_losses, axis=None)

        @tf.function
        def _dist_train_step(data_iterator):
          """A distributed train step."""

          if num_steps_per_iteration > 1:
            for _ in tf.range(num_steps_per_iteration - 1):
              # Following suggestion on yaqs/5402607292645376
              with tf.name_scope(''):
                _sample_and_train(strategy, train_step_fn, data_iterator)

          return _sample_and_train(strategy, train_step_fn, data_iterator)

        train_input_iter = iter(train_input)

        if int(global_step.value()) == 0:
          manager.save()

        checkpointed_step = int(global_step.value())
        logged_step = global_step.value()

        # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration

        last_step_time = time.time()
        for epoch, _ in enumerate(range(global_step.value(), train_steps,
                       num_steps_per_iteration)):

          loss = _dist_train_step(train_input_iter)

          time_taken = time.time() - last_step_time
          last_step_time = time.time()
          steps_per_sec = num_steps_per_iteration * 1.0 / time_taken

          tf.compat.v2.summary.scalar(
              'steps_per_sec', steps_per_sec, step=global_step)

          steps_per_sec_list.append(steps_per_sec)

          if global_step.value() - logged_step >= 100:
            tf.logging.info(
                'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                    global_step.value(), time_taken / num_steps_per_iteration,
                    loss))

            manager.save()
            checkpointed_step = int(global_step.value())

            log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20)
            log_metrics['train_total_loss'] = loss

            model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss'])
            stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss'])
            train_logger_callback.log(log_metrics)

            if stop_training or cancellation_point.check():
                break
            
            print(log_metrics)
            logged_step = global_step.value()

    

  # Remove the checkpoint directories of the non-chief workers that
  # MultiWorkerMirroredStrategy forces us to save during sync distributed
  # training.
  clean_temporary_directories(strategy, manager_dir)
  clean_temporary_directories(strategy, summary_writer_filepath)
  # TODO(pkanwar): add accuracy metrics.
  if performance_summary_exporter is not None:
    metrics = {
        'steps_per_sec': np.mean(steps_per_sec_list),
        'steps_per_sec_p50': np.median(steps_per_sec_list),
        'steps_per_sec_max': max(steps_per_sec_list),
        'last_batch_loss': float(loss)
    }
    mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32'
    performance_summary_exporter(metrics, mixed_precision)
コード例 #24
0
ファイル: model_lib.py プロジェクト: zhangjiulong/models
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Make sure to set the Keras learning phase. True during training,
    # False for inference.
    tf.keras.backend.set_learning_phase(is_training)
    detection_model = detection_model_fn(
        is_training=is_training, add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      # For evaling on train data, it is necessary to check whether groundtruth
      # must be unpadded.
      boxes_shape = (
          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
          .as_list())
      unpad_groundtruth_tensors = boxes_shape[1] is not None and not use_tpu
      labels = unstack_batch(
          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      gt_weights_list = None
      if fields.InputDataFields.groundtruth_weights in labels:
        gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
      gt_confidences_list = None
      if fields.InputDataFields.groundtruth_confidences in labels:
        gt_confidences_list = labels[
            fields.InputDataFields.groundtruth_confidences]
      gt_is_crowd_list = None
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_confidences_list=gt_confidences_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=gt_weights_list,
          groundtruth_is_crowd_list=gt_is_crowd_list)

    preprocessed_images = features[fields.InputDataFields.image]
    if use_tpu and train_config.use_bfloat16:
      with tf.contrib.tpu.bfloat16_scope():
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape])
        for k, v in prediction_dict.items():
          if v.dtype == tf.bfloat16:
            prediction_dict[k] = tf.cast(v, tf.float32)
    else:
      prediction_dict = detection_model.predict(
          preprocessed_images,
          features[fields.InputDataFields.true_image_shape])

    def postprocess_wrapper(args):
      return detection_model.postprocess(args[0], args[1])

    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      if use_tpu and postprocess_on_cpu:
        detections = tf.contrib.tpu.outside_compilation(
            postprocess_wrapper,
            (prediction_dict,
             features[fields.InputDataFields.true_image_shape]))
      else:
        detections = postprocess_wrapper((
            prediction_dict,
            features[fields.InputDataFields.true_image_shape]))

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map,
                train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:

          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()

          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.values()]
      if train_config.add_regularization_loss:
        regularization_losses = detection_model.regularization_losses()
        if regularization_losses:
          regularization_loss = tf.add_n(
              regularization_losses, name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
      losses_dict['Loss/total_loss'] = total_loss

      if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=is_training)
        graph_rewriter_fn()

      # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
      # can write learning rate summaries on TPU without host calls.
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
        training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      include_variables = (
          train_config.update_trainable_variables
          if train_config.update_trainable_variables else None)
      exclude_variables = (
          train_config.freeze_variables
          if train_config.freeze_variables else None)
      trainable_variables = tf.contrib.framework.filter_variables(
          tf.trainable_variables(),
          include_patterns=include_variables,
          exclude_patterns=exclude_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      if train_config.summarize_gradients:
        summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          update_ops=detection_model.updates(),
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      exported_output = exporter_lib.add_output_tensor_nodes(detections)
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(exported_output)
      }

    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (
          fields.DetectionResultFields.detection_classes not in detections)
      groundtruth = _prepare_groundtruth_for_eval(
          detection_model, class_agnostic,
          eval_input_config.max_number_of_boxes)
      use_original_images = fields.InputDataFields.original_image in features
      if use_original_images:
        eval_images = features[fields.InputDataFields.original_image]
        true_image_shapes = tf.slice(
            features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3])
        original_image_spatial_shapes = features[fields.InputDataFields
                                                 .original_image_spatial_shape]
      else:
        eval_images = features[fields.InputDataFields.image]
        true_image_shapes = None
        original_image_spatial_shapes = None

      eval_dict = eval_util.result_dict_for_batched_example(
          eval_images,
          features[inputs.HASH_KEY],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=True,
          original_image_spatial_shapes=original_image_spatial_shapes,
          true_image_shapes=true_image_shapes)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      vis_metric_ops = None
      if not use_tpu and use_original_images:
        eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
            category_index,
            max_examples_to_draw=eval_config.num_visualizations,
            max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
            min_score_thresh=eval_config.min_score_threshold,
            use_normalized_coordinates=False)
        vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
            eval_dict)

      # Eval metrics on a single example.
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_config, list(category_index.values()), eval_dict)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
      if vis_metric_ops is not None:
        eval_metric_ops.update(vis_metric_ops)
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

      if eval_config.use_moving_averages:
        variable_averages = tf.train.ExponentialMovingAverage(0.0)
        variables_to_restore = variable_averages.variables_to_restore()
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            variables_to_restore,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        scaffold = tf.train.Scaffold(saver=saver)

    # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
    if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      if scaffold is None:
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            sharded=True,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            save_relative_paths=True)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
        scaffold = tf.train.Scaffold(saver=saver)
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs,
          scaffold=scaffold)
コード例 #25
0
ファイル: model_lib.py プロジェクト: Asharib90/OCR
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with tf.tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = tf.tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = slim.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = slim.optimizers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                keypoint_edges = [(kp.start, kp.end)
                                  for kp in eval_config.keypoint_edge]

                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False,
                    keypoint_edges=keypoint_edges or None)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
コード例 #26
0
def train(create_model_fn, create_tensor_dict_fn, train_config, train_dir, img_root, video_root):
    detection_model = create_model_fn()
    data_augmentation_options = [
              preprocessor_builder.build(step)
              for step in train_config.data_augmentation_options]
    gpu_num = 2
    with tf.device('cpu:0'):
        global_step = slim.create_global_step()

        input_queue = _create_input_queue(train_config.batch_size*gpu_num,
                                          create_tensor_dict_fn,
                                          detection_model,
                                          train_config.batch_queue_capacity,
                                          train_config.num_batch_queue_threads,
                                          train_config.prefetch_queue_capacity,
                                          data_augmentation_options,
                                          img_root, video_root)

       # inputOp = input_queue.dequeue()
        (InitBox1, image1, groundtruth_boxes1, groundtruth_classes1,
         groundtruth_masks
         ) = _get_inputs(input_queue)

        reuse_vars = False
        #task_lossb = []
        tower_grads = []
        for gpu_id in range(gpu_num):
            with tf.device(assign_to_device('/gpu:{}'.format(gpu_id), ps_device='/cpu:0')):
                _groundtruth_boxes1 = groundtruth_boxes1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size]
                _InitBox1 = InitBox1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size]
                _groundtruth_classes1 = groundtruth_classes1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size]
                _image1 = image1[gpu_id*train_config.batch_size:(gpu_id+1)*train_config.batch_size]

                detection_model = create_model_fn()
                task_lossa = _create_losses(_groundtruth_boxes1,_InitBox1,_groundtruth_classes1,_image1, detection_model,reuse=reuse_vars)
                optimizer = optimizer_builder.build(train_config.optimizer,
                                                    set())
                grads = optimizer.compute_gradients(task_lossa)
                pretrained_regex_list = ['^FeatureExtractor/MobilenetV1/Conv2d']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads,
                    pretrained_regex_list,
                    multiplier=0.0)
                pretrained_regex_list = ['^InitFeatureExtractor/MobilenetV1/Conv2d_[0-13]']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads_and_vars,
                    pretrained_regex_list,
                    multiplier=0.0)
                reuse_vars = True
                tower_grads.append(grads_and_vars)

        tower_grads = average_gradients(tower_grads)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = optimizer.apply_gradients(tower_grads,global_step=global_step)
        update_ops.append(train_op)
        update_op = tf.group(*update_ops)
        total_loss = tf.losses.get_total_loss()
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')


    # create initial restore op
        init_fn = None
        if train_config.fine_tune_checkpoint:
          var_map = detection_model.restore_map(
              from_detection_checkpoint=train_config.from_detection_checkpoint)
          init_var_map = detection_model.restore_init_map(
              from_detection_checkpoint=train_config.from_detection_checkpoint)
          available_var_map = (variables_helper.
              get_variables_available_in_checkpoint(
              var_map, train_config.fine_tune_checkpoint))
          init_available_var_map = (variables_helper.
              get_variables_available_in_checkpoint(
              init_var_map, train_config.fine_tune_checkpoint))
          saver = tf.train.Saver(available_var_map)
          init_saver = tf.train.Saver(init_available_var_map)

          def initializer_fn(sess):
            saver.restore(sess, train_config.fine_tune_checkpoint)
            init_saver.restore(sess, train_config.fine_tune_checkpoint)
            init_saver.restore(sess, train_config.fine_tune_checkpoint)
          init_fn = initializer_fn

        # session_config.gpu_options.allow_growth = True
        keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
        training_saver = tf.train.Saver(
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        slim.learning.train(train_tensor,logdir=train_dir,session_config=session_config,
                            init_fn=init_fn, number_of_steps=(
            train_config.num_steps if train_config.num_steps else None),
        save_summaries_secs=120,
        saver=training_saver,
                           global_step=global_step)
コード例 #27
0
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):

    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    use_bfloat16 = train_config.use_bfloat16
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # TODO(kaftan): Either make strategy a parameter of this method, or
    ## grab it w/  Distribution strategy's get_scope
    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.MirroredStrategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(
                    detection_model, fine_tune_checkpoint_path,
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars, train_input,
                    unpad_groundtruth_tensors, use_tpu, use_bfloat16)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    use_bfloat16=use_bfloat16,
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                tf.compat.v2.summary.scalar('steps_per_sec',
                                            1.0 / (end_time - start_time),
                                            step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                tf.print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()
コード例 #28
0
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
          num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name,
          is_chief, train_dir):
    """Training function for detection models.

  Args:
    create_tensor_dict_fn: a function to create a tensor input dictionary.
    create_model_fn: a function that creates a DetectionModel and generates
                     losses.
    train_config: a train_pb2.TrainConfig protobuf.
    master: BNS name of the TensorFlow master to use.
    task: The task id of this training instance.
    num_clones: The number of clones to run per machine.
    worker_replicas: The number of work replicas to train with.
    clone_on_cpu: True if clones should be forced to run on CPU.
    ps_tasks: Number of parameter server tasks.
    worker_job_name: Name of the worker job.
    is_chief: Whether this replica is the chief replica.
    train_dir: Directory to write checkpoints and training summaries to.
  """

    detection_model = create_model_fn()  #Object for create the detection model
    data_augmentation_options = [  #for ssd it's ssd random crop 
        preprocessor_builder.build(
            step)  #random_horizontal_flip in the faster rcnn config file 
        for step in train_config.data_augmentation_options
    ]

    with tf.Graph().as_default(
    ):  #we need a default graph in order to create the model
        # Build a configuration specifying multi-GPU and multi-replicas.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=num_clones,
            clone_on_cpu=clone_on_cpu,
            replica_id=task,
            num_replicas=worker_replicas,
            num_ps_tasks=ps_tasks,
            worker_job_name=worker_job_name)

        # Place the global step on the device storing the variables.    #global step is needed to keep the records
        with tf.device(deploy_config.variables_device()
                       ):  #suitable device for operation  +++On CPU I think
            global_step = slim.create_global_step(
            )  #created the global step tensor


#The following will create an input Que images ,boxes m targets
        with tf.device(deploy_config.inputs_device()
                       ):  #Device to use to build the inputs ++++on CPU ??
            input_queue = _create_input_queue(
                train_config.batch_size //
                num_clones,  #here batch size/number_clones 
                create_tensor_dict_fn,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity,
                data_augmentation_options)  #random_horizontal_flip

        # Gather initial summaries.
        summaries = set(tf.get_collection(
            tf.GraphKeys.SUMMARIES))  #vreate the summeries
        global_summaries = set([])
        #Creating the loss
        model_fn = functools.partial(
            _create_losses,  #This will create the losses , It need a object of our model as an argivement 
            create_model_fn=create_model_fn)
        clones = model_deploy.create_clones(
            deploy_config, model_fn,
            [input_queue
             ])  #creating the clones with respect to t he input model fn
        first_clone_scope = clones[0].scope

        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by model_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        with tf.device(deploy_config.optimizer_device()):  #This is important
            training_optimizer = optimizer_builder.build(
                train_config.optimizer,  #optimization 
                global_summaries
            )  #will select rms_prop , Adam Here derectly we get the optimizer

        sync_optimizer = None
        if train_config.sync_replicas:
            training_optimizer = tf.SyncReplicasOptimizer(  #This is more of synchronising the optimizer because there are repicas doing optimizing
                training_optimizer,
                replicas_to_aggregate=train_config.replicas_to_aggregate,
                total_num_replicas=train_config.worker_replicas)
            sync_optimizer = training_optimizer

        # Create ops required to initialize the model from a given checkpoint.
        init_fn = None
        if train_config.fine_tune_checkpoint:  #This is the checkpoint path file
            init_fn = detection_model.restore_fn(  #Re storing the weights from the feature extractors 
                train_config.fine_tune_checkpoint,
                from_detection_checkpoint=train_config.
                from_detection_checkpoint
            )  #This is more of the initializer which is re-stored from check points

        with tf.device(deploy_config.optimizer_device()):
            total_loss, grads_and_vars = model_deploy.optimize_clones(  #This gives the total loss and also the grad and var pairs (Tuple) 
                clones,
                training_optimizer,
                regularization_losses=None)
            total_loss = tf.check_numerics(total_loss,
                                           'LossTensor is inf or nan.')

            # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
            if train_config.bias_grad_multiplier:  #We have not initialized a bias gradient multiplier
                biases_regex_list = ['.*/biases']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads_and_vars,
                    biases_regex_list,
                    multiplier=train_config.bias_grad_multiplier)

            # Optionally freeze some layers by setting their gradients to be zero.
            if train_config.freeze_variables:  #Here we are not freezing any may be it's good to freeze the
                #This will be usefult to go through the variables
                print("Priting the grad_and_vars to check the tuples ")
                print(grad_and_vars)
                grads_and_vars = variables_helper.freeze_gradients_matching_regex(  #input to this also grads and vars which means 
                    grads_and_vars,
                    train_config.freeze_variables)  #This function will output
                #We are getiing gradients and of their varaibles exept the froxen list
            # Optionally clip gradients
            if train_config.gradient_clipping_by_norm > 0:
                with tf.name_scope('clip_grads'):
                    grads_and_vars = slim.learning.clip_gradient_norms(
                        grads_and_vars, train_config.gradient_clipping_by_norm)

            # Create gradient updates.
            grad_updates = training_optimizer.apply_gradients(
                grads_and_vars,  #updating the gradinets list 
                global_step=global_step)
            update_ops.append(grad_updates)  #Here the new updated variables

            update_op = tf.group(*update_ops)
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')

        # Add summaries.
        for model_var in slim.get_model_variables():
            global_summaries.add(
                tf.summary.histogram(model_var.op.name, model_var))
        for loss_tensor in tf.losses.get_losses():
            global_summaries.add(
                tf.summary.scalar(loss_tensor.op.name, loss_tensor))
        global_summaries.add(
            tf.summary.scalar('TotalLoss', tf.losses.get_total_loss()))

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
        summaries |= global_summaries

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # Soft placement allows placing on CPU ops without GPU implementation.
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)

        # Save checkpoints regularly.
        keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
        saver = tf.train.Saver(  #saving the checkpoints 
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)

        slim.learning.train(  #Training the network using a compact function 
            train_tensor,
            logdir=train_dir,
            master=master,
            is_chief=is_chief,
            session_config=session_config,
            startup_delay_steps=train_config.startup_delay_steps,
            init_fn=init_fn,
            summary_op=summary_op,
            number_of_steps=(train_config.num_steps
                             if train_config.num_steps else None),
            save_summaries_secs=120,
            sync_optimizer=sync_optimizer,
            saver=saver)
コード例 #29
0
def fine_tune_pruned_model(pipeline_config, train_dir, pruned_shape, pruned_weights, checkpoint_path, fine_tune =True):

    task = 0

    if task == 0:
        tf.gfile.MakeDirs(train_dir)
    if pipeline_config:
        configs = config_util.get_configs_from_pipeline_file(pipeline_config)
        if task == 0:
            tf.gfile.Copy(pipeline_config, os.path.join(train_dir, 'pipeline.config'), overwrite=True)
    else:
        print("FAIL")
        sys.exit(1)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']
    train_config.fine_tune_checkpoint_type = 'detection'

    model_c_fn = functools.partial(
        model_builder.build,
        model_config=model_config,
        is_training=True,
        add_summaries=True,
        convDict=pruned_shape)

    def get_next(config):
        return dataset_util.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object,), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    create_model_fn = model_c_fn
    create_tensor_dict_fn = create_input_dict_fn
    pruned_graph = tf.Graph()
    with pruned_graph.as_default():
        detection_model = model_c_fn()
        data_augmentation_options = [preprocessor_builder.build(step) for step in
                                     train_config.data_augmentation_options]

        # Build a configuration specifying multi-GPU and multi-replicas.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=1,
            clone_on_cpu=False,
            replica_id=task,
            num_replicas=worker_replicas,
            num_ps_tasks=ps_tasks,
            worker_job_name=worker_job_name)

        # Place the global step on the device storing the variables.
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        with tf.device(deploy_config.inputs_device()):
            input_queue = create_input_queue(
                train_config.batch_size // 1, create_input_dict_fn,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity, data_augmentation_options)

        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        global_summaries = set([])

        model_fn = functools.partial(_create_losses,
                                     create_model_fn=model_c_fn,
                                     train_config=train_config)
        clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue])
        first_clone_scope = clones[0].scope

        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by model_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

        with tf.device(deploy_config.optimizer_device()):
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(train_config.optimizer)
            for var in optimizer_summary_vars:
                tf.summary.scalar(var.op.name, var)

        sync_optimizer = None

        # Create ops required to initialize the model from a given checkpoint.
        init_fn = None
        if pruned_weights:
            init_assign_op, init_feed_dict = slim.assign_from_values(pruned_weights)

            def initializer_fn(sess):
                sess.run(init_assign_op, init_feed_dict)

            init_fn = initializer_fn

        with tf.device(deploy_config.optimizer_device()):
            regularization_losses = (None if train_config.add_regularization_loss
                                     else [])
            total_loss, grads_and_vars = model_deploy.optimize_clones(
                clones, training_optimizer,
                regularization_losses=regularization_losses)
            total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.')

        # Optionally clip gradients
        if train_config.gradient_clipping_by_norm > 0:
            with tf.name_scope('clip_grads'):
                grads_and_vars = slim.learning.clip_gradient_norms(
                    grads_and_vars, train_config.gradient_clipping_by_norm)

            # Create gradient updates.
            grad_updates = training_optimizer.apply_gradients(grads_and_vars,
                                                              global_step=global_step)
            update_ops.append(grad_updates)
            update_op = tf.group(*update_ops, name='update_barrier')
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')

        # Add summaries.
        for model_var in slim.get_model_variables():
            global_summaries.add(tf.summary.histogram(model_var.op.name, model_var))
        for loss_tensor in tf.losses.get_losses():
            global_summaries.add(tf.summary.scalar(loss_tensor.op.name, loss_tensor))
        global_summaries.add(
            tf.summary.scalar('TotalLoss', tf.losses.get_total_loss()))

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                           first_clone_scope))
        summaries |= global_summaries

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # Soft placement allows placing on CPU ops without GPU implementation.
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)

        # Save checkpoints regularly.
        keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
        saver = tf.train.Saver(
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        with pruned_graph.as_default():
            slim.learning.train(
                train_tensor,
                logdir=train_dir,
                master=master,
                is_chief=is_chief,
                session_config=session_config,
                startup_delay_steps=train_config.startup_delay_steps,
                init_fn=init_fn,
                summary_op=summary_op,
                number_of_steps=(
            train_config.num_steps if train_config.num_steps else None),
                save_summaries_secs=120,
                sync_optimizer=sync_optimizer,
                saver=saver)
コード例 #30
0
ファイル: model_lib_randy.py プロジェクト: randyphoa/models
def train_loop(pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               checkpoint_every_n=1000,
               checkpoint_max_to_keep=7,
               record_summaries=True,
               performance_summary_exporter=None,
               num_steps_per_iteration=NUM_STEPS_PER_ITERATION,
               **kwargs):

    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        "get_configs_from_pipeline_file"]
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        "merge_external_params_with_configs"]
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        "create_pipeline_proto_from_configs"]
    steps_per_sec_list = []

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        "train_steps":
        train_steps,
        "use_bfloat16":
        configs["train_config"].use_bfloat16 and use_tpu,
    })
    configs = merge_external_params_with_configs(configs,
                                                 None,
                                                 kwargs_dict=kwargs)
    model_config = configs["model"]
    train_config = configs["train_config"]
    train_input_config = configs["train_input_config"]

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    if kwargs["use_bfloat16"]:
        tf.compat.v2.keras.mixed_precision.set_global_policy("mixed_bfloat16")

    if train_config.load_all_detection_checkpoint_vars:
        raise ValueError(
            "train_pb2.load_all_detection_checkpoint_vars unsupported in TF2")

    config_util.update_fine_tune_checkpoint_type(train_config)
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
    fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = MODEL_BUILD_UTIL_MAP["detection_model_fn_base"](
            model_config=model_config, is_training=True)

        def train_dataset_fn(input_context):
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=detection_model,
                input_context=input_context,
            )
            train_input = train_input.repeat()
            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.compat.v2.dtypes.int64,
            name="global_step",
            aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA,
        )
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if train_config.optimizer.use_moving_average:
            _ensure_model_is_built(detection_model, train_input,
                                   unpad_groundtruth_tensors)
            optimizer.shadow_copy(detection_model)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    summary_writer_filepath = get_filepath(strategy,
                                           os.path.join(model_dir, "train"))
    # summary_writer = tf.compat.v2.summary.create_file_writer(summary_writer_filepath)
    summary_writer = tf2.summary.create_noop_writer()
    with summary_writer.as_default():
        with strategy.scope():
            with tf2.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                if train_config.fine_tune_checkpoint:
                    load_fine_tune_checkpoint(
                        detection_model,
                        train_config.fine_tune_checkpoint,
                        fine_tune_checkpoint_type,
                        fine_tune_checkpoint_version,
                        train_config.
                        run_fine_tune_checkpoint_dummy_computation,
                        train_input,
                        unpad_groundtruth_tensors,
                    )

                # ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer)

                # manager_dir = get_filepath(strategy, model_dir)
                # manager = tf.compat.v2.train.CheckpointManager(ckpt, manager_dir, max_to_keep=1)

                # latest_checkpoint = tf.train.latest_checkpoint(model_dir)
                # ckpt.restore(latest_checkpoint)

                def train_step_fn(features, labels):
                    loss = eager_train_step(
                        detection_model,
                        features,
                        labels,
                        unpad_groundtruth_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradients_value=clip_gradients_value,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync,
                    )
                    global_step.assign_add(1)
                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    if hasattr(tf.distribute.Strategy, "run"):
                        per_replica_losses = strategy.run(train_step_fn,
                                                          args=(features,
                                                                labels))
                    else:
                        per_replica_losses = strategy.experimental_run_v2(
                            train_step_fn, args=(features, labels))
                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            with tf.name_scope(""):
                                _sample_and_train(strategy, train_step_fn,
                                                  data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)

                checkpointed_step = int(global_step.value())
                logged_step = global_step.value()

                last_step_time = time.time()
                for _ in range(global_step.value(), train_steps,
                               num_steps_per_iteration):
                    loss = _dist_train_step(train_input_iter)
                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()
                    steps_per_sec = num_steps_per_iteration * 1.0 / time_taken
                    tf.compat.v2.summary.scalar("steps_per_sec",
                                                steps_per_sec,
                                                step=global_step)
                    steps_per_sec_list.append(steps_per_sec)
                    if global_step.value() - logged_step >= 100:
                        tf.logging.info(
                            "Step {} per-step time {:.3f}s loss={:.3f}".format(
                                global_step.value(),
                                time_taken / num_steps_per_iteration, loss))
                        logged_step = global_step.value()

                    if (int(global_step.value()) -
                            checkpointed_step) >= checkpoint_every_n:
                        # manager.save()
                        checkpointed_step = int(global_step.value())

    # clean_temporary_directories(strategy, manager_dir)
    clean_temporary_directories(strategy, summary_writer_filepath)
コード例 #31
0
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    detection_model = detection_model_fn(is_training=is_training,
                                         add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      labels = unstack_batch(labels, unpad_groundtruth_tensors=False)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list)

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
        preprocessed_images, features[fields.InputDataFields.true_image_shape])
    detections = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        asg_map = detection_model.restore_map(
            from_detection_checkpoint=train_config.from_detection_checkpoint,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map, train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.itervalues()]
      total_loss = tf.add_n(losses, name='total_loss')

    if mode == tf.estimator.ModeKeys.TRAIN:
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

      if use_tpu:
        training_optimizer = tpu_optimizer.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      if train_config.freeze_variables:
        trainable_variables = tf.contrib.framework.filter_variables(
            tf.trainable_variables(),
            exclude_patterns=train_config.freeze_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(detections)
      }

    eval_metric_ops = None
    if mode == tf.estimator.ModeKeys.EVAL:
      # Detection summaries during eval.
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
      eval_dict = eval_util.result_dict_for_single_example(
          tf.expand_dims(features[fields.InputDataFields.original_image][0], 0),
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=False)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image(
          eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2)
      if not use_tpu:
        tf.summary.image('Detections_Left_Groundtruth_Right',
                         detection_and_groundtruth)

      # Eval metrics on a single image.
      detection_fields = fields.DetectionResultFields()
      input_data_fields = fields.InputDataFields()
      coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
          category_index.values())
      eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
          image_id=eval_dict[input_data_fields.key],
          groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes],
          groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes],
          detection_boxes=eval_dict[detection_fields.detection_boxes],
          detection_scores=eval_dict[detection_fields.detection_scores],
          detection_classes=eval_dict[detection_fields.detection_classes])

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs)
コード例 #32
0
def eval_continuously(
    pipeline_config_path,
    config_override=None,
    train_steps=None,
    sample_1_of_n_eval_examples=1,
    sample_1_of_n_eval_on_train_examples=1,
    use_tpu=False,
    override_eval_num_epochs=True,
    postprocess_on_cpu=False,
    model_dir=None,
    checkpoint_dir=None,
    wait_interval=180,
    timeout=3600,
    eval_index=0,
    save_final_config=False,
    **kwargs):
  """Run continuous evaluation of a detection model eagerly.

  This method builds the model, and continously restores it from the most
  recent training checkpoint in the checkpoint directory & evaluates it
  on the evaluation data.

  Args:
    pipeline_config_path: A path to a pipeline config file.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    sample_1_of_n_eval_examples: Integer representing how often an eval example
      should be sampled. If 1, will sample all examples.
    sample_1_of_n_eval_on_train_examples: Similar to
      `sample_1_of_n_eval_examples`, except controls the sampling of training
      data for evaluation.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for
      eval_input.
    postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true,
      postprocess is scheduled on the host cpu.
    model_dir: Directory to output resulting evaluation summaries to.
    checkpoint_dir: Directory that contains the training checkpoints.
    wait_interval: The mimmum number of seconds to wait before checking for a
      new checkpoint.
    timeout: The maximum number of seconds to wait for a checkpoint. Execution
      will terminate if no new checkpoints are found after these many seconds.
    eval_index: int, If given, only evaluate the dataset at the given
      index. By default, evaluates dataset at 0'th index.
    save_final_config: Whether to save the pipeline config file to the model
      directory.
    **kwargs: Additional keyword arguments for configuration override.
  """
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']

  configs = get_configs_from_pipeline_file(
      pipeline_config_path, config_override=config_override)
  kwargs.update({
      'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples,
      'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
  })
  if train_steps is not None:
    kwargs['train_steps'] = train_steps
  if override_eval_num_epochs:
    kwargs.update({'eval_num_epochs': 1})
    tf.logging.warning(
        'Forced number of epochs for all eval validations to be 1.')
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  if model_dir and save_final_config:
    tf.logging.info('Saving pipeline config file to directory {}'.format(
        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)

  model_config = configs['model']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_configs = configs['eval_input_configs']
  eval_on_train_input_config = copy.deepcopy(train_input_config)
  eval_on_train_input_config.sample_1_of_n_examples = (
      sample_1_of_n_eval_on_train_examples)
  if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
    tf.logging.warning('Expected number of evaluation epochs is 1, but '
                       'instead encountered `eval_on_train_input_config'
                       '.num_epochs` = '
                       '{}. Overwriting `num_epochs` to 1.'.format(
                           eval_on_train_input_config.num_epochs))
    eval_on_train_input_config.num_epochs = 1

  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

  eval_input_config = eval_input_configs[eval_index]
  strategy = tf.compat.v2.distribute.get_strategy()
  with strategy.scope():
    detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        model_config=model_config, is_training=True)

  eval_input = strategy.experimental_distribute_dataset(
      inputs.eval_input(
          eval_config=eval_config,
          eval_input_config=eval_input_config,
          model_config=model_config,
          model=detection_model))

  global_step = tf.compat.v2.Variable(
      0, trainable=False, dtype=tf.compat.v2.dtypes.int64)

  optimizer, _ = optimizer_builder.build(
      configs['train_config'].optimizer, global_step=global_step)

  for latest_checkpoint in tf.train.checkpoints_iterator(
      checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval):
    ckpt = tf.compat.v2.train.Checkpoint(
        step=global_step, model=detection_model, optimizer=optimizer)

    # We run the detection_model on dummy inputs in order to ensure that the
    # model and all its variables have been properly constructed. Specifically,
    # this is currently necessary prior to (potentially) creating shadow copies
    # of the model variables for the EMA optimizer.
    if eval_config.use_moving_averages:
      unpad_groundtruth_tensors = (eval_config.batch_size == 1 and not use_tpu)
      _ensure_model_is_built(detection_model, eval_input,
                             unpad_groundtruth_tensors)
      optimizer.shadow_copy(detection_model)

    ckpt.restore(latest_checkpoint).expect_partial()

    if eval_config.use_moving_averages:
      optimizer.swap_weights()

    summary_writer = tf.compat.v2.summary.create_file_writer(
        os.path.join(model_dir, 'eval', eval_input_config.name))
    with summary_writer.as_default():
      eval_metrics = eager_eval_loop(
          detection_model,
          configs,
          eval_input,
          use_tpu=use_tpu,
          postprocess_on_cpu=postprocess_on_cpu,
          global_step=global_step,
          )
    return eval_metrics
コード例 #33
0
def train(datasets_dicts,
          epochs,
          val_every,
          iters_cnt,
          validate_with_eval_model,
          pipeline_config,
          num_clones=1,
          save_cback=None,
          is_transfer_learning=False):
    logger.info('Start train')
    configs = configs_from_pipeline(pipeline_config)

    model_config = configs['model']
    train_config = configs['train_config']

    create_model_fn = functools.partial(model_builder.build,
                                        model_config=model_config,
                                        is_training=True)
    detection_model = create_model_fn()

    def get_next(dataset):
        return dataset_util.make_initializable_iterator(
            build_dataset(dataset)).get_next()

    create_tensor_dict_fn = functools.partial(get_next,
                                              datasets_dicts['train'])
    create_tensor_dict_fn_val = functools.partial(get_next,
                                                  datasets_dicts['val'])

    data_augmentation_options = [
        preprocessor_builder.build(step)
        for step in train_config.data_augmentation_options
    ]

    with tf.Graph().as_default():
        # Build a configuration specifying multi-GPU and multi-replicas.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=4,
            clone_on_cpu=False,
            replica_id=0,
            num_replicas=1,
            num_ps_tasks=0,
            worker_job_name='lonely_worker')

        # Place the global step on the device storing the variables.
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        with tf.device(deploy_config.inputs_device()):
            coord = coordinator.Coordinator()
            input_queue = create_input_queue(
                train_config.batch_size, create_tensor_dict_fn,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity,
                data_augmentation_options)

            input_queue_val = create_input_queue(
                train_config.batch_size, create_tensor_dict_fn_val,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity,
                data_augmentation_options)

        # create validation graph
        create_model_fn_val = functools.partial(
            model_builder.build,
            model_config=model_config,
            is_training=not validate_with_eval_model)

        with tf.device(deploy_config.optimizer_device()):
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)
            for var in optimizer_summary_vars:
                tf.summary.scalar(var.op.name, var, family='LearningRate')

        train_losses = []
        grads_and_vars = []
        with slim.arg_scope([slim.model_variable, slim.variable],
                            device='/device:CPU:0'):
            for curr_dev_id in range(num_clones):
                with tf.device('/gpu:{}'.format(curr_dev_id)):
                    with tf.name_scope(
                            'clone_{}'.format(curr_dev_id)) as scope:
                        with tf.variable_scope(
                                tf.get_variable_scope(),
                                reuse=True if curr_dev_id > 0 else None):
                            losses = _create_losses_val(
                                input_queue, create_model_fn, train_config)
                            clones_loss = tf.add_n(losses)
                            clones_loss = tf.divide(clones_loss,
                                                    1.0 * num_clones)
                            grads = training_optimizer.compute_gradients(
                                clones_loss)
                            train_losses.append(clones_loss)
                            grads_and_vars.append(grads)
                            if curr_dev_id == 0:
                                update_ops = tf.get_collection(
                                    tf.GraphKeys.UPDATE_OPS)

        val_total_loss = get_val_loss(num_clones, input_queue_val,
                                      create_model_fn_val, train_config)

        with tf.device(deploy_config.optimizer_device()):
            total_loss = tf.add_n(train_losses)
            grads_and_vars = model_deploy._sum_clones_gradients(grads_and_vars)
            total_loss = tf.check_numerics(total_loss,
                                           'LossTensor is inf or nan.')

            # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
            if train_config.bias_grad_multiplier:
                biases_regex_list = ['.*/biases']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads_and_vars,
                    biases_regex_list,
                    multiplier=train_config.bias_grad_multiplier)

            # Optionally freeze some layers by setting their gradients to be zero.
            if train_config.freeze_variables:
                grads_and_vars = variables_helper.freeze_gradients_matching_regex(
                    grads_and_vars, train_config.freeze_variables)

            # Optionally clip gradients
            if train_config.gradient_clipping_by_norm > 0:
                with tf.name_scope('clip_grads'):
                    grads_and_vars = slim.learning.clip_gradient_norms(
                        grads_and_vars, train_config.gradient_clipping_by_norm)

            # Create gradient updates.
            grad_updates = training_optimizer.apply_gradients(
                grads_and_vars, global_step=global_step)
            update_ops.append(grad_updates)
            update_op = tf.group(*update_ops, name='update_barrier')
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        coord.clear_stop()
        sess = tf.Session(config=config)
        saver = tf.train.Saver()

        graph = ops.get_default_graph()
        with graph.as_default():
            with ops.name_scope('init_ops'):
                init_op = variables.global_variables_initializer()
                ready_op = variables.report_uninitialized_variables()
                local_init_op = control_flow_ops.group(
                    variables.local_variables_initializer(),
                    lookup_ops.tables_initializer())

        # graph.finalize()
        sess.run([init_op, ready_op, local_init_op])

        queue_runners = graph.get_collection(ops.GraphKeys.QUEUE_RUNNERS)
        threads = []
        for qr in queue_runners:
            threads.extend(
                qr.create_threads(sess, coord=coord, daemon=True, start=True))

        logger.info('Start restore')
        if train_config.fine_tune_checkpoint:
            var_map = detection_model.restore_map(
                fine_tune_checkpoint_type=train_config.
                fine_tune_checkpoint_type,
                load_all_detection_checkpoint_vars=(
                    train_config.load_all_detection_checkpoint_vars
                    and (not is_transfer_learning)))
            available_var_map = (
                variables_helper.get_variables_available_in_checkpoint(
                    var_map, train_config.fine_tune_checkpoint))
            if 'global_step' in available_var_map:
                del available_var_map['global_step']
            init_saver = tf.train.Saver(available_var_map)
            logger.info('Restoring model weights from previous checkpoint.')
            init_saver.restore(sess, train_config.fine_tune_checkpoint)
            logger.info('Model restored.')

        eval_planner = EvalPlanner(epochs, val_every)
        progress = sly.Progress('Model training: ',
                                epochs * iters_cnt['train'])
        best_val_loss = float('inf')
        epoch_flt = 0

        for epoch in range(epochs):
            logger.info("Before new epoch", extra={'epoch': epoch_flt})
            for train_it in range(iters_cnt['train']):
                total_loss, np_global_step = sess.run(
                    [train_tensor, global_step])

                metrics_values_train = {
                    'loss': total_loss,
                }

                progress.iter_done_report()
                epoch_flt = epoch_float(epoch, train_it + 1,
                                        iters_cnt['train'])
                sly.report_metrics_training(epoch_flt, metrics_values_train)

                if eval_planner.need_validation(epoch_flt):
                    logger.info("Before validation",
                                extra={'epoch': epoch_flt})

                    overall_val_loss = 0
                    for val_it in range(iters_cnt['val']):
                        overall_val_loss += sess.run(val_total_loss)

                        logger.info("Validation in progress",
                                    extra={
                                        'epoch': epoch_flt,
                                        'val_iter': val_it,
                                        'val_iters': iters_cnt['val']
                                    })

                    metrics_values_val = {
                        'loss': overall_val_loss / iters_cnt['val'],
                    }
                    sly.report_metrics_validation(epoch_flt,
                                                  metrics_values_val)
                    logger.info("Validation has been finished",
                                extra={'epoch': epoch_flt})

                    eval_planner.validation_performed()

                    val_loss = metrics_values_val['loss']
                    model_is_best = val_loss < best_val_loss
                    if model_is_best:
                        best_val_loss = val_loss
                        logger.info(
                            'It\'s been determined that current model is the best one for a while.'
                        )

                    save_cback(saver,
                               sess,
                               model_is_best,
                               opt_data={
                                   'epoch': epoch_flt,
                                   'val_metrics': metrics_values_val,
                               })

            logger.info("Epoch was finished", extra={'epoch': epoch_flt})
        coord.request_stop()
        coord.join(threads)
コード例 #34
0
def train(create_tensor_dict_fn,
          create_model_fn,
          train_config,
          master,
          task,
          num_clones,
          worker_replicas,
          clone_on_cpu,
          ps_tasks,
          worker_job_name,
          is_chief,
          train_dir,
          train_steps,
          to_keep,
          save_steps,
          graph_hook_fn=None):
    """Training function for detection models.

  Args:
    create_tensor_dict_fn: a function to create a tensor input dictionary.
    create_model_fn: a function that creates a DetectionModel and generates
                     losses.
    train_config: a train_pb2.TrainConfig protobuf.
    master: BNS name of the TensorFlow master to use.
    task: The task id of this training instance.
    num_clones: The number of clones to run per machine.
    worker_replicas: The number of work replicas to train with.
    clone_on_cpu: True if clones should be forced to run on CPU.
    ps_tasks: Number of parameter server tasks.
    worker_job_name: Name of the worker job.
    is_chief: Whether this replica is the chief replica.
    train_dir: Directory to write checkpoints and training summaries to.
    train_steps: Number of training steps
    to_keep: Number of checkpoints to keep
    save_steps: Save after every n seconds
    graph_hook_fn: Optional function that is called after the inference graph is
      built (before optimization). This is helpful to perform additional changes
      to the training graph such as adding FakeQuant ops. The function should
      modify the default graph.

  Raises:
    ValueError: If both num_clones > 1 and train_config.sync_replicas is true.
  """

    detection_model = create_model_fn()
    data_augmentation_options = [
        preprocessor_builder.build(step)
        for step in train_config.data_augmentation_options
    ]

    with tf.Graph().as_default():
        # Build a configuration specifying multi-GPU and multi-replicas.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=num_clones,
            clone_on_cpu=clone_on_cpu,
            replica_id=task,
            num_replicas=worker_replicas,
            num_ps_tasks=ps_tasks,
            worker_job_name=worker_job_name)

        # Place the global step on the device storing the variables.
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        if num_clones != 1 and train_config.sync_replicas:
            raise ValueError('In Synchronous SGD mode num_clones must ',
                             'be 1. Found num_clones: {}'.format(num_clones))
        batch_size = train_config.batch_size // num_clones
        if train_config.sync_replicas:
            batch_size //= train_config.replicas_to_aggregate

        with tf.device(deploy_config.inputs_device()):
            input_queue = create_input_queue(
                batch_size, create_tensor_dict_fn,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity,
                data_augmentation_options)

        # Gather initial summaries.
        # TODO(rathodv): See if summaries can be added/extracted from global tf
        # collections so that they don't have to be passed around.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        global_summaries = set([])

        model_fn = functools.partial(_create_losses,
                                     create_model_fn=create_model_fn,
                                     train_config=train_config)
        clones = model_deploy.create_clones(deploy_config, model_fn,
                                            [input_queue])
        first_clone_scope = clones[0].scope

        if graph_hook_fn:
            with tf.device(deploy_config.variables_device()):
                graph_hook_fn()

        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by model_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        with tf.device(deploy_config.optimizer_device()):
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)
            for var in optimizer_summary_vars:
                tf.summary.scalar(var.op.name, var, family='LearningRate')

        sync_optimizer = None
        if train_config.sync_replicas:
            training_optimizer = tf.train.SyncReplicasOptimizer(
                training_optimizer,
                replicas_to_aggregate=train_config.replicas_to_aggregate,
                total_num_replicas=worker_replicas)
            sync_optimizer = training_optimizer

        with tf.device(deploy_config.optimizer_device()):
            regularization_losses = (
                None if train_config.add_regularization_loss else [])
            total_loss, grads_and_vars = model_deploy.optimize_clones(
                clones,
                training_optimizer,
                regularization_losses=regularization_losses)
            total_loss = tf.check_numerics(total_loss,
                                           'LossTensor is inf or nan.')

            # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
            if train_config.bias_grad_multiplier:
                biases_regex_list = ['.*/biases']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads_and_vars,
                    biases_regex_list,
                    multiplier=train_config.bias_grad_multiplier)

            # Optionally freeze some layers by setting their gradients to be zero.
            if train_config.freeze_variables:
                grads_and_vars = variables_helper.freeze_gradients_matching_regex(
                    grads_and_vars, train_config.freeze_variables)

            # Optionally clip gradients
            if train_config.gradient_clipping_by_norm > 0:
                with tf.name_scope('clip_grads'):
                    grads_and_vars = slim.learning.clip_gradient_norms(
                        grads_and_vars, train_config.gradient_clipping_by_norm)

            # Create gradient updates.
            grad_updates = training_optimizer.apply_gradients(
                grads_and_vars, global_step=global_step)
            update_ops.append(grad_updates)
            update_op = tf.group(*update_ops, name='update_barrier')
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')

        # Add summaries.
        for model_var in slim.get_model_variables():
            global_summaries.add(
                tf.summary.histogram('ModelVars/' + model_var.op.name,
                                     model_var))
        for loss_tensor in tf.losses.get_losses():
            global_summaries.add(
                tf.summary.scalar('Losses/' + loss_tensor.op.name,
                                  loss_tensor))
        global_summaries.add(
            tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss()))

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
        summaries |= global_summaries

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # Soft placement allows placing on CPU ops without GPU implementation.
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)

        # Save checkpoints regularly.
        keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
        saver = tf.train.Saver(
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            max_to_keep=to_keep)

        # Create ops required to initialize the model from a given checkpoint.
        init_fn = None
        if train_config.fine_tune_checkpoint:
            if not train_config.fine_tune_checkpoint_type:
                # train_config.from_detection_checkpoint field is deprecated. For
                # backward compatibility, fine_tune_checkpoint_type is set based on
                # from_detection_checkpoint.
                if train_config.from_detection_checkpoint:
                    train_config.fine_tune_checkpoint_type = 'detection'
                else:
                    train_config.fine_tune_checkpoint_type = 'classification'
            var_map = detection_model.restore_map(
                fine_tune_checkpoint_type=train_config.
                fine_tune_checkpoint_type,
                load_all_detection_checkpoint_vars=(
                    train_config.load_all_detection_checkpoint_vars))
            available_var_map = (
                variables_helper.get_variables_available_in_checkpoint(
                    var_map,
                    train_config.fine_tune_checkpoint,
                    include_global_step=False))
            init_saver = tf.train.Saver(available_var_map)

            def initializer_fn(sess):
                init_saver.restore(sess, train_config.fine_tune_checkpoint)

            init_fn = initializer_fn

        train_config.num_steps = train_steps
        slim.learning.train(
            train_tensor,
            logdir=train_dir,
            master=master,
            is_chief=is_chief,
            session_config=session_config,
            startup_delay_steps=train_config.startup_delay_steps,
            init_fn=init_fn,
            summary_op=summary_op,
            number_of_steps=(train_config.num_steps
                             if train_config.num_steps else None),
            save_summaries_secs=120,
            save_interval_secs=save_steps,
            sync_optimizer=sync_optimizer,
            saver=saver)
コード例 #35
0
ファイル: model_lib.py プロジェクト: ALISCIFP/models
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    detection_model = detection_model_fn(is_training=is_training,
                                         add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      # For evaling on train data, it is necessary to check whether groundtruth
      # must be unpadded.
      boxes_shape = (
          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
          .as_list())
      unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False
      labels = unstack_batch(
          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=labels[
              fields.InputDataFields.groundtruth_weights],
          groundtruth_is_crowd_list=gt_is_crowd_list)

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
        preprocessed_images, features[fields.InputDataFields.true_image_shape])
    detections = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map, train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.itervalues()]
      if train_config.add_regularization_loss:
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
          regularization_loss = tf.add_n(regularization_losses,
                                         name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
      losses_dict['Loss/total_loss'] = total_loss

      if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=is_training)
        graph_rewriter_fn()

      # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
      # can write learning rate summaries on TPU without host calls.
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
        training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      if train_config.freeze_variables:
        trainable_variables = tf.contrib.framework.filter_variables(
            tf.trainable_variables(),
            exclude_patterns=train_config.freeze_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(detections)
      }

    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _prepare_groundtruth_for_eval(
          detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
      eval_images = (
          features[fields.InputDataFields.original_image] if use_original_images
          else features[fields.InputDataFields.image])
      eval_dict = eval_util.result_dict_for_single_example(
          eval_images[0:1],
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=True)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      img_summary = None
      if not use_tpu and use_original_images:
        detection_and_groundtruth = (
            vis_utils.draw_side_by_side_evaluation_image(
                eval_dict, category_index, max_boxes_to_draw=20,
                min_score_thresh=0.2,
                use_normalized_coordinates=False))
        img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
                                       detection_and_groundtruth)

      # Eval metrics on a single example.
      eval_metrics = eval_config.metrics_set
      if not eval_metrics:
        eval_metrics = ['coco_detection_metrics']
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_metrics,
          category_index.values(),
          eval_dict,
          include_metrics_per_category=eval_config.include_metrics_per_category)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
      if img_summary is not None:
        eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
            img_summary, tf.no_op())
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}

      if eval_config.use_moving_averages:
        variable_averages = tf.train.ExponentialMovingAverage(0.0)
        variables_to_restore = variable_averages.variables_to_restore()
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            variables_to_restore,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        scaffold = tf.train.Scaffold(saver=saver)

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs,
          scaffold=scaffold)
コード例 #36
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = True if boxes_shape[
                1] is not None else False
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
            gt_classes_list = labels[
                fields.InputDataFields.groundtruth_classes]
            gt_masks_list = None
            if fields.InputDataFields.groundtruth_instance_masks in labels:
                gt_masks_list = labels[
                    fields.InputDataFields.groundtruth_instance_masks]
            gt_keypoints_list = None
            if fields.InputDataFields.groundtruth_keypoints in labels:
                gt_keypoints_list = labels[
                    fields.InputDataFields.groundtruth_keypoints]
            gt_weights_list = None
            if fields.InputDataFields.groundtruth_weights in labels:
                gt_weights_list = labels[
                    fields.InputDataFields.groundtruth_weights]
            if fields.InputDataFields.groundtruth_is_crowd in labels:
                gt_is_crowd_list = labels[
                    fields.InputDataFields.groundtruth_is_crowd]
            detection_model.provide_groundtruth(
                groundtruth_boxes_list=gt_boxes_list,
                groundtruth_classes_list=gt_classes_list,
                groundtruth_masks_list=gt_masks_list,
                groundtruth_keypoints_list=gt_keypoints_list,
                groundtruth_weights_list=gt_weights_list,
                groundtruth_is_crowd_list=gt_is_crowd_list)

        preprocessed_images = features[fields.InputDataFields.image]
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape])
        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            detections = detection_model.postprocess(
                prediction_dict,
                features[fields.InputDataFields.true_image_shape])

        if mode == tf.estimator.ModeKeys.TRAIN:
            if train_config.fine_tune_checkpoint and hparams.load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            losses_dict = detection_model.loss(
                prediction_dict,
                features[fields.InputDataFields.true_image_shape])
            losses = [loss_tensor for loss_tensor in losses_dict.values()]
            if train_config.add_regularization_loss:
                regularization_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                if regularization_losses:
                    regularization_loss = tf.add_n(regularization_losses,
                                                   name='regularization_loss')
                    losses.append(regularization_loss)
                    losses_dict[
                        'Loss/regularization_loss'] = regularization_loss
            total_loss = tf.add_n(losses, name='total_loss')
            losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            train_op = tf.contrib.layers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(detections)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic)
            use_original_images = fields.InputDataFields.original_image in features
            eval_images = (features[fields.InputDataFields.original_image]
                           if use_original_images else
                           features[fields.InputDataFields.image])
            eval_dict = eval_util.result_dict_for_single_example(
                eval_images[0:1],
                features[inputs.HASH_KEY][0],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True)

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            img_summary = None
            if not use_tpu and use_original_images:
                detection_and_groundtruth = (
                    vis_utils.draw_side_by_side_evaluation_image(
                        eval_dict,
                        category_index,
                        max_boxes_to_draw=20,
                        min_score_thresh=0.2,
                        use_normalized_coordinates=False))
                img_summary = tf.summary.image(
                    'Detections_Left_Groundtruth_Right',
                    detection_and_groundtruth)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, category_index.values(), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if img_summary is not None:
                eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
                    img_summary, tf.no_op())
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
コード例 #37
0
ファイル: trainer_seq.py プロジェクト: zhuzhangliang923/MBMD
def train(create_model_fn, create_tensor_dict_fn, train_config, train_dir,
          img_root):
    detection_model = create_model_fn()
    data_augmentation_options = [
        preprocessor_builder.build(step)
        for step in train_config.data_augmentation_options
    ]

    with tf.device('cpu:0'):
        global_step = slim.create_global_step()

        input_queue = _create_input_queue(train_config.batch_size,
                                          create_tensor_dict_fn,
                                          detection_model,
                                          train_config.batch_queue_capacity,
                                          train_config.num_batch_queue_threads,
                                          train_config.prefetch_queue_capacity,
                                          data_augmentation_options, img_root)
    with tf.device('gpu:0'):
        _create_losses(input_queue, create_model_fn)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    training_optimizer = optimizer_builder.build(train_config.optimizer, set())
    # create initial restore op
    init_fn = None
    if train_config.fine_tune_checkpoint:
        var_map = detection_model.restore_map(
            from_detection_checkpoint=train_config.from_detection_checkpoint)
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                var_map, train_config.fine_tune_checkpoint))
        init_saver = tf.train.Saver(available_var_map)

        def initializer_fn(sess):
            init_saver.restore(sess, train_config.fine_tune_checkpoint)

        init_fn = initializer_fn
    # loss and grads
    total_loss = tf.losses.get_total_loss()
    grads_and_vars = training_optimizer.compute_gradients(
        total_loss, tf.trainable_variables())
    # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
    if train_config.bias_grad_multiplier:
        biases_regex_list = ['.*/biases']
        grads_and_vars = variables_helper.multiply_gradients_matching_regex(
            grads_and_vars,
            biases_regex_list,
            multiplier=train_config.bias_grad_multiplier)

    # Optionally freeze some layers by setting their gradients to be zero.
    if train_config.freeze_variables:
        grads_and_vars = variables_helper.freeze_gradients_matching_regex(
            grads_and_vars, train_config.freeze_variables)

    # Optionally clip gradients
    if train_config.gradient_clipping_by_norm > 0:
        with tf.name_scope('clip_grads'):
            grads_and_vars = slim.learning.clip_gradient_norms(
                grads_and_vars, train_config.gradient_clipping_by_norm)

    # Create gradient updates.
    grad_updates = training_optimizer.apply_gradients(grads_and_vars,
                                                      global_step=global_step)
    update_ops.append(grad_updates)

    update_op = tf.group(*update_ops)
    with tf.control_dependencies([update_op]):
        train_tensor = tf.identity(total_loss, name='train_op')
    # create summary
    summaries = set()
    for loss_tensor in tf.losses.get_losses():
        summaries.add(tf.summary.scalar(loss_tensor.op.name, loss_tensor))
    summaries.add(tf.summary.scalar('TotalLoss', tf.losses.get_total_loss()))
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
    # session_config.gpu_options.allow_growth = True
    keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
    saver = tf.train.Saver(
        keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
    slim.learning.train(train_tensor,
                        logdir=train_dir,
                        session_config=session_config,
                        init_fn=init_fn,
                        summary_op=summary_op,
                        number_of_steps=(train_config.num_steps
                                         if train_config.num_steps else None),
                        save_summaries_secs=120,
                        saver=saver)
コード例 #38
0
ファイル: trainer.py プロジェクト: smajida/models
def train(create_tensor_dict_fn,
          create_model_fn,
          train_config,
          master,
          task,
          num_clones,
          worker_replicas,
          clone_on_cpu,
          ps_tasks,
          worker_job_name,
          is_chief,
          train_dir,
          graph_hook_fn=None):
  """Training function for detection models.

  Args:
    create_tensor_dict_fn: a function to create a tensor input dictionary.
    create_model_fn: a function that creates a DetectionModel and generates
                     losses.
    train_config: a train_pb2.TrainConfig protobuf.
    master: BNS name of the TensorFlow master to use.
    task: The task id of this training instance.
    num_clones: The number of clones to run per machine.
    worker_replicas: The number of work replicas to train with.
    clone_on_cpu: True if clones should be forced to run on CPU.
    ps_tasks: Number of parameter server tasks.
    worker_job_name: Name of the worker job.
    is_chief: Whether this replica is the chief replica.
    train_dir: Directory to write checkpoints and training summaries to.
    graph_hook_fn: Optional function that is called after the training graph is
      completely built. This is helpful to perform additional changes to the
      training graph such as optimizing batchnorm. The function should modify
      the default graph.
  """

  detection_model = create_model_fn()
  data_augmentation_options = [
      preprocessor_builder.build(step)
      for step in train_config.data_augmentation_options]

  with tf.Graph().as_default():
    # Build a configuration specifying multi-GPU and multi-replicas.
    deploy_config = model_deploy.DeploymentConfig(
        num_clones=num_clones,
        clone_on_cpu=clone_on_cpu,
        replica_id=task,
        num_replicas=worker_replicas,
        num_ps_tasks=ps_tasks,
        worker_job_name=worker_job_name)

    # Place the global step on the device storing the variables.
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    with tf.device(deploy_config.inputs_device()):
      input_queue = create_input_queue(
          train_config.batch_size // num_clones, create_tensor_dict_fn,
          train_config.batch_queue_capacity,
          train_config.num_batch_queue_threads,
          train_config.prefetch_queue_capacity, data_augmentation_options)

    # Gather initial summaries.
    # TODO(rathodv): See if summaries can be added/extracted from global tf
    # collections so that they don't have to be passed around.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    global_summaries = set([])

    model_fn = functools.partial(_create_losses,
                                 create_model_fn=create_model_fn,
                                 train_config=train_config)
    clones = model_deploy.create_clones(deploy_config, model_fn, [input_queue])
    first_clone_scope = clones[0].scope

    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by model_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    with tf.device(deploy_config.optimizer_device()):
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)
      for var in optimizer_summary_vars:
        tf.summary.scalar(var.op.name, var, family='LearningRate')

    sync_optimizer = None
    if train_config.sync_replicas:
      training_optimizer = tf.train.SyncReplicasOptimizer(
          training_optimizer,
          replicas_to_aggregate=train_config.replicas_to_aggregate,
          total_num_replicas=worker_replicas)
      sync_optimizer = training_optimizer

    with tf.device(deploy_config.optimizer_device()):
      regularization_losses = (None if train_config.add_regularization_loss
                               else [])
      total_loss, grads_and_vars = model_deploy.optimize_clones(
          clones, training_optimizer,
          regularization_losses=regularization_losses)
      total_loss = tf.check_numerics(total_loss, 'LossTensor is inf or nan.')

      # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
      if train_config.bias_grad_multiplier:
        biases_regex_list = ['.*/biases']
        grads_and_vars = variables_helper.multiply_gradients_matching_regex(
            grads_and_vars,
            biases_regex_list,
            multiplier=train_config.bias_grad_multiplier)

      # Optionally freeze some layers by setting their gradients to be zero.
      if train_config.freeze_variables:
        grads_and_vars = variables_helper.freeze_gradients_matching_regex(
            grads_and_vars, train_config.freeze_variables)

      # Optionally clip gradients
      if train_config.gradient_clipping_by_norm > 0:
        with tf.name_scope('clip_grads'):
          grads_and_vars = slim.learning.clip_gradient_norms(
              grads_and_vars, train_config.gradient_clipping_by_norm)

      # Create gradient updates.
      grad_updates = training_optimizer.apply_gradients(grads_and_vars,
                                                        global_step=global_step)
      update_ops.append(grad_updates)
      update_op = tf.group(*update_ops, name='update_barrier')
      with tf.control_dependencies([update_op]):
        train_tensor = tf.identity(total_loss, name='train_op')

    if graph_hook_fn:
      with tf.device(deploy_config.variables_device()):
        graph_hook_fn()

    # Add summaries.
    for model_var in slim.get_model_variables():
      global_summaries.add(tf.summary.histogram('ModelVars/' +
                                                model_var.op.name, model_var))
    for loss_tensor in tf.losses.get_losses():
      global_summaries.add(tf.summary.scalar('Losses/' + loss_tensor.op.name,
                                             loss_tensor))
    global_summaries.add(
        tf.summary.scalar('Losses/TotalLoss', tf.losses.get_total_loss()))

    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))
    summaries |= global_summaries

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    # Soft placement allows placing on CPU ops without GPU implementation.
    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)

    # Save checkpoints regularly.
    keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
    saver = tf.train.Saver(
        keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)

    # Create ops required to initialize the model from a given checkpoint.
    init_fn = None
    if train_config.fine_tune_checkpoint:
      if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, fine_tune_checkpoint_type is set based on
        # from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
          train_config.fine_tune_checkpoint_type = 'detection'
        else:
          train_config.fine_tune_checkpoint_type = 'classification'
      var_map = detection_model.restore_map(
          fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
          load_all_detection_checkpoint_vars=(
              train_config.load_all_detection_checkpoint_vars))
      available_var_map = (variables_helper.
                           get_variables_available_in_checkpoint(
                               var_map, train_config.fine_tune_checkpoint))
      init_saver = tf.train.Saver(available_var_map)
      def initializer_fn(sess):
        init_saver.restore(sess, train_config.fine_tune_checkpoint)
      init_fn = initializer_fn

    slim.learning.train(
        train_tensor,
        logdir=train_dir,
        master=master,
        is_chief=is_chief,
        session_config=session_config,
        startup_delay_steps=train_config.startup_delay_steps,
        init_fn=init_fn,
        summary_op=summary_op,
        number_of_steps=(
            train_config.num_steps if train_config.num_steps else None),
        save_summaries_secs=120,
        sync_optimizer=sync_optimizer,
        saver=saver)
コード例 #39
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

        Args:
            features: Dictionary of feature tensors, returned from `input_fn`.
            labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
            otherwise None.
            mode: Mode key from tf.estimator.ModeKeys.
            params: Parameter dictionary passed from the estimator.

        Returns:
            An `EstimatorSpec` that encapsulates the model and its serving
            configurations.
        """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))

        scaffold_fn = None
        scaffold = None
        eval_metric_ops = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            # get the optimizer and global step:
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

            #get the trainable variables
            #trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            #get the clip_gradients_value
            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            total_loss = 0.
            tower_grads = []
            with tf.variable_scope(tf.get_variable_scope()):
                feature_list, label_list = split_features_and_labels(
                    features, labels, train_config.GPU_num)
                for i in xrange(train_config.GPU_num):
                    with tf.device('/gpu:%d' % i):
                        with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                            loss = tower_loss(scope=scope,
                                              features=feature_list[i],
                                              labels=label_list[i],
                                              detection_model=detection_model,
                                              train_config=train_config)
                            tf.get_variable_scope().reuse_variables()
                            grads = training_optimizer.compute_gradients(
                                loss=loss)
                            if isinstance(clip_gradients_value, float):
                                grads = clip_gradients_by_norm(
                                    grads, clip_gradients_value)
                            tower_grads.append(grads)
                            total_loss += loss
            total_loss /= train_config.GPU_num
            grad_avg = average_gradients(tower_grads)

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                apply_gradient_op = training_optimizer.apply_gradients(
                    grads_and_vars=grad_avg, global_step=global_step)

            train_op = apply_gradient_op

            if train_config.fine_tune_checkpoint:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        elif mode == tf.estimator.ModeKeys.EVAL:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            #in mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT, I explictly set the evaluation and prediction to run on CPU
            with tf.device('/cpu:1'):
                # training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer )
                boxes_shape = (labels[fields.InputDataFields.
                                      groundtruth_boxes].get_shape().as_list())
                unpad_groundtruth_tensors = boxes_shape[
                    1] is not None and not use_tpu
                labels = unstack_batch(
                    labels,
                    unpad_groundtruth_tensors=unpad_groundtruth_tensors)

                gt_boxes_list = labels[
                    fields.InputDataFields.groundtruth_boxes]
                gt_classes_list = labels[
                    fields.InputDataFields.groundtruth_classes]
                gt_masks_list = None
                if fields.InputDataFields.groundtruth_instance_masks in labels:
                    gt_masks_list = labels[
                        fields.InputDataFields.groundtruth_instance_masks]
                gt_keypoints_list = None
                if fields.InputDataFields.groundtruth_keypoints in labels:
                    gt_keypoints_list = labels[
                        fields.InputDataFields.groundtruth_keypoints]
                gt_weights_list = None
                if fields.InputDataFields.groundtruth_weights in labels:
                    gt_weights_list = labels[
                        fields.InputDataFields.groundtruth_weights]
                gt_confidences_list = None
                if fields.InputDataFields.groundtruth_confidences in labels:
                    gt_confidences_list = labels[
                        fields.InputDataFields.groundtruth_confidences]
                gt_is_crowd_list = None
                if fields.InputDataFields.groundtruth_is_crowd in labels:
                    gt_is_crowd_list = labels[
                        fields.InputDataFields.groundtruth_is_crowd]
                detection_model.provide_groundtruth(
                    groundtruth_boxes_list=gt_boxes_list,
                    groundtruth_classes_list=gt_classes_list,
                    groundtruth_confidences_list=gt_confidences_list,
                    groundtruth_masks_list=gt_masks_list,
                    groundtruth_keypoints_list=gt_keypoints_list,
                    groundtruth_weights_list=gt_weights_list,
                    groundtruth_is_crowd_list=gt_is_crowd_list)

                training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                    train_config.optimizer)

                preprocessed_images = features[fields.InputDataFields.image]
                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                    for k, v in prediction_dict.items():
                        if v.dtype == tf.bfloat16:
                            prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                losses.append(regularization_loss)
                losses_dict['Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

                if 'graph_rewriter_config' in configs:
                    graph_rewriter_fn = graph_rewriter_builder.build(
                        configs['graph_rewriter_config'],
                        is_training=is_training)
                    graph_rewriter_fn()

                class_agnostic = (
                    fields.DetectionResultFields.detection_classes
                    not in detections)
                groundtruth = _prepare_groundtruth_for_eval(
                    detection_model, class_agnostic,
                    eval_input_config.max_number_of_boxes)
                use_original_images = fields.InputDataFields.original_image in features
                if use_original_images:
                    eval_images = features[
                        fields.InputDataFields.original_image]
                    true_image_shapes = tf.slice(
                        features[fields.InputDataFields.true_image_shape],
                        [0, 0], [-1, 3])
                    original_image_spatial_shapes = features[
                        fields.InputDataFields.original_image_spatial_shape]
                else:
                    eval_images = features[fields.InputDataFields.image]
                    true_image_shapes = None
                    original_image_spatial_shapes = None

                eval_dict = eval_util.result_dict_for_batched_example(
                    eval_images,
                    features[inputs.HASH_KEY],
                    detections,
                    groundtruth,
                    class_agnostic=class_agnostic,
                    scale_to_absolute=True,
                    original_image_spatial_shapes=original_image_spatial_shapes,
                    true_image_shapes=true_image_shapes)

                if class_agnostic:
                    category_index = label_map_util.create_class_agnostic_category_index(
                    )
                else:
                    category_index = label_map_util.create_category_index_from_labelmap(
                        eval_input_config.label_map_path)
                vis_metric_ops = None
                if not use_tpu and use_original_images:
                    eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                        category_index,
                        max_examples_to_draw=eval_config.num_visualizations,
                        max_boxes_to_draw=eval_config.
                        max_num_boxes_to_visualize,
                        min_score_thresh=eval_config.min_score_threshold,
                        use_normalized_coordinates=False)
                    vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                        eval_dict)

                # Eval metrics on a single example.
                eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                    eval_config, category_index.values(), eval_dict)
                for loss_key, loss_tensor in iter(losses_dict.items()):
                    eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
                for var in optimizer_summary_vars:
                    eval_metric_ops[var.op.name] = (var, tf.no_op())
                if vis_metric_ops is not None:
                    eval_metric_ops.update(vis_metric_ops)
                eval_metric_ops = {
                    str(k): v
                    for k, v in eval_metric_ops.items()
                }

                if eval_config.use_moving_averages:
                    variable_averages = tf.train.ExponentialMovingAverage(0.0)
                    variables_to_restore = variable_averages.variables_to_restore(
                    )
                    keep_checkpoint_every_n_hours = (
                        train_config.keep_checkpoint_every_n_hours)
                    saver = tf.train.Saver(variables_to_restore,
                                           keep_checkpoint_every_n_hours=
                                           keep_checkpoint_every_n_hours)
                    scaffold = tf.train.Scaffold(saver=saver)

        elif mode == tf.estimator.ModeKeys.PREDICT:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            #similar to EVAL mode, I run PREDICT on CPU too.
            with tf.device(':/cpu:1'):
                preprocessed_images = features[fields.InputDataFields.image]

                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                        for k, v in prediction_dict.items():
                            if v.dtype == tf.bfloat16:
                                prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                exported_output = exporter_lib.add_output_tensor_nodes(
                    detections)
                export_outputs = {
                    tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                    tf.estimator.export.PredictOutput(exported_output)
                }

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            #scafold here only contains Saver
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)

            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
コード例 #40
0
def train(create_tensor_dict_fn, create_model_fn, train_config, master, task,
          num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name,
          is_chief, train_dir):
    """Training function for detection models.

  Args:
    create_tensor_dict_fn: a function to create a tensor input dictionary.
    create_model_fn: a function that creates a DetectionModel and generates
                     losses.
    train_config: a train_pb2.TrainConfig protobuf.
    master: BNS name of the TensorFlow master to use.
    task: The task id of this training instance.
    num_clones: The number of clones to run per machine.
    worker_replicas: The number of work replicas to train with.
    clone_on_cpu: True if clones should be forced to run on CPU.
    ps_tasks: Number of parameter server tasks.
    worker_job_name: Name of the worker job.
    is_chief: Whether this replica is the chief replica.
    train_dir: Directory to write checkpoints and training summaries to.
  """

    detection_model = create_model_fn()
    data_augmentation_options = [
        preprocessor_builder.build(step)
        for step in train_config.data_augmentation_options
    ]

    with tf.Graph().as_default():
        # Build a configuration specifying multi-GPU and multi-replicas.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=num_clones,
            clone_on_cpu=clone_on_cpu,
            replica_id=task,
            num_replicas=worker_replicas,
            num_ps_tasks=ps_tasks,
            worker_job_name=worker_job_name)

        # Place the global step on the device storing the variables.
        with tf.device(deploy_config.variables_device()):
            global_step = tf.train.create_global_step()

        with tf.device(deploy_config.inputs_device()):
            input_queue = create_input_queue(
                train_config.batch_size // num_clones, create_tensor_dict_fn,
                train_config.batch_queue_capacity,
                train_config.num_batch_queue_threads,
                train_config.prefetch_queue_capacity,
                data_augmentation_options)

        # Gather initial summaries.
        # TODO(rathodv): See if summaries can be added/extracted from global tf
        # collections so that they don't have to be passed around.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        global_summaries = set([])

        model_fn = functools.partial(_create_losses,
                                     create_model_fn=create_model_fn,
                                     train_config=train_config)
        clones = model_deploy.create_clones(deploy_config, model_fn,
                                            [input_queue])
        first_clone_scope = clones[0].scope

        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by model_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        with tf.device(deploy_config.optimizer_device()):
            training_optimizer = optimizer_builder.build(
                train_config.optimizer, global_summaries)

        sync_optimizer = None
        if train_config.sync_replicas:
            training_optimizer = tf.SyncReplicasOptimizer(
                training_optimizer,
                replicas_to_aggregate=train_config.replicas_to_aggregate,
                total_num_replicas=train_config.worker_replicas)
            sync_optimizer = training_optimizer

        # Create ops required to initialize the model from a given checkpoint.
        init_fn = None
        if train_config.fine_tune_checkpoint:
            var_map = detection_model.restore_map(
                from_detection_checkpoint=train_config.
                from_detection_checkpoint)
            available_var_map = (
                variables_helper.get_variables_available_in_checkpoint(
                    var_map, train_config.fine_tune_checkpoint))
            init_saver = tf.train.Saver(available_var_map)

            def initializer_fn(sess):
                init_saver.restore(sess, train_config.fine_tune_checkpoint)

            init_fn = initializer_fn

        with tf.device(deploy_config.optimizer_device()):
            total_loss, grads_and_vars = model_deploy.optimize_clones(
                clones, training_optimizer, regularization_losses=None)
            total_loss = tf.check_numerics(total_loss,
                                           'LossTensor is inf or nan.')

            # Optionally multiply bias gradients by train_config.bias_grad_multiplier.
            if train_config.bias_grad_multiplier:
                biases_regex_list = ['.*/biases']
                grads_and_vars = variables_helper.multiply_gradients_matching_regex(
                    grads_and_vars,
                    biases_regex_list,
                    multiplier=train_config.bias_grad_multiplier)

            # Optionally freeze some layers by setting their gradients to be zero.
            if train_config.freeze_variables:
                grads_and_vars = variables_helper.freeze_gradients_matching_regex(
                    grads_and_vars, train_config.freeze_variables)

            # Optionally clip gradients
            if train_config.gradient_clipping_by_norm > 0:
                with tf.name_scope('clip_grads'):
                    grads_and_vars = slim.learning.clip_gradient_norms(
                        grads_and_vars, train_config.gradient_clipping_by_norm)

            # Create gradient updates.
            grad_updates = training_optimizer.apply_gradients(
                grads_and_vars, global_step=global_step)
            update_ops.append(grad_updates)

            update_op = tf.group(*update_ops)
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')

        # Add summaries.
        for model_var in slim.get_model_variables():
            global_summaries.add(
                tf.summary.histogram(model_var.op.name, model_var))
        for loss_tensor in tf.losses.get_losses():
            global_summaries.add(
                tf.summary.scalar(loss_tensor.op.name, loss_tensor))
        global_summaries.add(
            tf.summary.scalar('TotalLoss', tf.losses.get_total_loss()))

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
        summaries |= global_summaries

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # Soft placement allows placing on CPU ops without GPU implementation.
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)

        session_config.gpu_options.allow_growth = True

        # Save checkpoints regularly.
        keep_checkpoint_every_n_hours = train_config.keep_checkpoint_every_n_hours
        saver = tf.train.Saver(
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)

        slim.learning.train(
            train_tensor,
            logdir=train_dir,
            master=master,
            is_chief=is_chief,
            session_config=session_config,
            startup_delay_steps=train_config.startup_delay_steps,
            init_fn=init_fn,
            summary_op=summary_op,
            number_of_steps=(train_config.num_steps
                             if train_config.num_steps else None),
            save_summaries_secs=120,
            sync_optimizer=sync_optimizer,
            saver=saver)
コード例 #41
0
ファイル: train.py プロジェクト: fmigone/nienluan
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        global_summaries = set([])

        model_fn = functools.partial(trainer._create_losses,
                                     create_model_fn=model,
                                     train_config=train_config)
        clones = model_deploy.create_clones(deploy_config, model_fn,
                                            [input_queue])
        print(len(clones))
        first_clone_scope = clones[0].scope

        update_ops = []
        with tf.device(deploy_config.optimizer_device()):
            #momentum optimizer va summaries
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)
            for var in optimizer_summary_vars:
                tf.summary.scalar(var.op.name, var)

        #restore checkpoint.
        init_fn = None
        if train_config.fine_tune_checkpoint:
            var_map = detection_model.restore_map(
                from_detection_checkpoint=train_config.
                from_detection_checkpoint)
            available_var_map = (
                variables_helper.get_variables_available_in_checkpoint(
                    var_map, train_config.fine_tune_checkpoint))
            init_saver = tf.train.Saver(available_var_map)

            def initializer_fn(sess):