Exemple #1
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           write_inference_graph=False):

    detection_model = model_builder.build(pipeline_config.model,
                                          is_training=False)
    graph_rewriter_fn = None
    if pipeline_config.HasField('graph_rewriter'):
        graph_rewriter_config = pipeline_config.graph_rewriter
        graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config,
                                                         is_training=False)
    _export_inference_graph(input_type,
                            detection_model,
                            pipeline_config.eval_config.use_moving_averages,
                            trained_checkpoint_prefix,
                            output_directory,
                            additional_output_tensor_names,
                            input_shape,
                            output_collection_name,
                            graph_hook_fn=graph_rewriter_fn,
                            write_inference_graph=write_inference_graph)
    pipeline_config.eval_config.use_moving_averages = False
    config_util.save_pipeline_config(pipeline_config, output_directory)
def populate_config(settings):
    """Fill the base config file with settings and save new version."""

    print '...Reading base config file'
    configs = config_util.get_configs_from_pipeline_file(
        settings['paths']['base_config'])

    print '...Updating config settings'
    hparams = tf.contrib.training.HParams(
        **{
            "model.ssd.num_classes":
            1,
            "train_config.fine_tune_checkpoint":
            settings['config']['train_config']['fine_tune_checkpoint'],
            "train_config.num_steps":
            settings['config']['train_config']['num_steps'],
            "eval_config.num_examples":
            settings['config']['eval_config']['num_examples'],
            "label_map_path":
            settings['config']['label_map_path']
        })
    configs = config_util.merge_external_params_with_configs(configs, hparams)
    configs['train_input_config'].tf_record_input_reader.input_path[
        0] = settings['config']['train_input_reader'][
            'tf_record_input_reader']['input_path']
    configs['eval_input_config'].tf_record_input_reader.input_path[
        0] = settings['config']['eval_input_reader']['tf_record_input_reader'][
            'input_path']

    print '...Writing new config file'
    pipeline_config = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config,
                                     settings['dirs']['pipeline'])
Exemple #3
0
def create_config_file(input_path, config_params, network_type):
    configs = config_util.get_configs_from_pipeline_file(input_path)

    if config_params['checkpoint_path'] is not None:
        prefix = ""
        for ckpt_file in os.listdir(
                os.path.join('/checkpoints/' + network_type,
                             config_params['checkpoint_path'])):
            if ckpt_file.endswith(".index"):
                prefix = ckpt_file.split(".index")[0]
                config_params[
                    'checkpoint_path'] = '/checkpoints/' + network_type + '/' + config_params[
                        'checkpoint_path'] + '/' + prefix

    else:
        config_params[
            'checkpoint_path'] = '/weights/' + network_type + '/model.ckpt'

    new_configs = None

    if network_type == "ssd_mobilenet" or network_type == "ssd_inception":
        new_configs = config_ssd_mobilenet_inception(configs, config_params)

    elif network_type == "ssd_resnet_50" or network_type == "ssd_fpn":
        new_configs = config_ssd_mobilenet_inception(configs, config_params)

    elif network_type == "frcnn_resnet_50" or network_type == "frcnn_resnet_101":
        new_configs = config_frcnn_resnet_50_101(configs, config_params)

    pipeline_config = config_util.create_pipeline_proto_from_configs(
        new_configs)

    config_util.save_pipeline_config(pipeline_config, '/training_dir/model')
Exemple #4
0
    def train(self,
              epochs=100,
              val_split=0.3,
              clear_folder=False,
              override_pipeline=False,
              eval=False):
        try:

            if clear_folder:
                FileUtil.clear_folder(self._out_folder)
            self.num_steps = epochs
            self._mk_labels_map()
            self._mk_records(val_split)
            # update pipeline
            self._out_folder.joinpath(os.path.sep.join(
                ["export", "Servo"])).mkdir(exist_ok=True, parents=True)
            # merge pipelines
            save_pipeline_config(self.pipeline, str(self._out_folder))
            # start training
            tf.logging.set_verbosity(tf.logging.INFO)
            if eval:
                self._train_and_eval()
            else:
                self._train()
        except Exception as ex:
            raise Exception("Error training the model : {}".format(ex)) from ex
        return super(TfTrainableModel, self).train()
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           write_inference_graph=False,
                           use_side_inputs=False,
                           side_input_shapes=None,
                           side_input_names=None,
                           side_input_types=None):
  """Exports inference graph for the model specified in the pipeline config.

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_prefix: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    input_shape: Sets a fixed shape for an `image_tensor` input. If not
      specified, will default to [None, None, None, 3].
    output_collection_name: Name of collection to add output tensors to.
      If None, does not add output tensors to a collection.
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
    write_inference_graph: If true, writes inference graph to disk.
    use_side_inputs: If True, the model requires side_inputs.
    side_input_shapes: List of shapes of the side input tensors,
      required if use_side_inputs is True.
    side_input_names: List of names of the side input tensors,
      required if use_side_inputs is True.
    side_input_types: List of types of the side input tensors,
      required if use_side_inputs is True.
  """
  detection_model = model_builder.build(pipeline_config.model,
                                        is_training=False)
  graph_rewriter_fn = None
  if pipeline_config.HasField('graph_rewriter'):
    graph_rewriter_config = pipeline_config.graph_rewriter
    graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config,
                                                     is_training=False)
  _export_inference_graph(
      input_type,
      detection_model,
      pipeline_config.eval_config.use_moving_averages,
      trained_checkpoint_prefix,
      output_directory,
      additional_output_tensor_names,
      input_shape,
      output_collection_name,
      graph_hook_fn=graph_rewriter_fn,
      write_inference_graph=write_inference_graph,
      use_side_inputs=use_side_inputs,
      side_input_shapes=side_input_shapes,
      side_input_names=side_input_names,
      side_input_types=side_input_types)
  pipeline_config.eval_config.use_moving_averages = False
  config_util.save_pipeline_config(pipeline_config, output_directory)
Exemple #6
0
 def generate_pipeline_config(self):
     configs = config_util.get_configs_from_pipeline_file(
         self.base_pipeline_config)
     tf_hparams = tf.contrib.training.HParams(**self.hparams)
     config_util.merge_external_params_with_configs(configs, tf_hparams)
     pipeline_config = config_util.create_pipeline_proto_from_configs(
         configs)
     config_util.save_pipeline_config(pipeline_config, self.prefix)
     return os.path.join(self.prefix, 'pipeline.config')
Exemple #7
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_dir,
                           output_directory):
  """Exports inference graph for the model specified in the pipeline config.

  This function creates `output_directory` if it does not already exist,
  which will hold a copy of the pipeline config with filename `pipeline.config`,
  and two subdirectories named `checkpoint` and `saved_model`
  (containing the exported checkpoint and SavedModel respectively).

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_dir: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
  Raises:
    ValueError: if input_type is invalid.
  """
  output_checkpoint_directory = os.path.join(output_directory, 'checkpoint')
  output_saved_model_directory = os.path.join(output_directory, 'saved_model')

  detection_model = model_builder.build(pipeline_config.model,
                                        is_training=False)

  ckpt = tf.train.Checkpoint(
      model=detection_model)
  manager = tf.train.CheckpointManager(
      ckpt, trained_checkpoint_dir, max_to_keep=1)
  status = ckpt.restore(manager.latest_checkpoint).expect_partial()

  module_dict = {
      'image_tensor': DetectionFromImageModule,
      'encoded_image_string_tensor':
          DetectionFromEncodedImageModule,
      'tf_example': DetectionFromTFExampleModule
  }
  if input_type not in module_dict:
    raise ValueError('Unrecognized `input_type`')
  detection_module = module_dict[input_type](detection_model)
  # Getting the concrete function traces the graph and forces variables to
  # be constructed --- only after this can we save the checkpoint and
  # saved model.
  concrete_function = detection_module.__call__.get_concrete_function()
  status.assert_existing_objects_matched()

  exported_checkpoint_manager = tf.train.CheckpointManager(
      ckpt, output_checkpoint_directory, max_to_keep=1)
  exported_checkpoint_manager.save(checkpoint_number=0)

  tf.saved_model.save(detection_module,
                      output_saved_model_directory,
                      signatures=concrete_function)

  config_util.save_pipeline_config(pipeline_config, output_directory)
Exemple #8
0
def override_pipeline(pipeline, override_dict, num_classes=0):
    configs = config_util.get_configs_from_pipeline_file(pipeline)

    meta_arch = configs["model"].WhichOneof("model")
    override_dict['model.{}.num_classes'.format(meta_arch)] = num_classes

    configs = config_util.merge_external_params_with_configs(
        configs, kwargs_dict=override_dict)
    pipeline_config = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config, os.environ['RESULT_DIR'])
Exemple #9
0
 def write_configuration(self, configuration_pipeline: Dict[str,
                                                            str]) -> None:
     try:
         pipeline_config = create_pipeline_proto_from_configs(
             configuration_pipeline)
     except Exception as e:
         raise ConfigurationBodyCorrupter(additional_message=e.__str__())
     try:
         save_pipeline_config(pipeline_config, self.path.model_dir)
     except Exception:
         raise ModelTrainingPathNotFound(
             training_model_path=self.path.model_dir)
def update_pipeline_config(tfrecords_dir):
    org_pipeline_config_file = os.path.join(FLAGS.base_model_dir,
                                            'pipeline.config')
    logger.info('original pipeline.config {}'.format(org_pipeline_config_file))
    cfg = config_util.get_configs_from_pipeline_file(org_pipeline_config_file)

    #update num_of_classes
    model_name = os.path.basename(os.path.normpath(
        FLAGS.base_model_dir)).lower()
    if model_name.startswith("ssd"):
        model_cfg = cfg['model'].ssd
        logger.info('found a ssd base model')
    elif model_name.startswith("faster_rcnn"):
        model_cfg = cfg['model'].faster_rcnn
        logger.info('found a faster_rcnn base model')
    else:
        raise ValueError(
            'unknown base model {}, we can only handle ssd nor faster_rcnn'.
            format(model_name))

    pascal_label_map_file = os.path.join(tfrecords_dir,
                                         'pascal_label_map.pbtxt')
    label_map_dict = label_map_util.get_label_map_dict(pascal_label_map_file)
    num_classes = len(label_map_dict)
    logger.info('num_of_classes from {} to {}'.format(model_cfg.num_classes,
                                                      num_classes))
    model_cfg.num_classes = num_classes

    #update base_model_dir
    train_cfg = cfg['train_config']
    train_cfg.fine_tune_checkpoint = os.path.join(FLAGS.base_model_dir,
                                                  'model.ckpt')
    logger.info('fine_tune_checkpoint: {}'.format(
        train_cfg.fine_tune_checkpoint))

    #update num_train_steps, label_map_path, train_tfrecords, val_tfrecords
    hparams = tf.contrib.training.HParams(
        train_steps=FLAGS.num_steps,
        label_map_path=pascal_label_map_file,
        train_input_path=os.path.join(tfrecords_dir, 'train.record'),
        eval_input_path=os.path.join(tfrecords_dir, 'val.record'))
    cfg = config_util.merge_external_params_with_configs(cfg, hparams)

    updated_pipeline_config = config_util.create_pipeline_proto_from_configs(
        cfg)
    updated_pipeline_config_file = os.path.join(tfrecords_dir,
                                                'pipeline.config')
    config_util.save_pipeline_config(updated_pipeline_config, tfrecords_dir)
    logger.info('updated pipeline.config {}'.format(tfrecords_dir))
    return updated_pipeline_config, updated_pipeline_config_file
Exemple #11
0
  def test_save_pipeline_config(self):
    """Tests that the pipeline config is properly saved to disk."""
    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
    pipeline_config.model.faster_rcnn.num_classes = 10
    pipeline_config.train_config.batch_size = 32
    pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
    pipeline_config.eval_config.num_examples = 20
    pipeline_config.eval_input_reader.queue_capacity = 100

    config_util.save_pipeline_config(pipeline_config, self.get_temp_dir())
    configs = config_util.get_configs_from_pipeline_file(
        os.path.join(self.get_temp_dir(), "pipeline.config"))
    pipeline_config_reconstructed = (
        config_util.create_pipeline_proto_from_configs(configs))

    self.assertEqual(pipeline_config, pipeline_config_reconstructed)
    def test_save_pipeline_config(self):
        """Tests that the pipeline config is properly saved to disk."""
        pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
        pipeline_config.model.faster_rcnn.num_classes = 10
        pipeline_config.train_config.batch_size = 32
        pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
        pipeline_config.eval_config.num_examples = 20
        pipeline_config.eval_input_reader.queue_capacity = 100

        config_util.save_pipeline_config(pipeline_config, self.get_temp_dir())
        configs = config_util.get_configs_from_pipeline_file(
            os.path.join(self.get_temp_dir(), "pipeline.config"))
        pipeline_config_reconstructed = (
            config_util.create_pipeline_proto_from_configs(configs))

        self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def create_model(pipeline_config_path, output_directory, checkpoint_path):
    tf.keras.backend.clear_session()

    print('Building model and restoring weights for fine-tuning...',
          flush=True)
    num_classes = 1
    output_checkpoint_dir = os.path.join(output_directory, 'checkpoint')
    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    model_config = configs['model']
    model_config.ssd.num_classes = num_classes
    model_config.ssd.freeze_batchnorm = True
    detection_model = model_builder.build(model_config=model_config,
                                          is_training=True)
    pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_proto, output_directory)

    latest_checkpoint_number = int(checkpoint_path.split('-')[-1])
    print(latest_checkpoint_number)
    if latest_checkpoint_number == 0:
        fake_box_predictor = tf.compat.v2.train.Checkpoint(
            _base_tower_layers_for_heads=detection_model._box_predictor.
            _base_tower_layers_for_heads,
            # _prediction_heads=detection_model._box_predictor._prediction_heads,
            #    (i.e., the classification head that we *will not* restore)
            _box_prediction_head=detection_model._box_predictor.
            _box_prediction_head,
        )
        fake_model = tf.compat.v2.train.Checkpoint(
            _feature_extractor=detection_model._feature_extractor,
            _box_predictor=fake_box_predictor)
        ckpt = tf.compat.v2.train.Checkpoint(model=fake_model)
        ckpt.restore(checkpoint_path).expect_partial()

    exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
    ckpt_manager = tf.train.CheckpointManager(exported_ckpt,
                                              output_checkpoint_dir,
                                              max_to_keep=1)
    if latest_checkpoint_number > 0:
        status = exported_ckpt.restore(ckpt_manager.latest_checkpoint)

    image, shapes = detection_model.preprocess(tf.zeros([1, 320, 320, 3]))
    prediction_dict = detection_model.predict(image, shapes)
    _ = detection_model.postprocess(prediction_dict, shapes)
    print('Weights restored!')
    return detection_model, pipeline_proto, ckpt_manager
Exemple #14
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           write_inference_graph=False):
  """Exports inference graph for the model specified in the pipeline config.

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_prefix: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    input_shape: Sets a fixed shape for an `image_tensor` input. If not
      specified, will default to [None, None, None, 3].
    output_collection_name: Name of collection to add output tensors to.
      If None, does not add output tensors to a collection.
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
    write_inference_graph: If true, writes inference graph to disk.
  """
  detection_model = model_builder.build(pipeline_config.model,
                                        is_training=False)
  graph_rewriter_fn = None
  if pipeline_config.HasField('graph_rewriter'):
    graph_rewriter_config = pipeline_config.graph_rewriter
    graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config,
                                                     is_training=False)
  _export_inference_graph(
      input_type,
      detection_model,
      pipeline_config.eval_config.use_moving_averages,
      trained_checkpoint_prefix,
      output_directory,
      additional_output_tensor_names,
      input_shape,
      output_collection_name,
      graph_hook_fn=graph_rewriter_fn,
      write_inference_graph=write_inference_graph)
  pipeline_config.eval_config.use_moving_averages = False
  config_util.save_pipeline_config(pipeline_config, output_directory)
Exemple #15
0
    def get_configuration_content(self, network_info: NetworkInformation) -> str:
        try:
            network_path: str = os.path.join(self.path.weights_dir, network_info.network_architecture,
                                             "pipeline.config")
            config_file_content: Dict[str, str] = get_configs_from_pipeline_file(network_path)
            checkpoint_path = os.path.join(self.path.weights_dir, network_info.network_architecture,
                                           'checkpoint/ckpt-0')
            content: Dict[str, str] = self._adjust_configuration_content(config_file_content=config_file_content,
                                                                         network_path=checkpoint_path)

            # the return of proto dict make error so we save the file and read it with python reader
            pipeline_config = create_pipeline_proto_from_configs(content)
            save_pipeline_config(pipeline_config, "/tmp/")
            content_str: str = open("/tmp/pipeline.config", "r").read()

            return content_str

        except Exception as e:
            raise ConfigurationPipelineNotFound(additional_message=e.__str__(), pipeline_path=network_path)
Exemple #16
0
def override_pipeline_configs(config_file, overrides, out_dir=""):
    configs = config_util.get_configs_from_pipeline_file(config_file)

    configs['train_config'].from_detection_checkpoint = True
    configs['eval_config'].num_examples = 25000

    for field, value in overrides.items():
        if field == "num_classes":
            set_number_of_classes(configs['model'], value)
        elif field == "width_height":
            set_resizer_width_height(configs['model'], value[0], value[1])
        elif not config_util._maybe_update_config_with_key_value(
                configs, field, value):
            try:
                config_util._update_generic(configs, field, value)
            except ValueError as ex:
                if field == "train_config.fine_tune_checkpoint":
                    configs['train_config'].fine_tune_checkpoint = value
                else:
                    raise

    config_util.save_pipeline_config(
        config_util.create_pipeline_proto_from_configs(configs), out_dir)
Exemple #17
0
    def patch_pipeline_config(self, model_base_name):
        self.label_map_path = os.path.join(self.src_train_path,
                                           "label_map.pbtxt")

        model_base_dir_path = os.path.join(self.path_perm_storage,
                                           "model_base_checkpoints",
                                           model_base_name)
        config_path = os.path.join(model_base_dir_path, "pipeline.config")

        cf_dict = config_util.get_configs_from_pipeline_file(config_path)

        cf_dict["model"].ssd.num_classes = self.num_classes

        cf_dict["train_config"].fine_tune_checkpoint = os.path.join(
            model_base_dir_path, "ckpt-0")
        cf_dict["train_config"].batch_size = self.batch_size
        cf_dict["train_config"].use_bfloat16 = False

        cf_dict["train_input_config"].label_map_path = self.label_map_path
        cf_dict[
            "train_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records(
                DatasetType.training.name)

        cf_dict["eval_input_config"].label_map_path = self.label_map_path
        cf_dict[
            "eval_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records(
                DatasetType.evaluation.name)

        cf_obj = config_util.create_pipeline_proto_from_configs(cf_dict)
        tmp_config_path = os.path.join(self.path_perm_storage,
                                       "patched_config")
        config_util.save_pipeline_config(cf_obj, tmp_config_path)
        self.patched_config_path = os.path.join(tmp_config_path,
                                                "pipeline.config")
        print("Source configuration was patched: {0}".format(
            self.patched_config_path))
def train_loop(
    pipeline_config_path,
    model_dir,
    val_checkpoint_dir,
    config_override=None,
    train_steps=None,
    use_tpu=False,
    save_final_config=False,
    checkpoint_every_n=1000,
    checkpoint_max_to_keep=7,
    record_summaries=True,
    performance_summary_exporter=None,
    **kwargs):
  """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    val_checkpoint_dir:
      The directory to save validation checkpoint.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    checkpoint_every_n:
      Checkpoint every n training steps.
    checkpoint_max_to_keep:
      int, the number of most recent checkpoints to keep in the model directory.
    record_summaries: Boolean, whether or not to record summaries.
    performance_summary_exporter: function for exporting performance metrics.
    **kwargs: Additional keyword arguments for configuration override.
  """

  print('START train looop function ========================')

  ## Parse the configs
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  steps_per_sec_list = []

  configs = get_configs_from_pipeline_file(
      pipeline_config_path, config_override=config_override)
  kwargs.update({
      'train_steps': train_steps,
      'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
  })
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']

  unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
  add_regularization_loss = train_config.add_regularization_loss
  clip_gradients_value = None
  if train_config.gradient_clipping_by_norm > 0:
    clip_gradients_value = train_config.gradient_clipping_by_norm

  # update train_steps from config but only when non-zero value is provided
  if train_steps is None and train_config.num_steps != 0:
    train_steps = train_config.num_steps

  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

  if train_config.load_all_detection_checkpoint_vars:
    raise ValueError('train_pb2.load_all_detection_checkpoint_vars '
                     'unsupported in TF2')

  config_util.update_fine_tune_checkpoint_type(train_config)
  fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
  fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

  # Write the as-run pipeline config to disk.
  if save_final_config:
    tf.logging.info('Saving pipeline config file to directory {}'.format(
        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)

  # Build the model, optimizer, and training input
  strategy = tf.compat.v2.distribute.get_strategy()
  with strategy.scope():
    detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        model_config=model_config, is_training=True)

    def train_dataset_fn(input_context):
      """Callable to create train input."""
      # Create the inputs.
      train_input = inputs.train_input(
          train_config=train_config,
          train_input_config=train_input_config,
          model_config=model_config,
          model=detection_model,
          input_context=input_context)
      train_input = train_input.repeat()
      return train_input

    train_input = strategy.experimental_distribute_datasets_from_function(
        train_dataset_fn)


    global_step = tf.Variable(
        0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step',
        aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
    optimizer, (learning_rate,) = optimizer_builder.build(
        train_config.optimizer, global_step=global_step)

    # We run the detection_model on dummy inputs in order to ensure that the
    # model and all its variables have been properly constructed. Specifically,
    # this is currently necessary prior to (potentially) creating shadow copies
    # of the model variables for the EMA optimizer.
    if train_config.optimizer.use_moving_average:
      _ensure_model_is_built(detection_model, train_input,
                             unpad_groundtruth_tensors)
      optimizer.shadow_copy(detection_model)

    if callable(learning_rate):
      learning_rate_fn = learning_rate
    else:
      learning_rate_fn = lambda: learning_rate

  ## Train the model
  # Get the appropriate filepath (temporary or not) based on whether the worker
  # is the chief.
  summary_writer_filepath = get_filepath(strategy,
                                         os.path.join(model_dir, 'train'))
  if record_summaries:
    summary_writer = tf.compat.v2.summary.create_file_writer(
        summary_writer_filepath)
  else:
    summary_writer = tf2.summary.create_noop_writer()

  if use_tpu:
    num_steps_per_iteration = 100
  else:
    # TODO(b/135933080) Explore setting to 100 when GPU performance issues
    # are fixed.
    num_steps_per_iteration = 1

  with summary_writer.as_default():
    with strategy.scope():
      with tf.compat.v2.summary.record_if(
          lambda: global_step % num_steps_per_iteration == 0):
        # Load a fine-tuning checkpoint.
        if train_config.fine_tune_checkpoint:
          load_fine_tune_checkpoint(
              detection_model, train_config.fine_tune_checkpoint,
              fine_tune_checkpoint_type, fine_tune_checkpoint_version,
              train_config.run_fine_tune_checkpoint_dummy_computation,
              train_input, unpad_groundtruth_tensors)

        ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)
        val_ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)

        manager_dir = get_filepath(strategy, model_dir)
        val_manager_dir = get_filepath(strategy, val_checkpoint_dir)



        # if not strategy.extended.should_checkpoint:
            # checkpoint_max_to_keep = 1
            
        checkpoint_max_to_keep = 1
        manager = tf.compat.v2.train.CheckpointManager(
            ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)
        val_manager = tf.compat.v2.train.CheckpointManager(
            val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep)

        model_checkpoint_callback = tfc.ModelCheckpoint(val_manager)
        early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min')
        train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt')
        cancellation_point = tfc.CancellationPoint()
        

        # We use the following instead of manager.latest_checkpoint because
        # manager_dir does not point to the model directory when we are running
        # in a worker.
        latest_checkpoint = tf.train.latest_checkpoint(model_dir)
        ckpt.restore(latest_checkpoint)
        val_ckpt.restore(latest_checkpoint)

        def train_step_fn(features, labels):
          """Single train step."""
          loss = eager_train_step(
              detection_model,
              features,
              labels,
              unpad_groundtruth_tensors,
              optimizer,
              learning_rate=learning_rate_fn(),
              add_regularization_loss=add_regularization_loss,
              clip_gradients_value=clip_gradients_value,
              global_step=global_step,
              num_replicas=strategy.num_replicas_in_sync)
          global_step.assign_add(1)
          return loss

        def _sample_and_train(strategy, train_step_fn, data_iterator):
          features, labels = data_iterator.next()
          if hasattr(tf.distribute.Strategy, 'run'):
            per_replica_losses = strategy.run(
                train_step_fn, args=(features, labels))
          else:
            per_replica_losses = strategy.experimental_run_v2(
                train_step_fn, args=(features, labels))
          # TODO(anjalisridhar): explore if it is safe to remove the
          ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
          return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                 per_replica_losses, axis=None)

        @tf.function
        def _dist_train_step(data_iterator):
          """A distributed train step."""

          if num_steps_per_iteration > 1:
            for _ in tf.range(num_steps_per_iteration - 1):
              # Following suggestion on yaqs/5402607292645376
              with tf.name_scope(''):
                _sample_and_train(strategy, train_step_fn, data_iterator)

          return _sample_and_train(strategy, train_step_fn, data_iterator)

        train_input_iter = iter(train_input)

        if int(global_step.value()) == 0:
          manager.save()

        checkpointed_step = int(global_step.value())
        logged_step = global_step.value()

        # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration

        last_step_time = time.time()
        for epoch, _ in enumerate(range(global_step.value(), train_steps,
                       num_steps_per_iteration)):

          loss = _dist_train_step(train_input_iter)

          time_taken = time.time() - last_step_time
          last_step_time = time.time()
          steps_per_sec = num_steps_per_iteration * 1.0 / time_taken

          tf.compat.v2.summary.scalar(
              'steps_per_sec', steps_per_sec, step=global_step)

          steps_per_sec_list.append(steps_per_sec)

          if global_step.value() - logged_step >= 100:
            tf.logging.info(
                'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                    global_step.value(), time_taken / num_steps_per_iteration,
                    loss))

            manager.save()
            checkpointed_step = int(global_step.value())

            log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20)
            log_metrics['train_total_loss'] = loss

            model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss'])
            stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss'])
            train_logger_callback.log(log_metrics)

            if stop_training or cancellation_point.check():
                break
            
            print(log_metrics)
            logged_step = global_step.value()

    

  # Remove the checkpoint directories of the non-chief workers that
  # MultiWorkerMirroredStrategy forces us to save during sync distributed
  # training.
  clean_temporary_directories(strategy, manager_dir)
  clean_temporary_directories(strategy, summary_writer_filepath)
  # TODO(pkanwar): add accuracy metrics.
  if performance_summary_exporter is not None:
    metrics = {
        'steps_per_sec': np.mean(steps_per_sec_list),
        'steps_per_sec_p50': np.median(steps_per_sec_list),
        'steps_per_sec_max': max(steps_per_sec_list),
        'last_batch_loss': float(loss)
    }
    mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32'
    performance_summary_exporter(metrics, mixed_precision)
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_dir,
                           output_directory,
                           use_side_inputs=False,
                           side_input_shapes='',
                           side_input_types='',
                           side_input_names=''):
  """Exports inference graph for the model specified in the pipeline config.

  This function creates `output_directory` if it does not already exist,
  which will hold a copy of the pipeline config with filename `pipeline.config`,
  and two subdirectories named `checkpoint` and `saved_model`
  (containing the exported checkpoint and SavedModel respectively).

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_dir: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    use_side_inputs: boolean that determines whether side inputs should be
      included in the input signature.
    side_input_shapes: forward-slash-separated list of comma-separated lists
        describing input shapes.
    side_input_types: comma-separated list of the types of the inputs.
    side_input_names: comma-separated list of the names of the inputs.
  Raises:
    ValueError: if input_type is invalid.
  """
  output_checkpoint_directory = os.path.join(output_directory, 'checkpoint')
  output_saved_model_directory = os.path.join(output_directory, 'saved_model')

  detection_model = INPUT_BUILDER_UTIL_MAP['model_build'](
      pipeline_config.model, is_training=False)

  ckpt = tf.train.Checkpoint(
      model=detection_model)
  manager = tf.train.CheckpointManager(
      ckpt, trained_checkpoint_dir, max_to_keep=1)
  status = ckpt.restore(manager.latest_checkpoint).expect_partial()

  if input_type not in DETECTION_MODULE_MAP:
    raise ValueError('Unrecognized `input_type`')
  if use_side_inputs and input_type != 'image_tensor':
    raise ValueError('Side inputs supported for image_tensor input type only.')

  zipped_side_inputs = []
  if use_side_inputs:
    zipped_side_inputs = _combine_side_inputs(side_input_shapes,
                                              side_input_types,
                                              side_input_names)

  detection_module = DETECTION_MODULE_MAP[input_type](detection_model,
                                                      use_side_inputs,
                                                      list(zipped_side_inputs))
  # Getting the concrete function traces the graph and forces variables to
  # be constructed --- only after this can we save the checkpoint and
  # saved model.
  concrete_function = detection_module.__call__.get_concrete_function()
  status.assert_existing_objects_matched()

  exported_checkpoint_manager = tf.train.CheckpointManager(
      ckpt, output_checkpoint_directory, max_to_keep=1)
  exported_checkpoint_manager.save(checkpoint_number=0)

  tf.saved_model.save(detection_module,
                      output_saved_model_directory,
                      signatures=concrete_function)

  config_util.save_pipeline_config(pipeline_config, output_directory)
Exemple #20
0
def create_estimator_and_inputs(run_config,
                                hparams,
                                pipeline_config_path,
                                train_steps=None,
                                sample_1_of_n_eval_examples=1,
                                sample_1_of_n_eval_on_train_examples=1,
                                model_fn_creator=create_model_fn,
                                use_tpu_estimator=False,
                                use_tpu=False,
                                num_shards=1,
                                params=None,
                                override_eval_num_epochs=True,
                                **kwargs):
  """Creates `Estimator`, input functions, and steps.

  Args:
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    sample_1_of_n_eval_examples: Integer representing how often an eval example
      should be sampled. If 1, will sample all examples.
    sample_1_of_n_eval_on_train_examples: Similar to
      `sample_1_of_n_eval_examples`, except controls the sampling of training
      data for evaluation.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False,
      an `Estimator` will be returned.
    use_tpu: Boolean, whether training and evaluation should run on TPU. Only
      used if `use_tpu_estimator` is True.
    num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator`
      is True.
    params: Parameter dictionary passed from the estimator. Only used if
      `use_tpu_estimator` is True.
    override_eval_num_epochs: Whether to overwrite the number of epochs to
      1 for eval_input.
    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    A dictionary with the following fields:
    'estimator': An `Estimator` or `TPUEstimator`.
    'train_input_fn': A training input function.
    'eval_input_fns': A list of all evaluation input functions.
    'eval_input_names': A list of names for each evaluation input.
    'eval_on_train_input_fn': An evaluation-on-train input function.
    'predict_input_fn': A prediction input function.
    'train_steps': Number of training steps. Either directly from input or from
      configuration.
  """
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn']
  create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
  create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']

  configs = get_configs_from_pipeline_file(pipeline_config_path)
  kwargs.update({
      'train_steps': train_steps,
      'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples
  })
  if override_eval_num_epochs:
    kwargs.update({'eval_num_epochs': 1})
    tf.logging.warning(
        'Forced number of epochs for all eval validations to be 1.')
  configs = merge_external_params_with_configs(
      configs, hparams, kwargs_dict=kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_configs = configs['eval_input_configs']
  eval_on_train_input_config = copy.deepcopy(train_input_config)
  eval_on_train_input_config.sample_1_of_n_examples = (
      sample_1_of_n_eval_on_train_examples)
  if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
    tf.logging.warning('Expected number of evaluation epochs is 1, but '
                       'instead encountered `eval_on_train_input_config'
                       '.num_epochs` = '
                       '{}. Overwriting `num_epochs` to 1.'.format(
                           eval_on_train_input_config.num_epochs))
    eval_on_train_input_config.num_epochs = 1

  # update train_steps from config but only when non-zero value is provided
  if train_steps is None and train_config.num_steps != 0:
    train_steps = train_config.num_steps

  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)

  # Create the input functions for TRAIN/EVAL/PREDICT.
  train_input_fn = create_train_input_fn(
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
  eval_input_fns = [
      create_eval_input_fn(
          eval_config=eval_config,
          eval_input_config=eval_input_config,
          model_config=model_config) for eval_input_config in eval_input_configs
  ]
  eval_input_names = [
      eval_input_config.name for eval_input_config in eval_input_configs
  ]
  eval_on_train_input_fn = create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=eval_on_train_input_config,
      model_config=model_config)
  predict_input_fn = create_predict_input_fn(
      model_config=model_config, predict_input_config=eval_input_configs[0])

  export_to_tpu = hparams.get('export_to_tpu', False)
  tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s',
                  use_tpu, export_to_tpu)
  model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu)
  if use_tpu_estimator:
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        train_batch_size=train_config.batch_size,
        # For each core, only batch size 1 is supported for eval.
        eval_batch_size=num_shards * 1 if use_tpu else 1,
        use_tpu=use_tpu,
        config=run_config,
        # TODO(lzc): Remove conditional after CMLE moves to TF 1.9
        params=params if params else {})
  else:
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

  # Write the as-run pipeline config to disk.
  if run_config.is_chief:
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)

  return dict(
      estimator=estimator,
      train_input_fn=train_input_fn,
      eval_input_fns=eval_input_fns,
      eval_input_names=eval_input_names,
      eval_on_train_input_fn=eval_on_train_input_fn,
      predict_input_fn=predict_input_fn,
      train_steps=train_steps)
def eval_continuously(
    pipeline_config_path,
    config_override=None,
    train_steps=None,
    sample_1_of_n_eval_examples=1,
    sample_1_of_n_eval_on_train_examples=1,
    use_tpu=False,
    override_eval_num_epochs=True,
    postprocess_on_cpu=False,
    model_dir=None,
    checkpoint_dir=None,
    wait_interval=180,
    timeout=3600,
    eval_index=0,
    save_final_config=False,
    **kwargs):
  """Run continuous evaluation of a detection model eagerly.

  This method builds the model, and continously restores it from the most
  recent training checkpoint in the checkpoint directory & evaluates it
  on the evaluation data.

  Args:
    pipeline_config_path: A path to a pipeline config file.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    sample_1_of_n_eval_examples: Integer representing how often an eval example
      should be sampled. If 1, will sample all examples.
    sample_1_of_n_eval_on_train_examples: Similar to
      `sample_1_of_n_eval_examples`, except controls the sampling of training
      data for evaluation.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for
      eval_input.
    postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true,
      postprocess is scheduled on the host cpu.
    model_dir: Directory to output resulting evaluation summaries to.
    checkpoint_dir: Directory that contains the training checkpoints.
    wait_interval: The mimmum number of seconds to wait before checking for a
      new checkpoint.
    timeout: The maximum number of seconds to wait for a checkpoint. Execution
      will terminate if no new checkpoints are found after these many seconds.
    eval_index: int, If given, only evaluate the dataset at the given
      index. By default, evaluates dataset at 0'th index.
    save_final_config: Whether to save the pipeline config file to the model
      directory.
    **kwargs: Additional keyword arguments for configuration override.
  """
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']

  configs = get_configs_from_pipeline_file(
      pipeline_config_path, config_override=config_override)
  kwargs.update({
      'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples,
      'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
  })
  if train_steps is not None:
    kwargs['train_steps'] = train_steps
  if override_eval_num_epochs:
    kwargs.update({'eval_num_epochs': 1})
    tf.logging.warning(
        'Forced number of epochs for all eval validations to be 1.')
  configs = merge_external_params_with_configs(
      configs, None, kwargs_dict=kwargs)
  if model_dir and save_final_config:
    tf.logging.info('Saving pipeline config file to directory {}'.format(
        model_dir))
    pipeline_config_final = create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_config_final, model_dir)

  model_config = configs['model']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_configs = configs['eval_input_configs']
  eval_on_train_input_config = copy.deepcopy(train_input_config)
  eval_on_train_input_config.sample_1_of_n_examples = (
      sample_1_of_n_eval_on_train_examples)
  if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
    tf.logging.warning('Expected number of evaluation epochs is 1, but '
                       'instead encountered `eval_on_train_input_config'
                       '.num_epochs` = '
                       '{}. Overwriting `num_epochs` to 1.'.format(
                           eval_on_train_input_config.num_epochs))
    eval_on_train_input_config.num_epochs = 1

  if kwargs['use_bfloat16']:
    tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

  eval_input_config = eval_input_configs[eval_index]
  strategy = tf.compat.v2.distribute.get_strategy()
  with strategy.scope():
    detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        model_config=model_config, is_training=True)

  eval_input = strategy.experimental_distribute_dataset(
      inputs.eval_input(
          eval_config=eval_config,
          eval_input_config=eval_input_config,
          model_config=model_config,
          model=detection_model))

  global_step = tf.compat.v2.Variable(
      0, trainable=False, dtype=tf.compat.v2.dtypes.int64)

  optimizer, _ = optimizer_builder.build(
      configs['train_config'].optimizer, global_step=global_step)

  for latest_checkpoint in tf.train.checkpoints_iterator(
      checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval):
    ckpt = tf.compat.v2.train.Checkpoint(
        step=global_step, model=detection_model, optimizer=optimizer)

    # We run the detection_model on dummy inputs in order to ensure that the
    # model and all its variables have been properly constructed. Specifically,
    # this is currently necessary prior to (potentially) creating shadow copies
    # of the model variables for the EMA optimizer.
    if eval_config.use_moving_averages:
      unpad_groundtruth_tensors = (eval_config.batch_size == 1 and not use_tpu)
      _ensure_model_is_built(detection_model, eval_input,
                             unpad_groundtruth_tensors)
      optimizer.shadow_copy(detection_model)

    ckpt.restore(latest_checkpoint).expect_partial()

    if eval_config.use_moving_averages:
      optimizer.swap_weights()

    summary_writer = tf.compat.v2.summary.create_file_writer(
        os.path.join(model_dir, 'eval', eval_input_config.name))
    with summary_writer.as_default():
      eval_metrics = eager_eval_loop(
          detection_model,
          configs,
          eval_input,
          use_tpu=use_tpu,
          postprocess_on_cpu=postprocess_on_cpu,
          global_step=global_step,
          )
    return eval_metrics
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               checkpoint_max_to_keep=7,
               **kwargs):
    """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU. If export_to_tpu is not provided, we will look for it in
      hparams too.
    checkpoint_every_n:
      Checkpoint every n training steps.
    checkpoint_max_to_keep:
      int, the number of most recent checkpoints to keep in the model directory.
    **kwargs: Additional keyword arguments for configuration override.
  """
    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(
            'mixed_bfloat16')

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type
    fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        def train_dataset_fn(input_context):
            """Callable to create train input."""
            # Create the inputs.
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=detection_model,
                input_context=input_context)
            train_input = train_input.repeat()
            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.compat.v2.dtypes.int64,
            name='global_step',
            aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    # Get the appropriate filepath (temporary or not) based on whether the worker
    # is the chief.
    summary_writer_filepath = _get_filepath(strategy,
                                            os.path.join(model_dir, 'train'))
    summary_writer = tf.compat.v2.summary.create_file_writer(
        summary_writer_filepath)

    if use_tpu:
        num_steps_per_iteration = 100
    else:
        # TODO(b/135933080) Explore setting to 100 when GPU performance issues
        # are fixed.
        num_steps_per_iteration = 1

    with summary_writer.as_default():
        with strategy.scope():
            with tf.compat.v2.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # Load a fine-tuning checkpoint.
                if fine_tune_checkpoint_path:
                    load_fine_tune_checkpoint(
                        detection_model, fine_tune_checkpoint_path,
                        fine_tune_checkpoint_type,
                        fine_tune_checkpoint_version,
                        load_all_detection_checkpoint_vars, train_input,
                        unpad_groundtruth_tensors)

                ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                     model=detection_model,
                                                     optimizer=optimizer)

                manager_dir = _get_filepath(strategy, model_dir)
                if not strategy.extended.should_checkpoint:
                    checkpoint_max_to_keep = 1
                manager = tf.compat.v2.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep)

                # We use the following instead of manager.latest_checkpoint because
                # manager_dir does not point to the model directory when we are running
                # in a worker.
                latest_checkpoint = tf.train.latest_checkpoint(model_dir)
                ckpt.restore(latest_checkpoint)

                def train_step_fn(features, labels):
                    """Single train step."""
                    loss = eager_train_step(
                        detection_model,
                        features,
                        labels,
                        unpad_groundtruth_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradients_value=clip_gradients_value,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync)
                    global_step.assign_add(1)
                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    per_replica_losses = strategy.run(train_step_fn,
                                                      args=(features, labels))
                    # TODO(anjalisridhar): explore if it is safe to remove the
                    ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""

                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            _sample_and_train(strategy, train_step_fn,
                                              data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)
                checkpointed_step = int(global_step.value())
                logged_step = global_step.value()

                last_step_time = time.time()
                for _ in range(global_step.value(), train_steps,
                               num_steps_per_iteration):

                    loss = _dist_train_step(train_input_iter)

                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()

                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                num_steps_per_iteration * 1.0 /
                                                time_taken,
                                                step=global_step)

                    if global_step.value() - logged_step >= 100:
                        tf.logging.info(
                            'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                                global_step.value(),
                                time_taken / num_steps_per_iteration, loss))
                        logged_step = global_step.value()

                    if ((int(global_step.value()) - checkpointed_step) >=
                            checkpoint_every_n):
                        manager.save()
                        checkpointed_step = int(global_step.value())

    # Remove the checkpoint directories of the non-chief workers that
    # MultiWorkerMirroredStrategy forces us to save during sync distributed
    # training.
    _clean_temporary_directories(strategy, manager_dir)
    _clean_temporary_directories(strategy, summary_writer_filepath)
def edit_config(model_selected,
                config_output_dir,
                num_steps,
                label_map_path,
                record_dir,
                eval_number,
                annotation_type,
                batch_size=None,
                learning_rate=None,
                resizer_size=None):
    '''
        Wrapper to edit the essential values inside the base configuration protobuf file provided with an object-detection/segmentation checkpoint.
        This configuration file is what will entirely define your model, pre-processing, training, evaluation etc. It is the most important file of a model with the checkpoint file and should never be deleted. 
        This is why it is saved in almost every directory where you did something to keep redondancy but also to be sure to have the right config file used at this moment.
        For advanced users, if you want to dwell deep inside the configuration file you should read the proto definitions inside the proto directory of the object-detection API.

        Args: 
            Required:
                model_selected: The checkpoint you want to resume from.
                config_output_dir: The path where you want to save your edited protobuf configuration file.
                num_steps: The number of steps you want to train on.
                label_map_path: The path to your label_map.pbtxt file.
                record_dir: The path to the directory where your TFRecord files are saved.
                eval_number: The number of images you want to evaluate on.
                annotation_type: Should be either "rectangle" or "polygon", depending on how you annotated your images.

            Optional:
                batch_size: The batch size you want to use. If not provided it will use the previous one. 
                learning_rate: The learning rate you want to use for the training. If not provided it will use the previous one. 
                                Please see config_utils.update_initial_learning_rate() inside the object_detection folder for indepth details on what happens when updating it.
                resizer_size: The shape used to update your image resizer. Please see set_image_resizer() for more details on this. If not provided it will use the previous one.            

    '''

    file_list = os.listdir(model_selected)
    ckpt_ids = []
    for p in file_list:
        if "index" in p:
            if "-" in p:
                ckpt_ids.append(int(p.split('-')[1].split('.')[0]))
    if len(ckpt_ids) > 0:
        ckpt_path = os.path.join(model_selected,
                                 "model.ckpt-{}".format(str(max(ckpt_ids))))

    else:
        ckpt_path = os.path.join(model_selected, "model.ckpt")

    configs = config_util.get_configs_from_pipeline_file(
        os.path.join(model_selected, 'pipeline.config'))
    label_map = label_map_util.load_labelmap(label_map_path)

    config_util._update_train_steps(configs, num_steps)
    update_different_paths(
        configs,
        ckpt_path=ckpt_path,
        label_map_path=label_map_path,
        train_record_path=os.path.join(record_dir, "train.record"),
        eval_record_path=os.path.join(record_dir, "eval.record"))

    if learning_rate is not None:
        config_util._update_initial_learning_rate(configs, learning_rate)

    if batch_size is not None:
        config_util._update_batch_size(configs, batch_size)

    if annotation_type == "polygon":
        edit_masks(configs, mask_type="PNG_MASKS")

    if resizer_size is not None:
        set_image_resizer(configs, resizer_size)

    edit_eval_config(configs, annotation_type, eval_number)
    update_num_classes(configs, label_map)
    config_proto = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(config_proto, directory=config_output_dir)
Exemple #24
0
    args = parser.parse_args()

    ds_info = load_data_set_path_dict()[args.dataset]
    if ds_info['data_set_type'] != 'object_detection':
        assert ('Dataset TypeError: Select a dataset for object detection')
    return args


if __name__ == '__main__':
    args = parse_args()
    # Defining the output_path that the pipeline config will be written to
    pipeline_out_path = os.path.join(args.exp_dir, 'pipeline.config')

    dataset_dir = os.path.join(os.environ['DCNN_DATASETS_PATH'], args.dataset)

    params_proto = config_odm_run(
        pipline_config_path=args.pipeline_config_path,
        dataset_path=dataset_dir,
        fine_tune_dir=args.fine_tune_dir)

    save_pipeline_config(params_proto, args.exp_dir)

    print('-' * 50)
    print('Beginning Training, logging to {}'.format(args.exp_dir))
    train_odm(model_dir=args.exp_dir,
              pipeline_config_path=pipeline_out_path,
              num_train_steps=args.num_train_steps,
              num_eval_steps=args.num_eval_steps,
              hparams=None)
    print('-' * 50)
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):
    """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU. If export_to_tpu is not provided, we will look for it in
      hparams too.
    checkpoint_every_n:
      Checkpoint every n training steps.
    **kwargs: Additional keyword arguments for configuration override.
  """
    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(
            'mixed_bfloat16')

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64,
                                            name='global_step')
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(detection_model,
                                          fine_tune_checkpoint_path,
                                          fine_tune_checkpoint_type,
                                          load_all_detection_checkpoint_vars,
                                          train_input,
                                          unpad_groundtruth_tensors)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                if not use_tpu:
                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                1.0 / (end_time - start_time),
                                                step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()
Exemple #26
0
def train_loop(config_path: str,
               model_dir: str,
               config_override: Optional[
                   pipeline_pb2.TrainEvalPipelineConfig] = None,
               train_steps: Optional[int] = None,
               use_tpu: bool = False,
               save_final_config: bool = False,
               log_every_n: int = 100,
               ckpt_every_n: int = 1000,
               ckpt_max_to_keep: int = 7,
               record_summaries: bool = True,
               **kwargs) -> None:
    """Trains a model using eager + functions.
    
    This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside tf.functions.
    7. Checkpoints the model every `ckpt_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.
    
    Args:
        config_path: A path to a pipeline config file.
        model_dir: The directory to save checkpoints and summaries to.
        config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`.
        train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted.
        use_tpu: Boolean, whether training and evaluation should run on TPU.
        save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`.
        log_every_n: Log total loss every n training steps.
        ckpt_every_n: Checkpoint every n training steps.
        ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory.
        record_summaries: Boolean, whether or not to record summaries.
        **kwargs: Additional keyword arguments for configuration override.
    """

    # parse config
    configs = config_util.get_configs_from_pipeline_file(
        config_path, config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu,
    })
    configs = config_util.merge_external_params_with_configs(
        configs, None, kwargs_dict=kwargs)

    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_gt_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradient_norm = None

    if train_config.gradient_clipping_by_norm > 0:
        clip_gradient_norm = train_config.gradient_clipping_by_norm

    if kwargs['use_bfloat16']:
        tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

    if train_config.load_all_detection_checkpoint_vars:
        raise ValueError(
            'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2')

    # base checkpoint to fine-tune from
    config_util.update_fine_tune_checkpoint_type(train_config)
    base_ckpt = train_config.fine_tune_checkpoint
    base_ckpt_type = train_config.fine_tune_checkpoint_type
    base_ckpt_ver = train_config.fine_tune_checkpoint_version

    # write the as-run pipeline config to disk
    if save_final_config:
        pipeline_config_final = config_util.create_pipeline_proto_from_configs(
            configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # build model, input, optimizer
    strategy = tf.distribute.get_strategy()
    with strategy.scope():
        # build model
        model = model_builder.build(model_config=model_config,
                                    is_training=True)

        # build input
        def train_dataset_fn(
                input_context: tf.distribute.InputContext) -> tf.data.Dataset:
            """Callable to create train input."""
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=model,
                input_context=input_context,
            )
            train_input = train_input.repeat()

            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        # build optimizer
        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.int64,
            name='global_step',
            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
        )
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    # prepare for training

    # get appropriate filepath (temporary or not) based on whether the worker is the chief
    summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train'))

    if record_summaries:
        summary_writer = tf.summary.create_file_writer(summary_log_path)
    else:
        summary_writer = tf.summary.create_noop_writer()

    if use_tpu:
        num_steps_per_iteration = 100
    else:
        num_steps_per_iteration = 1

    with summary_writer.as_default():
        with strategy.scope():
            with tf.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # prepare checkpoint manager
                # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker)
                ckpt = tf.train.Checkpoint(model=model,
                                           step=global_step,
                                           optimizer=optimizer)
                ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1
                manager_dir = get_filepath(strategy, model_dir)
                manager = tf.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=ckpt_max_to_keep)
                latest_ckpt = tf.train.latest_checkpoint(model_dir)

                if latest_ckpt:
                    # load latest checkpoint being trained
                    ckpt.restore(latest_ckpt).expect_partial()
                elif base_ckpt:
                    # load a pre-trained checkpoint
                    load_base_ckpt(model, base_ckpt, base_ckpt_type,
                                   base_ckpt_ver, train_input,
                                   unpad_gt_tensors)

                # get trainable variables
                train_vars = get_train_vars(model, train_config)

                # define training step
                def train_step_fn(features: Dict, labels: Dict):
                    """Single train step."""
                    loss = eager_train_step(
                        model,
                        train_vars,
                        features,
                        labels,
                        unpad_gt_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradient_norm=clip_gradient_norm,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync,
                    )
                    global_step.assign_add(1)

                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    per_replica_losses = strategy.run(train_step_fn,
                                                      args=(features, labels))

                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""
                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            with tf.name_scope(''):
                                _sample_and_train(strategy, train_step_fn,
                                                  data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)

                # save initialized version of checkpoint
                if int(global_step.value()) == 0:
                    manager.save()

                ckpt_step = int(global_step.value())
                logged_step = global_step.value()

                # proceed with training
                last_step_time = time.time()
                for _ in range(global_step.value(), train_config.num_steps,
                               num_steps_per_iteration):
                    # execute a step (forward pass + backward pass)
                    loss = _dist_train_step(train_input_iter)

                    # log time
                    curr_step = global_step.value()
                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()

                    tf.summary.scalar(
                        'steps_per_sec',
                        num_steps_per_iteration * 1.0 / time_taken,
                        step=global_step,
                    )

                    # log loss
                    if curr_step - logged_step >= log_every_n:
                        step_time = time_taken / num_steps_per_iteration
                        step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                            curr_step, step_time, loss)
                        v1.logging.info(step_msg)
                        logged_step = curr_step

                    # save checkpoint regularly
                    if (curr_step - ckpt_step) >= ckpt_every_n:
                        manager.save()
                        ckpt_step = curr_step

    # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync
    # distributed training.
    clean_temporary_directories(strategy, manager_dir)
    clean_temporary_directories(strategy, summary_log_path)
def set_config(config_path: Union[str, Path],
               checkpoint_path: Union[str, Path],
               tf_records_train_path: Union[str, Path],
               label_map: Dict[str, int],
               label_map_filepath: Union[str, Path],
               batch_size: int,
               max_box_predictions: int,
               max_number_of_boxes: int,
               fine_tune_checkpoint_type: str = 'detection',
               augment_path: str = None,
               min_dimension: int = None,
               max_dimension: int = None,
               total_steps: int = None,
               warmup_steps: int = None,
               num_steps: int = None):
    logger.info(f"Set configs {config_path}...")

    configs = get_configs_from_pipeline_file(str(config_path))

    train_len = count_tfrecord_examples(str(tf_records_train_path))
    logger.info(f"Train has {train_len} tf_records.")
    num_classes = len(set(label_map.values()))
    _, config_model = configs['model'].ListFields()[0]
    config_model.num_classes = num_classes

    configs[
        'model'].center_net.object_center_params.max_box_predictions = max_box_predictions
    if min_dimension is not None:
        configs[
            'model'].center_net.image_resizer.keep_aspect_ratio_resizer.min_dimension = min_dimension
    if max_dimension is not None:
        configs[
            'model'].center_net.image_resizer.keep_aspect_ratio_resizer.max_dimension = max_dimension

    configs[
        'train_config'].fine_tune_checkpoint_type = fine_tune_checkpoint_type
    configs['train_config'].fine_tune_checkpoint = str(checkpoint_path)
    configs['train_config'].batch_size = batch_size

    configs['train_config'].max_number_of_boxes = max_number_of_boxes
    if total_steps is not None:
        configs[
            'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.total_steps = total_steps
    if warmup_steps is not None:
        configs[
            'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.warmup_steps = warmup_steps
    if num_steps is not None:
        configs['train_config'].num_steps = num_steps

    if augment_path is not None:
        augment_config = configs['train_config'].data_augmentation_options
        for _ in augment_config:
            augment_config.pop()
        augment = text_format.Merge(str(augment_path),
                                    pipeline_pb2.TrainEvalPipelineConfig())
        augment_config.extend(augment.train_config.data_augmentation_options)

    label_map_to_file(label_map=label_map, filepath=label_map_filepath)

    def clear_repeated_proto(proto):
        for _ in proto:
            proto.pop()

    configs['train_input_config'].label_map_path = str(label_map_filepath)
    clear_repeated_proto(
        configs['train_input_config'].tf_record_input_reader.input_path)
    configs['train_input_config'].tf_record_input_reader.input_path.append(
        str(tf_records_train_path))

    pipeline_proto = create_pipeline_proto_from_configs(configs)
    save_pipeline_config(pipeline_proto, str(config_path.parent))
    logger.info(f"Config {config_path} changed")
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):

    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    use_bfloat16 = train_config.use_bfloat16
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # TODO(kaftan): Either make strategy a parameter of this method, or
    ## grab it w/  Distribution strategy's get_scope
    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.MirroredStrategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(
                    detection_model, fine_tune_checkpoint_path,
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars, train_input,
                    unpad_groundtruth_tensors, use_tpu, use_bfloat16)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    use_bfloat16=use_bfloat16,
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                tf.compat.v2.summary.scalar('steps_per_sec',
                                            1.0 / (end_time - start_time),
                                            step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                tf.print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()
Exemple #29
0
 def save_pipeline(pipeline_dict, out_folder):
     pipeline_proto = create_pipeline_proto_from_configs(pipeline_dict)
     save_pipeline_config(pipeline_proto, out_folder)
Exemple #30
0
def create_estimator_and_inputs(run_config,
                                hparams,
                                pipeline_config_path,
                                train_steps=None,
                                eval_steps=None,
                                model_fn_creator=create_model_fn,
                                use_tpu_estimator=False,
                                use_tpu=False,
                                num_shards=1,
                                params=None,
                                **kwargs):
  """Creates `Estimator`, input functions, and steps.

  Args:
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    eval_steps: Number of evaluation steps per evaluation cycle. If None, the
      number of evaluation steps is set from the `EvalConfig` proto.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False,
      an `Estimator` will be returned.
    use_tpu: Boolean, whether training and evaluation should run on TPU. Only
      used if `use_tpu_estimator` is True.
    num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator`
      is True.
    params: Parameter dictionary passed from the estimator. Only used if
      `use_tpu_estimator` is True.
    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    A dictionary with the following fields:
    'estimator': An `Estimator` or `TPUEstimator`.
    'train_input_fn': A training input function.
    'eval_input_fn': An evaluation input function.
    'eval_on_train_input_fn': An evaluation-on-train input function.
    'predict_input_fn': A prediction input function.
    'train_steps': Number of training steps. Either directly from input or from
      configuration.
    'eval_steps': Number of evaluation steps. Either directly from input or from
      configuration.
  """
  get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
      'get_configs_from_pipeline_file']
  merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
      'merge_external_params_with_configs']
  create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
      'create_pipeline_proto_from_configs']
  create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn']
  create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
  create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']

  configs = get_configs_from_pipeline_file(pipeline_config_path)
  configs = merge_external_params_with_configs(
      configs,
      hparams,
      train_steps=train_steps,
      eval_steps=eval_steps,
      **kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_config = configs['eval_input_config']

  if train_steps is None:
    train_steps = configs['train_config'].num_steps

  if eval_steps is None:
    eval_steps = configs['eval_config'].num_examples

  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)

  # Create the input functions for TRAIN/EVAL/PREDICT.
  train_input_fn = create_train_input_fn(
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
  eval_input_fn = create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=eval_input_config,
      model_config=model_config)
  eval_on_train_input_fn = create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=train_input_config,
      model_config=model_config)
  predict_input_fn = create_predict_input_fn(model_config=model_config)

  model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu)
  if use_tpu_estimator:
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        train_batch_size=train_config.batch_size,
        # For each core, only batch size 1 is supported for eval.
        eval_batch_size=num_shards * 1 if use_tpu else 1,
        use_tpu=use_tpu,
        config=run_config,
        params=params if params else {})
  else:
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

  # Write the as-run pipeline config to disk.
  if run_config.is_chief:
    pipeline_config_final = create_pipeline_proto_from_configs(
        configs)
    config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir)

  return dict(
      estimator=estimator,
      train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      eval_on_train_input_fn=eval_on_train_input_fn,
      predict_input_fn=predict_input_fn,
      train_steps=train_steps,
      eval_steps=eval_steps)
Exemple #31
0
print('Building model and restoring weights for fine-tuning...', flush=True)
pipeline_config = 'models/research/object_detection/configs/tf2/ssd_mobilenet_v2_fpnlite_320x320_coco17_tpu-8.config'
checkpoint_path = 'models/research/object_detection/test_data/checkpoint/ckpt-0'

output_directory = 'output/'
output_checkpoint_dir = os.path.join(output_directory, 'checkpoint')
configs = config_util.get_configs_from_pipeline_file(pipeline_config)

model_config = configs['model']
model_config.ssd.num_classes = num_classes
model_config.ssd.freeze_batchnorm = True
detection_model = model_builder.build(
      model_config=model_config, is_training=True)
# Save new pipeline config
pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
config_util.save_pipeline_config(pipeline_proto, output_directory)
fake_box_predictor = tf.compat.v2.train.Checkpoint(
    _base_tower_layers_for_heads=detection_model._box_predictor._base_tower_layers_for_heads,
    # _prediction_heads=detection_model._box_predictor._prediction_heads,
    #    (i.e., the classification head that we *will not* restore)
    _box_prediction_head=detection_model._box_predictor._box_prediction_head,
    )
fake_model = tf.compat.v2.train.Checkpoint(
          _feature_extractor=detection_model._feature_extractor,
          _box_predictor=fake_box_predictor)
ckpt = tf.compat.v2.train.Checkpoint(model=fake_model)
ckpt.restore(checkpoint_path).expect_partial()

# To save checkpoint for TFLite conversion.
exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt_manager = tf.train.CheckpointManager(
Exemple #32
0
def create_estimator_and_inputs(run_config,
                                hparams,
                                pipeline_config_path,
                                train_steps=None,
                                eval_steps=None,
                                model_fn_creator=create_model_fn,
                                use_tpu_estimator=False,
                                use_tpu=False,
                                num_shards=1,
                                params=None,
                                **kwargs):
    """Creates `Estimator`, input functions, and steps.

  Args:
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    eval_steps: Number of evaluation steps per evaluation cycle. If None, the
      number of evaluation steps is set from the `EvalConfig` proto.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False,
      an `Estimator` will be returned.
    use_tpu: Boolean, whether training and evaluation should run on TPU. Only
      used if `use_tpu_estimator` is True.
    num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator`
      is True.
    params: Parameter dictionary passed from the estimator. Only used if
      `use_tpu_estimator` is True.
    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    A dictionary with the following fields:
    'estimator': An `Estimator` or `TPUEstimator`.
    'train_input_fn': A training input function.
    'eval_input_fn': An evaluation input function.
    'eval_on_train_input_fn': An evaluation-on-train input function.
    'predict_input_fn': A prediction input function.
    'train_steps': Number of training steps. Either directly from input or from
      configuration.
    'eval_steps': Number of evaluation steps. Either directly from input or from
      configuration.
  """
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']
    create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn']
    create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
    create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']

    configs = get_configs_from_pipeline_file(pipeline_config_path)
    configs = merge_external_params_with_configs(
        configs,
        hparams,
        train_steps=train_steps,
        eval_steps=eval_steps,
        retain_original_images_in_eval=False if use_tpu else True,
        **kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']
    eval_config = configs['eval_config']
    eval_input_config = configs['eval_input_config']

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # update eval_steps from config but only when non-zero value is provided
    if eval_steps is None and eval_config.num_examples != 0:
        eval_steps = eval_config.num_examples

    detection_model_fn = functools.partial(model_builder.build,
                                           model_config=model_config)

    # Create the input functions for TRAIN/EVAL/PREDICT.
    train_input_fn = create_train_input_fn(
        train_config=train_config,
        train_input_config=train_input_config,
        model_config=model_config)
    eval_input_fn = create_eval_input_fn(eval_config=eval_config,
                                         eval_input_config=eval_input_config,
                                         model_config=model_config)
    eval_on_train_input_fn = create_eval_input_fn(
        eval_config=eval_config,
        eval_input_config=train_input_config,
        model_config=model_config)
    predict_input_fn = create_predict_input_fn(
        model_config=model_config, predict_input_config=eval_input_config)

    tf.logging.info('create_estimator_and_inputs: use_tpu %s', use_tpu)
    model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu)
    if use_tpu_estimator:
        estimator = tf.contrib.tpu.TPUEstimator(
            model_fn=model_fn,
            train_batch_size=train_config.batch_size,
            # For each core, only batch size 1 is supported for eval.
            eval_batch_size=num_shards * 1 if use_tpu else 1,
            use_tpu=use_tpu,
            config=run_config,
            # TODO(lzc): Remove conditional after CMLE moves to TF 1.9
            params=params if params else {})
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    # Write the as-run pipeline config to disk.
    if run_config.is_chief:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final,
                                         estimator.model_dir)

    return dict(estimator=estimator,
                train_input_fn=train_input_fn,
                eval_input_fn=eval_input_fn,
                eval_on_train_input_fn=eval_on_train_input_fn,
                predict_input_fn=predict_input_fn,
                train_steps=train_steps,
                eval_steps=eval_steps)
Exemple #33
0
def create_estimator_and_inputs(run_config,
                                hparams=None,
                                pipeline_config_path=None,
                                config_override=None,
                                train_steps=None,
                                sample_1_of_n_eval_examples=1,
                                sample_1_of_n_eval_on_train_examples=1,
                                model_fn_creator=create_model_fn,
                                use_tpu_estimator=False,
                                use_tpu=False,
                                num_shards=1,
                                params=None,
                                override_eval_num_epochs=True,
                                save_final_config=False,
                                postprocess_on_cpu=False,
                                export_to_tpu=None,
                                **kwargs):
    """Creates `Estimator`, input functions, and steps.

  Args:
    run_config: A `RunConfig`.
    hparams: (optional) A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    sample_1_of_n_eval_examples: Integer representing how often an eval example
      should be sampled. If 1, will sample all examples.
    sample_1_of_n_eval_on_train_examples: Similar to
      `sample_1_of_n_eval_examples`, except controls the sampling of training
      data for evaluation.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False,
      an `Estimator` will be returned.
    use_tpu: Boolean, whether training and evaluation should run on TPU. Only
      used if `use_tpu_estimator` is True.
    num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator`
      is True.
    params: Parameter dictionary passed from the estimator. Only used if
      `use_tpu_estimator` is True.
    override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for
      eval_input.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `estimator.model_dir`.
    postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true,
      postprocess is scheduled on the host cpu.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU.
    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    A dictionary with the following fields:
    'estimator': An `Estimator` or `TPUEstimator`.
    'train_input_fn': A training input function.
    'eval_input_fns': A list of all evaluation input functions.
    'eval_input_names': A list of names for each evaluation input.
    'eval_on_train_input_fn': An evaluation-on-train input function.
    'predict_input_fn': A prediction input function.
    'train_steps': Number of training steps. Either directly from input or from
      configuration.
  """
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']
    create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn']
    create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn']
    create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn']
    detection_model_fn_base = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    if sample_1_of_n_eval_examples >= 1:
        kwargs.update(
            {'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples})
    if override_eval_num_epochs:
        kwargs.update({'eval_num_epochs': 1})
        tf.logging.warning(
            'Forced number of epochs for all eval validations to be 1.')
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']
    eval_config = configs['eval_config']
    eval_input_configs = configs['eval_input_configs']
    eval_on_train_input_config = copy.deepcopy(train_input_config)
    eval_on_train_input_config.sample_1_of_n_examples = (
        sample_1_of_n_eval_on_train_examples)
    if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
        tf.logging.warning('Expected number of evaluation epochs is 1, but '
                           'instead encountered `eval_on_train_input_config'
                           '.num_epochs` = '
                           '{}. Overwriting `num_epochs` to 1.'.format(
                               eval_on_train_input_config.num_epochs))
        eval_on_train_input_config.num_epochs = 1

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    detection_model_fn = functools.partial(detection_model_fn_base,
                                           model_config=model_config)

    # Create the input functions for TRAIN/EVAL/PREDICT.
    train_input_fn = create_train_input_fn(
        train_config=train_config,
        train_input_config=train_input_config,
        model_config=model_config)
    eval_input_fns = [
        create_eval_input_fn(eval_config=eval_config,
                             eval_input_config=eval_input_config,
                             model_config=model_config)
        for eval_input_config in eval_input_configs
    ]
    eval_input_names = [
        eval_input_config.name for eval_input_config in eval_input_configs
    ]
    eval_on_train_input_fn = create_eval_input_fn(
        eval_config=eval_config,
        eval_input_config=eval_on_train_input_config,
        model_config=model_config)
    predict_input_fn = create_predict_input_fn(
        model_config=model_config, predict_input_config=eval_input_configs[0])

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None and hparams is not None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info(
        'create_estimator_and_inputs: use_tpu %s, export_to_tpu %s', use_tpu,
        export_to_tpu)
    model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu,
                                postprocess_on_cpu)
    if use_tpu_estimator:
        estimator = tf.estimator.tpu.TPUEstimator(
            model_fn=model_fn,
            train_batch_size=train_config.batch_size,
            # For each core, only batch size 1 is supported for eval.
            eval_batch_size=num_shards * 1 if use_tpu else 1,
            use_tpu=use_tpu,
            config=run_config,
            export_to_tpu=export_to_tpu,
            eval_on_tpu=False,  # Eval runs on CPU, so disable eval on TPU
            params=params if params else {})
    else:
        estimator = tf.estimator.Estimator(model_fn=model_fn,
                                           config=run_config)

    # Write the as-run pipeline config to disk.
    if run_config.is_chief and save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final,
                                         estimator.model_dir)

    return dict(estimator=estimator,
                train_input_fn=train_input_fn,
                eval_input_fns=eval_input_fns,
                eval_input_names=eval_input_names,
                eval_on_train_input_fn=eval_on_train_input_fn,
                predict_input_fn=predict_input_fn,
                train_steps=train_steps)
def update_pipeline_config(params, eval_type):
    cfg = config_util.get_configs_from_pipeline_file(
        os.path.join(params.config_mnt, params.config_dir))
    # update num_of_classes
    model_name = os.path.basename(
        os.path.normpath(os.path.join(params.config_mnt,
                                      params.config_dir))).lower()
    print("model name: ", model_name)
    if model_name.startswith("ssd"):
        model_cfg = cfg['model'].ssd
    elif model_name.startswith("faster_rcnn"):
        model_cfg = cfg['model'].faster_rcnn
    else:
        raise ValueError(
            'unknown base model {}, we can only handle ssd or faster_rcnn'.
            format(model_name))

    label_map = os.path.join(params.config_mnt, params.label_dir)
    label_map_dict = label_map_util.get_label_map_dict(label_map)
    num_classes = len(label_map_dict)
    model_cfg.num_classes = num_classes

    # update base_model_dir
    train_cfg = cfg['train_config']
    train_cfg.fine_tune_checkpoint = os.path.join(params.config_mnt,
                                                  params.transfer_learning_dir,
                                                  'model.ckpt')
    eval_cfg = cfg['eval_config']
    eval_cfg.max_evals = 1
    eval_cfg.num_examples = int(params.eval_num_examples)

    # update num_train_steps, label_map_path, train_tfrecords, val_tfrecords, batch size\
    print(
        os.path.join(os.path.sep, params.base_mnt, params.source_data_name,
                     'tf_records', 'train.record'))
    hparams = tf.contrib.training.HParams(
        batch_size=int(params.batch_size),
        train_steps=int(params.num_steps),
        label_map_path=label_map,
        train_input_path=os.path.join(os.path.sep, params.base_mnt,
                                      params.source_data_name, 'tf_records',
                                      'train.record'),
        eval_input_path=os.path.join(os.path.sep, params.base_mnt,
                                     params.source_data_name, 'tf_records',
                                     eval_type + '.record'),
    )
    cfg = config_util.merge_external_params_with_configs(cfg, hparams)
    # log metrics
    run_context = Run.get_context()
    run_context.log("Batch Size", int(params.batch_size))
    run_context.log("Training Steps", int(params.num_steps))
    # run.log("Maximum Evaluations",max_evals)

    updated_pipeline_config = config_util.create_pipeline_proto_from_configs(
        cfg)
    print("updated_pipeline_config: ", updated_pipeline_config)
    updated_pipeline_config_file = os.path.join(params.config_mnt,
                                                params.config_dir)
    print("updated_pipeline_config_file: ", updated_pipeline_config_file)
    print("dir name: ",
          os.path.dirname(os.path.join(params.config_mnt, params.config_dir)))
    config_util.save_pipeline_config(
        updated_pipeline_config,
        os.path.join(params.base_mnt, params.source_data_name, 'model_config'))
    return updated_pipeline_config, updated_pipeline_config_file