コード例 #1
0
ファイル: model_lib_randy.py プロジェクト: randyphoa/models
 def train_dataset_fn(input_context):
     train_input = inputs.train_input(
         train_config=train_config,
         train_input_config=train_input_config,
         model_config=model_config,
         model=detection_model,
         input_context=input_context,
     )
     train_input = train_input.repeat()
     return train_input
コード例 #2
0
 def train_dataset_fn(input_context):
   """Callable to create train input."""
   # Create the inputs.
   train_input = inputs.train_input(
       train_config=train_config,
       train_input_config=train_input_config,
       model_config=model_config,
       model=detection_model,
       input_context=input_context)
   train_input = train_input.repeat()
   return train_input
コード例 #3
0
ファイル: trainer.py プロジェクト: ymlsam/deepgaau-detector
        def train_dataset_fn(
                input_context: tf.distribute.InputContext) -> tf.data.Dataset:
            """Callable to create train input."""
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=model,
                input_context=input_context,
            )
            train_input = train_input.repeat()

            return train_input
コード例 #4
0
    def train_dataset_fn(input_context):
      """Callable to create train input."""
      batch_size = input_context.get_per_replica_batch_size(
          train_config.batch_size)

      # Create the inputs.
      train_input = inputs.train_input(
          train_config=train_config,
          train_input_config=train_input_config,
          model_config=model_config,
          model=detection_model, params={'batch_size': batch_size})
      train_input = train_input.repeat()
      return train_input.shard(input_context.num_input_pipelines,
                               input_context.input_pipeline_id)
def get_train_input(config_path):
  """
  Get the tf dataset that inputs training batches
  args:
    - config_path [str]: path to the edited config file
  returns:
    - dataset [tf.Dataset]: data outputting augmented batches
  """
  # parse config
  configs = get_configs_from_pipeline_file(config_path)
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']

  # get the dataset
  dataset = train_input(train_config, train_input_config, configs['model'])
  return dataset
コード例 #6
0
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):
    """Trains a model using eager + functions.

  This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside
       tf.functions.
    7. Checkpoints the model every `checkpoint_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.

  Args:
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    model_dir:
      The directory to save checkpoints and summaries to.
    config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
      override the config from `pipeline_config_path`.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    use_tpu: Boolean, whether training and evaluation should run on TPU.
    save_final_config: Whether to save final config (obtained after applying
      overrides) to `model_dir`.
    export_to_tpu: When use_tpu and export_to_tpu are true,
      `export_savedmodel()` exports a metagraph for serving on TPU besides the
      one on CPU. If export_to_tpu is not provided, we will look for it in
      hparams too.
    checkpoint_every_n:
      Checkpoint every n training steps.
    **kwargs: Additional keyword arguments for configuration override.
  """
    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(
            'mixed_bfloat16')

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64,
                                            name='global_step')
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(detection_model,
                                          fine_tune_checkpoint_path,
                                          fine_tune_checkpoint_type,
                                          load_all_detection_checkpoint_vars,
                                          train_input,
                                          unpad_groundtruth_tensors)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                if not use_tpu:
                    tf.compat.v2.summary.scalar('steps_per_sec',
                                                1.0 / (end_time - start_time),
                                                step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()
コード例 #7
0
def train_loop(hparams,
               pipeline_config_path,
               model_dir,
               config_override=None,
               train_steps=None,
               use_tpu=False,
               save_final_config=False,
               export_to_tpu=None,
               checkpoint_every_n=1000,
               **kwargs):

    ## Parse the configs
    get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
        'get_configs_from_pipeline_file']
    merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
        'merge_external_params_with_configs']
    create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
        'create_pipeline_proto_from_configs']

    configs = get_configs_from_pipeline_file(pipeline_config_path,
                                             config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu
    })
    configs = merge_external_params_with_configs(configs,
                                                 hparams,
                                                 kwargs_dict=kwargs)
    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors
    use_bfloat16 = train_config.use_bfloat16
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradients_value = None
    if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

    # update train_steps from config but only when non-zero value is provided
    if train_steps is None and train_config.num_steps != 0:
        train_steps = train_config.num_steps

    # Read export_to_tpu from hparams if not passed.
    if export_to_tpu is None:
        export_to_tpu = hparams.get('export_to_tpu', False)
    tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu,
                    export_to_tpu)

    # Parse the checkpoint fine tuning configs
    if hparams.load_pretrained:
        fine_tune_checkpoint_path = train_config.fine_tune_checkpoint
    else:
        fine_tune_checkpoint_path = None
    load_all_detection_checkpoint_vars = (
        train_config.load_all_detection_checkpoint_vars)
    # TODO(kaftan) (or anyone else): move this piece of config munging to
    ## utils/config_util.py
    if not train_config.fine_tune_checkpoint_type:
        # train_config.from_detection_checkpoint field is deprecated. For
        # backward compatibility, set train_config.fine_tune_checkpoint_type
        # based on train_config.from_detection_checkpoint.
        if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
        else:
            train_config.fine_tune_checkpoint_type = 'classification'
    fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type

    # Write the as-run pipeline config to disk.
    if save_final_config:
        pipeline_config_final = create_pipeline_proto_from_configs(configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # TODO(kaftan): Either make strategy a parameter of this method, or
    ## grab it w/  Distribution strategy's get_scope
    # Build the model, optimizer, and training input
    strategy = tf.compat.v2.distribute.MirroredStrategy()
    with strategy.scope():
        detection_model = model_builder.build(model_config=model_config,
                                              is_training=True)

        # Create the inputs.
        train_input = inputs.train_input(train_config=train_config,
                                         train_input_config=train_input_config,
                                         model_config=model_config,
                                         model=detection_model)

        train_input = strategy.experimental_distribute_dataset(
            train_input.repeat())

        global_step = tf.compat.v2.Variable(0,
                                            trainable=False,
                                            dtype=tf.compat.v2.dtypes.int64)
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    ## Train the model
    summary_writer = tf.compat.v2.summary.create_file_writer(model_dir +
                                                             '/train')
    with summary_writer.as_default():
        with strategy.scope():
            # Load a fine-tuning checkpoint.
            if fine_tune_checkpoint_path:
                load_fine_tune_checkpoint(
                    detection_model, fine_tune_checkpoint_path,
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars, train_input,
                    unpad_groundtruth_tensors, use_tpu, use_bfloat16)

            ckpt = tf.compat.v2.train.Checkpoint(step=global_step,
                                                 model=detection_model)
            manager = tf.compat.v2.train.CheckpointManager(ckpt,
                                                           model_dir,
                                                           max_to_keep=7)

            ## Maybe re-enable checkpoint restoration depending on how it works:
            # ckpt.restore(manager.latest_checkpoint)

            def train_step_fn(features, labels):
                return eager_train_step(
                    detection_model,
                    features,
                    labels,
                    unpad_groundtruth_tensors,
                    optimizer,
                    learning_rate=learning_rate_fn(),
                    use_bfloat16=use_bfloat16,
                    add_regularization_loss=add_regularization_loss,
                    clip_gradients_value=clip_gradients_value,
                    use_tpu=use_tpu,
                    global_step=global_step,
                    num_replicas=strategy.num_replicas_in_sync)

            @tf.function
            def _dist_train_step(data_iterator):
                """A distributed train step."""
                features, labels = data_iterator.next()
                per_replica_losses = strategy.experimental_run_v2(
                    train_step_fn, args=(
                        features,
                        labels,
                    ))
                # TODO(anjalisridhar): explore if it is safe to remove the
                ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean
                mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                            per_replica_losses,
                                            axis=None)
                return mean_loss

            train_input_iter = iter(train_input)
            for _ in range(train_steps):
                start_time = time.time()

                loss = _dist_train_step(train_input_iter)
                global_step.assign_add(1)
                end_time = time.time()
                tf.compat.v2.summary.scalar('steps_per_sec',
                                            1.0 / (end_time - start_time),
                                            step=global_step)
                # TODO(kaftan): Remove this print after it is no longer helpful for
                ## debugging.
                tf.print('Finished step', global_step, end_time, loss)
                if int(global_step.value().numpy()) % checkpoint_every_n == 0:
                    manager.save()