Ejemplo n.º 1
0
 def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
   model_dir = self.get_temp_dir()
   experiment_config = configs.MultiEvalExperimentConfig(
       task=test_utils.FooConfig(),
       eval_tasks=(configs.TaskRoutine(
           task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2),
                   configs.TaskRoutine(
                       task_name='bar',
                       task_config=test_utils.BarConfig(),
                       eval_steps=3)))
   experiment_config = params_dict.override_params_dict(
       experiment_config, self._test_config, is_strict=False)
   with distribution_strategy.scope():
     train_task = task_factory.get_task(experiment_config.task)
     eval_tasks = [
         task_factory.get_task(config.task_config, name=config.task_name)
         for config in experiment_config.eval_tasks
     ]
   train_lib.run_experiment_with_multitask_eval(
       distribution_strategy=distribution_strategy,
       train_task=train_task,
       eval_tasks=eval_tasks,
       mode=flag_mode,
       params=experiment_config,
       model_dir=model_dir)
Ejemplo n.º 2
0
  def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
    model_dir = self.get_temp_dir()
    flags_dict = dict(
        experiment='mock',
        mode=flag_mode,
        model_dir=model_dir,
        params_override=json.dumps(self._test_config))
    with flagsaver.flagsaver(**flags_dict):
      params = train_utils.parse_configuration(flags.FLAGS)
      train_utils.serialize_config(params, model_dir)
      with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

      logs = train_lib.run_experiment(
          distribution_strategy=distribution_strategy,
          task=task,
          mode=flag_mode,
          params=params,
          model_dir=model_dir,
          run_post_eval=run_post_eval)

    if run_post_eval:
      self.assertNotEmpty(logs)
    else:
      self.assertEmpty(logs)
    self.assertNotEmpty(
        tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml')))
    if flag_mode != 'eval':
      self.assertNotEmpty(
          tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
Ejemplo n.º 3
0
def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = train_utils.parse_configuration(FLAGS)

  model_dir = FLAGS.model_dir
  if 'train' in FLAGS.mode:
    # Pure eval modes do not output yaml files. Otherwise continuous eval job
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)
  with distribution_strategy.scope():
    task = task_factory.get_task(params.task, logging_dir=model_dir)

  train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      model_dir=model_dir)

  train_utils.save_gin_config(FLAGS.mode, model_dir)
Ejemplo n.º 4
0
def load_model(experiment="yolo_custom", config_path=[], model_dir=""):
  CFG = train_utils.ParseConfigOptions(
      experiment=experiment, config_file=config_path)
  params = train_utils.parse_configuration(CFG)

  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
                                           params.runtime.loss_scale)

  task = task_factory.get_task(params.task, logging_dir=model_dir)
  model = task.build_model()

  if model_dir is not None and model_dir != "":
    optimizer = task.create_optimizer(params.trainer.optimizer_config,
                                      params.runtime)
    # optimizer = tf.keras.mixed_precision.LossScaleOptimizer(tf.keras.optimizers.SGD(), dynamic = True)
    ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer)
    status = ckpt.restore(tf.train.latest_checkpoint(model_dir))

    status.expect_partial().assert_existing_objects_matched()
    print(dir(status), status)
  else:
    task.initialize(model)

  return task, model
Ejemplo n.º 5
0
def build_experiment_model(experiment_type):
    """Builds model from experiment type configuration."""
    params = exp_factory.get_exp_config(experiment_type)
    params.validate()
    params.lock()
    task = task_factory.get_task(params.task)
    return task.build_model()
Ejemplo n.º 6
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    superglue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists)

    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    with distribution_strategy.scope():
        task = None
        if 'train_eval' in FLAGS.mode:
            logging.info('Starting training and eval...')
            logging.info('Model dir: %s', FLAGS.model_dir)

            exp_config = _get_exp_config(input_meta_data=input_meta_data,
                                         exp_config_files=FLAGS.config_file)
            train_utils.serialize_config(exp_config, FLAGS.model_dir)
            task = task_factory.get_task(exp_config.task,
                                         logging_dir=FLAGS.model_dir)
            train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode='train_and_eval',
                params=exp_config,
                model_dir=FLAGS.model_dir)

        if 'predict' in FLAGS.mode:
            logging.info('Starting predict...')
            # When mode is `predict`, `task` will be None.
            if task is None:
                exp_config = _get_exp_config(input_meta_data=input_meta_data,
                                             exp_config_files=[
                                                 os.path.join(
                                                     FLAGS.model_dir,
                                                     'params.yaml')
                                             ])
                task = task_factory.get_task(exp_config.task,
                                             logging_dir=FLAGS.model_dir)
            _write_submission_file(task, input_meta_data['max_seq_length'])
Ejemplo n.º 7
0
    def testContinuousFinetune(self):
        pretrain_steps = 1
        src_model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode='continuous_train_and_eval',
                          model_dir=self._model_dir,
                          params_override={
                              'task': {
                                  'init_checkpoint': src_model_dir,
                              },
                              'trainer': {
                                  'continuous_eval_timeout': 1,
                                  'steps_per_loop': 1,
                                  'train_steps': 1,
                                  'validation_steps': 1,
                                  'best_checkpoint_export_subdir': 'best_ckpt',
                                  'best_checkpoint_eval_metric': 'acc',
                                  'optimizer_config': {
                                      'optimizer': {
                                          'type': 'sgd'
                                      },
                                      'learning_rate': {
                                          'type': 'constant'
                                      }
                                  }
                              }
                          })

        with flagsaver.flagsaver(**flags_dict):
            # Train and save some checkpoints.
            params = train_utils.parse_configuration(flags.FLAGS)
            distribution_strategy = tf.distribute.get_strategy()
            with distribution_strategy.scope():
                task = task_factory.get_task(params.task,
                                             logging_dir=src_model_dir)
            _ = train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode='train',
                params=params,
                model_dir=src_model_dir)

            params = train_utils.parse_configuration(FLAGS)
            eval_metrics = continuous_finetune_lib.run_continuous_finetune(
                FLAGS.mode,
                params,
                FLAGS.model_dir,
                run_post_eval=True,
                pretrain_steps=pretrain_steps)
            self.assertIn('best_acc', eval_metrics)

            self.assertFalse(
                tf.io.gfile.exists(os.path.join(FLAGS.model_dir,
                                                'checkpoint')))
Ejemplo n.º 8
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    if 'train_and_eval' in FLAGS.mode:
        assert (
            params.task.train_data.feature_shape ==
            params.task.validation_data.feature_shape), (
                f'train {params.task.train_data.feature_shape} != validate '
                f'{params.task.validation_data.feature_shape}')

    if 'assemblenet' in FLAGS.experiment:
        if 'eval' in FLAGS.mode:
            # Use the feature shape in validation_data for all jobs. The number of
            # frames in train_data will be used to construct the Assemblenet model.
            params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[
                0]
            shape = params.task.validation_data.feature_shape
        else:
            params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[
                0]
            shape = params.task.train_data.feature_shape
        logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode,
                     params.task.model.backbone.assemblenet.num_frames, shape)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
Ejemplo n.º 9
0
 def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
     tasks = {}
     task_eval_steps = {}
     task_weights = {}
     for task_routine in config.task_routines:
         task_name = task_routine.task_name
         tasks[task_name] = task_factory.get_task(task_routine.task_config,
                                                  logging_dir=logging_dir)
         task_eval_steps[task_name] = task_routine.eval_steps
         task_weights[task_name] = task_routine.task_weight
     return cls(tasks,
                task_eval_steps=task_eval_steps,
                task_weights=task_weights)
Ejemplo n.º 10
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype,
            params.runtime.loss_scale,
            use_experimental_api=True)

    input_partition_dims = None
    if FLAGS.mode == 'train_and_eval':
        if np.prod(params.task.train_input_partition_dims) != np.prod(
                params.task.eval_input_partition_dims):
            raise ValueError('Train and eval input partition dims can not be'
                             'partitioned on the same node')
        else:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.train_input_partition_dims)
    elif FLAGS.mode == 'train':
        if params.task.train_input_partition_dims:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.train_input_partition_dims)
    elif FLAGS.mode == 'eval' or FLAGS.mode == 'continuous_eval':
        if params.task.eval_input_partition_dims:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.eval_input_partition_dims)

    distribution_strategy = create_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        num_gpus=params.runtime.num_gpus,
        input_partition_dims=input_partition_dims,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)
def _build_experiment_model(experiment_type):
    """Builds model from experiment type configuration w/o loading checkpoint.

  To reduce test latency and avoid unexpected errors (e.g. checkpoint files not
  exist in the dedicated path), we skip the checkpoint loading for the tests.

  Args:
    experiment_type: model type for the experiment.
  Returns:
    TF/Keras model for the task.
  """
    params = exp_factory.get_exp_config(experiment_type)
    if 'deeplabv3plus_mobilenet_edgetpuv2' in experiment_type:
        params.task.model.backbone.mobilenet_edgetpu.pretrained_checkpoint_path = None
    if 'autoseg_edgetpu' in experiment_type:
        params.task.model.model_params.model_weights_path = None
    params.validate()
    params.lock()
    task = task_factory.get_task(params.task)
    return task.build_model()
    def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
        model_dir = self.get_temp_dir()
        experiment_config = cfg.ExperimentConfig(
            trainer=prog_trainer_lib.ProgressiveTrainerConfig(),
            task=ProgTaskConfig())
        experiment_config = params_dict.override_params_dict(experiment_config,
                                                             self._test_config,
                                                             is_strict=False)

        with distribution_strategy.scope():
            task = task_factory.get_task(experiment_config.task,
                                         logging_dir=model_dir)

        _, logs = train_lib.run_experiment(
            distribution_strategy=distribution_strategy,
            task=task,
            mode=flag_mode,
            params=experiment_config,
            model_dir=model_dir,
            run_post_eval=run_post_eval)

        if run_post_eval:
            self.assertNotEmpty(logs)
        else:
            self.assertEmpty(logs)

        if flag_mode == 'eval':
            return
        self.assertNotEmpty(
            tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
        # Tests continuous evaluation.
        _, logs = train_lib.run_experiment(
            distribution_strategy=distribution_strategy,
            task=task,
            mode='continuous_eval',
            params=experiment_config,
            model_dir=model_dir,
            run_post_eval=run_post_eval)
        print(logs)
Ejemplo n.º 13
0
    def test_recovery(self, distribution_strategy, flag_mode):
        loss_threshold = 1.0
        model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode=flag_mode,
                          model_dir=model_dir,
                          params_override=json.dumps(self._test_config))
        with flagsaver.flagsaver(**flags_dict):
            params = train_utils.parse_configuration(flags.FLAGS)
            params.trainer.loss_upper_bound = loss_threshold
            params.trainer.recovery_max_trials = 1
            train_utils.serialize_config(params, model_dir)
            with distribution_strategy.scope():
                task = task_factory.get_task(params.task,
                                             logging_dir=model_dir)

            # Saves a checkpoint for reference.
            model = task.build_model()
            checkpoint = tf.train.Checkpoint(model=model)
            checkpoint_manager = tf.train.CheckpointManager(
                checkpoint, self.get_temp_dir(), max_to_keep=2)
            checkpoint_manager.save()
            before_weights = model.get_weights()

            def build_losses(labels, model_outputs, aux_losses=None):
                del labels, model_outputs
                return tf.constant([loss_threshold], tf.float32) + aux_losses

            task.build_losses = build_losses

            model, _ = train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode=flag_mode,
                params=params,
                model_dir=model_dir)
            after_weights = model.get_weights()
            for left, right in zip(before_weights, after_weights):
                self.assertAllEqual(left, right)
def run_continuous_finetune(
    mode: str,
    params: config_definitions.ExperimentConfig,
    model_dir: str,
    run_post_eval: bool = False,
) -> Mapping[str, Any]:
  """Run modes with continuous training.

  Currently only supports continuous_train_and_eval.

  Args:
    mode: A 'str', specifying the mode.
      continuous_train_and_eval - monitors a checkpoint directory. Once a new
        checkpoint is discovered, loads the checkpoint, finetune the model by
        training it (probably on another dataset or with another task), then
        evaluate the finetuned model.
    params: ExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.

  Returns:
    eval logs: returns eval metrics logs when run_post_eval is set to True,
      othewise, returns {}.
  """

  assert mode == 'continuous_train_and_eval', (
      'Only continuous_train_and_eval is supported by continuous_finetune. '
      'Got mode: {}'.format(mode))

  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
                                           params.runtime.loss_scale)
  distribution_strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  retry_times = 0
  while not tf.io.gfile.isdir(params.task.init_checkpoint):
    # Wait for the init_checkpoint directory to be created.
    if retry_times >= 60:
      raise ValueError(
          'ExperimentConfig.task.init_checkpoint must be a directory for '
          'continuous_train_and_eval mode.')
    retry_times += 1
    time.sleep(60)

  summary_writer = tf.summary.create_file_writer(
      os.path.join(model_dir, 'eval'))
  for pretrain_ckpt in tf.train.checkpoints_iterator(
      checkpoint_dir=params.task.init_checkpoint,
      min_interval_secs=10,
      timeout=params.trainer.continuous_eval_timeout):
    with distribution_strategy.scope():
      global_step = train_utils.read_global_step_from_checkpoint(pretrain_ckpt)

    if params.trainer.best_checkpoint_export_subdir:
      best_ckpt_subdir = '{}_{}'.format(
          params.trainer.best_checkpoint_export_subdir, global_step)
      params_replaced = params.replace(
          task={'init_checkpoint': pretrain_ckpt},
          trainer={'best_checkpoint_export_subdir': best_ckpt_subdir})
    else:
      params_replaced = params.replace(task={'init_checkpoint': pretrain_ckpt})
    params_replaced.lock()
    logging.info('Running finetuning with params: %s', params_replaced)

    with distribution_strategy.scope():
      task = task_factory.get_task(params_replaced.task, logging_dir=model_dir)

    _, eval_metrics = train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode='train_and_eval',
        # replace params.task.init_checkpoint to make sure that we load
        # exactly this pretrain checkpoint.
        params=params_replaced,
        model_dir=model_dir,
        run_post_eval=True,
        save_summary=False)
    logging.info('Evaluation finished. Pretrain global_step: %d', global_step)
    train_utils.write_json_summary(model_dir, global_step, eval_metrics)

    if not os.path.basename(model_dir):  # if model_dir.endswith('/')
      summary_grp = os.path.dirname(model_dir) + '_' + task.__class__.__name__
    else:
      summary_grp = os.path.basename(model_dir) + '_' + task.__class__.__name__
    summaries = {}
    for name, value in eval_metrics.items():
      summaries[summary_grp + '/' + name] = value
    train_utils.write_summary(summary_writer, global_step, summaries)

    train_utils.remove_ckpts(model_dir)

  if run_post_eval:
    return eval_metrics
  return {}
Ejemplo n.º 15
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)

    if params.runtime.num_hpus > 0:
        import os
        #TODO: remove when SW-49334 is fixed [SW-49404]
        os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1"
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

    if params.task.train_data.deterministic or params.task.validation_data.deterministic:
        import os
        os.environ['PYTHONHASHSEED'] = '0'
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        import numpy
        numpy.random.seed(0)
        import tensorflow as tf
        tf.random.set_seed(0)
        tf.compat.v1.set_random_seed(0)
        import random
        random.seed(0)

    if FLAGS.dtype == "bf16":
        print("Using bf16 config list {}".format(FLAGS.bf16_config_path))
        os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    hls_addresses = str(os.environ.get("MULTI_HLS_IPS",
                                       "127.0.0.1")).split(",")
    TF_BASE_PORT = 2410
    mpi_rank = comm_rank()
    mpi_size = comm_size()

    if params.runtime.num_hpus > 1:
        model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank))
    else:
        model_dir = FLAGS.model_dir

    #prepare a comma-seperated list of device addreses
    worker_list = []
    for address in hls_addresses:
        for rank in range(mpi_size // len(hls_addresses)):
            worker_list.append(address + ':' + str(TF_BASE_PORT + rank))
    worker_hosts = ",".join(worker_list)
    task_index = mpi_rank

    # Configures cluster spec for distribution strategy.
    distribution_utils.configure_cluster(worker_hosts, task_index)
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        num_hpus=params.runtime.num_hpus,
        tpu_address=params.runtime.tpu)

    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
Ejemplo n.º 16
0
 def test_task_factory(self):
     config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600')
     task = task_factory.get_task(config.task)
     self.assertIs(type(task), pretrain.VideoSSLPretrainTask)
Ejemplo n.º 17
0
        (name, self._task_weights.get(name, 1.0)) for name in self.tasks
>>>>>>> upstream/master
    ])

  @classmethod
  def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
    tasks = {}
    task_eval_steps = {}
<<<<<<< HEAD
    task_mixing_steps = {}
=======
>>>>>>> upstream/master
    task_weights = {}
    for task_routine in config.task_routines:
      task_name = task_routine.task_name
      tasks[task_name] = task_factory.get_task(
          task_routine.task_config, logging_dir=logging_dir)
      task_eval_steps[task_name] = task_routine.eval_steps
<<<<<<< HEAD
      task_mixing_steps[task_name] = task_routine.mixing_steps
      task_weights[task_name] = task_routine.task_weight
    return cls(
        tasks,
        task_mixing_steps=task_mixing_steps,
=======
      task_weights[task_name] = task_routine.task_weight
    return cls(
        tasks,
>>>>>>> upstream/master
        task_eval_steps=task_eval_steps,
        task_weights=task_weights)
Ejemplo n.º 18
0
def run_continuous_finetune(
    mode: str,
    params: config_definitions.ExperimentConfig,
    model_dir: str,
    run_post_eval: bool = False,
    pretrain_steps: Optional[int] = None,
) -> Mapping[str, Any]:
    """Run modes with continuous training.

  Currently only supports continuous_train_and_eval.

  Args:
    mode: A 'str', specifying the mode. continuous_train_and_eval - monitors a
      checkpoint directory. Once a new checkpoint is discovered, loads the
      checkpoint, finetune the model by training it (probably on another dataset
      or with another task), then evaluate the finetuned model.
    params: ExperimentConfig instance.
    model_dir: A 'str', a path to store model checkpoints and summaries.
    run_post_eval: Whether to run post eval once after training, metrics logs
      are returned.
    pretrain_steps: Optional, the number of total training steps for the
      pretraining job.

  Returns:
    eval logs: returns eval metrics logs when run_post_eval is set to True,
      othewise, returns {}.
  """

    assert mode == 'continuous_train_and_eval', (
        'Only continuous_train_and_eval is supported by continuous_finetune. '
        'Got mode: {}'.format(mode))

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype, params.runtime.loss_scale)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu)

    retry_times = 0
    while not tf.io.gfile.isdir(params.task.init_checkpoint):
        # Wait for the init_checkpoint directory to be created.
        if retry_times >= 60:
            raise ValueError(
                'ExperimentConfig.task.init_checkpoint must be a directory for '
                'continuous_train_and_eval mode.')
        retry_times += 1
        time.sleep(60)

    summary_writer = tf.summary.create_file_writer(
        os.path.join(model_dir, 'eval'))

    global_step = 0

    def timeout_fn():
        if pretrain_steps and global_step < pretrain_steps:
            # Keeps waiting for another timeout period.
            logging.info(
                'Continue waiting for new checkpoint as current pretrain '
                'global_step=%d and target is %d.', global_step,
                pretrain_steps)
            return False
        # Quits the loop.
        return True

    for pretrain_ckpt in tf.train.checkpoints_iterator(
            checkpoint_dir=params.task.init_checkpoint,
            min_interval_secs=10,
            timeout=params.trainer.continuous_eval_timeout,
            timeout_fn=timeout_fn):
        with distribution_strategy.scope():
            global_step = train_utils.read_global_step_from_checkpoint(
                pretrain_ckpt)
        # Replaces params.task.init_checkpoint to make sure that we load
        # exactly this pretrain checkpoint.
        if params.trainer.best_checkpoint_export_subdir:
            best_ckpt_subdir = '{}_{}'.format(
                params.trainer.best_checkpoint_export_subdir, global_step)
            params_replaced = params.replace(
                task={'init_checkpoint': pretrain_ckpt},
                trainer={'best_checkpoint_export_subdir': best_ckpt_subdir})
        else:
            params_replaced = params.replace(
                task={'init_checkpoint': pretrain_ckpt})
        params_replaced.lock()
        logging.info('Running finetuning with params: %s', params_replaced)

        with distribution_strategy.scope():
            if isinstance(params, configs.MultiEvalExperimentConfig):
                task = task_factory.get_task(params_replaced.task)
                eval_tasks = multitask.MultiTask.from_config(
                    params_replaced.eval_tasks)
                (_, eval_metrics
                 ) = multitask_train_lib.run_experiment_wtih_multitask_eval(
                     distribution_strategy=distribution_strategy,
                     train_task=task,
                     eval_tasks=eval_tasks,
                     mode='train_and_eval',
                     params=params_replaced,
                     model_dir=model_dir,
                     run_post_eval=True,
                     save_summary=False)
            else:
                task = task_factory.get_task(params_replaced.task,
                                             logging_dir=model_dir)
                _, eval_metrics = train_lib.run_experiment(
                    distribution_strategy=distribution_strategy,
                    task=task,
                    mode='train_and_eval',
                    params=params_replaced,
                    model_dir=model_dir,
                    run_post_eval=True,
                    save_summary=False)
        logging.info('Evaluation finished. Pretrain global_step: %d',
                     global_step)
        train_utils.write_json_summary(model_dir, global_step, eval_metrics)

        if not os.path.basename(model_dir):  # if model_dir.endswith('/')
            summary_grp = os.path.dirname(model_dir) + '_' + task.name
        else:
            summary_grp = os.path.basename(model_dir) + '_' + task.name
        summaries = {}
        for name, value in _flatten_dict(eval_metrics).items():
            summaries[summary_grp + '/' + '-'.join(name)] = value
        train_utils.write_summary(summary_writer, global_step, summaries)

        train_utils.remove_ckpts(model_dir)
        # In TF2, the resource life cycle is bound with the python object life
        # cycle. Force trigger python garbage collection here so those resources
        # can be deallocated in time, so it doesn't cause OOM when allocating new
        # objects.
        # TODO(b/169178664): Fix cycle reference in Keras model and revisit to see
        # if we need gc here.
        gc.collect()

    if run_post_eval:
        return eval_metrics
    return {}
Ejemplo n.º 19
0
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
<<<<<<< HEAD
                                           params.runtime.loss_scale)
=======
                                           params.runtime.loss_scale,
                                           use_experimental_api=True)
>>>>>>> upstream/master
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)
  with distribution_strategy.scope():
    task = task_factory.get_task(params.task, logging_dir=model_dir)

  train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      model_dir=model_dir)

<<<<<<< HEAD
=======
  train_utils.save_gin_config(FLAGS.mode, model_dir)

>>>>>>> upstream/master
if __name__ == '__main__':
  tfm_flags.define_flags()