Example #1
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):

        extension = os.path.splitext(saving_filepath)[1]

        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
            (test_base.get_task_type(), test_base.get_task_index(), extension))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath),
                             test_base.is_chief())
Example #2
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):
        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        # TODO(b/134551335): Must save to hdf5 until bug with copying
        # MirroredVariables is resolved.
        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' %
            (test_base.get_task_type(), test_base.get_task_index()))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(os.path.exists(saving_filepath),
                             test_base.is_chief())
Example #3
0
        def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves summaries but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(), 'logfile_%s_%d' %
                (test_base.get_task_type(), test_base.get_task_index()))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

            # If it's chief, the summaries should be saved in the filepath; if not,
            # the directory should be empty (although created). Using
            # `file_io.list_directory()` since the directory may be created at this
            # point.
            test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)),
                                 test_base.is_chief())
  def testSimpleInputFromFnLastPartialBatch(self, strategy):

    def dataset_fn(input_context):
      global_batch_size = 8
      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
      dataset = dataset_ops.DatasetV2.range(14).batch(
          batch_size, drop_remainder=False)
      return dataset.shard(input_context.num_input_pipelines,
                           input_context.input_pipeline_id)

    input_iterator = iter(
        strategy.experimental_distribute_datasets_from_function(dataset_fn))

    @def_function.function
    def run(input_iterator):
      return strategy.run(lambda x: x, args=(next(input_iterator),))

    # Let the complete batch go.
    run(input_iterator)
    # `result` is an incomplete batch
    result = run(input_iterator)

    expected_data_on_worker = [[8, 9, 10, 11], [12, 13]]
    self.assertTrue(
        np.array_equal(
            result.numpy(), expected_data_on_worker[
                multi_worker_test_base.get_task_index()]))
Example #5
0
        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
                test_obj, file_format):

            model, saving_filepath, train_ds, steps = _model_setup(
                test_obj, file_format)
            num_epoch = 2
            extension = os.path.splitext(saving_filepath)[1]

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves checkpoint but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'checkpoint_%s_%d%s' % (test_base.get_task_type(),
                                        test_base.get_task_index(), extension))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(
                training_state.checkpoint_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      validation_data=train_ds,
                      validation_steps=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=save_weights_only)
                      ])

            # If it's chief, the model should be saved; if not, the model shouldn't.
            test_obj.assertEqual(
                training_state.checkpoint_exists(saving_filepath),
                test_base.is_chief())

            # If it's chief, the model should be saved (`write_filepath` should
            # simply return `saving_filepath`); if not, i.e. for non-chief workers,
            # the temporary path generated by `write_filepath` should no longer
            # contain the checkpoint that has been deleted.
            test_obj.assertEqual(
                training_state.checkpoint_exists(
                    distributed_file_utils.write_filepath(
                        saving_filepath, model._distribution_strategy)),
                test_base.is_chief())
  def testSimpleInputFromDatasetLastPartialBatch(self, strategy):
    global_batch_size = 8
    dataset = dataset_ops.DatasetV2.range(14).batch(
        global_batch_size, drop_remainder=False)
    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))

    @def_function.function
    def run(input_iterator):
      return strategy.run(lambda x: x, args=(next(input_iterator),))

    # Let the complete batch go.
    run(input_iterator)

    # `result` is an incomplete batch
    result = run(input_iterator)
    expected_data_on_worker = [[8, 9, 10], [11, 12, 13]]
    self.assertTrue(
        np.array_equal(
            result.numpy(),
            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath):
    # Incorporate type/index information and thread id in saving_filepath to
    # ensure every worker has a unique path. Note that in normal use case the
    # saving_filepath will be the same for all workers, but we use different
    # ones here just to test out chief saves checkpoint but non-chief doesn't.
    saving_filepath = os.path.join(
        test_obj.get_temp_dir(), 'checkpoint_%s_%d' %
        (test_base.get_task_type(), test_base.get_task_index()))

    # The saving_filepath shouldn't exist at the beginning (as it's unique).
    test_obj.assertFalse(os.path.exists(saving_filepath))

    model.fit(
        x=train_ds,
        epochs=num_epoch,
        steps_per_epoch=steps,
        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

    # If it's chief, the model should be saved; if not, the model shouldn't.
    test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
  def testDatasetFromFunction(self, strategy):
    def dataset_fn(input_context):
      global_batch_size = 10
      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
      return d.shard(input_context.num_input_pipelines,
                     input_context.input_pipeline_id)

    expected_sum_on_workers = [10, 35]
    input_iterator = iter(
        strategy.experimental_distribute_datasets_from_function(dataset_fn))

    @def_function.function
    def run(iterator):
      return strategy.experimental_local_results(iterator.get_next())

    result = run(input_iterator)
    sum_value = math_ops.reduce_sum(result)
    self.assertEqual(
        sum_value.numpy(),
        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
  def testDatasetFromFunction(self, strategy):
    def dataset_fn(input_context):
      global_batch_size = 10
      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
      return d.shard(input_context.num_input_pipelines,
                     input_context.input_pipeline_id)

    expected_data_on_worker = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
    input_iterator = iter(
        strategy.experimental_distribute_datasets_from_function(dataset_fn))

    @def_function.function
    def run(iterator):
      return strategy.experimental_local_results(iterator.get_next())

    result = run(input_iterator)
    self.assertTrue(
        np.array_equal(
            result[0].numpy(),
            expected_data_on_worker[multi_worker_test_base.get_task_index()]))
 def wrapped_method(method_to_wrap, name, *arg, **kwargs):
     # Use lock to ensure += operation is thread-safe.
     with self._lock:
         self._task_dict[test_base.get_task_type()][
             test_base.get_task_index()][name] += 1
     method_to_wrap(*arg, **kwargs)
 def wrapped_method(method_to_wrap, name, *arg, **kwargs):
   # Use lock to ensure += operation is thread-safe.
   with self._lock:
     self._task_dict[test_base.get_task_type()][
         test_base.get_task_index()][name] += 1
   method_to_wrap(*arg, **kwargs)