def proc_func():
     for i in range(5):
         logging.info('(logging) %s-%d, i: %d',
                      multi_worker_test_base.get_task_type(),
                      self._worker_idx(), i)
         print('(print) {}-{}, i: {}'.format(
             multi_worker_test_base.get_task_type(), self._worker_idx(),
             i),
               flush=True)
         time.sleep(1)
Esempio n. 2
0
        def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves summaries but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(), 'logfile_%s_%d' %
                (test_base.get_task_type(), test_base.get_task_index()))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

            # If it's chief, the summaries should be saved in the filepath; if not,
            # the directory should be empty (although created). Using
            # `file_io.list_directory()` since the directory may be created at this
            # point.
            test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)),
                                 test_base.is_chief())
Esempio n. 3
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):

        extension = os.path.splitext(saving_filepath)[1]

        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
            (test_base.get_task_type(), test_base.get_task_index(), extension))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath),
                             test_base.is_chief())
Esempio n. 4
0
    def testSimpleInputFromFnLastPartialBatch(self, strategy):
        def dataset_fn(input_context):
            global_batch_size = 8
            batch_size = input_context.get_per_replica_batch_size(
                global_batch_size)
            dataset = dataset_ops.DatasetV2.range(14).batch(
                batch_size, drop_remainder=False)
            return dataset.shard(input_context.num_input_pipelines,
                                 input_context.input_pipeline_id)

        input_iterator = iter(
            strategy.distribute_datasets_from_function(dataset_fn))

        @def_function.function
        def run(input_iterator):
            return strategy.run(lambda x: x, args=(next(input_iterator), ))

        # Let the complete batch go.
        run(input_iterator)
        # `result` is an incomplete batch
        result = run(input_iterator)

        expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]}
        self.assertAllEqual(
            expected_data_on_worker[multi_worker_test_base.get_task_type()],
            result.numpy())
Esempio n. 5
0
    def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
            model, test_obj, train_ds, num_epoch, steps, strategy,
            saving_filepath, **kwargs):
        # Incorporate type/index information and thread id in saving_filepath to
        # ensure every worker has a unique path. Note that in normal use case the
        # saving_filepath will be the same for all workers, but we use different
        # ones here just to test out chief saves checkpoint but non-chief doesn't.

        # TODO(b/134551335): Must save to hdf5 until bug with copying
        # MirroredVariables is resolved.
        saving_filepath = os.path.join(
            test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' %
            (test_base.get_task_type(), test_base.get_task_index()))

        # The saving_filepath shouldn't exist at the beginning (as it's unique).
        test_obj.assertFalse(os.path.exists(saving_filepath))

        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

        # If it's chief, the model should be saved; if not, the model shouldn't.
        test_obj.assertEqual(os.path.exists(saving_filepath),
                             test_base.is_chief())
Esempio n. 6
0
    def testGatherRaiseSparsePerReplicaMultiWorker(self, strategy, pure_eager):
        if strategy.num_replicas_in_sync != 2:
            self.skipTest('Test for two replicas.')
        dense_shape = [5, 2]
        if multi_worker_test_base.get_task_type() == 'chief':
            t0 = _make_indexed_slices(values=[[1., 2.]],
                                      indices=[2],
                                      dense_shape=dense_shape)
        if multi_worker_test_base.get_task_type() == 'worker':
            t0 = _make_indexed_slices(values=[[3., 4.], [5., 6.]],
                                      indices=[1, 3],
                                      dense_shape=dense_shape)

        def run(value):
            return strategy._gather(value, axis=0)

        with self.assertRaisesRegex(
                NotImplementedError,
                r'gather/all_gather does not support IndexedSlices'):
            if pure_eager:
                run(t0)
            else:
                def_function.function(run)(t0)
Esempio n. 7
0
        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
                test_obj, file_format):

            model, saving_filepath, train_ds, steps = _model_setup(
                test_obj, file_format)
            num_epoch = 2
            extension = os.path.splitext(saving_filepath)[1]

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves checkpoint but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'checkpoint_%s_%d%s' % (test_base.get_task_type(),
                                        test_base.get_task_index(), extension))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(
                training_state.checkpoint_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      validation_data=train_ds,
                      validation_steps=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=save_weights_only)
                      ])

            # If it's chief, the model should be saved; if not, the model shouldn't.
            test_obj.assertEqual(
                training_state.checkpoint_exists(saving_filepath),
                test_base.is_chief())

            # If it's chief, the model should be saved (`write_filepath` should
            # simply return `saving_filepath`); if not, i.e. for non-chief workers,
            # the temporary path generated by `write_filepath` should no longer
            # contain the checkpoint that has been deleted.
            test_obj.assertEqual(
                training_state.checkpoint_exists(
                    distributed_file_utils.write_filepath(
                        saving_filepath, model._distribution_strategy)),
                test_base.is_chief())
Esempio n. 8
0
  def testSimpleInputFromDatasetLastPartialBatch(self, strategy):
    global_batch_size = 8
    dataset = dataset_ops.DatasetV2.range(14).batch(
        global_batch_size, drop_remainder=False)
    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))

    @def_function.function
    def run(input_iterator):
      return strategy.run(lambda x: x, args=(next(input_iterator),))

    # Let the complete batch go.
    run(input_iterator)

    # `result` is an incomplete batch
    result = run(input_iterator)
    expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]}
    self.assertAllEqual(
        expected_data_on_workers[multi_worker_test_base.get_task_type()],
        result.numpy(),
    )
Esempio n. 9
0
    def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      saving_filepath = os.path.join(test_obj.get_temp_dir(),
                                     'logfile_%s' % (test_base.get_task_type()))

      saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1')
      os.mkdir(saving_filepath)
      os.mkdir(saving_filepath_for_temp)

      # Verifies that even if `saving_filepath_for_temp` exists, tensorboard
      # can still save to temporary directory.
      test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
Esempio n. 10
0
  def testDatasetFromFunction(self, strategy):
    def dataset_fn(input_context):
      global_batch_size = 10
      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
      return d.shard(input_context.num_input_pipelines,
                     input_context.input_pipeline_id)

    expected_sum_on_workers = {'chief': 10, 'worker': 35}
    input_iterator = iter(
        strategy.distribute_datasets_from_function(dataset_fn))

    @def_function.function
    def run(iterator):
      return strategy.experimental_local_results(iterator.get_next())

    result = run(input_iterator)
    sum_value = math_ops.reduce_sum(result)
    self.assertEqual(
        sum_value.numpy(),
        expected_sum_on_workers[multi_worker_test_base.get_task_type()])
  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath):
    # Incorporate type/index information and thread id in saving_filepath to
    # ensure every worker has a unique path. Note that in normal use case the
    # saving_filepath will be the same for all workers, but we use different
    # ones here just to test out chief saves checkpoint but non-chief doesn't.
    saving_filepath = os.path.join(
        test_obj.get_temp_dir(), 'checkpoint_%s_%d' %
        (test_base.get_task_type(), test_base.get_task_index()))

    # The saving_filepath shouldn't exist at the beginning (as it's unique).
    test_obj.assertFalse(os.path.exists(saving_filepath))

    model.fit(
        x=train_ds,
        epochs=num_epoch,
        steps_per_epoch=steps,
        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])

    # If it's chief, the model should be saved; if not, the model shouldn't.
    test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
Esempio n. 12
0
 def fn():
     for i in range(5):
         logging.info('%s-%d, i: %d',
                      multi_worker_test_base.get_task_type(),
                      self._worker_idx(), i)
         time.sleep(1)
 def wrapped_method(method_to_wrap, name, *arg, **kwargs):
     # Use lock to ensure += operation is thread-safe.
     with self._lock:
         self._task_dict[test_base.get_task_type()][
             test_base.get_task_index()][name] += 1
     method_to_wrap(*arg, **kwargs)
def proc_func_that_adds_task_type_in_return_data():
    return multi_worker_test_base.get_task_type()
Esempio n. 15
0
 def proc_func_expected_to_seg_fault():
     if multi_worker_test_base.get_task_type() == 'worker':
         time.sleep(10000)
     ctypes.string_at(0)  # Intentionally made seg fault.
Esempio n. 16
0
 def proc_func_expected_to_exit_with_20():
     if multi_worker_test_base.get_task_type() == 'worker':
         time.sleep(10000)
     sys.exit(20)
 def proc_func():
     for i in range(50):
         logging.info('(logging) %s-%d, i: %d',
                      multi_worker_test_base.get_task_type(),
                      self._worker_idx(), i)
         time.sleep(1)
Esempio n. 18
0
 def proc_func():
     time.sleep(1)
     if multi_worker_test_base.get_task_type() != 'chief':
         raise ValueError
Esempio n. 19
0
def proc_func_that_adds_task_type_in_return_data(test_obj, val):
    test_obj.assertEqual(val, 3)
    return multi_worker_test_base.get_task_type()
Esempio n. 20
0
def proc_func_that_adds_task_type_in_return_data(test_obj):
    test_obj.assertTrue(flags.FLAGS.test_flag == 3)
    return multi_worker_test_base.get_task_type()
Esempio n. 21
0
 def wrapped_method(method_to_wrap, name, *arg, **kwargs):
   # Use lock to ensure += operation is thread-safe.
   with self._lock:
     self._task_dict[test_base.get_task_type()][
         test_base.get_task_index()][name] += 1
   method_to_wrap(*arg, **kwargs)
def proc_func_that_adds_task_type_in_return_data(test_obj):
    multi_process_runner.add_return_data(
        multi_worker_test_base.get_task_type())
    test_obj.assertTrue(flags.FLAGS.test_flag == 3)