Beispiel #1
0
 def worker_fn():
   enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
   # There may be some delays before the server startup. Check health should
   # eventually be OK.
   while True:
     try:
       for task in [
           "/job:worker/replica:0/task:0",
           "/job:worker/replica:0/task:1",
       ]:
         context.context().check_collective_ops_peer_health(task)
     except errors.UnavailableError:
       continue
     break
   multi_process_runner.get_barrier().wait()
    def worker_step_fn(worker_id):
      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
      # Make sure the processeses are in sync after updating the cluster
      multi_process_runner.get_barrier().wait()

      @def_function.function
      def run_reduce():
        with ops.device(self._local_device):
          t_in = array_ops.ones(tensor_shape) * worker_id
          return strategy.reduce(reduce_util.ReduceOp.MEAN, t_in, axis=None)

      t_out = run_reduce()
      # Element values from the workers are
      #     0, 1, ..., (NUM_WORKERS - 1)
      expected_mean = (NUM_WORKERS - 1) / 2
      expected_out = np.ones(tensor_shape) * expected_mean
      self.assertAllClose(t_out, expected_out)
    def worker_step_fn(worker_id, num_dims):
      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
      # Make sure the processeses are in sync after updating the cluster
      multi_process_runner.get_barrier().wait()
      tensor_shape = [2] * num_dims

      def variable_fn():
        with ops.device(self._local_device):
          # The initial value will be broadcasted from worker 0 to others.
          initial_value = (array_ops.ones(tensor_shape) if worker_id == 0 else
                           array_ops.zeros(tensor_shape))
          var = variable_scope.get_variable(name='x', initializer=initial_value)
          return array_ops.identity(var)

      t_out = strategy.extended.call_for_each_replica(variable_fn)
      expected_out = np.ones(tensor_shape)
      self.assertAllClose(t_out, expected_out)
    def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))

      multi_process_runner.get_barrier().wait()

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

      multi_process_runner.get_barrier().wait()

      test_obj.assertTrue(file_io.list_directory_v2(saving_filepath))
Beispiel #5
0
    def _create_multi_worker_mirrored():
        tf_config = cluster_resolver.TFConfigClusterResolver()
        master = tf_config.master()
        if tf_config.rpc_layer:
            # Strip off the rpc_layer suffix.
            master = master[len("%s://" % tf_config.rpc_layer):]
        resolver = cluster_resolver.SimpleClusterResolver(
            cluster_spec=tf_config.cluster_spec(),
            task_type=tf_config.task_type,
            task_id=tf_config.task_id,
            master=master,
            environment=tf_config.environment,
            num_accelerators={"GPU": required_gpus},
            rpc_layer=tf_config.rpc_layer or "grpc",
        )
        # Disable health check and coordination service. We don't have a reliable
        # way to shutdown the strategy (and thus the strategy health check or
        # coordination service heartbeat) at the end of a test. Turning on the
        # strategy health check or coordination service heartbeat causes some
        # flakiness since we re-create part of the server when creating a strategy,
        # and our tests are capable of handling failures.
        CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
        context.context().configure_coordination_service(service_type="")
        # Always create the strategy in eager mode so that it starts the server and
        # configures the eager context. The eager context can no longer be
        # configured after initialization.
        with context.eager_mode():
            strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver)

        if not use_merge_call:
            strategy.extended._use_merge_call = lambda: False  # pylint: disable=protected-access
        # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
        # collectives may hang if any worker launches collectives before the chief
        # creates the strategy.
        try:
            multi_process_runner.get_barrier().wait()
        except ValueError:
            # If the creator is called in the main process,
            # multi_process_runner.get_barrier() raises ValueError, which is safe to
            # ignore.
            pass
        return strategy
        def proc_model_checkpoint_works_with_same_file_path(
                test_obj, saving_filepath):
            if multi_process_runner.is_oss():
                test_obj.skipTest('TODO(b/170838633): Failing in OSS')
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 4

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
            bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')

            try:
                model.fit(
                    x=train_ds,
                    epochs=num_epoch,
                    steps_per_epoch=steps,
                    callbacks=[
                        callbacks.ModelCheckpoint(filepath=saving_filepath),
                        callbacks.BackupAndRestore(backup_dir=bar_dir),
                        InterruptingCallback()
                    ])
            except RuntimeError as e:
                if 'Interrupting!' not in str(e):
                    raise

            multi_process_runner.get_barrier().wait()
            backup_filepath = os.path.join(bar_dir, 'chief', 'checkpoint')
            test_obj.assertTrue(file_io.file_exists_v2(backup_filepath))
            test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(filepath=saving_filepath),
                          callbacks.BackupAndRestore(backup_dir=bar_dir),
                          AssertCallback()
                      ])
            multi_process_runner.get_barrier().wait()
            test_obj.assertFalse(file_io.file_exists_v2(backup_filepath))
            test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
def enable_collective_ops_with_barrier(cluster_resolver):
  multi_process_runner.get_barrier().wait()
  enable_collective_ops(cluster_resolver)
  multi_process_runner.get_barrier().wait()
Beispiel #8
0
def fn_with_barrier():
    return multi_process_runner.get_barrier()
Beispiel #9
0
 def test_barrier_called_in_main_process(self):
     with self.assertRaises(ValueError):
         multi_process_runner.get_barrier()
        def fn(model_path, checkpoint_dir):
            global_batch_size = per_worker_batch_size * num_workers
            strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
            )
            with strategy.scope():
                multi_worker_model = build_and_compile_cnn_model()

            callbacks = [
                keras.callbacks.ModelCheckpoint(
                    filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
            ]

            multi_worker_dataset = mnist_dataset(global_batch_size)
            if shard_policy:
                options = dataset_ops.Options()
                options.experimental_distribute.auto_shard_policy = shard_policy
                multi_worker_dataset = multi_worker_dataset.with_options(
                    options)

            multi_worker_model.fit(multi_worker_dataset,
                                   epochs=2,
                                   steps_per_epoch=20,
                                   callbacks=callbacks)

            def _is_chief(task_type, task_id):
                return task_type is None or task_type == 'chief' or (
                    task_type == 'worker' and task_id == 0)

            def _get_temp_dir(dirpath, task_id):
                base_dirpath = 'workertemp_' + str(task_id)
                temp_dir = os.path.join(dirpath, base_dirpath)
                file_io.recursive_create_dir_v2(temp_dir)
                return temp_dir

            def write_filepath(filepath, task_type, task_id):
                dirpath = os.path.dirname(filepath)
                base = os.path.basename(filepath)
                if not _is_chief(task_type, task_id):
                    dirpath = _get_temp_dir(dirpath, task_id)
                return os.path.join(dirpath, base)

            task_type, task_id = (strategy.cluster_resolver.task_type,
                                  strategy.cluster_resolver.task_id)
            write_model_path = write_filepath(model_path, task_type, task_id)

            multi_worker_model.save(write_model_path)
            if not _is_chief(task_type, task_id):
                file_io.delete_recursively_v2(
                    os.path.dirname(write_model_path))

            # Make sure chief finishes saving before non-chief's assertions.
            multi_process_runner.get_barrier().wait()

            if not file_io.file_exists_v2(model_path):
                raise RuntimeError()
            if file_io.file_exists_v2(write_model_path) != _is_chief(
                    task_type, task_id):
                raise RuntimeError()

            loaded_model = keras.saving.save.load_model(model_path)
            loaded_model.fit(multi_worker_dataset,
                             epochs=2,
                             steps_per_epoch=20)

            checkpoint = tracking_util.Checkpoint(model=multi_worker_model)
            write_checkpoint_dir = write_filepath(checkpoint_dir, task_type,
                                                  task_id)
            checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint, directory=write_checkpoint_dir, max_to_keep=1)

            checkpoint_manager.save()
            if not _is_chief(task_type, task_id):
                file_io.delete_recursively_v2(write_checkpoint_dir)

            # Make sure chief finishes saving before non-chief's assertions.
            multi_process_runner.get_barrier().wait()

            if not file_io.file_exists_v2(checkpoint_dir):
                raise RuntimeError()
            if file_io.file_exists_v2(write_checkpoint_dir) != _is_chief(
                    task_type, task_id):
                raise RuntimeError()

            latest_checkpoint = checkpoint_management.latest_checkpoint(
                checkpoint_dir)
            checkpoint.restore(latest_checkpoint)
            multi_worker_model.fit(multi_worker_dataset,
                                   epochs=2,
                                   steps_per_epoch=20)

            logging.info('testMultiWorkerTutorial successfully ends')