Esempio n. 1
0
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     master = tf_config.master()
     if tf_config.rpc_layer:
         # Strip off the rpc_layer suffix.
         master = master[len("%s://" % tf_config.rpc_layer):]
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         master=master,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
     with context.eager_mode():
         strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
             cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
Esempio n. 2
0
 def worker_fn():
     enable_collective_ops(
         cluster_resolver_lib.TFConfigClusterResolver())
     # There may be some delays before the server startup. Check health should
     # eventually be OK.
     while True:
         try:
             for task in [
                     "/job:worker/replica:0/task:0",
                     "/job:worker/replica:0/task:1",
             ]:
                 context.context().check_collective_ops_peer_health(
                     task)
         except errors.UnavailableError:
             continue
         break
     multi_process_runner.barrier().wait()
Esempio n. 3
0
    def worker_step_fn(worker_id):
      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
      # Make sure the processeses are in sync after updating the cluster
      multi_process_runner.barrier().wait()

      @def_function.function
      def run_reduce():
        with ops.device(self._local_device):
          t_in = array_ops.ones(tensor_shape) * worker_id
          return strategy.reduce(reduce_util.ReduceOp.MEAN, t_in, axis=None)

      t_out = run_reduce()
      # Element values from the workers are
      #     0, 1, ..., (NUM_WORKERS - 1)
      expected_mean = (NUM_WORKERS - 1) / 2
      expected_out = np.ones(tensor_shape) * expected_mean
      self.assertAllClose(t_out, expected_out)
Esempio n. 4
0
    def worker_step_fn(worker_id, num_dims):
      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
      # Make sure the processeses are in sync after updating the cluster
      multi_process_runner.barrier().wait()
      tensor_shape = [2] * num_dims

      def variable_fn():
        with ops.device(self._local_device):
          # The initial value will be broadcasted from worker 0 to others.
          initial_value = (array_ops.ones(tensor_shape) if worker_id == 0 else
                           array_ops.zeros(tensor_shape))
          var = variable_scope.get_variable(name='x', initializer=initial_value)
          return array_ops.identity(var)

      t_out = strategy.extended.call_for_each_replica(variable_fn)
      expected_out = np.ones(tensor_shape)
      self.assertAllClose(t_out, expected_out)
Esempio n. 5
0
    def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(file_io.file_exists(saving_filepath))

      multi_process_runner.barrier().wait()

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

      multi_process_runner.barrier().wait()

      test_obj.assertTrue(file_io.list_directory(saving_filepath))
Esempio n. 6
0
    def proc_model_checkpoint_works_with_same_file_path(test_obj,
                                                        saving_filepath):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 4

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(file_io.file_exists(saving_filepath))
      bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')

      try:
        model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=[
                callbacks.ModelCheckpoint(filepath=saving_filepath),
                callbacks.BackupAndRestore(backup_dir=bar_dir),
                InterruptingCallback()
            ])
      except RuntimeError as e:
        if 'Interrupting!' not in str(e):
          raise

      multi_process_runner.barrier().wait()
      backup_filepath = os.path.join(bar_dir, 'checkpoint')
      test_obj.assertTrue(file_io.file_exists(backup_filepath))
      test_obj.assertTrue(file_io.file_exists(saving_filepath))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[
              callbacks.ModelCheckpoint(filepath=saving_filepath),
              callbacks.BackupAndRestore(backup_dir=bar_dir),
              AssertCallback()
          ])
      multi_process_runner.barrier().wait()
      test_obj.assertFalse(file_io.file_exists(backup_filepath))
      test_obj.assertTrue(file_io.file_exists(saving_filepath))
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer,
     )
     strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
Esempio n. 8
0
 def _create_multi_worker_mirrored():
     tf_config = cluster_resolver.TFConfigClusterResolver()
     master = tf_config.master()
     if tf_config.rpc_layer:
         # Strip off the rpc_layer suffix.
         master = master[len("%s://" % tf_config.rpc_layer):]
     resolver = cluster_resolver.SimpleClusterResolver(
         cluster_spec=tf_config.cluster_spec(),
         task_type=tf_config.task_type,
         task_id=tf_config.task_id,
         master=master,
         environment=tf_config.environment,
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
     # Disable health check. We don't have a reliable to shutdown the strategy
     # (and thus the health check) at the end of a test. Turning on health check
     # causes some flakiness since we re-create part of the server when creating
     # a strategy, and our tests are capable of handling failures.
     CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
     with context.eager_mode():
         strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
             cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
         multi_process_runner.barrier().wait()
     except ValueError:
         # If the creator is called in the main process,
         # multi_process_runner.barrier() raises ValueError, which is safe to
         # ignore.
         pass
     return strategy
        def worker_fn(attempts):
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
            task_id, attempt = get_attempt(strategy, attempts)

            if attempt == 2 and task_id == 1:
                multi_process_runner.barrier().wait()

            @tf.function
            def replica_fn():
                ctx = tf.distribute.get_replica_context()
                # Use a large tensor because small tensor may hang regardless when the
                # worker recovers.
                value = tf.ones((64, 64))
                ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value, value])

            strategy.run(replica_fn)
            # worker-1 dies here.
            if attempt == 1 and task_id == 1:
                quick_exit(1)
            # Make worker-0 waits for worker-1 to restart before entering the next
            # collective to simulate a quick recovery of worker-1.
            if attempt == 1 and task_id == 0:
                multi_process_runner.barrier().wait()
            strategy.run(replica_fn)
def proc_func_with_barrier():
    return multi_process_runner.barrier()
 def test_barrier_called_in_main_process(self):
     with self.assertRaises(ValueError):
         multi_process_runner.barrier()
Esempio n. 12
0
        def proc_func(model_path, checkpoint_dir):
            global_batch_size = per_worker_batch_size * num_workers
            strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
            )
            with strategy.scope():
                multi_worker_model = build_and_compile_cnn_model()

            callbacks = [
                keras.callbacks.ModelCheckpoint(
                    filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
            ]

            multi_worker_dataset = mnist_dataset(global_batch_size)
            if shard_policy:
                options = dataset_ops.Options()
                options.experimental_distribute.auto_shard_policy = shard_policy
                multi_worker_dataset = multi_worker_dataset.with_options(
                    options)

            multi_worker_model.fit(multi_worker_dataset,
                                   epochs=2,
                                   steps_per_epoch=20,
                                   callbacks=callbacks)

            def _is_chief(task_type, task_id):
                return task_type is None or task_type == 'chief' or (
                    task_type == 'worker' and task_id == 0)

            def _get_temp_dir(dirpath, task_id):
                base_dirpath = 'workertemp_' + str(task_id)
                temp_dir = os.path.join(dirpath, base_dirpath)
                file_io.recursive_create_dir_v2(temp_dir)
                return temp_dir

            def write_filepath(filepath, task_type, task_id):
                dirpath = os.path.dirname(filepath)
                base = os.path.basename(filepath)
                if not _is_chief(task_type, task_id):
                    dirpath = _get_temp_dir(dirpath, task_id)
                return os.path.join(dirpath, base)

            task_type, task_id = (strategy.cluster_resolver.task_type,
                                  strategy.cluster_resolver.task_id)
            write_model_path = write_filepath(model_path, task_type, task_id)

            multi_worker_model.save(write_model_path)
            if not _is_chief(task_type, task_id):
                file_io.delete_recursively_v2(
                    os.path.dirname(write_model_path))

            # Make sure chief finishes saving before non-chief's assertions.
            multi_process_runner.barrier().wait()

            if not file_io.file_exists_v2(model_path):
                raise RuntimeError()
            if file_io.file_exists_v2(write_model_path) != _is_chief(
                    task_type, task_id):
                raise RuntimeError()

            loaded_model = keras.saving.save.load_model(model_path)
            loaded_model.fit(multi_worker_dataset,
                             epochs=2,
                             steps_per_epoch=20)

            checkpoint = tracking_util.Checkpoint(model=multi_worker_model)
            write_checkpoint_dir = write_filepath(checkpoint_dir, task_type,
                                                  task_id)
            checkpoint_manager = checkpoint_management.CheckpointManager(
                checkpoint, directory=write_checkpoint_dir, max_to_keep=1)

            checkpoint_manager.save()
            if not _is_chief(task_type, task_id):
                file_io.delete_recursively_v2(write_checkpoint_dir)

            # Make sure chief finishes saving before non-chief's assertions.
            multi_process_runner.barrier().wait()

            if not file_io.file_exists_v2(checkpoint_dir):
                raise RuntimeError()
            if file_io.file_exists_v2(write_checkpoint_dir) != _is_chief(
                    task_type, task_id):
                raise RuntimeError()

            latest_checkpoint = checkpoint_management.latest_checkpoint(
                checkpoint_dir)
            checkpoint.restore(latest_checkpoint)
            multi_worker_model.fit(multi_worker_dataset,
                                   epochs=2,
                                   steps_per_epoch=20)

            logging.info('testMultiWorkerTutorial successfully ends')
def fn_with_barrier():
    return multi_process_runner.barrier()