def run_independent_workers(self,
                              worker_fn,
                              strategy_cls,
                              num_workers,
                              num_ps=None,
                              **kwargs):
    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=num_workers, num_ps=num_ps)
    self._barrier = dc._Barrier(num_workers + (num_ps or 0))  # pylint: disable=protected-access

    def _worker_fn(**kwargs):
      """Runs the worker function in a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = get_strategy_object(strategy_cls)
        with strategy.scope():
          return worker_fn(**kwargs)

    threads = self.run_multiple_tasks_in_threads(_worker_fn, cluster_spec,
                                                 **kwargs)
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      threads_to_join = [
          ts for task_type, ts in threads.items() if task_type == 'ps'
      ]
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
Esempio n. 2
0
  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2
    cluster_spec = test_base.create_cluster_spec(
        num_workers=num_workers, num_ps=2)
    self._barrier = dc._Barrier(4)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      # TODO(rchao/yuefengz): The following is run by both worker and ps
      # threads. The distribute coordinator should run std server immediately
      # without configuring the session (or building the graph) on PS.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        batch_size = 64
        steps = 2
        strategy = strategy_cls()
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph

        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

          # TODO(b/123868066): Verify callback for model.evaluate().
          callbacks_for_fit = nest.flatten(
              kwargs.get('verification_callback', []))
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              validation_data=val_ds,
              validation_steps=steps,
              callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    for task_type, ts in threads.items():
      # This test can finish once the worker threads complete, and thus
      # the ps threads don't need to be joined.
      if task_type == 'ps':
        continue
      threads_to_join.extend(ts)
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
Esempio n. 3
0
    def test_process_exists(self):
        def fn():
            time.sleep(100000)

        mpr = multi_process_runner.MultiProcessRunner(
            fn, multi_worker_test_base.create_cluster_spec(num_workers=1))
        mpr.start()
        self.assertTrue(mpr.process_exists('worker', 0))
        mpr.terminate('worker', 0)
        # Worker 0 should exit at some point, or else the test would time out.
        while mpr.process_exists('worker', 0):
            time.sleep(1)
Esempio n. 4
0
    def test_error_reporting_overrides_timeout_reporting(self):
        def fn():
            if self._worker_idx() == 1:
                time.sleep(10000)
            raise ValueError('Worker 0 errored')

        mpr = multi_process_runner.MultiProcessRunner(
            fn, multi_worker_test_base.create_cluster_spec(num_workers=2))
        mpr.start()

        with self.assertRaisesRegex(ValueError, 'Worker 0 errored'):
            mpr.join(timeout=20)
Esempio n. 5
0
    def test_template(self, strategy_cls, file_format):
        num_workers = 2
        num_epoch = 2

        cluster_spec = test_base.create_cluster_spec(num_workers=num_workers,
                                                     test_obj=self)
        self._barrier = dc._Barrier(2)

        def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
            """Simulates an Independent Worker inside of a thread."""
            with test.mock.patch.object(dc, '_run_std_server',
                                        self._make_mock_run_std_server()):
                strategy = get_strategy_object(strategy_cls)
                batch_size = 64
                steps = 2
                train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
                    batch_size, steps)
                with strategy.scope():
                    model = multi_worker_testing_utils.get_mnist_model(
                        (28, 28, 1))

                custom_callable(model,
                                self,
                                train_ds,
                                num_epoch,
                                steps,
                                strategy,
                                saving_filepath=kwargs['saving_filepath'],
                                barrier=kwargs['barrier'],
                                threading_local=kwargs['threading_local'])

        # Pass saving_filepath from the parent thread to ensure every worker has the
        # same fileapth to save.
        saving_filepath = os.path.join(self.get_temp_dir(),
                                       'checkpoint.' + file_format)
        barrier = dc._Barrier(2)
        threading_local = threading.local()
        threads = self.run_multiple_tasks_in_threads(
            _independent_worker_fn,
            cluster_spec,
            saving_filepath=saving_filepath,
            barrier=barrier,
            threading_local=threading_local)
        self.assertFalse(training_state.checkpoint_exists(saving_filepath))

        threads_to_join = []
        strategy = get_strategy_object(strategy_cls)
        if strategy.extended.experimental_between_graph:
            for ts in threads.values():
                threads_to_join.extend(ts)
        else:
            threads_to_join = [threads['worker'][0]]
        self.join_independent_workers(threads_to_join)
  def testCheckHealthInvalidPeer(self):

    def worker_fn():
      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
      context.context().check_collective_ops_peer_health(
          "localhost:12345", timeout_in_ms=1000)

    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
    mpr.start_single_process("worker", 0)
    with self.assertRaises(errors.InvalidArgumentError):
      mpr.join()
    def test_auto_restart_and_timeout(self):
        def proc_func():
            time.sleep(1)
            raise ValueError

        mpr = multi_process_runner.MultiProcessRunner(
            proc_func,
            multi_worker_test_base.create_cluster_spec(num_workers=1),
            auto_restart=True)
        mpr.start()
        with self.assertRaises(multi_process_runner.SubprocessTimeoutError):
            mpr.join(timeout=10)
 def testArbitraryJobName(self):
     cluster_def = multi_worker_test_base.create_cluster_spec(
         num_workers=1, num_ps=1, has_chief=True)
     cluster_def["some_arbitrary_name"] = [
         "localhost:%d" % multi_worker_test_base.pick_unused_port()
     ]
     cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def),
                                              rpc_layer="grpc")
     with self.assertRaisesRegexp(ValueError,
                                  "Disallowed task type found in"):
         parameter_server_strategy_v2.ParameterServerStrategyV2(
             cluster_resolver)
    def test_timeout_none(self):
        def proc_func():
            time.sleep(250)
            raise ValueError('Worker 0 errored')

        mpr = multi_process_runner.MultiProcessRunner(
            proc_func,
            multi_worker_test_base.create_cluster_spec(num_workers=1))

        mpr.start()
        with self.assertRaisesRegex(ValueError, 'Worker 0 errored'):
            mpr.join(timeout=None)
 def test_signal_doesnt_fire_after_process_exits(self):
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func_that_does_nothing,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         max_run_time=10)
     mpr.start()
     mpr.join()
     with self.assertRaisesRegexp(Queue.Empty, ''):
         # If the signal was fired, another message would be added to internal
         # queue, so verifying it's empty.
         multi_process_runner._resource(
             multi_process_runner.PROCESS_STATUS_QUEUE).get(block=False)
  def test_tf_config(self):
    cluster_spec = multi_worker_test_base.create_cluster_spec(
        has_chief=True, num_workers=2)
    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
    result = runner.run(fn_that_adds_task_type_in_return_data)

    job_count_dict = {'worker': 2, 'chief': 1}
    for data in result:
      job_count_dict[data] -= 1

    self.assertEqual(job_count_dict['worker'], 0)
    self.assertEqual(job_count_dict['chief'], 0)
Esempio n. 12
0
    def test_complete_flow_independent_worker_between_graph(
            self, train_distribute_cls, eval_distribute_cls):
        if (context.num_gpus() < 2 and eval_distribute_cls
                == collective_all_reduce_strategy.CollectiveAllReduceStrategy):
            self.skipTest(
                "`CollectiveAllReduceStrategy` needs at least two towers.")

        train_distribute = self._get_strategy_object(train_distribute_cls)

        if eval_distribute_cls:
            eval_distribute = self._get_strategy_object(eval_distribute_cls,
                                                        eval_strategy=True)
        else:
            eval_distribute = None

        if (train_distribute_cls ==
                parameter_server_strategy.ParameterServerStrategy):
            cluster_spec = multi_worker_test_base.create_cluster_spec(
                num_workers=3, num_ps=2, has_eval=True)
            # 3 workers, 2 ps and 1 evaluator.
            self._barrier = dc._Barrier(6)
        else:
            cluster_spec = multi_worker_test_base.create_cluster_spec(
                num_workers=3, num_ps=0, has_eval=True)
            # 3 workers and 1 evaluator.
            self._barrier = dc._Barrier(4)

        threads = self.run_multiple_tasks_in_threads(
            self._independent_worker_fn, cluster_spec, train_distribute,
            eval_distribute)
        threads_to_join = []
        for task_type, ts in threads.items():
            if task_type == PS:
                continue
            for t in ts:
                threads_to_join.append(t)
        self.join_independent_workers(threads_to_join)

        estimator = self._get_estimator(train_distribute, eval_distribute)
        self._inspect_train_and_eval_events(estimator)
Esempio n. 13
0
  def test_streaming(self):

    def proc_func():
      for i in range(5):
        logging.info('(logging) %s-%d, i: %d',
                     multi_worker_test_base.get_task_type(), self._worker_idx(),
                     i)
        print(
            '(print) {}-{}, i: {}'.format(
                multi_worker_test_base.get_task_type(), self._worker_idx(), i),
            flush=True)
        time.sleep(1)

    mpr = multi_process_runner.MultiProcessRunner(
        proc_func,
        multi_worker_test_base.create_cluster_spec(
            has_chief=True, num_workers=2, num_ps=2, has_eval=True),
        list_stdout=True)
    mpr._dependence_on_chief = False

    mpr.start()
    mpr.start_single_process('worker', 2)
    mpr.start_single_process('ps', 2)
    mpr_result = mpr.join()

    list_to_assert = mpr_result.stdout

    for job in ['chief', 'evaluator']:
      for iteration in range(5):
        self.assertTrue(
            any('(logging) {}-0, i: {}'.format(job, iteration) in line
                for line in list_to_assert))
        self.assertTrue(
            any('(print) {}-0, i: {}'.format(job, iteration) in line
                for line in list_to_assert))

    for job in ['worker', 'ps']:
      for iteration in range(5):
        for task in range(3):
          self.assertTrue(
              any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line
                  for line in list_to_assert))
          self.assertTrue(
              any('(print) {}-{}, i: {}'.format(job, task, iteration) in line
                  for line in list_to_assert))
        task = 3
        self.assertFalse(
            any('(logging) {}-{}, i: {}'.format(job, task, iteration) in line
                for line in list_to_assert))
        self.assertFalse(
            any('(print) {}-{}, i: {}'.format(job, task, iteration) in line
                for line in list_to_assert))
Esempio n. 14
0
  def test_basic_run(self):
    has_chief = False
    cluster_spec = multi_worker_test_base.create_cluster_spec(
        has_chief=has_chief,
        num_workers=CLUSTER_SIZE)
    maintenance_event = multi_process_runner.manager().Event()
    training_finished = multi_process_runner.manager().Event()

    checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt')

    if _is_oss():
      rpc_layer = 'grpc'
    else:
      rpc_layer = 'grpc+loas'

    mpr = multi_process_runner.MultiProcessRunner(
        self.worker_fn,
        cluster_spec,
        args=(checkpoint_dir, cluster_spec, maintenance_event,
              training_finished),
        rpc_layer=rpc_layer,
        return_output=True,
        dependence_on_chief=has_chief)

    logging.info('Cluster starting.')
    mpr.start()

    while (not maintenance_event.is_set()) and (not training_finished.is_set()):
      time.sleep(1)

    time.sleep(5)
    if not training_finished.is_set():
      logging.info('restarting workers')
      for worker_id in range(CLUSTER_SIZE):
        mpr.start_single_process('worker', worker_id, cluster_spec)
      logging.info('workers restarted')

    stdout = mpr.join().stdout
    if maintenance_event.is_set():
      all_start_point = []
      for msg in stdout:
        matched_group = re.search(r'.*Start training at (\d+)', msg)

        if matched_group:
          all_start_point.append(int(matched_group.group(1)))

      # remove duplicate logs created due to presence of multiple workers
      start_points = all_start_point[::CLUSTER_SIZE]

      if len(start_points) > 1:
        # assert that after restarting, we don't repeat previous training steps
        self.assertNotEqual(start_points[-1], 0)
Esempio n. 15
0
  def test_preemption_checkpointing(self):
    has_chief = False
    cluster_spec = multi_worker_test_base.create_cluster_spec(
        has_chief=has_chief,
        num_workers=CLUSTER_SIZE)
    training_started_event = multi_process_runner.manager().Event()

    checkpoint_dir = os.path.join(self.get_temp_dir(), 'fh_ckpt')

    if _is_oss():
      rpc_layer = 'grpc'
    else:
      rpc_layer = 'grpc+loas'

    mpr = multi_process_runner.MultiProcessRunner(
        self.worker_fn,
        cluster_spec,
        args=(checkpoint_dir, cluster_spec, [training_started_event]),
        rpc_layer=rpc_layer,
        return_output=True,
        dependence_on_chief=has_chief)

    logging.info('Cluster starting.')
    mpr.start()
    while not training_started_event.is_set():
      time.sleep(1)

    logging.info('sending sigterm')
    killed_worker = random.randrange(0, CLUSTER_SIZE)
    os.kill(mpr.get_process_id('worker', killed_worker), signal.SIGTERM)

    logging.info('sigterm sent')
    time.sleep(5)

    logging.info('restarting workers')
    for worker_id in range(CLUSTER_SIZE):
      mpr.start_single_process('worker', worker_id, cluster_spec)
    logging.info('workers restarted')

    stdout = mpr.join().stdout
    all_start_point = []
    for msg in stdout:
      matched_group = re.search(r'.*Restored training at (\d+)', msg)

      if matched_group:
        all_start_point.append(int(matched_group.group(1)))

    # remove duplicate logs created due to presence of multiple workers
    start_points = all_start_point[::CLUSTER_SIZE]

    # assert that after restarting, we don't repeat previous training steps
    self.assertNotEqual(start_points[-1], 0)
  def test_model_checkpoint_saves_on_chief_but_not_otherwise(
      self, file_format, mode, save_weights_only):

    def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
        test_obj, file_format):

      model, saving_filepath, train_ds, steps = _model_setup(
          test_obj, file_format)
      num_epoch = 2
      extension = os.path.splitext(saving_filepath)[1]

      # Incorporate type/index information and thread id in saving_filepath to
      # ensure every worker has a unique path. Note that in normal use case the
      # saving_filepath will be the same for all workers, but we use different
      # ones here just to test out chief saves checkpoint but non-chief doesn't.
      task_config = _get_task_config()
      saving_filepath = os.path.join(
          test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
          (task_config['type'], task_config['index'], extension))

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(checkpoint_exists(saving_filepath))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          validation_data=train_ds,
          validation_steps=steps,
          callbacks=[
              callbacks.ModelCheckpoint(
                  filepath=saving_filepath, save_weights_only=save_weights_only)
          ])

      # If it's chief, the model should be saved; if not, the model shouldn't.
      test_obj.assertEqual(
          checkpoint_exists(saving_filepath), test_base.is_chief())

      # If it's chief, the model should be saved (`write_filepath` should
      # simply return `saving_filepath`); if not, i.e. for non-chief workers,
      # the temporary path generated by `write_filepath` should no longer
      # contain the checkpoint that has been deleted.
      test_obj.assertEqual(
          checkpoint_exists(
              distributed_file_utils.write_filepath(
                  saving_filepath, model._distribution_strategy)),
          test_base.is_chief())

    multi_process_runner.run(
        proc_model_checkpoint_saves_on_chief_but_not_otherwise,
        cluster_spec=test_base.create_cluster_spec(num_workers=2),
        args=(self, file_format))
Esempio n. 17
0
  def test_multi_process_runner(self):
    mpr_result = multi_process_runner.run(
        proc_func_that_adds_task_type_in_return_data,
        multi_worker_test_base.create_cluster_spec(
            num_workers=2, num_ps=3, has_eval=1))

    job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1}
    for data in mpr_result.return_value:
      job_count_dict[data] -= 1

    self.assertEqual(job_count_dict['worker'], 0)
    self.assertEqual(job_count_dict['ps'], 0)
    self.assertEqual(job_count_dict['evaluator'], 0)
  def test_complete_flow_independent_worker_between_graph(
      self, train_distribute_cls, eval_distribute_cls):
    if (context.num_gpus() < 2 and eval_distribute_cls ==
        collective_all_reduce_strategy.CollectiveAllReduceStrategy):
      self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")

    train_distribute = self._get_strategy_object(train_distribute_cls)

    if eval_distribute_cls:
      eval_distribute = self._get_strategy_object(
          eval_distribute_cls, eval_strategy=True)
    else:
      eval_distribute = None

    if (train_distribute_cls == parameter_server_strategy
        .ParameterServerStrategy):
      cluster_spec = multi_worker_test_base.create_cluster_spec(
          num_workers=3, num_ps=2, has_eval=True)
      # 3 workers, 2 ps and 1 evaluator.
      self._barrier = dc._Barrier(6)
    else:
      cluster_spec = multi_worker_test_base.create_cluster_spec(
          num_workers=3, num_ps=0, has_eval=True)
      # 3 workers and 1 evaluator.
      self._barrier = dc._Barrier(4)

    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                 cluster_spec, train_distribute,
                                                 eval_distribute)
    threads_to_join = []
    for task_type, ts in threads.items():
      if task_type == PS:
        continue
      for t in ts:
        threads_to_join.append(t)
    self.join_independent_workers(threads_to_join)

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
  def testCheckHealthPeerDown(self):

    def worker_fn():
      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
      context.context().check_collective_ops_peer_health(
          "/job:worker/replica:0/task:1", timeout_in_ms=1000)

    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
    mpr.start_single_process("worker", 0)
    with self.assertRaises(
        (errors.UnavailableError, errors.DeadlineExceededError)):
      mpr.join()
Esempio n. 20
0
    def test_exit_code_is_reported_by_subprocess(self):
        def proc_func_expected_to_exit_with_10():
            sys.exit(10)

        mpr = multi_process_runner.MultiProcessRunner(
            proc_func_expected_to_exit_with_10,
            multi_worker_test_base.create_cluster_spec(num_workers=1))
        mpr.start()

        with self.assertRaisesRegex(
                multi_process_runner.UnexpectedSubprocessExitError,
                'Subprocess worker-0 exited with exit code 10'):
            mpr.join()
Esempio n. 21
0
 def testMoreThanOneChief(self):
   cluster_def = multi_worker_test_base.create_cluster_spec(
       num_workers=1, num_ps=1)
   chief_ports = [multi_worker_test_base.pick_unused_port() for _ in range(3)]
   cluster_def["chief"] = ["localhost:%s" % port for port in chief_ports]
   cluster_resolver = SimpleClusterResolver(
       ClusterSpec(cluster_def),
       rpc_layer="grpc",
       task_type="chief",
       task_id=1)
   with self.assertRaisesRegexp(ValueError,
                                "There must be at most one 'chief' job."):
     parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver)
Esempio n. 22
0
    def testSimpleModelIndependentWorkerSync(self, strategy_cls):
        num_workers = 2
        num_epoch = 2

        cluster_spec = test_base.create_cluster_spec(num_workers=num_workers,
                                                     test_obj=self)
        self._barrier = dc._Barrier(2)

        # The verification callback will be shared by multiple threads.
        verification_callback = MultiWorkerVerificationCallback(
            num_epoch=num_epoch, num_worker=num_workers)

        def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
            """Simulates an Independent Worker inside of a thread."""
            with test.mock.patch.object(dc, '_run_std_server',
                                        self._make_mock_run_std_server()):
                strategy = strategy_cls()
                verification_callback.is_between_graph = \
                    strategy.extended.experimental_between_graph
                batch_size = 64
                steps = 2
                train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
                    batch_size, steps)
                with strategy.scope():
                    model = multi_worker_testing_utils.get_mnist_model(
                        (28, 28, 1))
                orig_loss, _ = model.evaluate(train_ds, steps=steps)
                callbacks_for_fit = nest.flatten(
                    kwargs.get('verification_callback', []))
                history = model.fit(x=train_ds,
                                    epochs=num_epoch,
                                    steps_per_epoch=steps,
                                    callbacks=callbacks_for_fit)
                self.assertIsInstance(history, keras.callbacks.History)
                trained_loss, _ = model.evaluate(train_ds, steps=steps)
                self.assertLess(trained_loss, orig_loss)

        threads = self.run_multiple_tasks_in_threads(
            _independent_worker_fn,
            cluster_spec,
            verification_callback=verification_callback)

        threads_to_join = []
        strategy = strategy_cls()
        if strategy.extended.experimental_between_graph:
            for ts in threads.values():
                threads_to_join.extend(ts)
        else:
            threads_to_join = [threads['worker'][0]]
        self.join_independent_workers(threads_to_join)
        verification_callback.verify(self)
Esempio n. 23
0
 def runner(self):
     if not self._runner:
         if (_num_total_workers(self.has_chief, self.num_workers) > 1
                 and self.use_pool_runner):
             # Need to create the strategy in the initializer so that collectives are
             # configured before eager context initialization.
             cluster_spec = multi_worker_test_base.create_cluster_spec(
                 has_chief=self.has_chief,
                 num_workers=self.num_workers,
                 num_ps=0,
                 has_eval=False)
             self._runner = multi_process_runner.MultiProcessPoolRunner(
                 cluster_spec, initializer=self._distribution_fn)
     return self._runner
Esempio n. 24
0
    def test_seg_fault_raises_error(self):
        def proc_func_expected_to_seg_fault():
            ctypes.string_at(0)  # Intentionally made seg fault.

        with self.assertRaises(
                multi_process_runner.UnexpectedSubprocessExitError) as cm:
            multi_process_runner.run(
                proc_func_expected_to_seg_fault,
                multi_worker_test_base.create_cluster_spec(num_workers=1),
                list_stdout=True)
        self.assertIn('Subprocess worker-0 exited with exit code',
                      str(cm.exception))
        list_to_assert = cm.exception.mpr_result.stdout
        self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
Esempio n. 25
0
    def decorator(self, has_chief, num_workers, runner, **kwargs):
        if _num_total_workers(has_chief,
                              num_workers) == 1 or _running_in_worker:
            # We're in worker process or the test is for single worker. Either case we
            # execute the test method directly instead of spawning subprocesses.
            test_method(self, **kwargs)
            return

        # We're in the main process. We spawn subprocesses and run the *test* on
        # each of them. Note that we're not directly executing test_method passed to
        # _multi_worker_test, because we need setUp()/tearDown() to be called and
        # all the decorations on the test method. The conceptual call stack is:
        #   [main process]test.main()
        #     [main process]test_runner.run(test)
        #       [main process]wrapper by combinations.generate()
        #         [main process]_multi_worker_test.decorator()
        #           # A sub process goes through the same code path as the main
        #           # process.
        #           [sub process]_test_runner()
        #             [sub process]test_runner.run(test)
        #               [sub process]wrapper by combinations.generate()
        #                 [sub process]_multi_worker_test.decorator()
        #                   # _running_in_worker is True
        #                   [sub process]test_method()
        test_id = self.id()
        if runner:
            results = runner.run(_test_runner, args=(test_id, ))
        else:
            cluster_spec = multi_worker_test_base.create_cluster_spec(
                has_chief=has_chief,
                num_workers=num_workers,
                num_ps=0,
                has_eval=False)
            results = multi_process_runner.run(_test_runner,
                                               cluster_spec,
                                               args=(test_id, )).return_value

        skip_reason = None
        for result in results:
            if result.status == "failure":
                # We can't tell which worker the return value come from, so we fail on
                # the  first error.
                self.fail(result.message)
                break
            elif result.status == "skipped":
                # Record the skip reason, but do not actually skip the test in case some
                # processes fail instead.
                skip_reason = result.message
        if skip_reason is not None:
            self.skipTest(skip_reason)
Esempio n. 26
0
  def testSimpleModelIndependentWorkerSync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2

    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = strategy_cls()
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph
        batch_size = 64
        steps = 2
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))
        orig_loss, _ = model.evaluate(train_ds, steps=steps)
        callbacks_for_fit = nest.flatten(
            kwargs.get('verification_callback', []))
        history = model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)
        trained_loss, _ = model.evaluate(train_ds, steps=steps)
        self.assertLess(trained_loss, orig_loss)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    strategy = strategy_cls()
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
Esempio n. 27
0
    def test_timeout_none(self):

        if multi_process_runner.is_oss():
            self.skipTest('Intentionally skipping longer test in OSS.')

        def fn():
            time.sleep(250)
            raise ValueError('Worker 0 errored')

        mpr = multi_process_runner.MultiProcessRunner(
            fn, multi_worker_test_base.create_cluster_spec(num_workers=1))

        mpr.start()
        with self.assertRaisesRegex(ValueError, 'Worker 0 errored'):
            mpr.join(timeout=None)
Esempio n. 28
0
 def testCanonicalizeWithoutDefaultDeviceCollectiveEnabled(self):
     cluster_spec = server_lib.ClusterSpec(
         multi_worker_test_base.create_cluster_spec(has_chief=False,
                                                    num_workers=1,
                                                    num_ps=0,
                                                    has_eval=False))
     server_def = tensorflow_server_pb2.ServerDef(
         cluster=cluster_spec.as_cluster_def(),
         job_name="worker",
         task_index=0,
         protocol="grpc",
         port=0)
     context.context().enable_collective_ops(server_def)
     self.assertEqual(device_util.canonicalize("/cpu:0"),
                      "/job:worker/replica:0/task:0/device:CPU:0")
Esempio n. 29
0
    def test_stdout_captured(self):
        def simple_print_func():
            print('This is something printed.')
            return 'This is returned data.'

        returned_data, std_stream_data = multi_process_runner.run(
            simple_print_func,
            multi_worker_test_base.create_cluster_spec(num_workers=2),
            capture_std_stream=True)
        num_string_std_stream = len(
            [d for d in std_stream_data if d == 'This is something printed.'])
        num_string_returned_data = len(
            [d for d in returned_data if d == 'This is returned data.'])
        self.assertEqual(num_string_std_stream, 2)
        self.assertEqual(num_string_returned_data, 2)
Esempio n. 30
0
    def test_process_that_exits(self):
        def func_to_exit_in_10_sec():
            time.sleep(5)
            mpr._add_return_data('foo')
            time.sleep(20)
            mpr._add_return_data('bar')

        mpr = multi_process_runner.MultiProcessRunner(
            func_to_exit_in_10_sec,
            multi_worker_test_base.create_cluster_spec(num_workers=1),
            max_run_time=10)

        mpr.start()
        returned_data, _ = mpr.join()
        self.assertLen(returned_data, 1)
    def test_process_that_exits(self):
        def func_to_exit_in_25_sec():
            logging.error('foo')
            time.sleep(100)
            logging.error('bar')

        mpr = multi_process_runner.MultiProcessRunner(
            func_to_exit_in_25_sec,
            multi_worker_test_base.create_cluster_spec(num_workers=1),
            list_stdout=True,
            max_run_time=25)

        mpr.start()
        stdout = mpr.join().stdout
        self.assertLen([msg for msg in stdout if 'foo' in msg], 1)
        self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
    def test_stdout_captured(self):
        def simple_print_func():
            print('This is something printed.', flush=True)
            return 'This is returned data.'

        mpr_result = multi_process_runner.run(
            simple_print_func,
            multi_worker_test_base.create_cluster_spec(num_workers=2),
            list_stdout=True)
        std_stream_results = mpr_result.stdout
        return_value = mpr_result.return_value
        self.assertIn('[worker-0]:    This is something printed.\n',
                      std_stream_results)
        self.assertIn('[worker-1]:    This is something printed.\n',
                      std_stream_results)
        self.assertIn('This is returned data.', return_value)
Esempio n. 33
0
    def test_exit_code_is_reported_by_chief_subprocess(self):
        def proc_func_expected_to_exit_with_20():
            if multi_worker_test_base.get_task_type() == 'worker':
                time.sleep(10000)
            sys.exit(20)

        mpr = multi_process_runner.MultiProcessRunner(
            proc_func_expected_to_exit_with_20,
            multi_worker_test_base.create_cluster_spec(has_chief=True,
                                                       num_workers=1))
        mpr.start()

        with self.assertRaisesRegex(
                multi_process_runner.UnexpectedSubprocessExitError,
                'Subprocess chief-0 exited with exit code 20'):
            mpr.join()
Esempio n. 34
0
    def test_auto_restart(self):
        def proc_func(counter):
            counter.value += 1
            if counter.value == 1:
                raise ValueError

        manager = multi_process_runner.manager()
        counter = manager.Value(int, 0)
        mpr = multi_process_runner.MultiProcessRunner(
            proc_func,
            multi_worker_test_base.create_cluster_spec(num_workers=1),
            args=(counter, ),
            auto_restart=True)
        mpr.start()
        mpr.join()
        self.assertEqual(counter.value, 2)
  def test_template(self, strategy_cls):
    num_workers = 2
    num_epoch = 2

    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = get_strategy_object(strategy_cls)
        batch_size = 64
        steps = 2
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

        custom_callable(
            model,
            self,
            train_ds,
            num_epoch,
            steps,
            strategy,
            saving_filepath=kwargs['saving_filepath'])

    # Pass saving_filepath from the parent thread to ensure every worker has the
    # same fileapth to save.
    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.h5')
    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath)
    if os.path.exists(saving_filepath):
      os.remove(saving_filepath)

    threads_to_join = []
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
  def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
    train_distribute = self._get_strategy_object(train_distribute_cls)

    if eval_distribute_cls:
      eval_distribute = self._get_strategy_object(eval_distribute_cls)
    else:
      eval_distribute = None

    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=3, num_ps=0, has_eval=True)
    # 3 workers and 1 evaluator.
    self._barrier = dc._Barrier(4)
    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                 cluster_spec, train_distribute,
                                                 eval_distribute)
    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
                                       preemption_callback):
    """Test fault-tolerance with multi-threading using sync dist-strat.

    This test simulates multi-worker training that is interrupted by a
    preemption, by having two threads, each of which represents a chief and a
    non-chief worker, where the non-chief raises an error in the middle of
    training loop. Upon excepting the error, a new thread with a new cluster
    spec is created to simulate the recovered non-chief worker. Meanwhile, the
    chief worker cannot proceed and hangs since the non-chief worker has
    crashed. To simulate a restart of the chief, a new thread has been prepared
    to run to take over chief with the help of a condition variable. It is
    expected that after the restart of both chief and non-chief workers, the
    training continues from the epoch they previously failed at. The test
    concludes by verifying the preemption-interrupted training can finish with
    the same loss and accuracy had the preemption not occurred.

    Arguments:
      strategy_cls: The strategy class to use.
      file_format: `h5` or `tf`.
      preemption_callback: The callback to simulate preemption.
    """

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        # Condition variable that blocks the thread that represents the
        # restarted chief.
        cv = kwargs.get('cv', None)
        # `before_restart` is True for the threads that represent the original
        # chief and non-chief worker, and False for threads that represent the
        # restarted chief and non-chief workers.
        before_restart = kwargs['before_restart']
        if kwargs['new_chief']:
          # `new_chief` is only True for the restarted chief thread. It waits
          # until non-chief is preempted and restarted to simulate the causality
          # where chief's restart results from non-chief's failure.
          cv.acquire()
          while not hasattr(cv, 'preempted'):
            cv.wait()
          cv.release()

        # Model building under strategy scope. Following is the code we expect
        # the user runs on every worker.
        strategy = get_strategy_object(strategy_cls)
        batch_size = 64
        steps = 3
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

        # Function to start a new thread. This will be called twice in the
        # following code: one represents the restart of the non-chief, and one
        # represents the restart of the chief as a result of the restart of the
        # non-chief (so the training can continue in sync).
        def start_new_thread(new_chief=False):
          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
          return self._run_task_in_thread(
              task_fn=_independent_worker_fn,
              cluster_spec=None,
              task_type=None,
              task_id=None,
              tf_config=new_thread_tf_config,
              before_restart=False,
              cv=cv,
              new_chief=new_chief)

        if test_base.is_chief() and before_restart:
          # Chief to start a new thread (that will be blocked by a condition
          # variable until the non-chief's new thread is started). The thread
          # for (recovered) chief is started before entering `fit()` because
          # the original chief thread will eventually hang and be ignored.
          start_new_thread(new_chief=True)

        try:

          class CkptSavedEpochAssertingCallback(callbacks.Callback):

            def __init__(self, test_obj):
              super(CkptSavedEpochAssertingCallback, self).__init__()
              self.test_obj = test_obj

            def on_epoch_begin(self, epoch, logs=None):
              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
              self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None,
                                        epoch == 0)

          callbacks_list = [
              callbacks.ModelCheckpoint(
                  filepath=saving_filepath,
                  save_weights_only=True,
                  load_weights_on_restart=True),
              CkptSavedEpochAssertingCallback(self)
          ]
          if before_restart:
            callbacks_list.append(preemption_callback())

          self.assertIsNone(model._ckpt_saved_epoch)
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              callbacks=callbacks_list)
          self.assertIsNone(model._ckpt_saved_epoch)

          # `history` of the training result is collected to be compared against
          # each other. It is expected that the training results (loss and
          # accuracy`) are the same with or without preemption.
          self._histories.append(history.history)

        except RuntimeError:
          # pylint: disable=g-assert-in-except
          self.assertTrue(before_restart)
          # Reset the barrier so the new threads simulating recovery can
          # continue.
          self._barrier._counter = 0
          self._barrier._flag = False

          # Now that the non-chief has been preempted, it notifies the thread
          # that simulates the restarted chief to start so they can be back in
          # sync.
          cv.acquire()
          cv.preempted = True
          cv.notify()
          cv.release()

          # At this point we should discard the original non-chief thread, and
          # start the new thread that simulates the restarted non-chief, hence
          # joining the thread and return.
          self.join_independent_workers([start_new_thread()])
          return

        # Successful end of a `fit()` call.
        self._successful_thread_ends += 1
        self.assertFalse(before_restart)

    # Common parameters
    num_workers = 2
    num_epoch = 3
    # History list storing the results for preemption and no preemption cases.
    self._histories = []
    # Pass `saving_filepath` from the parent thread to ensure every worker has
    # the same filepath to save.
    saving_filepath = os.path.join(self.get_temp_dir(),
                                   'checkpoint.' + file_format)
    strategy = get_strategy_object(strategy_cls)

    # Case 1: Training for `num_epoch` without preemptions.
    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)
    self._successful_thread_ends = 0
    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        saving_filepath=saving_filepath,
        before_restart=False,
        new_chief=False)
    if os.path.exists(saving_filepath):
      os.remove(saving_filepath)
    threads_to_join = []
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
    self.assertEqual(self._successful_thread_ends, 2)

    # Case 2: Training for `num_epoch` epoch with preemptions.
    # The preemption is simulated at both epoch boundary and batch boundary.
    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    cv = threading.Condition()
    self._barrier = dc._Barrier(2)
    # Ports reserved for new threads simulating recovery.
    reserved_ports = [
        'localhost:%s' % test_base.pick_unused_port()
        for _ in range(num_workers)
    ]
    self._successful_thread_ends = 0
    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        saving_filepath=saving_filepath,
        reserved_ports=reserved_ports,
        before_restart=True,
        cv=cv,
        new_chief=False)
    if os.path.exists(saving_filepath):
      os.remove(saving_filepath)
    threads_to_join = []
    if strategy.extended.experimental_between_graph:
      # Only join the non-chief thread since the first thread for chief will
      # eventually hang and be ignored.
      threads_to_join = [threads['worker'][1]]
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
    self.assertEqual(self._successful_thread_ends, 2)

    def assert_all_elements_are_identical(list_to_check):
      first_item = list_to_check[0]
      for item in list_to_check[1:]:
        self.assertAllClose(first_item, item, rtol=1e-5, atol=1e-5)

    # Important: the results from preemption interrupted and non-interrupted
    # cases should give the same final results.
    assert_all_elements_are_identical(
        [history['acc'][-1] for history in self._histories])
    assert_all_elements_are_identical(
        [history['loss'][-1] for history in self._histories])
    # The length of `self._histories` would be num_workers * num_runs (3).
    self.assertLen(self._histories, 4)
  def run_optimizer_comparison_with_simple_bias_model(
      self, strategy_cls, optimizer_class_1, optimizer_class_2):

    def get_input_datasets():
      # Simple training input.
      train_input = [[1]] * 16
      train_label = [[0]] * 16
      ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label))
      ds = maybe_shard_dataset(ds)
      # TODO(rchao): Investigate to figure out the reason for having 8 workers
      # instead of 2 as expected.
      return ds.batch(8, drop_remainder=True)

    def get_simple_bias_model():

      class Bias(base_layer.Layer):

        def build(self, input_shape):
          self.bias = self.add_variable('bias', (1,), initializer='zeros')

        def call(self, inputs):
          return inputs + self.bias

      model = sequential.Sequential()
      model.add(Bias(input_shape=(1,)))

      return model

    self._lock = threading.Lock()
    cluster_spec = test_base.create_cluster_spec(num_workers=2)
    self._barrier = dc._Barrier(2)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside a thread."""
      # TODO(rchao): Refactor to abstract the common boilerplate out.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):

        model = get_simple_bias_model()

        initial_weights = model.get_weights()

        def _get_model_results(optimizer, initial_weights):

          # Clear Keras session to reset device assignment
          keras.backend._SESSION.session = None
          strategy = strategy_cls()

          with strategy.scope():
            train_ds = get_input_datasets()
            model = get_simple_bias_model()
            model.set_weights(initial_weights)
            model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])

          return {
              'trained_loss_and_accuracy':
                  model.fit(x=train_ds, epochs=20).history,
              'trained_weights':
                  model.get_weights(),
          }

        results1 = _get_model_results(optimizer_class_1(0.01), initial_weights)
        results2 = _get_model_results(optimizer_class_2(0.01), initial_weights)

        for key in results1:
          self.assertAllClose(
              results1[key],
              results2[key],
              atol=1e-5,
              rtol=1e-5,
              msg='Fail to assert {}'.format(key))

    threads = self.run_multiple_tasks_in_threads(_independent_worker_fn,
                                                 cluster_spec)

    threads_to_join = []
    strategy = strategy_cls()
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)