def test_complete_flow_indepedent_worker_between_graph(
            self, train_distribute_cls, eval_distribute_cls):
        train_distribute = train_distribute_cls(
            num_gpus_per_worker=context.num_gpus())

        if eval_distribute_cls:
            eval_distribute = eval_distribute_cls()
        else:
            eval_distribute = None

        cluster_spec = multi_worker_test_base.create_cluster_spec(
            num_workers=3, num_ps=2, has_eval=True)
        # 3 workers, 2 ps and 1 evaluator.
        self._barrier = dc._Barrier(6)

        threads = self._run_multiple_tasks_in_threads(cluster_spec,
                                                      train_distribute,
                                                      eval_distribute)
        for task_type, ts in threads.items():
            if task_type == PS:
                continue
            for t in ts:
                t.join()

        estimator = self._get_estimator(train_distribute, eval_distribute)
        self._inspect_train_and_eval_events(estimator)
Esempio n. 2
0
  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2
    cluster_spec = test_base.create_cluster_spec(
        num_workers=num_workers, num_ps=2)
    self._barrier = dc._Barrier(4)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      # TODO(rchao/yuefengz): The following is run by both worker and ps
      # threads. The distribute coordinator should run std server immediately
      # without configuring the session (or building the graph) on PS.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        batch_size = 64
        steps = 10
        strategy = strategy_cls(num_gpus_per_worker=context.num_gpus())
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph

        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

          # TODO(b/123868066): Verify callback for model.evaluate().
          callbacks_for_fit = nest.flatten(
              kwargs.get('verification_callback', []))
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              validation_data=val_ds,
              validation_steps=steps,
              callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    for task_type, ts in threads.items():
      # This test can finish once the worker threads complete, and thus
      # the ps threads don't need to be joined.
      if task_type == 'ps':
        continue
      threads_to_join.extend(ts)
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
  def testSimpleModelIndependentWorkerAsync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2
    cluster_spec = test_base.create_cluster_spec(
        num_workers=num_workers, num_ps=2)
    self._barrier = dc._Barrier(4)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      # TODO(rchao/yuefengz): The following is run by both worker and ps
      # threads. The distribute coordinator should run std server immediately
      # without configuring the session (or building the graph) on PS.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        batch_size = 64
        steps = 10
        strategy = strategy_cls(num_gpus_per_worker=context.num_gpus())
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph

        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        val_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

          # TODO(b/123868066): Verify callback for model.evaluate().
          callbacks_for_fit = nest.flatten(
              kwargs.get('verification_callback', []))
          history = model.fit(
              x=train_ds,
              epochs=num_epoch,
              steps_per_epoch=steps,
              validation_data=val_ds,
              validation_steps=steps,
              callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    for task_type, ts in threads.items():
      # This test can finish once the worker threads complete, and thus
      # the ps threads don't need to be joined.
      if task_type == 'ps':
        continue
      threads_to_join.extend(ts)
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
    def test_complete_flow_indepedent_worker_between_graph(
            self, train_distribute_cls, eval_distribute_cls):
        train_distribute = train_distribute_cls(
            num_gpus_per_worker=context.num_gpus())

        if (context.num_gpus() < 2 and eval_distribute_cls
                == collective_all_reduce_strategy.CollectiveAllReduceStrategy):
            self.skipTest(
                "`CollectiveAllReduceStrategy` needs at least two towers.")

        if eval_distribute_cls:
            eval_distribute = eval_distribute_cls(
                num_gpus_per_worker=context.num_gpus())
        else:
            eval_distribute = None

        if (train_distribute_cls ==
                parameter_server_strategy.ParameterServerStrategy):
            cluster_spec = multi_worker_test_base.create_cluster_spec(
                num_workers=3, num_ps=2, has_eval=True)
            # 3 workers, 2 ps and 1 evaluator.
            self._barrier = dc._Barrier(6)
        else:
            cluster_spec = multi_worker_test_base.create_cluster_spec(
                num_workers=3, num_ps=0, has_eval=True)
            # 3 workers and 1 evaluator.
            self._barrier = dc._Barrier(4)

        threads = self._run_multiple_tasks_in_threads(cluster_spec,
                                                      train_distribute,
                                                      eval_distribute)
        for task_type, ts in threads.items():
            if task_type == PS:
                continue
            for t in ts:
                t.join()

        estimator = self._get_estimator(train_distribute, eval_distribute)
        self._inspect_train_and_eval_events(estimator)
  def test_complete_flow_independent_worker_between_graph(
      self, train_distribute_cls, eval_distribute_cls):
    if (context.num_gpus() < 2 and eval_distribute_cls ==
        collective_all_reduce_strategy.CollectiveAllReduceStrategy):
      self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")

    train_distribute = self._get_strategy_object(train_distribute_cls)

    if eval_distribute_cls:
      eval_distribute = self._get_strategy_object(eval_distribute_cls)
    else:
      eval_distribute = None

    if (train_distribute_cls == parameter_server_strategy
        .ParameterServerStrategy):
      cluster_spec = multi_worker_test_base.create_cluster_spec(
          num_workers=3, num_ps=2, has_eval=True)
      # 3 workers, 2 ps and 1 evaluator.
      self._barrier = dc._Barrier(6)
    else:
      cluster_spec = multi_worker_test_base.create_cluster_spec(
          num_workers=3, num_ps=0, has_eval=True)
      # 3 workers and 1 evaluator.
      self._barrier = dc._Barrier(4)

    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                 cluster_spec, train_distribute,
                                                 eval_distribute)
    threads_to_join = []
    for task_type, ts in threads.items():
      if task_type == PS:
        continue
      for t in ts:
        threads_to_join.append(t)
    self.join_independent_workers(threads_to_join)

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
Esempio n. 6
0
  def testSimpleModelIndependentWorkerSync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2

    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = get_strategy_object(strategy_cls)
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph
        batch_size = 64
        steps = 10
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))
        orig_loss, _ = model.evaluate(train_ds, steps=steps)
        callbacks_for_fit = nest.flatten(
            kwargs.get('verification_callback', []))
        history = model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)
        trained_loss, _ = model.evaluate(train_ds, steps=steps)
        self.assertLess(trained_loss, orig_loss)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
  def testSimpleModelIndependentWorkerSync(self, strategy_cls):
    num_workers = 2
    num_epoch = 2

    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)

    # The verification callback will be shared by multiple threads.
    verification_callback = MultiWorkerVerificationCallback(
        num_epoch=num_epoch, num_worker=num_workers)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = get_strategy_object(strategy_cls)
        verification_callback.is_between_graph = \
            strategy.extended.experimental_between_graph
        batch_size = 64
        steps = 10
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))
        orig_loss, _ = model.evaluate(train_ds, steps=steps)
        callbacks_for_fit = nest.flatten(
            kwargs.get('verification_callback', []))
        history = model.fit(
            x=train_ds,
            epochs=num_epoch,
            steps_per_epoch=steps,
            callbacks=callbacks_for_fit)
        self.assertIsInstance(history, keras.callbacks.History)
        trained_loss, _ = model.evaluate(train_ds, steps=steps)
        self.assertLess(trained_loss, orig_loss)

    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn,
        cluster_spec,
        verification_callback=verification_callback)

    threads_to_join = []
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)
    verification_callback.verify(self)
  def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
    train_distribute = self._get_strategy_object(train_distribute_cls)

    if eval_distribute_cls:
      eval_distribute = self._get_strategy_object(eval_distribute_cls)
    else:
      eval_distribute = None

    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=3, num_ps=0, has_eval=True)
    # 3 workers and 1 evaluator.
    self._barrier = dc._Barrier(4)
    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                 cluster_spec, train_distribute,
                                                 eval_distribute)
    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
  def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
    train_distribute = self._get_strategy_object(train_distribute_cls)

    if eval_distribute_cls:
      eval_distribute = self._get_strategy_object(eval_distribute_cls)
    else:
      eval_distribute = None

    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=3, num_ps=0, has_eval=True)
    # 3 workers and 1 evaluator.
    self._barrier = dc._Barrier(4)
    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                 cluster_spec, train_distribute,
                                                 eval_distribute)
    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
Esempio n. 10
0
  def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                    eval_distribute_cls):
    train_distribute = train_distribute_cls(
        num_gpus_per_worker=context.num_gpus())

    if eval_distribute_cls:
      eval_distribute = eval_distribute_cls()
    else:
      eval_distribute = None

    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=3, num_ps=0, has_eval=True)
    # 3 workers and 1 evaluator.
    self._barrier = dc._Barrier(4)
    threads = self._run_multiple_tasks_in_threads(
        cluster_spec, train_distribute, eval_distribute)
    threads[WORKER][0].join()
    threads[EVALUATOR][0].join()

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)
  def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                    eval_distribute_cls):
    train_distribute = train_distribute_cls(
        num_gpus_per_worker=context.num_gpus())

    if eval_distribute_cls:
      eval_distribute = eval_distribute_cls(
          num_gpus_per_worker=context.num_gpus())
    else:
      eval_distribute = None

    cluster_spec = multi_worker_test_base.create_cluster_spec(
        num_workers=3, num_ps=0, has_eval=True)
    # 3 workers and 1 evaluator.
    self._barrier = dc._Barrier(4)
    threads = self._run_multiple_tasks_in_threads(
        cluster_spec, train_distribute, eval_distribute)
    threads[WORKER][0].join()
    threads[EVALUATOR][0].join()

    estimator = self._get_estimator(train_distribute, eval_distribute)
    self._inspect_train_and_eval_events(estimator)