def test_complete_flow_indepedent_worker_between_graph( self, train_distribute_cls, eval_distribute_cls): train_distribute = train_distribute_cls( num_gpus_per_worker=context.num_gpus()) if eval_distribute_cls: eval_distribute = eval_distribute_cls() else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=2, has_eval=True) # 3 workers, 2 ps and 1 evaluator. self._barrier = dc._Barrier(6) threads = self._run_multiple_tasks_in_threads(cluster_spec, train_distribute, eval_distribute) for task_type, ts in threads.items(): if task_type == PS: continue for t in ts: t.join() estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def testSimpleModelIndependentWorkerAsync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec( num_workers=num_workers, num_ps=2) self._barrier = dc._Barrier(4) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" # TODO(rchao/yuefengz): The following is run by both worker and ps # threads. The distribute coordinator should run std server immediately # without configuring the session (or building the graph) on PS. with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): batch_size = 64 steps = 10 strategy = strategy_cls(num_gpus_per_worker=context.num_gpus()) verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) val_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # TODO(b/123868066): Verify callback for model.evaluate(). callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=val_ds, validation_steps=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] for task_type, ts in threads.items(): # This test can finish once the worker threads complete, and thus # the ps threads don't need to be joined. if task_type == 'ps': continue threads_to_join.extend(ts) self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def testSimpleModelIndependentWorkerAsync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec( num_workers=num_workers, num_ps=2) self._barrier = dc._Barrier(4) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" # TODO(rchao/yuefengz): The following is run by both worker and ps # threads. The distribute coordinator should run std server immediately # without configuring the session (or building the graph) on PS. with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): batch_size = 64 steps = 10 strategy = strategy_cls(num_gpus_per_worker=context.num_gpus()) verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) val_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # TODO(b/123868066): Verify callback for model.evaluate(). callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=val_ds, validation_steps=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] for task_type, ts in threads.items(): # This test can finish once the worker threads complete, and thus # the ps threads don't need to be joined. if task_type == 'ps': continue threads_to_join.extend(ts) self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def test_complete_flow_indepedent_worker_between_graph( self, train_distribute_cls, eval_distribute_cls): train_distribute = train_distribute_cls( num_gpus_per_worker=context.num_gpus()) if (context.num_gpus() < 2 and eval_distribute_cls == collective_all_reduce_strategy.CollectiveAllReduceStrategy): self.skipTest( "`CollectiveAllReduceStrategy` needs at least two towers.") if eval_distribute_cls: eval_distribute = eval_distribute_cls( num_gpus_per_worker=context.num_gpus()) else: eval_distribute = None if (train_distribute_cls == parameter_server_strategy.ParameterServerStrategy): cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=2, has_eval=True) # 3 workers, 2 ps and 1 evaluator. self._barrier = dc._Barrier(6) else: cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self._run_multiple_tasks_in_threads(cluster_spec, train_distribute, eval_distribute) for task_type, ts in threads.items(): if task_type == PS: continue for t in ts: t.join() estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def test_complete_flow_independent_worker_between_graph( self, train_distribute_cls, eval_distribute_cls): if (context.num_gpus() < 2 and eval_distribute_cls == collective_all_reduce_strategy.CollectiveAllReduceStrategy): self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.") train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object(eval_distribute_cls) else: eval_distribute = None if (train_distribute_cls == parameter_server_strategy .ParameterServerStrategy): cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=2, has_eval=True) # 3 workers, 2 ps and 1 evaluator. self._barrier = dc._Barrier(6) else: cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) threads_to_join = [] for task_type, ts in threads.items(): if task_type == PS: continue for t in ts: threads_to_join.append(t) self.join_independent_workers(threads_to_join) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def testSimpleModelIndependentWorkerSync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph batch_size = 64 steps = 10 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) orig_loss, _ = model.evaluate(train_ds, steps=steps) callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) trained_loss, _ = model.evaluate(train_ds, steps=steps) self.assertLess(trained_loss, orig_loss) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def testSimpleModelIndependentWorkerSync(self, strategy_cls): num_workers = 2 num_epoch = 2 cluster_spec = test_base.create_cluster_spec(num_workers=num_workers) self._barrier = dc._Barrier(2) # The verification callback will be shared by multiple threads. verification_callback = MultiWorkerVerificationCallback( num_epoch=num_epoch, num_worker=num_workers) def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument """Simulates an Independent Worker inside of a thread.""" with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): strategy = get_strategy_object(strategy_cls) verification_callback.is_between_graph = \ strategy.extended.experimental_between_graph batch_size = 64 steps = 10 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) orig_loss, _ = model.evaluate(train_ds, steps=steps) callbacks_for_fit = nest.flatten( kwargs.get('verification_callback', [])) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_for_fit) self.assertIsInstance(history, keras.callbacks.History) trained_loss, _ = model.evaluate(train_ds, steps=steps) self.assertLess(trained_loss, orig_loss) threads = self.run_multiple_tasks_in_threads( _independent_worker_fn, cluster_spec, verification_callback=verification_callback) threads_to_join = [] strategy = get_strategy_object(strategy_cls) if strategy.extended.experimental_between_graph: for ts in threads.values(): threads_to_join.extend(ts) else: threads_to_join = [threads['worker'][0]] self.join_independent_workers(threads_to_join) verification_callback.verify(self)
def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object(eval_distribute_cls) else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]]) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def test_complete_flow_independent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): train_distribute = self._get_strategy_object(train_distribute_cls) if eval_distribute_cls: eval_distribute = self._get_strategy_object(eval_distribute_cls) else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn, cluster_spec, train_distribute, eval_distribute) self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]]) estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): train_distribute = train_distribute_cls( num_gpus_per_worker=context.num_gpus()) if eval_distribute_cls: eval_distribute = eval_distribute_cls() else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self._run_multiple_tasks_in_threads( cluster_spec, train_distribute, eval_distribute) threads[WORKER][0].join() threads[EVALUATOR][0].join() estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)
def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): train_distribute = train_distribute_cls( num_gpus_per_worker=context.num_gpus()) if eval_distribute_cls: eval_distribute = eval_distribute_cls( num_gpus_per_worker=context.num_gpus()) else: eval_distribute = None cluster_spec = multi_worker_test_base.create_cluster_spec( num_workers=3, num_ps=0, has_eval=True) # 3 workers and 1 evaluator. self._barrier = dc._Barrier(4) threads = self._run_multiple_tasks_in_threads( cluster_spec, train_distribute, eval_distribute) threads[WORKER][0].join() threads[EVALUATOR][0].join() estimator = self._get_estimator(train_distribute, eval_distribute) self._inspect_train_and_eval_events(estimator)