def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. # TODO(b/134551335): Must save to hdf5 until bug with copying # MirroredVariables is resolved. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves summaries but non-chief doesn't. task_config = _get_task_config() saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s_%d' % (task_config['type'], task_config['index'])) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists_v2(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # If it's chief, the summaries should be saved in the filepath; if not, # the directory should be empty (although created). Using # `file_io.list_directory()` since the directory may be created at this # point. test_obj.assertEqual( bool(file_io.list_directory_v2(saving_filepath)), test_base.is_chief())
def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse( training_state.checkpoint_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=train_ds, validation_steps=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual( training_state.checkpoint_exists(saving_filepath), test_base.is_chief()) # If it's chief, the model should be saved (`write_filepath` should # simply return `saving_filepath`); if not, i.e. for non-chief workers, # the temporary path generated by `write_filepath` should no longer # contain the checkpoint that has been deleted. test_obj.assertEqual( training_state.checkpoint_exists( distributed_file_utils.write_filepath( saving_filepath, model._distribution_strategy)), test_base.is_chief())
def mocked_mkstemp(): # Only non-chief should call tempfile.mkstemp() inside fit() in sync # training. assert not test_base.is_chief() file_handle, temp_file_name = real_mkstemp() extension = os.path.splitext(saving_filepath)[1] temp_filepath = temp_file_name + extension filepaths.append(temp_filepath) return file_handle, temp_file_name
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): # Condition variable that blocks the thread that represents the # restarted chief. cv = kwargs.get('cv', None) # `before_restart` is True for the threads that represent the original # chief and non-chief worker, and False for threads that represent the # restarted chief and non-chief workers. before_restart = kwargs['before_restart'] if kwargs['new_chief']: # `new_chief` is only True for the restarted chief thread. It waits # until non-chief is preempted and restarted to simulate the causality # where chief's restart results from non-chief's failure. cv.acquire() while not hasattr(cv, 'preempted'): cv.wait() cv.release() # Model building under strategy scope. Following is the code we expect # the user runs on every worker. strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 3 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # Function to start a new thread. This will be called twice in the # following code: one represents the restart of the non-chief, and one # represents the restart of the chief as a result of the restart of the # non-chief (so the training can continue in sync). def start_new_thread(new_chief=False): new_thread_tf_config = json.loads(os.environ['TF_CONFIG']) new_thread_tf_config['cluster']['worker'] = kwargs[ 'reserved_ports'] return self._run_task_in_thread( task_fn=_independent_worker_fn, cluster_spec=None, task_type=None, task_id=None, tf_config=new_thread_tf_config, before_restart=False, cv=cv, new_chief=new_chief) if test_base.is_chief() and before_restart: # Chief to start a new thread (that will be blocked by a condition # variable until the non-chief's new thread is started). The thread # for (recovered) chief is started before entering `fit()` because # the original chief thread will eventually hang and be ignored. start_new_thread(new_chief=True) try: class CkptSavedEpochAssertingCallback(callbacks.Callback): def __init__(self, test_obj): super(CkptSavedEpochAssertingCallback, self).__init__() self.test_obj = test_obj def on_epoch_begin(self, epoch, logs=None): # `_ckpt_saved_epoch` attribute is set at the end of every epoch. self.test_obj.assertEqual( self.model._ckpt_saved_epoch is None, epoch == 0) callbacks_list = [ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=True, load_weights_on_restart=True), CkptSavedEpochAssertingCallback(self) ] if before_restart: callbacks_list.append(preemption_callback()) self.assertIsNone(model._ckpt_saved_epoch) history = model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_list) self.assertIsNone(model._ckpt_saved_epoch) # `history` of the training result is collected to be compared against # each other. It is expected that the training results (loss and # accuracy`) are the same with or without preemption. self._histories.append(history.history) except RuntimeError: # pylint: disable=g-assert-in-except self.assertTrue(before_restart) # Reset the barrier so the new threads simulating recovery can # continue. self._barrier._counter = 0 self._barrier._flag = False # Now that the non-chief has been preempted, it notifies the thread # that simulates the restarted chief to start so they can be back in # sync. cv.acquire() cv.preempted = True cv.notify() cv.release() # At this point we should discard the original non-chief thread, and # start the new thread that simulates the restarted non-chief, hence # joining the thread and return. self.join_independent_workers([start_new_thread()]) return # Successful end of a `fit()` call. self._successful_thread_ends += 1 self.assertFalse(before_restart)
def on_epoch_begin(self, epoch, logs=None): if epoch == 1 and not test_base.is_chief(): # Simulate preemtion at the start of second epoch. raise RuntimeError('Preemption!')
def _independent_worker_fn(*args, **kwargs): # pylint: disable=unused-argument with test.mock.patch.object(dc, '_run_std_server', self._make_mock_run_std_server()): # Condition variable that blocks the thread that represents the # restarted chief. cv = kwargs.get('cv', None) # `before_restart` is True for the threads that represent the original # chief and non-chief worker, and False for threads that represent the # restarted chief and non-chief workers. before_restart = kwargs['before_restart'] if kwargs['new_chief']: # `new_chief` is only True for the restarted chief thread. It waits # until non-chief is preempted and restarted to simulate the causality # where chief's restart results from non-chief's failure. cv.acquire() while not hasattr(cv, 'preempted'): cv.wait() cv.release() # Model building under strategy scope. Following is the code we expect # the user runs on every worker. strategy = get_strategy_object(strategy_cls) batch_size = 64 steps = 3 train_ds, _ = _mnist_synthetic_dataset(batch_size, steps) with strategy.scope(): model = _get_model((28, 28, 1)) # Function to start a new thread. This will be called twice in the # following code: one represents the restart of the non-chief, and one # represents the restart of the chief as a result of the restart of the # non-chief (so the training can continue in sync). def start_new_thread(new_chief=False): new_thread_tf_config = json.loads(os.environ['TF_CONFIG']) new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports'] return self._run_task_in_thread( task_fn=_independent_worker_fn, cluster_spec=None, task_type=None, task_id=None, tf_config=new_thread_tf_config, before_restart=False, cv=cv, new_chief=new_chief) if test_base.is_chief() and before_restart: # Chief to start a new thread (that will be blocked by a condition # variable until the non-chief's new thread is started). The thread # for (recovered) chief is started before entering `fit()` because # the original chief thread will eventually hang and be ignored. start_new_thread(new_chief=True) try: class CkptSavedEpochAssertingCallback(callbacks.Callback): def __init__(self, test_obj): super(CkptSavedEpochAssertingCallback, self).__init__() self.test_obj = test_obj def on_epoch_begin(self, epoch, logs=None): # `_ckpt_saved_epoch` attribute is set at the end of every epoch. self.test_obj.assertEqual(self.model._ckpt_saved_epoch is None, epoch == 0) callbacks_list = [ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=True, load_weights_on_restart=True), CkptSavedEpochAssertingCallback(self) ] if before_restart: callbacks_list.append(preemption_callback()) self.assertIsNone(model._ckpt_saved_epoch) history = model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=callbacks_list) self.assertIsNone(model._ckpt_saved_epoch) # `history` of the training result is collected to be compared against # each other. It is expected that the training results (loss and # accuracy`) are the same with or without preemption. self._histories.append(history.history) except RuntimeError: # pylint: disable=g-assert-in-except self.assertTrue(before_restart) # Reset the barrier so the new threads simulating recovery can # continue. self._barrier._counter = 0 self._barrier._flag = False # Now that the non-chief has been preempted, it notifies the thread # that simulates the restarted chief to start so they can be back in # sync. cv.acquire() cv.preempted = True cv.notify() cv.release() # At this point we should discard the original non-chief thread, and # start the new thread that simulates the restarted non-chief, hence # joining the thread and return. self.join_independent_workers([start_new_thread()]) return # Successful end of a `fit()` call. self._successful_thread_ends += 1 self.assertFalse(before_restart)