def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath, **kwargs): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. # TODO(b/134551335): Must save to hdf5 until bug with copying # MirroredVariables is resolved. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d.h5' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj): model, _, train_ds, steps = _model_setup(test_obj, file_format='') num_epoch = 2 # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves summaries but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'logfile_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(file_io.file_exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)]) # If it's chief, the summaries should be saved in the filepath; if not, # the directory should be empty (although created). Using # `file_io.list_directory()` since the directory may be created at this # point. test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)), test_base.is_chief())
def testSimpleInputFromFnLastPartialBatch(self, strategy): def dataset_fn(input_context): global_batch_size = 8 batch_size = input_context.get_per_replica_batch_size(global_batch_size) dataset = dataset_ops.DatasetV2.range(14).batch( batch_size, drop_remainder=False) return dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) input_iterator = iter( strategy.experimental_distribute_datasets_from_function(dataset_fn)) @def_function.function def run(input_iterator): return strategy.run(lambda x: x, args=(next(input_iterator),)) # Let the complete batch go. run(input_iterator) # `result` is an incomplete batch result = run(input_iterator) expected_data_on_worker = [[8, 9, 10, 11], [12, 13]] self.assertTrue( np.array_equal( result.numpy(), expected_data_on_worker[ multi_worker_test_base.get_task_index()]))
def proc_model_checkpoint_saves_on_chief_but_not_otherwise( test_obj, file_format): model, saving_filepath, train_ds, steps = _model_setup( test_obj, file_format) num_epoch = 2 extension = os.path.splitext(saving_filepath)[1] # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' % (test_base.get_task_type(), test_base.get_task_index(), extension)) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse( training_state.checkpoint_exists(saving_filepath)) model.fit(x=train_ds, epochs=num_epoch, steps_per_epoch=steps, validation_data=train_ds, validation_steps=steps, callbacks=[ callbacks.ModelCheckpoint( filepath=saving_filepath, save_weights_only=save_weights_only) ]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual( training_state.checkpoint_exists(saving_filepath), test_base.is_chief()) # If it's chief, the model should be saved (`write_filepath` should # simply return `saving_filepath`); if not, i.e. for non-chief workers, # the temporary path generated by `write_filepath` should no longer # contain the checkpoint that has been deleted. test_obj.assertEqual( training_state.checkpoint_exists( distributed_file_utils.write_filepath( saving_filepath, model._distribution_strategy)), test_base.is_chief())
def testSimpleInputFromDatasetLastPartialBatch(self, strategy): global_batch_size = 8 dataset = dataset_ops.DatasetV2.range(14).batch( global_batch_size, drop_remainder=False) input_iterator = iter(strategy.experimental_distribute_dataset(dataset)) @def_function.function def run(input_iterator): return strategy.run(lambda x: x, args=(next(input_iterator),)) # Let the complete batch go. run(input_iterator) # `result` is an incomplete batch result = run(input_iterator) expected_data_on_worker = [[8, 9, 10], [11, 12, 13]] self.assertTrue( np.array_equal( result.numpy(), expected_data_on_worker[multi_worker_test_base.get_task_index()]))
def callableForTestModelCheckpointSavesOnChiefButNotOtherwise( model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath): # Incorporate type/index information and thread id in saving_filepath to # ensure every worker has a unique path. Note that in normal use case the # saving_filepath will be the same for all workers, but we use different # ones here just to test out chief saves checkpoint but non-chief doesn't. saving_filepath = os.path.join( test_obj.get_temp_dir(), 'checkpoint_%s_%d' % (test_base.get_task_type(), test_base.get_task_index())) # The saving_filepath shouldn't exist at the beginning (as it's unique). test_obj.assertFalse(os.path.exists(saving_filepath)) model.fit( x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)]) # If it's chief, the model should be saved; if not, the model shouldn't. test_obj.assertEqual(os.path.exists(saving_filepath), test_base.is_chief())
def testDatasetFromFunction(self, strategy): def dataset_fn(input_context): global_batch_size = 10 batch_size = input_context.get_per_replica_batch_size(global_batch_size) d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size) return d.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) expected_sum_on_workers = [10, 35] input_iterator = iter( strategy.experimental_distribute_datasets_from_function(dataset_fn)) @def_function.function def run(iterator): return strategy.experimental_local_results(iterator.get_next()) result = run(input_iterator) sum_value = math_ops.reduce_sum(result) self.assertEqual( sum_value.numpy(), expected_sum_on_workers[multi_worker_test_base.get_task_index()])
def testDatasetFromFunction(self, strategy): def dataset_fn(input_context): global_batch_size = 10 batch_size = input_context.get_per_replica_batch_size(global_batch_size) d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size) return d.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) expected_data_on_worker = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] input_iterator = iter( strategy.experimental_distribute_datasets_from_function(dataset_fn)) @def_function.function def run(iterator): return strategy.experimental_local_results(iterator.get_next()) result = run(input_iterator) self.assertTrue( np.array_equal( result[0].numpy(), expected_data_on_worker[multi_worker_test_base.get_task_index()]))
def wrapped_method(method_to_wrap, name, *arg, **kwargs): # Use lock to ensure += operation is thread-safe. with self._lock: self._task_dict[test_base.get_task_type()][ test_base.get_task_index()][name] += 1 method_to_wrap(*arg, **kwargs)
def wrapped_method(method_to_wrap, name, *arg, **kwargs): # Use lock to ensure += operation is thread-safe. with self._lock: self._task_dict[test_base.get_task_type()][ test_base.get_task_index()][name] += 1 method_to_wrap(*arg, **kwargs)