def test_csv_logger_callback(self, setup_save_path, model_config, number_epochs): (train_dataset, val_dataset) = train_val_dataset_generator() (ref_train_dataset, ref_val_dataset) = train_val_dataset_generator() filename = os.path.join(setup_save_path, "training") tnt_model_runner, reference_model_runner = gen_model_runners( model_config) param_dict = {'epochs': number_epochs, 'verbose': 0, 'shuffle': False} tnt_filename = filename + '_tnt.csv' tnt_model_runner.model.fit( train_dataset, validation_data=val_dataset, callbacks=[tf.keras.callbacks.CSVLogger(tnt_filename)], **param_dict) result = [True] if tnt.is_master_rank(): ref_filename = filename + '_ref.csv' reference_model_runner.model.fit( ref_train_dataset, validation_data=ref_val_dataset, callbacks=[tf.keras.callbacks.CSVLogger(ref_filename)], **param_dict) tnt_metrics = util.get_metric_values_from_file(tnt_filename) ref_metrics = util.get_metric_values_from_file(ref_filename) result = np.allclose(tnt_metrics, ref_metrics, atol=1e-6) util.assert_on_all_ranks(result)
def assert_identical_tnt_and_ref_history(tnt_history, ref_history): result = [True] if tnt.is_master_rank(): for key in ref_history.history.keys(): result += [ all( np.isclose(tnt_history.history[key], ref_history.history[key], atol=1e-6)) ] result = [all(result)] util.assert_on_all_ranks(result)
def test_early_stopping_callback(self, model_config, number_epochs): monitor_metric = 'val_loss' callbacks = [ tf.keras.callbacks.EarlyStopping(monitor=monitor_metric, min_delta=0.1, patience=1) ] tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks( callbacks, model_config, number_epochs) # Expect both models to run same number of epochs result = [True] if tnt.is_master_rank(): result = (len(tnt_history.history[monitor_metric]) == len( reference_history.history[monitor_metric])) util.assert_on_all_ranks(result)
def test_optimizers_compare_to_reference(self, model_config, optimizer, micro_batch_size, nbatches, number_epochs): tnt_history, ref_history = train_tnt_and_reference_models( model_config, optimizer, micro_batch_size, nbatches, number_epochs) result = [True, True] if tnt.is_master_rank(): result = [ np.allclose(tnt_history.history['loss'], ref_history.history['loss'], atol=1e-4), np.allclose(tnt_history.history[metric], ref_history.history[metric], atol=1e-6) ] util.assert_on_all_ranks(result)
def test_compare_accuracy_against_reference(self, model_runners, micro_batch_size, number_epochs, nbatches, test_nbatches, remainder_samples_per_batch, last_incomplete_batch_size): (train_dataset, test_dataset) = util.train_test_mnist_datasets( nbatches=nbatches, test_nbatches=test_nbatches, micro_batch_size=micro_batch_size, shuffle=False, remainder_samples_per_batch=remainder_samples_per_batch, last_incomplete_batch_size=last_incomplete_batch_size) (ref_train_dataset, ref_test_dataset) = util.train_test_mnist_datasets( nbatches=nbatches, test_nbatches=test_nbatches, micro_batch_size=micro_batch_size, shuffle=False, remainder_samples_per_batch=remainder_samples_per_batch, last_incomplete_batch_size=last_incomplete_batch_size) tnt_model_runner, reference_model_runner = model_runners reference_model_runner.train_model(ref_train_dataset, number_epochs) tnt_model_runner.train_model(train_dataset, number_epochs) tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset) ref_loss_accuracy = reference_model_runner.evaluate_model( ref_test_dataset) rank = tnt.get_rank() logging.getLogger().info( f"[Rank {rank}] Tarantella[loss, accuracy] = {tnt_loss_accuracy}") logging.getLogger().info( f"[Rank {rank}] Reference [loss, accuracy] = {ref_loss_accuracy}") result = [True, True] if tnt.is_master_rank(): result = [ np.isclose(tnt_loss_accuracy[0], ref_loss_accuracy[0], atol=1e-2), # losses might not be identical np.isclose(tnt_loss_accuracy[1], ref_loss_accuracy[1], atol=1e-6) ] util.assert_on_all_ranks(result)
def test_progbar_logger_callback_inference(self, model_config, number_epochs, use_explicit_progbarlogger, verbose, exec_type, capsys): (train_dataset, test_dataset) = train_val_dataset_generator() (ref_train_dataset, ref_test_dataset) = train_val_dataset_generator() tnt_callbacks = [tf.keras.callbacks.ProgbarLogger( count_mode='steps')] if use_explicit_progbarlogger else [] ref_callbacks = [tf.keras.callbacks.ProgbarLogger( count_mode='steps')] if use_explicit_progbarlogger else [] tnt_model_runner, ref_model_runner = gen_model_runners(model_config) if exec_type == 'evaluate': tnt_model_runner.model.evaluate(test_dataset, callbacks=tnt_callbacks, verbose=verbose) elif exec_type == 'predict': tnt_model_runner.model.predict(test_dataset, callbacks=tnt_callbacks, verbose=verbose) tnt_captured = capsys.readouterr() tnt_metrics = util.get_metrics_from_stdout( tnt_captured.out, tnt_model_runner.model.metrics_names) if exec_type == 'evaluate': ref_model_runner.model.evaluate(ref_test_dataset, callbacks=ref_callbacks, verbose=verbose) elif exec_type == 'predict': ref_model_runner.model.predict(ref_test_dataset, callbacks=ref_callbacks, verbose=verbose) ref_captured = capsys.readouterr() ref_metrics = util.get_metrics_from_stdout( ref_captured.out, ref_model_runner.model.metrics_names) if tnt.is_master_rank(): result = all(np.isclose(tnt_metrics, ref_metrics, atol=1e-6)) else: result = all([tnt_captured.out == "", tnt_captured.err == ""]) util.assert_on_all_ranks(result)
def test_tensorboard_callback(self, setup_save_path, model_config, number_epochs): (train_dataset, val_dataset) = train_val_dataset_generator() tnt_model_runner, _ = gen_model_runners(model_config) tnt_model_runner.model.fit( train_dataset, validation_data=val_dataset, epochs=number_epochs, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir=setup_save_path) ]) result = [True] if tnt.is_master_rank(): result = [ os.path.isdir(os.path.join(setup_save_path, "train")), os.path.isdir(os.path.join(setup_save_path, "validation")) ] result = [all(result)] util.assert_on_all_ranks(result)
def test_sgd_momentum_compare_to_reference(self, model_config, nesterov, momentum, micro_batch_size, nbatches, number_epochs): optimizer = keras.optimizers.SGD optimizer_kwargs = { 'learning_rate': 0.01, 'momentum': momentum, 'nesterov': nesterov } tnt_history, ref_history = train_tnt_and_reference_models( model_config, optimizer, micro_batch_size, nbatches, number_epochs, optimizer_kwargs) result = [True, True] if tnt.is_master_rank(): result = [ np.allclose(tnt_history.history['loss'], ref_history.history['loss'], atol=1e-4), np.allclose(tnt_history.history[metric], ref_history.history[metric], atol=1e-6) ] util.assert_on_all_ranks(result)