def test_direct_parquet_train(self): with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=0.2): model = create_xor_model() for inmemory_cache_all in [False, True]: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_fit_model(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, random_seed=1, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: torch_estimator = hvd.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() est = hvd_spark.TorchEstimator(backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, random_seed=1, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_train_with_pytorch_infinite_async_data_loader(self): from horovod.spark.data_loaders.pytorch_data_loaders import PytorchInfiniteAsyncDataLoader with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, data_loader_class=PytorchInfiniteAsyncDataLoader) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): if sys.version_info < (3, 0, 0) and is_gloo_used(): self.skipTest( 'Horovod on Spark over Gloo only supported on Python3') model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator(num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_model_override_trainer_args(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, trainer_args={'stochastic_weight_avg': True}) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_train_with_inmemory_cache_all(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc= 1, # Normally inmem dataloader is for single worker training with small data store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, inmemory_cache_all=True) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' est = hvd.KerasEstimator(backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_fit_model_multiclass(self): model = create_mnist_model() optimizer = tf.keras.optimizers.Adadelta(1.0) loss = tf.keras.losses.categorical_crossentropy for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]: with spark_session('test_fit_model_multiclass', cores=num_cores) as spark: df = create_mnist_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=num_cores, store=store, model=model, optimizer=optimizer, loss=loss, metrics=['accuracy'], feature_cols=['features'], label_cols=['label_vec'], batch_size=2, epochs=2, verbose=2) keras_model = keras_estimator.fit(df).setOutputCols(['label_prob']) pred_df = keras_model.transform(df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) preds = pred_df.collect() assert len(preds) == df.count() row = preds[0] label_prob = row.label_prob.toArray().tolist() assert label_prob[int(row.label_pred)] == max(label_prob)
def test_early_stop_callback(self): from pytorch_lightning.callbacks.early_stopping import EarlyStopping with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max') callbacks = [early_stop_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_model_checkpoint_callback(self): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: checkpoint_callback = ModelCheckpoint(dirpath=dir) callbacks = [checkpoint_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_restore_from_checkpoint(self): model = create_xor_model() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_legacy_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=4, epochs=2, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_dummy_callback(self): from pytorch_lightning.callbacks import Callback model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) for num_proc in [1, 2]: for epochs in [2, 3]: class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_train_end(self, trainer, model): print('Training ends') assert self.train_epcoh_end_counter == epochs dm_callback = MyDummyCallback() callbacks = [dm_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=num_proc, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=epochs, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_lr_scheduler_callback(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks import LearningRateMonitor class LRTestingModel(XOR): def configure_optimizers(self): optimizer = torch.optim.Adam(model.parameters(), lr=0.02) def lambda_func(epoch): return epoch // 30 lr_scheduler = { 'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_func), 'name': 'my_logging_name' } return [optimizer], [lr_scheduler] model = LRTestingModel() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) lr_monitor = LearningRateMonitor(logging_interval='step') callbacks = [lr_monitor] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure we cover validation dataloader as well for validation in [None, 'val']: # Need validation ratio to split data with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, reader_pool_type=reader_pool_type, inmemory_cache_all=inmemory_cache_all, validation=validation) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure we cover val dataloader cases for validation in [None, 'val']: with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, reader_pool_type=reader_pool_type, validation=validation, inmemory_cache_all=inmemory_cache_all, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_restore_from_checkpoint(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) backend = CallbackBackend() run_id = 'run01' with local_store() as store: keras_estimator = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, run_id=run_id) keras_estimator._load_model_from_checkpoint = mock.Mock( side_effect=keras_estimator._load_model_from_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) keras_estimator._load_model_from_checkpoint.assert_not_called() keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict([np.ones([1, 2], dtype=np.float64)]) assert len(pred) == 1 assert store.exists(ckpt_path) keras_estimator.fit(df) keras_estimator._load_model_from_checkpoint.assert_called()
def test_direct_parquet_train(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure to cover val dataloader cases for validation in [None, 'val']: with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=validation, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all, reader_pool_type=reader_pool_type) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_legacy_restore_from_checkpoint(self): self.skipTest( 'There is a bug in current lightning version for checkpoint' 'call back. Will add this test back when it is solved.') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = mock.Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_model_serialization(self, mock_remote_trainer): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' def train(serialized_model, train_rows, val_rows, avg_row_size): return None, serialized_model, 2 mock_remote_trainer.return_value = train with spark_session('test_model_serialization') as spark: df = create_xor_data(spark) keras_estimator = hvd.KerasEstimator(model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) backend = CallbackBackend() with local_store() as store: with temppath() as saved_path: keras_estimator.save(saved_path) keras_estimator_loaded = hvd.KerasEstimator.load( saved_path) keras_model = keras_estimator_loaded.fit( df, params={ keras_estimator_loaded.backend: backend, keras_estimator_loaded.store: store }) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_terminate_on_nan_flag(self): model = create_xor_model() with spark_session('test_terminate_on_nan_flag') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, terminate_on_nan=True, profiler="pytorch") assert torch_estimator.getTerminateOnNan() == True
def test_restore_from_checkpoint(self): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=1, verbose=2, run_id=run_id) torch_estimator._load_checkpoint = mock.Mock( side_effect=torch_estimator._load_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._load_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._load_checkpoint.assert_called()
def test_fit_model(self): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' with spark_session('test_fit_model') as spark: df = create_xor_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator(num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, random_seed=1, epochs=3, verbose=2, use_gpu=False, mp_start_method='spawn') assert not keras_estimator.getUseGpu() assert 'spawn' == keras_estimator.getMpStartMethod() keras_estimator.setMpStartMethod('forkserver') assert 'forkserver' == keras_estimator.getMpStartMethod() keras_model = keras_estimator.fit(df) trained_model = keras_model.getModel() pred = trained_model.predict( [np.ones([1, 2], dtype=np.float32)]) assert len(pred) == 1 assert pred.dtype == np.float32
def test_df_cache(self): # Clean the cache before starting the test util.clear_training_cache() util._training_cache.get_dataset = mock.Mock( side_effect=util._training_cache.get_dataset) with spark_session('test_df_cache') as spark: with local_store() as store: df = create_xor_data(spark) df2 = create_xor_data(spark) df3 = create_xor_data(spark) key = util._training_cache.create_key(df, store, None) key2 = util._training_cache.create_key(df2, store, None) key3 = util._training_cache.create_key(df3, store, None) # All keys are distinct assert key != key2 assert key != key3 assert key2 != key3 # The cache should be empty to start assert not util._training_cache.is_cached(key, store) assert not util._training_cache.is_cached(key2, store) assert not util._training_cache.is_cached(key3, store) # First insertion into the cache with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx: train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 1 assert util._training_cache.is_cached(key, store) assert dataset_idx == 0 # The first dataset is still in use, so we assign the next integer in sequence to this # dataset assert not util._training_cache.is_cached(key2, store) with util.prepare_data(num_processes=2, store=store, df=df2, feature_columns=['features'], label_columns=['y' ]) as dataset_idx2: util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 2 assert util._training_cache.is_cached(key2, store) assert dataset_idx2 == 1 # Even though the first dataset is no longer in use, it is still cached with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx1: train_rows1, val_rows1, metadata1, avg_row_size1 = util.get_dataset_properties( dataset_idx1) util._training_cache.get_dataset.assert_called() assert train_rows == train_rows1 assert val_rows == val_rows1 assert metadata == metadata1 assert avg_row_size == avg_row_size1 assert dataset_idx1 == 0 # The first dataset is no longer in use, so we can reclaim its dataset index assert not util._training_cache.is_cached(key3, store) with util.prepare_data(num_processes=2, store=store, df=df3, feature_columns=['features'], label_columns=['y']) as dataset_idx3: train_rows3, val_rows3, metadata3, avg_row_size3 = util.get_dataset_properties( dataset_idx3) assert train_rows == train_rows3 assert val_rows == val_rows3 assert metadata == metadata3 assert avg_row_size == avg_row_size3 assert dataset_idx3 == 0 # Same dataframe, different validation bad_key = util._training_cache.create_key(df, store, 0.1) assert not util._training_cache.is_cached(bad_key, store)
def test_prepare_data_compress_sparse(self): util.clear_training_cache() expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with mock.patch('horovod.spark.common.util._get_metadata', side_effect=util._get_metadata) as mock_get_metadata: with spark_session('test_prepare_data') as spark: data = [[ 0.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) with local_store() as store: with util.prepare_data( num_processes=2, store=store, df=df, feature_columns=['dense', 'sparse', 'mixed'], label_columns=['float'], compress_sparse=True) as dataset_idx: mock_get_metadata.assert_called() assert dataset_idx == 0 train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) self.assertDictEqual(metadata, expected_metadata)