def test_legacy_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=4, epochs=2, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_restore_from_checkpoint(self): model = create_xor_model() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_fit_model(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, random_seed=1, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_train_with_inmemory_cache_all(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc= 1, # Normally inmem dataloader is for single worker training with small data store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, inmemory_cache_all=True) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_early_stop_callback(self): from pytorch_lightning.callbacks.early_stopping import EarlyStopping with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=3, verbose=True, mode='max') callbacks = [early_stop_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_direct_parquet_train(self): with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=0.2): model = create_xor_model() for inmemory_cache_all in [False, True]: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_legacy_fit_model(self): model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = F.binary_cross_entropy with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=4, epochs=2, verbose=2, sample_weight_col='weight') torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_model_checkpoint_callback(self): from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: checkpoint_callback = ModelCheckpoint(dirpath=dir) callbacks = [checkpoint_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_fit_model(self): model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2) torch_model = torch_estimator.fit(df) trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_train_with_pytorch_infinite_async_data_loader(self): from horovod.spark.data_loaders.pytorch_data_loaders import PytorchInfiniteAsyncDataLoader with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, data_loader_class=PytorchInfiniteAsyncDataLoader) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_model_override_trainer_args(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with tempdir() as dir: with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, trainer_args={'stochastic_weight_avg': True}) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_dummy_callback(self): from pytorch_lightning.callbacks import Callback model = create_xor_model() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) for num_proc in [1, 2]: for epochs in [2, 3]: class MyDummyCallback(Callback): def __init__(self): self.epcoh_end_counter = 0 self.train_epcoh_end_counter = 0 def on_init_start(self, trainer): print('Starting to init trainer!') def on_init_end(self, trainer): print('Trainer is initialized.') def on_epoch_end(self, trainer, model): print('A epoch ended.') self.epcoh_end_counter += 1 def on_train_epoch_end(self, trainer, model, unused=None): print('A train epoch ended.') self.train_epcoh_end_counter += 1 def on_train_end(self, trainer, model): print('Training ends') assert self.train_epcoh_end_counter == epochs dm_callback = MyDummyCallback() callbacks = [dm_callback] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=num_proc, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=epochs, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_lr_scheduler_callback(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from pytorch_lightning.callbacks import LearningRateMonitor class LRTestingModel(XOR): def configure_optimizers(self): optimizer = torch.optim.Adam(model.parameters(), lr=0.02) def lambda_func(epoch): return epoch // 30 lr_scheduler = { 'scheduler': torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_func), 'name': 'my_logging_name' } return [optimizer], [lr_scheduler] model = LRTestingModel() with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) lr_monitor = LearningRateMonitor(logging_interval='step') callbacks = [lr_monitor] with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, callbacks=callbacks) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_legacy_restore_from_checkpoint(self): self.skipTest( 'There is a bug in current lightning version for checkpoint' 'call back. Will add this test back when it is solved.') model = create_legacy_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() with spark_session('test_restore_from_checkpoint') as spark: df = create_noisy_xor_data(spark) ctx = CallbackBackend() run_id = 'run01' with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( backend=ctx, store=store, model=model, optimizer=optimizer, loss=loss, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, run_id=run_id) torch_estimator._read_checkpoint = mock.Mock( side_effect=torch_estimator._read_checkpoint) ckpt_path = store.get_checkpoint_path(run_id) assert not store.exists(ckpt_path) torch_estimator._read_checkpoint.assert_not_called() torch_estimator.fit(df) assert store.exists(ckpt_path) torch_estimator.fit(df) torch_estimator._read_checkpoint.assert_called()
def test_terminate_on_nan_flag(self): model = create_xor_model() with spark_session('test_terminate_on_nan_flag') as spark: df = create_noisy_xor_data(spark) with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, verbose=2, terminate_on_nan=True, profiler="pytorch") assert torch_estimator.getTerminateOnNan() == True
def test_train_with_custom_data_module(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') from horovod.spark.data_loaders.pytorch_data_loaders import PytorchAsyncDataLoader class CustomDataModule(pl.LightningDataModule): """Custom DataModule for Lightning Estimator, using PytorchAsyncDataLoader""" def __init__(self, train_dir: str, val_dir: str, has_val: bool = True, train_batch_size: int = 32, val_batch_size: int = 32, shuffle_size: int = 100, num_reader_epochs=None, cur_shard: int = 0, shard_count: int = 1, schema_fields=None, storage_options=None, steps_per_epoch_train: int = 1, steps_per_epoch_val: int = 1, verbose=True, **kwargs): super().__init__() self.train_dir = train_dir self.val_dir = val_dir self.has_val = has_val self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.shuffle_size = shuffle_size self.num_reader_epochs = num_reader_epochs self.cur_shard = cur_shard self.shard_count = shard_count self.schema_fields = schema_fields self.storage_options = storage_options self.steps_per_epoch_train = steps_per_epoch_train self.steps_per_epoch_val = steps_per_epoch_val self.verbose = verbose def setup(self, stage=None): # Assign train/val datasets for use in dataloaders from petastorm import make_batch_reader if stage == 'fit' or stage is None: self.train_reader = make_batch_reader( self.train_dir, num_epochs=self.num_reader_epochs, cur_shard=self.cur_shard, shard_count=self.shard_count, hdfs_driver='libhdfs', schema_fields=self.schema_fields, storage_options=self.storage_options) if self.has_val: self.val_reader = make_batch_reader( self.val_dir, num_epochs=self.num_reader_epochs, cur_shard=self.cur_shard, shard_count=self.shard_count, hdfs_driver='libhdfs', schema_fields=self.schema_fields, storage_options=self.storage_options) def teardown(self, stage=None): if stage == "fit" or stage is None: if self.verbose: print("Tear down petastorm readers") self.train_reader.stop() self.train_reader.join() if self.has_val: self.val_reader.stop() self.val_reader.join() def train_dataloader(self): if self.verbose: print("Setup train dataloader") kwargs = dict(reader=self.train_reader, batch_size=self.train_batch_size, name="train dataloader", shuffling_queue_capacity=self.shuffle_size, limit_step_per_epoch=self.steps_per_epoch_train, verbose=self.verbose) return PytorchAsyncDataLoader(**kwargs) def val_dataloader(self): if not self.has_val: return None if self.verbose: print("setup val dataloader") kwargs = dict(reader=self.val_reader, batch_size=self.val_batch_size, name="val dataloader", shuffling_queue_capacity=0, limit_step_per_epoch=self.steps_per_epoch_val, verbose=self.verbose) return PytorchAsyncDataLoader(**kwargs) with spark_session('test_fit_model') as spark: df = create_noisy_xor_data(spark) model = create_xor_model() with local_store() as store: torch_estimator = hvd_spark.TorchEstimator( num_proc=2, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=4, epochs=2, data_module=CustomDataModule, verbose=2) torch_model = torch_estimator.fit(df) # TODO: Find a way to pass log metrics from remote, and assert base on the logger. trained_model = torch_model.getModel() pred = trained_model(torch.ones([1, 2], dtype=torch.int32)) assert len(pred) == 1 assert pred.dtype == torch.float32
def test_direct_parquet_train_with_no_val_column(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session( 'test_direct_parquet_train_with_no_val_column') as spark: df_train = create_noisy_xor_data(spark) df_val = create_noisy_xor_data(spark) def to_petastorm(df): metadata = None if util._has_vector_column(df): to_petastorm = util.to_petastorm_fn(["features", "y"], metadata) df = df.rdd.map(to_petastorm).toDF() return df df_train = to_petastorm(df_train) df_val = to_petastorm(df_val) df_train.show(1) print(df_train.count()) df_val.show(1) print(df_val.count()) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path print(store.get_train_data_path()) print(store.get_val_data_path()) df_train \ .coalesce(4) \ .write \ .mode('overwrite') \ .parquet(store.get_train_data_path()) df_val \ .coalesce(4) \ .write \ .mode('overwrite') \ .parquet(store.get_val_data_path()) model = create_xor_model() inmemory_cache_all = True reader_pool_type = 'process' est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], batch_size=64, epochs=2, verbose=2, inmemory_cache_all=inmemory_cache_all, reader_pool_type=reader_pool_type) # set validation to any random strings would work. est.setValidation("True") transformer = est.fit_on_parquet() predictions = transformer.transform(df_train) assert predictions.count() == df_train.count()