def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' est = hvd.KerasEstimator(backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() est = hvd_spark.TorchEstimator(backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_direct_parquet_train(self): with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=0.2): model = create_xor_model() for inmemory_cache_all in [False, True]: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=0.2, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_torch_direct_parquet_train(self): with spark_session('test_torch_direct_parquet_train') as spark: df = create_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure we cover validation dataloader as well for validation in [None, 'val']: # Need validation ratio to split data with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() optimizer = torch.optim.SGD(model.parameters(), lr=0.1) loss = nn.BCELoss() for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, optimizer=optimizer, input_shapes=[[2]], feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, reader_pool_type=reader_pool_type, inmemory_cache_all=inmemory_cache_all, validation=validation) # To make sure that setLoss works with non-list loss. est.setLoss(loss) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn): mock_fit_fn.return_value = get_mock_fit_fn() mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_direct_parquet_train') as spark: df = create_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure we cover val dataloader cases for validation in [None, 'val']: with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, reader_pool_type=reader_pool_type, validation=validation, inmemory_cache_all=inmemory_cache_all, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def test_direct_parquet_train(self): if skip_lightning_tests: self.skipTest( 'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: ' 'https://github.com/horovod/horovod/pull/3263') with spark_session('test_direct_parquet_train') as spark: df = create_noisy_xor_data_with_val(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path # Make sure to cover val dataloader cases for validation in [None, 'val']: with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y'], validation=validation): model = create_xor_model() for inmemory_cache_all in [False, True]: for reader_pool_type in ['process', 'thread']: est = hvd_spark.TorchEstimator( backend=backend, store=store, model=model, input_shapes=[[-1, 2]], feature_cols=['features'], label_cols=['y'], validation=validation, batch_size=1, epochs=3, verbose=2, inmemory_cache_all=inmemory_cache_all, reader_pool_type=reader_pool_type) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()
def _fit(self, df): backend = self._get_or_create_backend() with util.prepare_data( backend.num_processes(), self.getStore(), df, label_columns=self.getLabelCols(), feature_columns=self.getFeatureCols(), validation=self.getValidation(), sample_weight_col=self.getSampleWeightCol(), compress_sparse=self.getCompressSparseCols(), partitions_per_process=self.getPartitionsPerProcess(), verbose=self.getVerbose()) as dataset_idx: train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) self._check_metadata_compatibility(metadata) return self._fit_on_prepared_data(backend, train_rows, val_rows, metadata, avg_row_size, dataset_idx)
def test_prepare_data_compress_sparse(self): util.clear_training_cache() expected_metadata = \ { 'float': { 'spark_data_type': FloatType, 'is_sparse_vector_only': False, 'intermediate_format': constants.NOCHANGE, 'max_size': 1, 'shape': 1 }, 'dense': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, 'sparse': { 'spark_data_type': SparseVector, 'is_sparse_vector_only': True, 'intermediate_format': constants.CUSTOM_SPARSE, 'max_size': 1, 'shape': 2 }, 'mixed': { 'spark_data_type': DenseVector, 'is_sparse_vector_only': False, 'intermediate_format': constants.ARRAY, 'max_size': 2, 'shape': 2 }, } with mock.patch('horovod.spark.common.util._get_metadata', side_effect=util._get_metadata) as mock_get_metadata: with spark_session('test_prepare_data') as spark: data = [[ 0.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1.0, DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('float', FloatType()), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) with local_store() as store: with util.prepare_data( num_processes=2, store=store, df=df, feature_columns=['dense', 'sparse', 'mixed'], label_columns=['float'], compress_sparse=True) as dataset_idx: mock_get_metadata.assert_called() assert dataset_idx == 0 train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) self.assertDictEqual(metadata, expected_metadata)
def test_df_cache(self): # Clean the cache before starting the test util.clear_training_cache() util._training_cache.get_dataset = mock.Mock( side_effect=util._training_cache.get_dataset) with spark_session('test_df_cache') as spark: with local_store() as store: df = create_xor_data(spark) df2 = create_xor_data(spark) df3 = create_xor_data(spark) key = util._training_cache.create_key(df, store, None) key2 = util._training_cache.create_key(df2, store, None) key3 = util._training_cache.create_key(df3, store, None) # All keys are distinct assert key != key2 assert key != key3 assert key2 != key3 # The cache should be empty to start assert not util._training_cache.is_cached(key, store) assert not util._training_cache.is_cached(key2, store) assert not util._training_cache.is_cached(key3, store) # First insertion into the cache with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx: train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties( dataset_idx) util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 1 assert util._training_cache.is_cached(key, store) assert dataset_idx == 0 # The first dataset is still in use, so we assign the next integer in sequence to this # dataset assert not util._training_cache.is_cached(key2, store) with util.prepare_data(num_processes=2, store=store, df=df2, feature_columns=['features'], label_columns=['y' ]) as dataset_idx2: util._training_cache.get_dataset.assert_not_called() assert len(util._training_cache._key_to_dataset) == 2 assert util._training_cache.is_cached(key2, store) assert dataset_idx2 == 1 # Even though the first dataset is no longer in use, it is still cached with util.prepare_data(num_processes=2, store=store, df=df, feature_columns=['features'], label_columns=['y']) as dataset_idx1: train_rows1, val_rows1, metadata1, avg_row_size1 = util.get_dataset_properties( dataset_idx1) util._training_cache.get_dataset.assert_called() assert train_rows == train_rows1 assert val_rows == val_rows1 assert metadata == metadata1 assert avg_row_size == avg_row_size1 assert dataset_idx1 == 0 # The first dataset is no longer in use, so we can reclaim its dataset index assert not util._training_cache.is_cached(key3, store) with util.prepare_data(num_processes=2, store=store, df=df3, feature_columns=['features'], label_columns=['y']) as dataset_idx3: train_rows3, val_rows3, metadata3, avg_row_size3 = util.get_dataset_properties( dataset_idx3) assert train_rows == train_rows3 assert val_rows == val_rows3 assert metadata == metadata3 assert avg_row_size == avg_row_size3 assert dataset_idx3 == 0 # Same dataframe, different validation bad_key = util._training_cache.create_key(df, store, 0.1) assert not util._training_cache.is_cached(bad_key, store)
def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn): from horovod.tensorflow.keras.callbacks import BestModelCheckpoint def _get_mock_fit_fn(checkpoint_callback_provided): def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks, verbose): returned_model_checkpoint_present = False model_checkpoint_present = False for callback in callbacks: callback.set_model(model) if checkpoint_callback_provided: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) else: callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3}) if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint): self.assertIsNotNone(callback.filepath) self.assertTrue(callback.save_best_only) self.assertEqual(callback.monitor, 'binary_crossentropy') returned_model_checkpoint_present = True if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint): self.assertFalse(callback.save_best_only) self.assertFalse(callback.save_best_only) self.assertEqual(callback.monitor, 'val_loss') model_checkpoint_present = True if checkpoint_callback_provided: self.assertTrue(returned_model_checkpoint_present) self.assertFalse(model_checkpoint_present) else: self.assertFalse(returned_model_checkpoint_present) self.assertTrue(model_checkpoint_present) return mock.Mock() return fit mock_pin_gpu_fn.return_value = mock.Mock() with spark_session('test_keras_model_chekcpoint_callbacks') as spark: df = create_xor_data(spark) backend = CallbackBackend() with local_store() as store: store.get_train_data_path = lambda v=None: store._train_path store.get_val_data_path = lambda v=None: store._val_path with util.prepare_data(backend.num_processes(), store, df, feature_columns=['features'], label_columns=['y']): model = create_xor_model() optimizer = tf.keras.optimizers.SGD(lr=0.1) loss = 'binary_crossentropy' # Test when the checkpoint callback is not set, the correct one is created mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False) est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count() # Test if checkpoint call back is correctly set to the model mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True) checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy') est = hvd.KerasEstimator( backend=backend, store=store, model=model, optimizer=optimizer, loss=loss, feature_cols=['features'], label_cols=['y'], batch_size=1, epochs=3, verbose=2, checkpoint_callback=checkpoint_callback) transformer = est.fit_on_parquet() predictions = transformer.transform(df) assert predictions.count() == df.count()