Ejemplo n.º 1
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    est = hvd.KerasEstimator(backend=backend,
                                             store=store,
                                             model=model,
                                             optimizer=optimizer,
                                             loss=loss,
                                             feature_cols=['features'],
                                             label_cols=['y'],
                                             batch_size=1,
                                             epochs=3,
                                             verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                assert predictions.count() == df.count()
Ejemplo n.º 2
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                    loss = nn.BCELoss()

                    est = hvd_spark.TorchEstimator(backend=backend,
                                                   store=store,
                                                   model=model,
                                                   optimizer=optimizer,
                                                   input_shapes=[[2]],
                                                   feature_cols=['features'],
                                                   label_cols=['y'],
                                                   batch_size=1,
                                                   epochs=3,
                                                   verbose=2)

                    # To make sure that setLoss works with non-list loss.
                    est.setLoss(loss)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()
Ejemplo n.º 3
0
    def test_direct_parquet_train(self):
        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y'],
                                       validation=0.2):
                    model = create_xor_model()

                    for inmemory_cache_all in [False, True]:
                        est = hvd_spark.TorchEstimator(
                            backend=backend,
                            store=store,
                            model=model,
                            input_shapes=[[-1, 2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            validation=0.2,
                            batch_size=1,
                            epochs=3,
                            verbose=2,
                            inmemory_cache_all=inmemory_cache_all)

                        transformer = est.fit_on_parquet()
                        predictions = transformer.transform(df)
                        assert predictions.count() == df.count()
Ejemplo n.º 4
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure we cover validation dataloader as well
                for validation in [None, 'val']:
                    # Need validation ratio to split data
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()
                        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                        loss = nn.BCELoss()

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd_spark.TorchEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    optimizer=optimizer,
                                    input_shapes=[[2]],
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    batch_size=1,
                                    epochs=3,
                                    verbose=2,
                                    reader_pool_type=reader_pool_type,
                                    inmemory_cache_all=inmemory_cache_all,
                                    validation=validation)

                                # To make sure that setLoss works with non-list loss.
                                est.setLoss(loss)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Ejemplo n.º 5
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure we cover val dataloader cases
                for validation in [None, 'val']:
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()
                        optimizer = tf.keras.optimizers.SGD(lr=0.1)
                        loss = 'binary_crossentropy'

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd.KerasEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    optimizer=optimizer,
                                    loss=loss,
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    batch_size=1,
                                    epochs=3,
                                    reader_pool_type=reader_pool_type,
                                    validation=validation,
                                    inmemory_cache_all=inmemory_cache_all,
                                    verbose=2)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Ejemplo n.º 6
0
    def test_direct_parquet_train(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data_with_val(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                # Make sure to cover val dataloader cases
                for validation in [None, 'val']:
                    with util.prepare_data(backend.num_processes(),
                                           store,
                                           df,
                                           feature_columns=['features'],
                                           label_columns=['y'],
                                           validation=validation):
                        model = create_xor_model()

                        for inmemory_cache_all in [False, True]:
                            for reader_pool_type in ['process', 'thread']:
                                est = hvd_spark.TorchEstimator(
                                    backend=backend,
                                    store=store,
                                    model=model,
                                    input_shapes=[[-1, 2]],
                                    feature_cols=['features'],
                                    label_cols=['y'],
                                    validation=validation,
                                    batch_size=1,
                                    epochs=3,
                                    verbose=2,
                                    inmemory_cache_all=inmemory_cache_all,
                                    reader_pool_type=reader_pool_type)

                                transformer = est.fit_on_parquet()
                                predictions = transformer.transform(df)
                                assert predictions.count() == df.count()
Ejemplo n.º 7
0
 def _fit(self, df):
     backend = self._get_or_create_backend()
     with util.prepare_data(
             backend.num_processes(),
             self.getStore(),
             df,
             label_columns=self.getLabelCols(),
             feature_columns=self.getFeatureCols(),
             validation=self.getValidation(),
             sample_weight_col=self.getSampleWeightCol(),
             compress_sparse=self.getCompressSparseCols(),
             partitions_per_process=self.getPartitionsPerProcess(),
             verbose=self.getVerbose()) as dataset_idx:
         train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
             dataset_idx)
         self._check_metadata_compatibility(metadata)
         return self._fit_on_prepared_data(backend, train_rows, val_rows,
                                           metadata, avg_row_size,
                                           dataset_idx)
Ejemplo n.º 8
0
    def test_prepare_data_compress_sparse(self):
        util.clear_training_cache()

        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with mock.patch('horovod.spark.common.util._get_metadata',
                        side_effect=util._get_metadata) as mock_get_metadata:
            with spark_session('test_prepare_data') as spark:
                data = [[
                    0.0,
                    DenseVector([1.0, 1.0]),
                    SparseVector(2, {1: 1.0}),
                    DenseVector([1.0, 1.0])
                ],
                        [
                            1.0,
                            DenseVector([1.0, 1.0]),
                            SparseVector(2, {1: 1.0}),
                            SparseVector(2, {1: 1.0})
                        ]]

                schema = StructType([
                    StructField('float', FloatType()),
                    StructField('dense', VectorUDT()),
                    StructField('sparse', VectorUDT()),
                    StructField('mixed', VectorUDT())
                ])

                df = create_test_data_from_schema(spark, data, schema)

                with local_store() as store:
                    with util.prepare_data(
                            num_processes=2,
                            store=store,
                            df=df,
                            feature_columns=['dense', 'sparse', 'mixed'],
                            label_columns=['float'],
                            compress_sparse=True) as dataset_idx:
                        mock_get_metadata.assert_called()
                        assert dataset_idx == 0

                        train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                            dataset_idx)
                        self.assertDictEqual(metadata, expected_metadata)
Ejemplo n.º 9
0
    def test_df_cache(self):
        # Clean the cache before starting the test
        util.clear_training_cache()
        util._training_cache.get_dataset = mock.Mock(
            side_effect=util._training_cache.get_dataset)

        with spark_session('test_df_cache') as spark:
            with local_store() as store:
                df = create_xor_data(spark)
                df2 = create_xor_data(spark)
                df3 = create_xor_data(spark)

                key = util._training_cache.create_key(df, store, None)
                key2 = util._training_cache.create_key(df2, store, None)
                key3 = util._training_cache.create_key(df3, store, None)

                # All keys are distinct
                assert key != key2
                assert key != key3
                assert key2 != key3

                # The cache should be empty to start
                assert not util._training_cache.is_cached(key, store)
                assert not util._training_cache.is_cached(key2, store)
                assert not util._training_cache.is_cached(key3, store)

                # First insertion into the cache
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx:
                    train_rows, val_rows, metadata, avg_row_size = util.get_dataset_properties(
                        dataset_idx)
                    util._training_cache.get_dataset.assert_not_called()
                    assert len(util._training_cache._key_to_dataset) == 1
                    assert util._training_cache.is_cached(key, store)
                    assert dataset_idx == 0

                    # The first dataset is still in use, so we assign the next integer in sequence to this
                    # dataset
                    assert not util._training_cache.is_cached(key2, store)
                    with util.prepare_data(num_processes=2,
                                           store=store,
                                           df=df2,
                                           feature_columns=['features'],
                                           label_columns=['y'
                                                          ]) as dataset_idx2:
                        util._training_cache.get_dataset.assert_not_called()
                        assert len(util._training_cache._key_to_dataset) == 2
                        assert util._training_cache.is_cached(key2, store)
                        assert dataset_idx2 == 1

                # Even though the first dataset is no longer in use, it is still cached
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx1:
                    train_rows1, val_rows1, metadata1, avg_row_size1 = util.get_dataset_properties(
                        dataset_idx1)
                    util._training_cache.get_dataset.assert_called()
                    assert train_rows == train_rows1
                    assert val_rows == val_rows1
                    assert metadata == metadata1
                    assert avg_row_size == avg_row_size1
                    assert dataset_idx1 == 0

                # The first dataset is no longer in use, so we can reclaim its dataset index
                assert not util._training_cache.is_cached(key3, store)
                with util.prepare_data(num_processes=2,
                                       store=store,
                                       df=df3,
                                       feature_columns=['features'],
                                       label_columns=['y']) as dataset_idx3:
                    train_rows3, val_rows3, metadata3, avg_row_size3 = util.get_dataset_properties(
                        dataset_idx3)
                    assert train_rows == train_rows3
                    assert val_rows == val_rows3
                    assert metadata == metadata3
                    assert avg_row_size == avg_row_size3
                    assert dataset_idx3 == 0

                # Same dataframe, different validation
                bad_key = util._training_cache.create_key(df, store, 0.1)
                assert not util._training_cache.is_cached(bad_key, store)
Ejemplo n.º 10
0
    def test_keras_model_checkpoint_callback(self, mock_fit_fn, mock_pin_gpu_fn):
        from horovod.tensorflow.keras.callbacks import BestModelCheckpoint

        def _get_mock_fit_fn(checkpoint_callback_provided):
            def fit(model, train_data, val_data, steps_per_epoch, validation_steps, callbacks,
                    verbose):
                returned_model_checkpoint_present = False
                model_checkpoint_present = False
                for callback in callbacks:
                    callback.set_model(model)
                    if checkpoint_callback_provided:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})
                    else:
                        callback.on_epoch_end(0, logs={'binary_crossentropy': 0.3})

                    if checkpoint_callback_provided and isinstance(callback, BestModelCheckpoint):
                        self.assertIsNotNone(callback.filepath)
                        self.assertTrue(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'binary_crossentropy')
                        returned_model_checkpoint_present = True

                    if not checkpoint_callback_provided and isinstance(callback, tf.keras.callbacks.ModelCheckpoint):
                        self.assertFalse(callback.save_best_only)
                        self.assertFalse(callback.save_best_only)
                        self.assertEqual(callback.monitor, 'val_loss')
                        model_checkpoint_present = True

                if checkpoint_callback_provided:
                    self.assertTrue(returned_model_checkpoint_present)
                    self.assertFalse(model_checkpoint_present)
                else:
                    self.assertFalse(returned_model_checkpoint_present)
                    self.assertTrue(model_checkpoint_present)

                return mock.Mock()

            return fit

        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_model_chekcpoint_callbacks') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    # Test when the checkpoint callback is not set, the correct one is created
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=False)
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()

                    # Test if checkpoint call back is correctly set to the model
                    mock_fit_fn.return_value = _get_mock_fit_fn(checkpoint_callback_provided=True)
                    checkpoint_callback = BestModelCheckpoint(monitor='binary_crossentropy')
                    est = hvd.KerasEstimator(
                        backend=backend,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        feature_cols=['features'],
                        label_cols=['y'],
                        batch_size=1,
                        epochs=3,
                        verbose=2,
                        checkpoint_callback=checkpoint_callback)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()