Ejemplo n.º 1
0
    def test_fit_model(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    random_seed=1,
                    verbose=2)

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 2
0
    def test_fit_model(self):
        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    random_seed=1,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 3
0
    def test_legacy_fit_model(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        model = create_legacy_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 4
0
    def test_fit_model(self):
        if sys.version_info < (3, 0, 0) and is_gloo_used():
            self.skipTest(
                'Horovod on Spark over Gloo only supported on Python3')

        model = create_xor_model()
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
        loss = F.binary_cross_entropy

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    input_shapes=[[2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2,
                    sample_weight_col='weight')

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 5
0
    def test_torch_direct_parquet_train(self):
        with spark_session('test_torch_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
                    loss = nn.BCELoss()

                    est = hvd_spark.TorchEstimator(backend=backend,
                                                   store=store,
                                                   model=model,
                                                   optimizer=optimizer,
                                                   input_shapes=[[2]],
                                                   feature_cols=['features'],
                                                   label_cols=['y'],
                                                   batch_size=1,
                                                   epochs=3,
                                                   verbose=2)

                    # To make sure that setLoss works with non-list loss.
                    est.setLoss(loss)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                    assert predictions.count() == df.count()
Ejemplo n.º 6
0
    def test_transform_multi_class(self):
        model = create_xor_model(output_dim=2)

        with spark_session('test_transform_multi_class') as spark:
            df = create_xor_data(spark)
            metadata = util._get_metadata(df)

            torch_model = hvd_spark.TorchModel(history=None,
                                               model=model,
                                               input_shapes=[[2]],
                                               feature_columns=['features'],
                                               label_columns=['y'],
                                               _metadata=metadata)
            out_df = torch_model.transform(df)

            expected_types = {
                'x1': LongType,
                'x2': LongType,
                'features': VectorUDT,
                'weight': DoubleType,
                'y': DoubleType,
                'y__output': VectorUDT
            }

            for field in out_df.schema.fields:
                assert type(field.dataType) == expected_types[field.name]
Ejemplo n.º 7
0
    def test_model_override_trainer_args(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with tempdir() as dir:

                with local_store() as store:
                    torch_estimator = hvd_spark.TorchEstimator(
                        num_proc=2,
                        store=store,
                        model=model,
                        input_shapes=[[-1, 2]],
                        feature_cols=['features'],
                        label_cols=['y'],
                        validation=0.2,
                        batch_size=4,
                        epochs=2,
                        verbose=2,
                        trainer_args={'stochastic_weight_avg': True})

                    torch_model = torch_estimator.fit(df)

                    # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                    trained_model = torch_model.getModel()
                    pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                    assert len(pred) == 1
                    assert pred.dtype == torch.float32
Ejemplo n.º 8
0
    def test_restore_from_checkpoint(self):

        model = create_xor_model()

        with spark_session('test_restore_from_checkpoint') as spark:
            df = create_noisy_xor_data(spark)

            ctx = CallbackBackend()

            run_id = 'run01'
            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    backend=ctx,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    run_id=run_id)

                torch_estimator._read_checkpoint = Mock(
                    side_effect=torch_estimator._read_checkpoint)

                ckpt_path = store.get_checkpoint_path(run_id)
                assert not store.exists(ckpt_path)
                torch_estimator._read_checkpoint.assert_not_called()
                torch_estimator.fit(df)

                assert store.exists(ckpt_path)
                torch_estimator.fit(df)
                torch_estimator._read_checkpoint.assert_called()
Ejemplo n.º 9
0
    def test_fit_model(self):
        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    optimizer=optimizer,
                    loss=loss,
                    feature_cols=['features'],
                    label_cols=['y'],
                    batch_size=1,
                    epochs=3,
                    verbose=2)

                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict([np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Ejemplo n.º 10
0
    def test_happy_run_elastic_fault_tolerant_fails(self):
        self.skipTest(
            'elastic horovod does not support shutdown from the spark driver '
            'while elastic driver is waiting for hosts to come up')

        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic_fault_tolerant_fails',
                           max_failures=2):
            with tempdir() as dir:
                # these files make training function fail in given rank, epoch and batch
                # we have as many failures as Spark has max_failures (per task / index)
                with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_1_epoch_3_batch_1_fail']), 'w'):
                    pass
                res = horovod.spark.run_elastic(
                    fn,
                    args=(2, 5, 5, dir),
                    env={'HOROVOD_LOG_LEVEL': 'DEBUG'},
                    num_proc=2,
                    min_num_proc=2,
                    max_num_proc=2,
                    start_timeout=5,
                    verbose=2)
                self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0),
                                      ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
Ejemplo n.º 11
0
    def test_train_with_inmemory_cache_all(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=
                    1,  # Normally inmem dataloader is for single worker training with small data
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    inmemory_cache_all=True)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 12
0
    def test_fit_model_multiclass(self):
        model = create_mnist_model()
        optimizer = tf.keras.optimizers.Adadelta(1.0)
        loss = tf.keras.losses.categorical_crossentropy

        for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]:
            with spark_session('test_fit_model_multiclass', cores=num_cores) as spark:
                df = create_mnist_data(spark)

                with local_store() as store:
                    keras_estimator = hvd.KerasEstimator(
                        num_proc=num_cores,
                        store=store,
                        model=model,
                        optimizer=optimizer,
                        loss=loss,
                        metrics=['accuracy'],
                        feature_cols=['features'],
                        label_cols=['label_vec'],
                        batch_size=2,
                        epochs=2,
                        verbose=2)

                    keras_model = keras_estimator.fit(df).setOutputCols(['label_prob'])
                    pred_df = keras_model.transform(df)

                    argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType())
                    pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob))

                    preds = pred_df.collect()
                    assert len(preds) == df.count()

                    row = preds[0]
                    label_prob = row.label_prob.toArray().tolist()
                    assert label_prob[int(row.label_pred)] == max(label_prob)
Ejemplo n.º 13
0
    def test_direct_parquet_train(self):
        with spark_session('test_direct_parquet_train') as spark:
            df = create_noisy_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y'],
                                       validation=0.2):
                    model = create_xor_model()

                    for inmemory_cache_all in [False, True]:
                        est = hvd_spark.TorchEstimator(
                            backend=backend,
                            store=store,
                            model=model,
                            input_shapes=[[-1, 2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            validation=0.2,
                            batch_size=1,
                            epochs=3,
                            verbose=2,
                            inmemory_cache_all=inmemory_cache_all)

                        transformer = est.fit_on_parquet()
                        predictions = transformer.transform(df)
                        assert predictions.count() == df.count()
Ejemplo n.º 14
0
    def test_happy_run_elastic_fault_tolerant(self):
        if skip_lightning_tests:
            self.skipTest(
                'Spark PyTorch Lightning tests conflict with Tensorflow 2.5.x: '
                'https://github.com/horovod/horovod/pull/3263')

        if not gloo_built():
            self.skipTest("Gloo is not available")

        with spark_session('test_happy_run_elastic_fault_tolerant',
                           max_failures=3):
            with tempdir() as dir:
                # these files make training function fail in given rank, epoch and batch
                with open(os.path.sep.join([dir, 'rank_1_epoch_2_batch_4_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_0_epoch_3_batch_1_fail']), 'w'), \
                     open(os.path.sep.join([dir, 'rank_1_epoch_4_batch_2_fail']), 'w'):
                    pass
                res = horovod.spark.run_elastic(
                    fn,
                    args=(2, 5, 5, dir),
                    env={'HOROVOD_LOG_LEVEL': 'DEBUG'},
                    num_proc=2,
                    min_num_proc=2,
                    max_num_proc=2,
                    start_timeout=5,
                    verbose=2)
                self.assertListEqual([([0, 4, 0, 4, 1, 4, 0, 4], 0),
                                      ([0, 4, 0, 4, 1, 4, 0, 4], 1)], res)
Ejemplo n.º 15
0
    def test_train_with_pytorch_infinite_async_data_loader(self):
        from horovod.spark.data_loaders.pytorch_data_loaders import PytorchInfiniteAsyncDataLoader

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    data_loader_class=PytorchInfiniteAsyncDataLoader)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 16
0
    def test_keras_direct_parquet_train(self, mock_fit_fn, mock_pin_gpu_fn):
        mock_fit_fn.return_value = get_mock_fit_fn()
        mock_pin_gpu_fn.return_value = mock.Mock()

        with spark_session('test_keras_direct_parquet_train') as spark:
            df = create_xor_data(spark)

            backend = CallbackBackend()
            with local_store() as store:
                store.get_train_data_path = lambda v=None: store._train_path
                store.get_val_data_path = lambda v=None: store._val_path

                with util.prepare_data(backend.num_processes(),
                                       store,
                                       df,
                                       feature_columns=['features'],
                                       label_columns=['y']):
                    model = create_xor_model()
                    optimizer = tf.keras.optimizers.SGD(lr=0.1)
                    loss = 'binary_crossentropy'

                    est = hvd.KerasEstimator(backend=backend,
                                             store=store,
                                             model=model,
                                             optimizer=optimizer,
                                             loss=loss,
                                             feature_cols=['features'],
                                             label_cols=['y'],
                                             batch_size=1,
                                             epochs=3,
                                             verbose=2)

                    transformer = est.fit_on_parquet()
                    predictions = transformer.transform(df)
                assert predictions.count() == df.count()
Ejemplo n.º 17
0
    def test_early_stop_callback(self):
        from pytorch_lightning.callbacks.early_stopping import EarlyStopping

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            early_stop_callback = EarlyStopping(monitor='val_loss',
                                                min_delta=0.00,
                                                patience=3,
                                                verbose=True,
                                                mode='max')
            callbacks = [early_stop_callback]

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2,
                    callbacks=callbacks)

                torch_model = torch_estimator.fit(df)

                # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 18
0
    def test_model_checkpoint_callback(self):
        from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)
            model = create_xor_model()

            with tempdir() as dir:
                checkpoint_callback = ModelCheckpoint(dirpath=dir)
                callbacks = [checkpoint_callback]

                with local_store() as store:
                    torch_estimator = hvd_spark.TorchEstimator(
                        num_proc=2,
                        store=store,
                        model=model,
                        input_shapes=[[-1, 2]],
                        feature_cols=['features'],
                        label_cols=['y'],
                        validation=0.2,
                        batch_size=4,
                        epochs=2,
                        verbose=2,
                        callbacks=callbacks)

                    torch_model = torch_estimator.fit(df)

                    # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                    trained_model = torch_model.getModel()
                    pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                    assert len(pred) == 1
                    assert pred.dtype == torch.float32
Ejemplo n.º 19
0
    def test_fit_model(self):
        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            with local_store() as store:
                torch_estimator = hvd_spark.TorchEstimator(
                    num_proc=2,
                    store=store,
                    model=model,
                    input_shapes=[[-1, 2]],
                    feature_cols=['features'],
                    label_cols=['y'],
                    validation=0.2,
                    batch_size=4,
                    epochs=2,
                    verbose=2)

                torch_model = torch_estimator.fit(df)

                trained_model = torch_model.getModel()
                pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                assert len(pred) == 1
                assert pred.dtype == torch.float32
Ejemplo n.º 20
0
    def test_transform_multi_class(self):
        # set dim as 2, to mock a multi class model.
        model = create_xor_model(output_dim=2)

        with spark_session('test_transform_multi_class') as spark:
            df = create_xor_data(spark)
            metadata = util._get_metadata(df)

            torch_model = hvd_spark.TorchModel(history=None,
                                               model=model,
                                               input_shapes=[[2]],
                                               feature_columns=['features'],
                                               label_columns=['y'],
                                               _metadata=metadata)
            out_df = torch_model.transform(df)

            # in multi class model, model output is a vector but label is number.
            expected_types = {
                'x1': IntegerType,
                'x2': IntegerType,
                'features': VectorUDT,
                'weight': FloatType,
                'y': FloatType,
                'y__output': VectorUDT
            }

            for field in out_df.schema.fields:
                assert type(field.dataType) == expected_types[field.name]
Ejemplo n.º 21
0
    def test_fit_model(self):
        if sys.version_info < (3, 0, 0) and is_gloo_used():
            self.skipTest(
                'Horovod on Spark over Gloo only supported on Python3')

        model = create_xor_model()
        optimizer = tf.keras.optimizers.SGD(lr=0.1)
        loss = 'binary_crossentropy'

        with spark_session('test_fit_model') as spark:
            df = create_xor_data(spark)

            with local_store() as store:
                keras_estimator = hvd.KerasEstimator(num_proc=2,
                                                     store=store,
                                                     model=model,
                                                     optimizer=optimizer,
                                                     loss=loss,
                                                     feature_cols=['features'],
                                                     label_cols=['y'],
                                                     batch_size=1,
                                                     epochs=3,
                                                     verbose=2)

                keras_model = keras_estimator.fit(df)

                trained_model = keras_model.getModel()
                pred = trained_model.predict(
                    [np.ones([1, 2], dtype=np.float32)])
                assert len(pred) == 1
                assert pred.dtype == np.float32
Ejemplo n.º 22
0
    def test_get_col_info_error_bad_size(self):
        with spark_session('test_get_col_info_error_bad_size') as spark:
            data_bad_size = [[DenseVector([1.0, 1.0])], [DenseVector([1.0])]]
            schema = StructType([StructField('data', VectorUDT())])
            df = create_test_data_from_schema(spark, data_bad_size, schema)

            with pytest.raises(ValueError):
                util._get_col_info(df)
Ejemplo n.º 23
0
    def test_happy_run(self):
        def fn():
            hvd.init()
            res = hvd.allgather(torch.tensor([hvd.rank()])).tolist()
            return res, hvd.rank()

        with spark_session('test_happy_run'):
            res = horovod.spark.run(fn, env={'PATH': os.environ.get('PATH')}, verbose=0)
            self.assertListEqual([([0, 1], 0), ([0, 1], 1)], res)
Ejemplo n.º 24
0
    def test_get_col_info_error_bad_shape(self):
        with spark_session('test_get_col_info_error_bad_shape') as spark:
            data_bad_shape = [[SparseVector(2, {0: 1.0})],
                              [SparseVector(1, {0: 1.0})]]
            schema = StructType([StructField('data', VectorUDT())])
            df = create_test_data_from_schema(spark, data_bad_shape, schema)

            with pytest.raises(ValueError):
                util._get_col_info(df)
Ejemplo n.º 25
0
    def test_dummy_callback(self):
        from pytorch_lightning.callbacks import Callback
        model = create_xor_model()

        with spark_session('test_fit_model') as spark:
            df = create_noisy_xor_data(spark)

            for num_proc in [1, 2]:
                for epochs in [2, 3]:

                    class MyDummyCallback(Callback):
                        def __init__(self):
                            self.epcoh_end_counter = 0
                            self.train_epcoh_end_counter = 0

                        def on_init_start(self, trainer):
                            print('Starting to init trainer!')

                        def on_init_end(self, trainer):
                            print('Trainer is initialized.')

                        def on_epoch_end(self, trainer, model):
                            print('A epoch ended.')
                            self.epcoh_end_counter += 1

                        def on_train_epoch_end(self, trainer, model, unused=None):
                            print('A train epoch ended.')
                            self.train_epcoh_end_counter += 1

                        def on_train_end(self, trainer, model):
                            print('Training ends')
                            assert self.train_epcoh_end_counter == epochs

                    dm_callback = MyDummyCallback()
                    callbacks = [dm_callback]

                    with local_store() as store:
                        torch_estimator = hvd_spark.TorchEstimator(
                            num_proc=num_proc,
                            store=store,
                            model=model,
                            input_shapes=[[-1, 2]],
                            feature_cols=['features'],
                            label_cols=['y'],
                            validation=0.2,
                            batch_size=4,
                            epochs=epochs,
                            verbose=2,
                            callbacks=callbacks)

                        torch_model = torch_estimator.fit(df)

                        # TODO: Find a way to pass log metrics from remote, and assert base on the logger.
                        trained_model = torch_model.getModel()
                        pred = trained_model(torch.ones([1, 2], dtype=torch.int32))
                        assert len(pred) == 1
                        assert pred.dtype == torch.float32
Ejemplo n.º 26
0
    def test_get_available_devices(self):
        def fn():
            hvd.init()
            devices = get_available_devices()
            return devices, hvd.local_rank()

        with spark_session('test_get_available_devices', gpus=2):
            res = horovod.spark.run(fn, env={'PATH': os.environ.get('PATH')}, verbose=0)
            self.assertListEqual([(['0'], 0), (['1'], 1)], res)
Ejemplo n.º 27
0
    def test_spark_run_func_with_non_zero_exit(self):
        run_func = MagicMock(return_value=1)

        def fn():
            return 1

        with spark_session('test_spark_run_func', cores=4):
            with pytest.raises(Exception, match='^mpirun failed with exit code 1$') as e:
                horovod.spark.run(fn, verbose=0, run_func=run_func)
Ejemplo n.º 28
0
    def test_get_metadata(self):
        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with spark_session('test_get_metadata') as spark:
            data = [[
                1.0,
                DenseVector([1.0, 1.0]),
                SparseVector(2, {0: 1.0}),
                DenseVector([1.0, 1.0])
            ],
                    [
                        1.0,
                        DenseVector([1.0, 1.0]),
                        SparseVector(2, {1: 1.0}),
                        SparseVector(2, {1: 1.0})
                    ]]
            schema = StructType([
                StructField('float', FloatType()),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])
            df = create_test_data_from_schema(spark, data, schema)

            metadata = util._get_metadata(df)
            self.assertDictEqual(metadata, expected_metadata)
Ejemplo n.º 29
0
 def test_timeout(self):
     with spark_session('test_timeout'):
         with pytest.raises(
                 Exception,
                 match='^Timed out waiting for Spark tasks to start.'):
             horovod.spark.run(None,
                               num_proc=4,
                               start_timeout=5,
                               env={'PATH': os.environ.get('PATH')},
                               verbose=0)
Ejemplo n.º 30
0
 def test_mpirun_not_found(self):
     start = time.time()
     with spark_session('test_mpirun_not_found'):
         with pytest.raises(Exception,
                            match='^mpirun failed with exit code 127$'):
             horovod.spark.run(None,
                               env={'PATH': '/nonexistent'},
                               verbose=0)
     self.assertLessEqual(time.time() - start, 10,
                          'Failure propagation took too long')