Esempio n. 1
0
    def test_predict_xshards(self):
        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100, ))
        })
        expected = train_data_shard.collect()

        expected = [shard["x"] for shard in expected]

        for x in expected:
            print(x.shape)

        expected = np.concatenate(expected)

        config = {}
        trainer = Estimator(model_creator=identity_model_creator,
                            verbose=True,
                            config=config,
                            workers_per_node=2)

        result = trainer.predict(train_data_shard, batch_size=10).collect()

        result = [shard["prediction"] for shard in result]

        result = np.concatenate(result)

        assert np.allclose(expected, result)
Esempio n. 2
0
    def test_dataframe(self):

        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"batch_size": 4, "lr": 0.8}
        trainer = Estimator(model_creator=model_creator,
                            verbose=True,
                            config=config,
                            workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
Esempio n. 3
0
    def test_sparkxshards_with_inbalanced_data(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        def random_pad(data):
            import numpy as np
            import random
            times = random.randint(1, 10)
            data["x"] = np.concatenate([data["x"]] * times)
            data["y"] = np.concatenate([data["y"]] * times)
            return data

        train_data_shard = train_data_shard.transform_shard(random_pad)

        config = {"batch_size": 4, "lr": 0.8}
        trainer = Estimator(model_creator=model_creator,
                            verbose=True,
                            config=config,
                            workers_per_node=2)

        trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25)
        trainer.evaluate(train_data_shard, steps=25)
    def test_changing_config_during_fit(self):
        train_data_shard = XShards.partition({"x": np.random.randn(100, 1),
                                              "y": np.random.randint(0, 1, size=(100,))})
        config = {
            "lr": 0.8
        }
        trainer = Estimator(
            model_creator=model_creator,
            verbose=True,
            config=config,
            workers_per_node=2)

        trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25,  data_config={"batch_size": 8})
 def test_require_batch_size(self):
     train_data_shard = XShards.partition({"x": np.random.randn(100, 1),
                                           "y": np.random.randint(0, 1, size=(100,))})
     config = {
         "lr": 0.8
     }
     trainer = Estimator(
         model_creator=model_creator,
         verbose=True,
         config=config,
         workers_per_node=2)
     with pytest.raises(ray.exceptions.RayTaskError,
                        match=r".*batch_size must be set in config*."):
         trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25)
Esempio n. 6
0
    def test_horovod_learning_rate_schedule(self):
        import horovod
        major, minor, patch = horovod.__version__.split(".")

        larger_major = int(major) > 0
        larger_minor = int(major) == 0 and int(minor) > 19
        larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2

        if larger_major or larger_minor or larger_patch:
            ray_ctx = RayContext.get()
            batch_size = 32
            workers_per_node = 4
            global_batch_size = batch_size * workers_per_node
            config = {"batch_size": global_batch_size, "lr": 0.8}
            trainer = Estimator(model_creator=simple_model,
                                compile_args_creator=compile_args,
                                verbose=True,
                                config=config,
                                backend="horovod",
                                workers_per_node=workers_per_node)
            import horovod.tensorflow.keras as hvd
            callbacks = [
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                         initial_lr=0.4,
                                                         verbose=True),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=5,
                                                           end_epoch=10,
                                                           multiplier=1.,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=10,
                                                           end_epoch=15,
                                                           multiplier=1e-1,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=15,
                                                           end_epoch=20,
                                                           multiplier=1e-2,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=20,
                                                           multiplier=1e-3,
                                                           initial_lr=0.4),
                LRChecker()
            ]
            for i in range(30):
                trainer.fit(create_train_datasets,
                            epochs=1,
                            callbacks=callbacks)
        else:
            # skip tests in horovod lower version
            pass
Esempio n. 7
0
    def test_sparkxshards(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        config = {"batch_size": 4, "lr": 0.8}
        trainer = Estimator(model_creator=model_creator,
                            verbose=True,
                            config=config,
                            workers_per_node=2)

        trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25)
        trainer.evaluate(train_data_shard, steps=25)
    def test_auto_shard_tf(self):
        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator(
            model_creator=auto_shard_model_creator,
            verbose=True,
            config={"batch_size": 4},
            backend="tf", workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets, epochs=1, steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
    def impl_test_fit_and_evaluate(self, backend):
        import tensorflow as tf
        ray_ctx = RayContext.get()
        batch_size = 32
        global_batch_size = batch_size * ray_ctx.num_ray_nodes
        config = {"batch_size": global_batch_size}

        trainer = Estimator(model_creator=simple_model,
                            compile_args_creator=compile_args,
                            verbose=True,
                            config=config,
                            backend=backend)

        # model baseline performance
        start_stats = trainer.evaluate(create_test_dataset,
                                       steps=NUM_TEST_SAMPLES //
                                       global_batch_size)
        print(start_stats)

        def scheduler(epoch):
            if epoch < 2:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.1 * (2 - epoch))

        scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler,
                                                             verbose=1)
        # train for 2 epochs
        trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler])
        trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler])

        # model performance after training (should improve)
        end_stats = trainer.evaluate(create_test_dataset,
                                     steps=NUM_TEST_SAMPLES //
                                     global_batch_size)
        print(end_stats)

        # sanity check that training worked
        dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
        dmse = (end_stats["validation_mean_squared_error"] -
                start_stats["validation_mean_squared_error"])
        print(f"dLoss: {dloss}, dMSE: {dmse}")

        assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
Esempio n. 10
0
    callbacks = get_lr_schedule_callbacks(initial_lr)

    config = {
        "wd": 0.00005,
        "momentum": 0.9,
        "batch_size": global_batch_size,
        "warmup_epoch": 5,
        "num_worker": args.worker_num,
        "data_dir": args.data_dir,
        "bf16": args.use_bf16,
        "lr": initial_lr,
    }

    trainer = Estimator(model_creator=model_creator,
                        compile_args_creator=compile_args_creator,
                        verbose=True,
                        config=config,
                        backend="horovod")

    if args.benchmark:
        trainer.fit(
            data_creator=train_data_creator
            if not args.use_dummy_data else dummy_data_creator,
            epochs=3,
            steps_per_epoch=20,
            callbacks=callbacks,
        )
    else:
        epoch = 0
        for i in range(5):
            dummy = args.use_dummy_data