def test_predict_xshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100, )) }) expected = train_data_shard.collect() expected = [shard["x"] for shard in expected] for x in expected: print(x.shape) expected = np.concatenate(expected) config = {} trainer = Estimator(model_creator=identity_model_creator, verbose=True, config=config, workers_per_node=2) result = trainer.predict(train_data_shard, batch_size=10).collect() result = [shard["prediction"] for shard in result] result = np.concatenate(result) assert np.allclose(expected, result)
def test_dataframe(self): sc = init_nncontext() rdd = sc.range(0, 10) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"batch_size": 4, "lr": 0.8} trainer = Estimator(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_sparkxshards_with_inbalanced_data(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) def random_pad(data): import numpy as np import random times = random.randint(1, 10) data["x"] = np.concatenate([data["x"]] * times) data["y"] = np.concatenate([data["y"]] * times) return data train_data_shard = train_data_shard.transform_shard(random_pad) config = {"batch_size": 4, "lr": 0.8} trainer = Estimator(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25) trainer.evaluate(train_data_shard, steps=25)
def test_changing_config_during_fit(self): train_data_shard = XShards.partition({"x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100,))}) config = { "lr": 0.8 } trainer = Estimator( model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25, data_config={"batch_size": 8})
def test_require_batch_size(self): train_data_shard = XShards.partition({"x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100,))}) config = { "lr": 0.8 } trainer = Estimator( model_creator=model_creator, verbose=True, config=config, workers_per_node=2) with pytest.raises(ray.exceptions.RayTaskError, match=r".*batch_size must be set in config*."): trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25)
def test_horovod_learning_rate_schedule(self): import horovod major, minor, patch = horovod.__version__.split(".") larger_major = int(major) > 0 larger_minor = int(major) == 0 and int(minor) > 19 larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2 if larger_major or larger_minor or larger_patch: ray_ctx = RayContext.get() batch_size = 32 workers_per_node = 4 global_batch_size = batch_size * workers_per_node config = {"batch_size": global_batch_size, "lr": 0.8} trainer = Estimator(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend="horovod", workers_per_node=workers_per_node) import horovod.tensorflow.keras as hvd callbacks = [ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=0.4, verbose=True), hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=10, multiplier=1., initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=10, end_epoch=15, multiplier=1e-1, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=20, multiplier=1e-2, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=20, multiplier=1e-3, initial_lr=0.4), LRChecker() ] for i in range(30): trainer.fit(create_train_datasets, epochs=1, callbacks=callbacks) else: # skip tests in horovod lower version pass
def test_sparkxshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) config = {"batch_size": 4, "lr": 0.8} trainer = Estimator(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25) trainer.evaluate(train_data_shard, steps=25)
def test_auto_shard_tf(self): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator( model_creator=auto_shard_model_creator, verbose=True, config={"batch_size": 4}, backend="tf", workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def impl_test_fit_and_evaluate(self, backend): import tensorflow as tf ray_ctx = RayContext.get() batch_size = 32 global_batch_size = batch_size * ray_ctx.num_ray_nodes config = {"batch_size": global_batch_size} trainer = Estimator(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend=backend) # model baseline performance start_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(start_stats) def scheduler(epoch): if epoch < 2: return 0.001 else: return 0.001 * tf.math.exp(0.1 * (2 - epoch)) scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1) # train for 2 epochs trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler]) trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler]) # model performance after training (should improve) end_stats = trainer.evaluate(create_test_dataset, steps=NUM_TEST_SAMPLES // global_batch_size) print(end_stats) # sanity check that training worked dloss = end_stats["validation_loss"] - start_stats["validation_loss"] dmse = (end_stats["validation_mean_squared_error"] - start_stats["validation_mean_squared_error"]) print(f"dLoss: {dloss}, dMSE: {dmse}") assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
callbacks = get_lr_schedule_callbacks(initial_lr) config = { "wd": 0.00005, "momentum": 0.9, "batch_size": global_batch_size, "warmup_epoch": 5, "num_worker": args.worker_num, "data_dir": args.data_dir, "bf16": args.use_bf16, "lr": initial_lr, } trainer = Estimator(model_creator=model_creator, compile_args_creator=compile_args_creator, verbose=True, config=config, backend="horovod") if args.benchmark: trainer.fit( data_creator=train_data_creator if not args.use_dummy_data else dummy_data_creator, epochs=3, steps_per_epoch=20, callbacks=callbacks, ) else: epoch = 0 for i in range(5): dummy = args.use_dummy_data