def test_partition_num_less_than_workers(self): sc = init_nncontext() rdd = sc.range(200, numSlices=1) assert rdd.getNumPartitions() == 1 from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) assert df.rdd.getNumPartitions() < trainer.num_workers trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, validation_data=df, validation_steps=1, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_parquet_images_training(self): from bigdl.orca.learn.tf2 import Estimator temp_dir = tempfile.mkdtemp() try: ParquetDataset.write("file://" + temp_dir, images_generator(), images_schema) path = "file://" + temp_dir output_types = { "id": tf.string, "image": tf.string, "label": tf.float32 } output_shapes = {"id": (), "image": (), "label": ()} def data_creator(config, batch_size): dataset = read_parquet("tf_dataset", path=path, output_types=output_types, output_shapes=output_shapes) dataset = dataset.shuffle(10) dataset = dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) dataset = dataset.map(parse_data_train) dataset = dataset.batch(batch_size) return dataset ray_ctx = RayContext.get() trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(data=data_creator, epochs=1, batch_size=2) finally: shutil.rmtree(temp_dir)
def test_predict_xshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100, )) }) expected = train_data_shard.collect() expected = [shard["x"] for shard in expected] for x in expected: print(x.shape) expected = np.concatenate(expected) config = {} trainer = Estimator.from_keras(model_creator=identity_model_creator, verbose=True, config=config, workers_per_node=2) result_shards = trainer.predict(train_data_shard, batch_size=10).collect() result = [shard["prediction"] for shard in result_shards] expected_result = [shard["x"] for shard in result_shards] result = np.concatenate(result) assert np.allclose(expected, result)
def test_pandas_dataframe(self): def model_creator(config): import tensorflow as tf input1 = tf.keras.layers.Input(shape=(1, )) input2 = tf.keras.layers.Input(shape=(1, )) concatenation = tf.concat([input1, input2], axis=-1) outputs = tf.keras.layers.Dense( units=1, activation='softmax')(concatenation) model = tf.keras.Model(inputs=[input1, input2], outputs=outputs) model.compile(**compile_args(config)) return model file_path = os.path.join(resource_path, "orca/learn/ncf2.csv") train_data_shard = bigdl.orca.data.pandas.read_csv(file_path) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=1) trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["user", "item"], label_cols=["label"]) trainer.evaluate(train_data_shard, batch_size=4, num_steps=25, feature_cols=["user", "item"], label_cols=["label"]) trainer.predict(train_data_shard, feature_cols=["user", "item"]).collect()
def test_sparkxshards_with_inbalanced_data(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) def random_pad(data): import numpy as np import random times = random.randint(1, 10) data["x"] = np.concatenate([data["x"]] * times) data["y"] = np.concatenate([data["y"]] * times) return data train_data_shard = train_data_shard.transform_shard(random_pad) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25) trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
def test_num_part_data_diff_val_data(self): sc = init_nncontext() rdd = sc.range(200, numSlices=10) val_rdd = sc.range(60, numSlices=8) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)), int(np.random.randint(0, 1, size=()))))\ .toDF(["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) assert df.rdd.getNumPartitions() > trainer.num_workers assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions() trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, validation_data=val_df, validation_steps=1, feature_cols=["feature"], label_cols=["label"])
def test_dataframe_shard_size(self): from bigdl.orca import OrcaContext OrcaContext._shard_size = 3 sc = init_nncontext() rdd = sc.range(0, 10) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 1, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect() OrcaContext._shard_size = None
def test_dataframe_with_empty_partition(self): from bigdl.orca import OrcaContext sc = OrcaContext.get_spark_context() rdd = sc.range(0, 10) rdd_with_empty = rdd.repartition(4).\ mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)), int(np.random.randint(0, 1, size=()))))\ .toDF(["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def test_save_and_load(self): def model_creator(config): import tensorflow as tf model = tf.keras.Sequential([ tf.keras.layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='valid'), tf.keras.layers.BatchNormalization(), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), activation='relu', padding='valid'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='valid'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model def train_data_creator(config, batch_size): dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(100, 28, 28, 3), np.random.randint(0, 10, (100, 1)))) dataset = dataset.repeat() dataset = dataset.shuffle(1000) dataset = dataset.batch(batch_size) return dataset batch_size = 320 try: est = Estimator.from_keras(model_creator=model_creator, workers_per_node=2) history = est.fit(train_data_creator, epochs=1, batch_size=batch_size, steps_per_epoch=5) print("start saving") est.save("/tmp/cifar10_keras.ckpt") est.load("/tmp/cifar10_keras.ckpt") print("save success") finally: os.remove("/tmp/cifar10_keras.ckpt")
def test_dataframe_predict(self): sc = init_nncontext() rdd = sc.parallelize(range(20)) df = rdd.map(lambda x: ([float(x)] * 5, [int(np.random.randint(0, 2, size=()))])).toDF( ["feature", "label"]) estimator = Estimator.from_keras(model_creator=identity_model_creator, verbose=True, config={}, workers_per_node=2) result = estimator.predict(df, batch_size=4, feature_cols=["feature"]) expr = "sum(cast(feature <> to_array(prediction) as int)) as error" assert result.selectExpr(expr).first()["error"] == 0
def test_horovod_learning_rate_schedule(self): import horovod major, minor, patch = horovod.__version__.split(".") larger_major = int(major) > 0 larger_minor = int(major) == 0 and int(minor) > 19 larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2 if larger_major or larger_minor or larger_patch: ray_ctx = RayContext.get() batch_size = 32 workers_per_node = 4 global_batch_size = batch_size * workers_per_node config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=config, backend="horovod", workers_per_node=workers_per_node) import horovod.tensorflow.keras as hvd callbacks = [ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=0.4, verbose=True), hvd.callbacks.LearningRateScheduleCallback(start_epoch=5, end_epoch=10, multiplier=1., initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=10, end_epoch=15, multiplier=1e-1, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=20, multiplier=1e-2, initial_lr=0.4), hvd.callbacks.LearningRateScheduleCallback(start_epoch=20, multiplier=1e-3, initial_lr=0.4), LRChecker() ] for i in range(30): trainer.fit(create_train_datasets, epochs=1, batch_size=global_batch_size, callbacks=callbacks) else: # skip tests in horovod lower version pass
def test_dataframe(self): sc = OrcaContext.get_spark_context() rdd = sc.range(0, 100) spark = OrcaContext.get_spark_session() from pyspark.ml.linalg import DenseVector df = rdd.map(lambda x: (DenseVector(np.random.randn(1, ).astype(np.float)), int(np.random.randint(0, 2, size=())))).toDF( ["feature", "label"]) config = {"lr": 0.2} try: temp_dir = tempfile.mkdtemp() trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2, backend="spark", model_dir=temp_dir) res = trainer.fit(df, epochs=5, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"], validation_data=df, validation_steps=1) print("start saving") trainer.save_weights(os.path.join(temp_dir, "cifar10_keras.h5")) trainer.load_weights(os.path.join(temp_dir, "cifar10_keras.h5")) trainer.save(os.path.join(temp_dir, "a.model")) trainer.load(os.path.join(temp_dir, "a.model")) res = trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) print("validation result: ", res) res = trainer.predict(df, feature_cols=["feature"]).collect() finally: shutil.rmtree(temp_dir)
def test_sparkxshards(self): train_data_shard = XShards.partition({ "x": np.random.randn(100, 1), "y": np.random.randint(0, 1, size=(100)) }) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25) trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
def test_auto_shard_tf(self): # file 1 contains all 0s, file 2 contains all 1s # If shard by files, then each model will # see the same records in the same batch. # If shard by records, then each batch # will have different records. # The loss func is constructed such that # the former case will return 0, and the latter # case will return non-zero. ray_ctx = RayContext.get() trainer = Estimator.from_keras(model_creator=auto_shard_model_creator, verbose=True, backend="tf2", workers_per_node=2) stats = trainer.fit(create_auto_shard_datasets, epochs=1, batch_size=4, steps_per_epoch=2) assert stats["train_loss"] == 0.0
def test_string_input(self): def model_creator(config): import tensorflow as tf vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization( max_tokens=10, output_mode='int', output_sequence_length=4) model = tf.keras.models.Sequential() model.add(tf.keras.Input(shape=(1, ), dtype=tf.string)) model.add(vectorize_layer) return model from bigdl.orca import OrcaContext from pyspark.sql.types import StructType, StructField, StringType spark = OrcaContext.get_spark_session() schema = StructType([StructField("input", StringType(), True)]) input_data = [["foo qux bar"], ["qux baz"]] input_df = spark.createDataFrame(input_data, schema) estimator = Estimator.from_keras(model_creator=model_creator) output_df = estimator.predict(input_df, batch_size=1, feature_cols=["input"]) output = output_df.collect() print(output)
def main(): anchors = yolo_anchors anchor_masks = yolo_anchor_masks parser = argparse.ArgumentParser() parser.add_argument("--data_dir", dest="data_dir", help="Required. The path where data locates.") parser.add_argument( "--output_data", dest="output_data", default=tempfile.mkdtemp(), help="Required. The path where voc parquet data locates.") parser.add_argument("--data_year", dest="data_year", default="2009", help="Required. The voc data date.") parser.add_argument("--split_name_train", dest="split_name_train", default="train", help="Required. Split name.") parser.add_argument("--split_name_test", dest="split_name_test", default="val", help="Required. Split name.") parser.add_argument("--names", dest="names", help="Required. The path where class names locates.") parser.add_argument("--weights", dest="weights", default="./checkpoints/yolov3.weights", help="Required. The path where weights locates.") parser.add_argument("--checkpoint", dest="checkpoint", default="./checkpoints/yolov3.tf", help="Required. The path where checkpoint locates.") parser.add_argument( "--checkpoint_folder", dest="checkpoint_folder", default="./checkpoints", help="Required. The path where saved checkpoint locates.") parser.add_argument("--epochs", dest="epochs", type=int, default=2, help="Required. epochs.") parser.add_argument("--batch_size", dest="batch_size", type=int, default=16, help="Required. epochs.") parser.add_argument( "--cluster_mode", dest="cluster_mode", default="local", help="Required. Run on local/yarn/k8s/spark-submit mode.") parser.add_argument("--class_num", dest="class_num", type=int, default=20, help="Required. class num.") parser.add_argument( "--worker_num", type=int, default=1, help="The number of slave nodes to be used in the cluster." "You can change it depending on your own cluster setting.") parser.add_argument( "--cores", type=int, default=4, help="The number of cpu cores you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument( "--memory", type=str, default="20g", help="The memory you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument( "--object_store_memory", type=str, default="10g", help="The memory you want to use on each node. " "You can change it depending on your own cluster setting.") parser.add_argument("--enable_numa_binding", dest="enable_numa_binding", default=False, help="enable_numa_binding") parser.add_argument('--k8s_master', type=str, default="", help="The k8s master. " "It should be k8s://https://<k8s-apiserver-host>: " "<k8s-apiserver-port>.") parser.add_argument("--container_image", type=str, default="", help="The runtime k8s image. ") parser.add_argument('--k8s_driver_host', type=str, default="", help="The k8s driver localhost.") parser.add_argument('--k8s_driver_port', type=str, default="", help="The k8s driver port.") parser.add_argument('--nfs_mount_path', type=str, default="", help="nfs mount path") options = parser.parse_args() if options.cluster_mode == "local": init_orca_context(cluster_mode="local", cores=options.cores, num_nodes=options.worker_num, memory=options.memory, init_ray_on_spark=True, object_store_memory=options.object_store_memory) elif options.cluster_mode == "k8s": init_orca_context( cluster_mode="k8s", master=options.k8s_master, container_image=options.container_image, init_ray_on_spark=True, enable_numa_binding=options.enable_numa_binding, num_nodes=options.worker_num, cores=options.cores, memory=options.memory, object_store_memory=options.object_store_memory, conf={ "spark.driver.host": options.driver_host, "spark.driver.port": options.driver_port, "spark.kubernetes.executor.volumes.persistentVolumeClaim." "nfsvolumeclaim.options.claimName": "nfsvolumeclaim", "spark.kubernetes.executor.volumes.persistentVolumeClaim." "nfsvolumeclaim.mount.path": options.nfs_mount_path, "spark.kubernetes.driver.volumes.persistentVolumeClaim." "nfsvolumeclaim.options.claimName": "nfsvolumeclaim", "spark.kubernetes.driver.volumes.persistentVolumeClaim." "nfsvolumeclaim.mount.path": options.nfs_mount_path }) elif options.cluster_mode == "yarn": init_orca_context(cluster_mode="yarn-client", cores=options.cores, num_nodes=options.worker_num, memory=options.memory, init_ray_on_spark=True, enable_numa_binding=options.enable_numa_binding, object_store_memory=options.object_store_memory) elif options.cluster_mode == "spark-submit": init_orca_context(cluster_mode="spark-submit") # convert yolov3 weights yolo = YoloV3(classes=80) load_darknet_weights(yolo, options.weights) yolo.save_weights(options.checkpoint) def model_creator(config): model = YoloV3(DEFAULT_IMAGE_SIZE, training=True, classes=options.class_num) anchors = yolo_anchors anchor_masks = yolo_anchor_masks model_pretrained = YoloV3(DEFAULT_IMAGE_SIZE, training=True, classes=80) model_pretrained.load_weights(options.checkpoint) model.get_layer('yolo_darknet').set_weights( model_pretrained.get_layer('yolo_darknet').get_weights()) freeze_all(model.get_layer('yolo_darknet')) optimizer = tf.keras.optimizers.Adam(lr=1e-3) loss = [ YoloLoss(anchors[mask], classes=options.class_num) for mask in anchor_masks ] model.compile(optimizer=optimizer, loss=loss, run_eagerly=False) return model # prepare data class_map = { name: idx for idx, name in enumerate(open(options.names).read().splitlines()) } dataset_path = os.path.join(options.data_dir, "VOCdevkit") voc_train_path = os.path.join(options.output_data, "train_dataset") voc_val_path = os.path.join(options.output_data, "val_dataset") write_parquet(format="voc", voc_root_path=dataset_path, output_path="file://" + voc_train_path, splits_names=[(options.data_year, options.split_name_train)], classes=class_map) write_parquet(format="voc", voc_root_path=dataset_path, output_path="file://" + voc_val_path, splits_names=[(options.data_year, options.split_name_test)], classes=class_map) output_types = { "image": tf.string, "label": tf.float32, "image_id": tf.string } output_shapes = {"image": (), "label": (None, 5), "image_id": ()} def train_data_creator(config, batch_size): train_dataset = read_parquet(format="tf_dataset", path=voc_train_path, output_types=output_types, output_shapes=output_shapes) train_dataset = train_dataset.map( lambda data_dict: (data_dict["image"], data_dict["label"])) train_dataset = train_dataset.map(parse_data_train) train_dataset = train_dataset.shuffle(buffer_size=512) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.map(lambda x, y: ( transform_images(x, DEFAULT_IMAGE_SIZE), transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE))) train_dataset = train_dataset.prefetch( buffer_size=tf.data.experimental.AUTOTUNE) return train_dataset def val_data_creator(config, batch_size): val_dataset = read_parquet(format="tf_dataset", path=voc_val_path, output_types=output_types, output_shapes=output_shapes) val_dataset = val_dataset.map(lambda data_dict: (data_dict["image"], data_dict["label"])) val_dataset = val_dataset.map(parse_data_train) val_dataset = val_dataset.batch(batch_size) val_dataset = val_dataset.map(lambda x, y: ( transform_images(x, DEFAULT_IMAGE_SIZE), transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE))) return val_dataset callbacks = [ ReduceLROnPlateau(verbose=1), EarlyStopping(patience=3, verbose=1), ModelCheckpoint(options.checkpoint_folder + '/yolov3_train_{epoch}.tf', verbose=1, save_weights_only=True), TensorBoard(log_dir='logs') ] trainer = Estimator.from_keras(model_creator=model_creator) trainer.fit(train_data_creator, epochs=options.epochs, batch_size=options.batch_size, steps_per_epoch=3473 // options.batch_size, callbacks=callbacks, validation_data=val_data_creator, validation_steps=3581 // options.batch_size) stop_orca_context()
initial_lr = 0.1 * lr_multiplier callbacks = get_lr_schedule_callbacks(initial_lr) config = { "wd": 0.00005, "momentum": 0.9, "warmup_epoch": 5, "num_worker": args.worker_num, "data_dir": args.data_dir, "bf16": args.use_bf16, "lr": initial_lr, } trainer = Estimator.from_keras(model_creator=model_creator, compile_args_creator=compile_args_creator, verbose=True, config=config, backend="horovod") if args.benchmark: trainer.fit( data=train_data_creator if not args.use_dummy_data else dummy_data_creator, epochs=3, batch_size=global_batch_size, steps_per_epoch=20, callbacks=callbacks, ) else: epoch = 0 for i in range(5):
def impl_test_fit_and_evaluate(self, backend): import tensorflow as tf ray_ctx = RayContext.get() batch_size = 32 global_batch_size = batch_size * ray_ctx.num_ray_nodes if backend == "horovod": trainer = Estimator.from_keras(model_creator=simple_model, compile_args_creator=compile_args, verbose=True, config=None, backend=backend) else: trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=None, backend=backend, workers_per_node=2) # model baseline performance start_stats = trainer.evaluate(create_test_dataset, batch_size=global_batch_size, num_steps=NUM_TEST_SAMPLES // global_batch_size) print(start_stats) def scheduler(epoch): if epoch < 2: return 0.001 else: return 0.001 * tf.math.exp(0.1 * (2 - epoch)) scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1) # train for 2 epochs trainer.fit(create_train_datasets, epochs=2, batch_size=global_batch_size, steps_per_epoch=10, callbacks=[scheduler]) trainer.fit(create_train_datasets, epochs=2, batch_size=global_batch_size, steps_per_epoch=10, callbacks=[scheduler]) # model performance after training (should improve) end_stats = trainer.evaluate(create_test_dataset, batch_size=global_batch_size, num_steps=NUM_TEST_SAMPLES // global_batch_size) print(end_stats) # sanity check that training worked dloss = end_stats["validation_loss"] - start_stats["validation_loss"] dmse = (end_stats["validation_mean_squared_error"] - start_stats["validation_mean_squared_error"]) print(f"dLoss: {dloss}, dMSE: {dmse}") assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"