Beispiel #1
0
    def test_horovod_learning_rate_schedule(self):
        import horovod
        major, minor, patch = horovod.__version__.split(".")

        larger_major = int(major) > 0
        larger_minor = int(major) == 0 and int(minor) > 19
        larger_patch = int(major) == 0 and int(minor) == 19 and int(patch) >= 2

        if larger_major or larger_minor or larger_patch:
            ray_ctx = RayContext.get()
            batch_size = 32
            workers_per_node = 4
            global_batch_size = batch_size * workers_per_node
            config = {"batch_size": global_batch_size, "lr": 0.8}
            trainer = Estimator(model_creator=simple_model,
                                compile_args_creator=compile_args,
                                verbose=True,
                                config=config,
                                backend="horovod",
                                workers_per_node=workers_per_node)
            import horovod.tensorflow.keras as hvd
            callbacks = [
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                         initial_lr=0.4,
                                                         verbose=True),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=5,
                                                           end_epoch=10,
                                                           multiplier=1.,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=10,
                                                           end_epoch=15,
                                                           multiplier=1e-1,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=15,
                                                           end_epoch=20,
                                                           multiplier=1e-2,
                                                           initial_lr=0.4),
                hvd.callbacks.LearningRateScheduleCallback(start_epoch=20,
                                                           multiplier=1e-3,
                                                           initial_lr=0.4),
                LRChecker()
            ]
            for i in range(30):
                trainer.fit(create_train_datasets,
                            epochs=1,
                            callbacks=callbacks)
        else:
            # skip tests in horovod lower version
            pass
Beispiel #2
0
    def test_changing_config_during_evaluate(self):
        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100, ))
        })

        config = {"lr": 0.8}
        trainer = Estimator(model_creator=model_creator,
                            verbose=True,
                            config=config,
                            workers_per_node=2)

        trainer.evaluate(train_data_shard, data_config={"batch_size": 8})
Beispiel #3
0
 def test_require_batch_size(self):
     train_data_shard = XShards.partition({
         "x":
         np.random.randn(100, 1),
         "y":
         np.random.randint(0, 1, size=(100, ))
     })
     config = {"lr": 0.8}
     trainer = Estimator(model_creator=model_creator,
                         verbose=True,
                         config=config,
                         workers_per_node=2)
     with pytest.raises(ray.exceptions.RayTaskError,
                        match=r".*batch_size must be set in config*."):
         trainer.fit(train_data_shard, epochs=1, steps_per_epoch=25)
    def test_num_part_data_diff_val_data(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=10)
        val_rdd = sc.range(60, numSlices=8)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])
        val_df = val_rdd.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                        int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() > trainer.num_workers
        assert df.rdd.getNumPartitions() != val_df.rdd.getNumPartitions()

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=val_df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
    def test_dataframe_shard_size(self):
        from zoo.orca import OrcaContext
        OrcaContext._shard_size = 3
        sc = init_nncontext()
        rdd = sc.range(0, 10)
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
    def test_partition_num_less_than_workers(self):
        sc = init_nncontext()
        rdd = sc.range(200, numSlices=1)
        assert rdd.getNumPartitions() == 1
        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd.map(lambda x:
                     (DenseVector(np.random.randn(1, ).astype(np.float)),
                      int(np.random.randint(0, 1, size=())))).toDF(
                          ["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)
        assert df.rdd.getNumPartitions() < trainer.num_workers

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    validation_data=df,
                    validation_steps=1,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
    def test_pandas_dataframe(self):
        def model_creator(config):
            import tensorflow as tf
            input1 = tf.keras.layers.Input(shape=(1, ))
            input2 = tf.keras.layers.Input(shape=(1, ))
            concatenation = tf.concat([input1, input2], axis=-1)
            outputs = tf.keras.layers.Dense(
                units=1, activation='softmax')(concatenation)
            model = tf.keras.Model(inputs=[input1, input2], outputs=outputs)
            model.compile(**compile_args(config))
            return model

        file_path = os.path.join(resource_path, "orca/learn/ncf2.csv")
        train_data_shard = zoo.orca.data.pandas.read_csv(file_path)

        config = {"lr": 0.8}

        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=1)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["user", "item"],
                    label_cols=["label"])
        trainer.evaluate(train_data_shard,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["user", "item"],
                         label_cols=["label"])
        trainer.predict(train_data_shard, feature_cols=["user",
                                                        "item"]).collect()
    def test_dataframe_with_empty_partition(self):
        from zoo.orca import OrcaContext
        sc = OrcaContext.get_spark_context()
        rdd = sc.range(0, 10)

        rdd_with_empty = rdd.repartition(4).\
            mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part)

        from pyspark.sql import SparkSession
        spark = SparkSession(sc)
        from pyspark.ml.linalg import DenseVector
        df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)),
                                           int(np.random.randint(0, 1, size=()))))\
            .toDF(["feature", "label"])

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(df,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25,
                    feature_cols=["feature"],
                    label_cols=["label"])
        trainer.evaluate(df,
                         batch_size=4,
                         num_steps=25,
                         feature_cols=["feature"],
                         label_cols=["label"])
        trainer.predict(df, feature_cols=["feature"]).collect()
Beispiel #9
0
    def test_parquet_images_training(self):
        from zoo.orca.learn.tf2 import Estimator
        temp_dir = tempfile.mkdtemp()
        try:
            ParquetDataset.write("file://" + temp_dir, images_generator(),
                                 images_schema)
            path = "file://" + temp_dir
            output_types = {
                "id": tf.string,
                "image": tf.string,
                "label": tf.float32
            }
            output_shapes = {"id": (), "image": (), "label": ()}

            def data_creator(config, batch_size):
                dataset = read_parquet("tf_dataset",
                                       input_path=path,
                                       output_types=output_types,
                                       output_shapes=output_shapes)
                dataset = dataset.shuffle(10)
                dataset = dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
                dataset = dataset.map(parse_data_train)
                dataset = dataset.batch(batch_size)
                return dataset

            ray_ctx = RayContext.get()
            trainer = Estimator.from_keras(model_creator=model_creator)
            trainer.fit(data=data_creator, epochs=1, batch_size=2)
        finally:
            shutil.rmtree(temp_dir)
    def test_predict_xshards(self):
        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100, ))
        })
        expected = train_data_shard.collect()

        expected = [shard["x"] for shard in expected]

        for x in expected:
            print(x.shape)

        expected = np.concatenate(expected)

        config = {}
        trainer = Estimator.from_keras(model_creator=identity_model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        result_shards = trainer.predict(train_data_shard,
                                        batch_size=10).collect()

        result = [shard["prediction"] for shard in result_shards]
        expected_result = [shard["x"] for shard in result_shards]

        result = np.concatenate(result)

        assert np.allclose(expected, result)
    def test_sparkxshards_with_inbalanced_data(self):

        train_data_shard = XShards.partition({
            "x":
            np.random.randn(100, 1),
            "y":
            np.random.randint(0, 1, size=(100))
        })

        def random_pad(data):
            import numpy as np
            import random
            times = random.randint(1, 10)
            data["x"] = np.concatenate([data["x"]] * times)
            data["y"] = np.concatenate([data["y"]] * times)
            return data

        train_data_shard = train_data_shard.transform_shard(random_pad)

        config = {"lr": 0.8}
        trainer = Estimator.from_keras(model_creator=model_creator,
                                       verbose=True,
                                       config=config,
                                       workers_per_node=2)

        trainer.fit(train_data_shard,
                    epochs=1,
                    batch_size=4,
                    steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
    def test_auto_shard_tf(self):
        # file 1 contains all 0s, file 2 contains all 1s
        # If shard by files, then each model will
        # see the same records in the same batch.
        # If shard by records, then each batch
        # will have different records.
        # The loss func is constructed such that
        # the former case will return 0, and the latter
        # case will return non-zero.

        ray_ctx = RayContext.get()
        trainer = Estimator(
            model_creator=auto_shard_model_creator,
            verbose=True,
            config={"batch_size": 4},
            backend="tf", workers_per_node=2)
        stats = trainer.fit(create_auto_shard_datasets, epochs=1, steps_per_epoch=2)
        assert stats["train_loss"] == 0.0
    def test_save_and_load(self):
        def model_creator(config):
            import tensorflow as tf
            model = tf.keras.Sequential([
                tf.keras.layers.Conv2D(64,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       activation='relu',
                                       padding='valid'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                             strides=(2, 2),
                                             padding='valid'),
                tf.keras.layers.Conv2D(64,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       activation='relu',
                                       padding='valid'),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
                                             strides=(2, 2),
                                             padding='valid'),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(10, activation='softmax')
            ])
            model.compile(optimizer=tf.keras.optimizers.RMSprop(),
                          loss='sparse_categorical_crossentropy',
                          metrics=['accuracy'])
            return model

        def train_data_creator(config, batch_size):
            dataset = tf.data.Dataset.from_tensor_slices(
                (np.random.randn(100, 28, 28,
                                 3), np.random.randint(0, 10, (100, 1))))
            dataset = dataset.repeat()
            dataset = dataset.shuffle(1000)
            dataset = dataset.batch(batch_size)
            return dataset

        batch_size = 320
        try:
            est = Estimator.from_keras(model_creator=model_creator,
                                       workers_per_node=2)

            history = est.fit(train_data_creator,
                              epochs=1,
                              batch_size=batch_size,
                              steps_per_epoch=5)
            print("start saving")
            est.save("/tmp/cifar10_keras.ckpt")
            est.load("/tmp/cifar10_keras.ckpt")
            print("save success")
        finally:
            os.remove("/tmp/cifar10_keras.ckpt")
    def test_dataframe_predict(self):
        sc = init_nncontext()
        rdd = sc.parallelize(range(20))
        df = rdd.map(lambda x: ([float(x)] * 5,
                                [int(np.random.randint(0, 2, size=()))])).toDF(
                                    ["feature", "label"])

        estimator = Estimator.from_keras(model_creator=identity_model_creator,
                                         verbose=True,
                                         config={},
                                         workers_per_node=2)
        result = estimator.predict(df, batch_size=4, feature_cols=["feature"])
        expr = "sum(cast(feature <> to_array(prediction) as int)) as error"
        assert result.selectExpr(expr).first()["error"] == 0
    def test_sparkxshards(self):

        train_data_shard = XShards.partition({"x": np.random.randn(100, 1),
                                              "y": np.random.randint(0, 1, size=(100))})

        config = {
            "lr": 0.8
        }
        trainer = Estimator.from_keras(
            model_creator=model_creator,
            verbose=True,
            config=config,
            workers_per_node=2)

        trainer.fit(train_data_shard, epochs=1, batch_size=4, steps_per_epoch=25)
        trainer.evaluate(train_data_shard, batch_size=4, num_steps=25)
Beispiel #16
0
def main(max_epoch):

    batch_size = 320
    config = {"batch_size": batch_size}
    est = Estimator.from_keras(model_creator,
                               config=config,
                               workers_per_node=2)
    stats = est.fit(train_data_creator,
                    epochs=max_epoch,
                    steps_per_epoch=60000 // batch_size,
                    validation_data_creator=val_data_creator,
                    validation_steps=10000 // batch_size)
    print(stats)
    est.save("/tmp/mnist_keras.ckpt")
    est.restore("/tmp/mnist_keras.ckpt")
    stats = est.evaluate(val_data_creator, steps=10000 // batch_size)
    print(stats)
    def test_string_input(self):
        def model_creator(config):
            import tensorflow as tf
            vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
                max_tokens=10, output_mode='int', output_sequence_length=4)
            model = tf.keras.models.Sequential()
            model.add(tf.keras.Input(shape=(1, ), dtype=tf.string))
            model.add(vectorize_layer)
            return model

        from zoo.orca import OrcaContext
        from pyspark.sql.types import StructType, StructField, StringType
        spark = OrcaContext.get_spark_session()
        schema = StructType([StructField("input", StringType(), True)])
        input_data = [["foo qux bar"], ["qux baz"]]
        input_df = spark.createDataFrame(input_data, schema)
        estimator = Estimator.from_keras(model_creator=model_creator)
        output_df = estimator.predict(input_df,
                                      batch_size=1,
                                      feature_cols=["input"])
        output = output_df.collect()
        print(output)
    def impl_test_fit_and_evaluate(self, backend):
        import tensorflow as tf
        ray_ctx = RayContext.get()
        batch_size = 32
        global_batch_size = batch_size * ray_ctx.num_ray_nodes
        config = {"batch_size": global_batch_size}

        trainer = Estimator(model_creator=simple_model,
                            compile_args_creator=compile_args,
                            verbose=True,
                            config=config,
                            backend=backend)

        # model baseline performance
        start_stats = trainer.evaluate(create_test_dataset,
                                       steps=NUM_TEST_SAMPLES //
                                       global_batch_size)
        print(start_stats)

        def scheduler(epoch):
            if epoch < 2:
                return 0.001
            else:
                return 0.001 * tf.math.exp(0.1 * (2 - epoch))

        scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler,
                                                             verbose=1)
        # train for 2 epochs
        trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler])
        trainer.fit(create_train_datasets, epochs=2, callbacks=[scheduler])

        # model performance after training (should improve)
        end_stats = trainer.evaluate(create_test_dataset,
                                     steps=NUM_TEST_SAMPLES //
                                     global_batch_size)
        print(end_stats)

        # sanity check that training worked
        dloss = end_stats["validation_loss"] - start_stats["validation_loss"]
        dmse = (end_stats["validation_mean_squared_error"] -
                start_stats["validation_mean_squared_error"])
        print(f"dLoss: {dloss}, dMSE: {dmse}")

        assert dloss < 0 and dmse < 0, "training sanity check failed. loss increased!"
Beispiel #19
0
    lr_schdule = tf.keras.callbacks.LearningRateScheduler(
        lambda epoch: schedule(epoch, lr_multiplier), verbose=1)

    config = {
        "momentum": 0.9,
        "wd": 0.00005,
        "batch_size": args.batch_size_per_worker,
        "val_batch_size": args.batch_size_per_worker,
        "warmup_epoch": 5,
        "num_worker": args.worker_num,
        "data_dir": args.data_dir,
    }

    trainer = Estimator(model_creator=model_creator,
                        compile_args_creator=compile_args_creator,
                        verbose=True,
                        config=config,
                        backend="horovod")

    results = trainer.fit(
        data_creator=train_data_creator,
        epochs=90,
        validation_data_creator=val_data_creator,
        steps_per_epoch=_NUM_IMAGES['train'] // global_batch_size,
        callbacks=[lr_schdule],
        validation_steps=_NUM_IMAGES['validation'] // global_batch_size,
    )

    print(results)
    stop_orca_context()
Beispiel #20
0
def main():
    anchors = yolo_anchors
    anchor_masks = yolo_anchor_masks

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        dest="data_dir",
                        help="Required. The path where data locates.")
    parser.add_argument(
        "--output_data",
        dest="output_data",
        default=tempfile.mkdtemp(),
        help="Required. The path where voc parquet data locates.")
    parser.add_argument("--data_year",
                        dest="data_year",
                        default="2009",
                        help="Required. The voc data date.")
    parser.add_argument("--split_name_train",
                        dest="split_name_train",
                        default="train",
                        help="Required. Split name.")
    parser.add_argument("--split_name_test",
                        dest="split_name_test",
                        default="val",
                        help="Required. Split name.")
    parser.add_argument("--names",
                        dest="names",
                        help="Required. The path where class names locates.")
    parser.add_argument("--weights",
                        dest="weights",
                        default="./checkpoints/yolov3.weights",
                        help="Required. The path where weights locates.")
    parser.add_argument("--checkpoint",
                        dest="checkpoint",
                        default="./checkpoints/yolov3.tf",
                        help="Required. The path where checkpoint locates.")
    parser.add_argument(
        "--checkpoint_folder",
        dest="checkpoint_folder",
        default="./checkpoints",
        help="Required. The path where saved checkpoint locates.")
    parser.add_argument("--epochs",
                        dest="epochs",
                        type=int,
                        default=2,
                        help="Required. epochs.")
    parser.add_argument("--batch_size",
                        dest="batch_size",
                        type=int,
                        default=16,
                        help="Required. epochs.")
    parser.add_argument("--cluster_mode",
                        dest="cluster_mode",
                        default="local",
                        help="Required. Run on local/yarn/k8s mode.")
    parser.add_argument("--class_num",
                        dest="class_num",
                        type=int,
                        default=20,
                        help="Required. class num.")
    parser.add_argument(
        "--worker_num",
        type=int,
        default=1,
        help="The number of slave nodes to be used in the cluster."
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--cores",
        type=int,
        default=4,
        help="The number of cpu cores you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--memory",
        type=str,
        default="20g",
        help="The memory you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument(
        "--object_store_memory",
        type=str,
        default="10g",
        help="The memory you want to use on each node. "
        "You can change it depending on your own cluster setting.")
    parser.add_argument('--k8s_master',
                        type=str,
                        default="",
                        help="The k8s master. "
                        "It should be k8s://https://<k8s-apiserver-host>: "
                        "<k8s-apiserver-port>.")
    parser.add_argument("--container_image",
                        type=str,
                        default="",
                        help="The runtime k8s image. ")
    parser.add_argument('--k8s_driver_host',
                        type=str,
                        default="",
                        help="The k8s driver localhost.")
    parser.add_argument('--k8s_driver_port',
                        type=str,
                        default="",
                        help="The k8s driver port.")

    options = parser.parse_args()

    # convert yolov3 weights
    yolo = YoloV3(classes=80)
    load_darknet_weights(yolo, options.weights)
    yolo.save_weights(options.checkpoint)

    def model_creator(config):
        model = YoloV3(DEFAULT_IMAGE_SIZE,
                       training=True,
                       classes=options.class_num)
        anchors = yolo_anchors
        anchor_masks = yolo_anchor_masks

        model_pretrained = YoloV3(DEFAULT_IMAGE_SIZE,
                                  training=True,
                                  classes=80)
        model_pretrained.load_weights(options.checkpoint)

        model.get_layer('yolo_darknet').set_weights(
            model_pretrained.get_layer('yolo_darknet').get_weights())
        freeze_all(model.get_layer('yolo_darknet'))

        optimizer = tf.keras.optimizers.Adam(lr=1e-3)
        loss = [
            YoloLoss(anchors[mask], classes=options.class_num)
            for mask in anchor_masks
        ]
        model.compile(optimizer=optimizer, loss=loss, run_eagerly=False)
        return model

    # prepare data
    class_map = {
        name: idx
        for idx, name in enumerate(open(options.names).read().splitlines())
    }
    dataset_path = os.path.join(options.data_dir, "VOCdevkit")
    voc_train_path = os.path.join(options.output_data, "train_dataset")
    voc_val_path = os.path.join(options.output_data, "val_dataset")

    write_parquet(format="voc",
                  voc_root_path=dataset_path,
                  output_path="file://" + voc_train_path,
                  splits_names=[(options.data_year, options.split_name_train)],
                  classes=class_map)
    write_parquet(format="voc",
                  voc_root_path=dataset_path,
                  output_path="file://" + voc_val_path,
                  splits_names=[(options.data_year, options.split_name_test)],
                  classes=class_map)

    output_types = {
        "image": tf.string,
        "label": tf.float32,
        "image_id": tf.string
    }
    output_shapes = {"image": (), "label": (None, 5), "image_id": ()}

    def train_data_creator(config, batch_size):
        train_dataset = read_parquet(format="tf_dataset",
                                     path=voc_train_path,
                                     output_types=output_types,
                                     output_shapes=output_shapes)
        train_dataset = train_dataset.map(
            lambda data_dict: (data_dict["image"], data_dict["label"]))
        train_dataset = train_dataset.map(parse_data_train)
        train_dataset = train_dataset.shuffle(buffer_size=512)
        train_dataset = train_dataset.batch(batch_size)
        train_dataset = train_dataset.map(lambda x, y: (
            transform_images(x, DEFAULT_IMAGE_SIZE),
            transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE)))
        train_dataset = train_dataset.prefetch(
            buffer_size=tf.data.experimental.AUTOTUNE)
        return train_dataset

    def val_data_creator(config, batch_size):
        val_dataset = read_parquet(format="tf_dataset",
                                   path=voc_val_path,
                                   output_types=output_types,
                                   output_shapes=output_shapes)
        val_dataset = val_dataset.map(lambda data_dict:
                                      (data_dict["image"], data_dict["label"]))
        val_dataset = val_dataset.map(parse_data_train)
        val_dataset = val_dataset.batch(batch_size)
        val_dataset = val_dataset.map(lambda x, y: (
            transform_images(x, DEFAULT_IMAGE_SIZE),
            transform_targets(y, anchors, anchor_masks, DEFAULT_IMAGE_SIZE)))
        return val_dataset

    callbacks = [
        ReduceLROnPlateau(verbose=1),
        EarlyStopping(patience=3, verbose=1),
        ModelCheckpoint(options.checkpoint_folder + '/yolov3_train_{epoch}.tf',
                        verbose=1,
                        save_weights_only=True),
        TensorBoard(log_dir='logs')
    ]

    if options.cluster_mode == "local":
        init_orca_context(cluster_mode="local",
                          cores=options.cores,
                          num_nodes=options.worker_num,
                          memory=options.memory,
                          init_ray_on_spark=True,
                          enable_numa_binding=False,
                          object_store_memory=options.object_store_memory)
    elif options.cluster_mode == "k8s":
        init_orca_context(cluster_mode="k8s",
                          master=options.k8s_master,
                          container_image=options.container_image,
                          init_ray_on_spark=True,
                          enable_numa_binding=False,
                          num_nodes=options.worker_num,
                          cores=options.cores,
                          memory=options.memory,
                          object_store_memory=options.object_store_memory,
                          conf={
                              "spark.driver.host": options.driver_host,
                              "spark.driver.port": options.driver_port
                          })
    elif options.cluster_mode == "yarn":
        init_orca_context(cluster_mode="yarn-client",
                          cores=options.cores,
                          num_nodes=options.worker_num,
                          memory=options.memory,
                          init_ray_on_spark=True,
                          enable_numa_binding=False,
                          object_store_memory=options.object_store_memory)

    trainer = Estimator.from_keras(model_creator=model_creator)

    trainer.fit(train_data_creator,
                epochs=options.epochs,
                batch_size=options.batch_size,
                steps_per_epoch=3473 // options.batch_size,
                callbacks=callbacks,
                validation_data=val_data_creator,
                validation_steps=3581 // options.batch_size)
    stop_orca_context()
Beispiel #21
0
    callbacks = get_lr_schedule_callbacks(initial_lr)

    config = {
        "wd": 0.00005,
        "momentum": 0.9,
        "batch_size": global_batch_size,
        "warmup_epoch": 5,
        "num_worker": args.worker_num,
        "data_dir": args.data_dir,
        "bf16": args.use_bf16,
        "lr": initial_lr,
    }

    trainer = Estimator(model_creator=model_creator,
                        compile_args_creator=compile_args_creator,
                        verbose=True,
                        config=config,
                        backend="horovod")

    if args.benchmark:
        trainer.fit(
            data_creator=train_data_creator
            if not args.use_dummy_data else dummy_data_creator,
            epochs=3,
            steps_per_epoch=20,
            callbacks=callbacks,
        )
    else:
        epoch = 0
        for i in range(5):
            dummy = args.use_dummy_data
    initial_lr = 0.1 * lr_multiplier
    callbacks = get_lr_schedule_callbacks(initial_lr)

    config = {
        "wd": 0.00005,
        "momentum": 0.9,
        "warmup_epoch": 5,
        "num_worker": args.worker_num,
        "data_dir": args.data_dir,
        "bf16": args.use_bf16,
        "lr": initial_lr,
    }

    trainer = Estimator.from_keras(model_creator=model_creator,
                                   compile_args_creator=compile_args_creator,
                                   verbose=True,
                                   config=config,
                                   backend="horovod")

    if args.benchmark:
        trainer.fit(
            data=train_data_creator
            if not args.use_dummy_data else dummy_data_creator,
            epochs=3,
            batch_size=global_batch_size,
            steps_per_epoch=20,
            callbacks=callbacks,
        )
    else:
        epoch = 0
        for i in range(5):