Ejemplo n.º 1
0
def train_tf_ray_air(
    *,
    config: dict,
    num_workers: int = 4,
    cpus_per_worker: int = 8,
    use_gpu: bool = False,
) -> Tuple[float, float]:
    # This function is kicked off by the main() function and runs a full training
    # run using Ray AIR.
    from ray.train.tensorflow import TensorflowTrainer
    from ray.air.config import ScalingConfig

    def train_loop(config):
        train_func(use_ray=True, config=config)

    start_time = time.monotonic()
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_loop,
        train_loop_config=config,
        scaling_config=ScalingConfig(
            trainer_resources={"CPU": 0},
            num_workers=num_workers,
            resources_per_worker={"CPU": cpus_per_worker},
            use_gpu=use_gpu,
        ),
    )
    result = trainer.fit()
    time_taken = time.monotonic() - start_time

    print(f"Last result: {result.metrics}")
    return time_taken, result.metrics["loss"]
Ejemplo n.º 2
0
def train_tensorflow_mnist(
    num_workers: int = 2, use_gpu: bool = False, epochs: int = 4
) -> Result:
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    return results
Ejemplo n.º 3
0
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = TensorflowTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": epochs
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    print(f"Results: {results.metrics}")
Ejemplo n.º 4
0
def train_tensorflow_linear(num_workers: int = 2,
                            use_gpu: bool = False) -> Result:
    dataset_pipeline = get_dataset()
    config = {"lr": 1e-3, "batch_size": 32, "epochs": 4}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        datasets={"train": dataset_pipeline},
    )
    results = trainer.fit()
    print(results.metrics)
    return results
Ejemplo n.º 5
0
def test_keras_callback():
    epochs = 3
    scaling_config = {"num_workers": 2}
    config = {
        "epochs": epochs,
    }
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
        datasets={TRAIN_DATASET_KEY: get_dataset()},
    )
    checkpoint = trainer.fit().checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb"))
Ejemplo n.º 6
0
def train_tensorflow_linear(num_workers=2, use_gpu=False):
    datasets, dataset_configs = get_datasets_and_configs()
    trainer = TensorflowTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 32,
            "epochs": 4
        },
        datasets=datasets,
        dataset_config=dataset_configs,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    print(f"Results: {results.metrics}")
    return results
Ejemplo n.º 7
0
def train_tensorflow_mnist(num_workers: int = 2,
                           use_gpu: bool = False,
                           epochs: int = 4) -> Result:
    train_dataset = get_dataset(split_type="train")
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    scaling_config = dict(num_workers=num_workers, use_gpu=use_gpu)
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        datasets={"train": train_dataset},
        scaling_config=scaling_config,
    )

    results = trainer.fit()
    print(results.metrics)
    return results
Ejemplo n.º 8
0
def test_keras_callback_e2e():
    epochs = 3
    config = {
        "epochs": epochs,
    }
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=2),
        datasets={TRAIN_DATASET_KEY: get_dataset()},
    )
    checkpoint = trainer.fit().checkpoint
    checkpoint_dict = checkpoint.to_dict()
    assert MODEL_KEY in checkpoint_dict

    predictor = TensorflowPredictor.from_checkpoint(
        checkpoint, model_definition=build_model)

    items = np.random.uniform(0, 1, size=(10, 1))
    predictor.predict(data=items)
Ejemplo n.º 9
0
def test_tensorflow_linear(ray_start_4_cpus, num_workers):
    def train_func(config):
        result = tensorflow_linear_train_func(config)
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]

    num_workers = num_workers
    epochs = 3
    scaling_config = {"num_workers": num_workers}
    config = {
        "lr": 1e-3,
        "batch_size": 32,
        "epochs": epochs,
    }
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
        datasets={TRAIN_DATASET_KEY: get_dataset()},
    )
    trainer.fit()
Ejemplo n.º 10
0
def test_report_and_load_using_ml_session(ray_start_4_cpus):
    def train_func():
        if session.get_checkpoint():
            with session.get_checkpoint().as_directory() as checkpoint_dir:
                import tensorflow as tf

                model = tf.keras.models.load_model(checkpoint_dir)
        else:
            model = build_model()

        model.save("my_model", overwrite=True)
        session.report(metrics={"iter": 1},
                       checkpoint=Checkpoint.from_directory("my_model"))

    scaling_config = {"num_workers": 2}
    trainer = TensorflowTrainer(train_loop_per_worker=train_func,
                                scaling_config=scaling_config)
    result = trainer.fit()

    trainer2 = TensorflowTrainer(
        train_loop_per_worker=train_func,
        scaling_config=scaling_config,
        resume_from_checkpoint=result.checkpoint,
    )
    result = trainer2.fit()
    checkpoint = result.checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb"))
    assert result.metrics["iter"] == 1
Ejemplo n.º 11
0
def test_tensorflow_e2e(ray_start_4_cpus):
    def train_func():
        model = build_model().get_weights()
        train.save_checkpoint(**{MODEL_KEY: model})

    scaling_config = {"num_workers": 2}
    trainer = TensorflowTrainer(train_loop_per_worker=train_func,
                                scaling_config=scaling_config)
    result = trainer.fit()

    class TensorflowScorer:
        def __init__(self):
            self.pred = TensorflowPredictor.from_checkpoint(
                result.checkpoint, build_model)

        def __call__(self, x):
            return self.pred.predict(x, dtype=np.float)

    predict_dataset = ray.data.range(3)
    predictions = predict_dataset.map_batches(TensorflowScorer,
                                              batch_format="pandas",
                                              compute="actors")
    assert predictions.count() == 3
Ejemplo n.º 12
0
def tune_tensorflow_mnist(num_workers, num_samples):
    trainer = TensorflowTrainer(
        train_func, scaling_config=ScalingConfig(num_workers=num_workers))
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            },
        },
        tune_config=TuneConfig(num_samples=num_samples),
    )
    analysis = tuner.fit()
    best_loss = analysis.get_best_result(metric="loss", mode="min")
    best_accuracy = analysis.get_best_result(metric="accuracy", mode="max")
    print(f"Best loss result: {best_loss}")
    print(f"Best accuracy result: {best_accuracy}")
    return analysis
Ejemplo n.º 13
0
def tune_tensorflow_mnist(num_workers: int = 2,
                          num_samples: int = 2,
                          use_gpu: bool = False):
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="accuracy",
                               mode="max"),
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 3,
            }
        },
    )
    best_accuracy = tuner.fit().get_best_result().metrics["accuracy"]
    print(f"Best accuracy config: {best_accuracy}")
Ejemplo n.º 14
0
                ),
                batch_size=batch_size,
            ))
        history = multi_worker_model.fit(tf_dataset, callbacks=[Callback()])
        results.append(history.history)
    return results


num_workers = 2
use_gpu = False

config = {"lr": 1e-3, "batch_size": 32, "epochs": 4}

trainer = TensorflowTrainer(
    train_loop_per_worker=train_func,
    train_loop_config=config,
    scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    datasets={"train": dataset},
)
result = trainer.fit()
print(result.metrics)
# __air_tf_train_end__

# __air_tf_batchpred_start__
import numpy as np

from ray.train.batch_predictor import BatchPredictor
from ray.train.tensorflow import TensorflowPredictor

batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint,
                                                 TensorflowPredictor,
                                                 model_definition=build_model)
Ejemplo n.º 15
0
    multi_worker_dataset = mnist_dataset(global_batch_size)

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_and_compile_cnn_model()

    multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)

# __tf_distributed_end__

if __name__ == "__main__":
    # __tf_single_run_begin__

    train_func()

    # __tf_single_run_end__

    # __tf_trainer_begin__

    from ray.train.tensorflow import TensorflowTrainer
    from ray.air.config import ScalingConfig

    # For GPU Training, set `use_gpu` to True.
    use_gpu = False

    trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu))

    trainer.fit()

    # __tf_trainer_end__
Ejemplo n.º 16
0
            callbacks=[KerasCallback()],
            verbose=0,
        )
    return results


num_features = len(train_dataset.schema().names) - 1

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={
        "batch_size": 128,
        "num_epochs": 50,
        "num_features": num_features,
        "lr": 0.0001,
    },
    scaling_config=ScalingConfig(
        num_workers=2,  # Number of data parallel training workers
        use_gpu=False,
        trainer_resources={"CPU": 0},  # so that the example works on Colab.
    ),
    datasets={"train": train_dataset},
    preprocessor=preprocessor,
)

result = trainer.fit()
print(f"Last result: {result.metrics}")
# Last result: {'loss': 8.997025489807129, ...}
# __air_tf_train_end__

# __air_tf_tuner_start__
from ray import tune