def train_tf_ray_air( *, config: dict, num_workers: int = 4, cpus_per_worker: int = 8, use_gpu: bool = False, ) -> Tuple[float, float]: # This function is kicked off by the main() function and runs a full training # run using Ray AIR. from ray.train.tensorflow import TensorflowTrainer from ray.air.config import ScalingConfig def train_loop(config): train_func(use_ray=True, config=config) start_time = time.monotonic() trainer = TensorflowTrainer( train_loop_per_worker=train_loop, train_loop_config=config, scaling_config=ScalingConfig( trainer_resources={"CPU": 0}, num_workers=num_workers, resources_per_worker={"CPU": cpus_per_worker}, use_gpu=use_gpu, ), ) result = trainer.fit() time_taken = time.monotonic() - start_time print(f"Last result: {result.metrics}") return time_taken, result.metrics["loss"]
def train_tensorflow_mnist( num_workers: int = 2, use_gpu: bool = False, epochs: int = 4 ) -> Result: config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() return results
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = TensorflowTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": epochs }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(f"Results: {results.metrics}")
def train_tensorflow_linear(num_workers: int = 2, use_gpu: bool = False) -> Result: dataset_pipeline = get_dataset() config = {"lr": 1e-3, "batch_size": 32, "epochs": 4} trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={"train": dataset_pipeline}, ) results = trainer.fit() print(results.metrics) return results
def test_keras_callback(): epochs = 3 scaling_config = {"num_workers": 2} config = { "epochs": epochs, } trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, datasets={TRAIN_DATASET_KEY: get_dataset()}, ) checkpoint = trainer.fit().checkpoint with checkpoint.as_directory() as ckpt_dir: assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb"))
def train_tensorflow_linear(num_workers=2, use_gpu=False): datasets, dataset_configs = get_datasets_and_configs() trainer = TensorflowTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 32, "epochs": 4 }, datasets=datasets, dataset_config=dataset_configs, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(f"Results: {results.metrics}") return results
def train_tensorflow_mnist(num_workers: int = 2, use_gpu: bool = False, epochs: int = 4) -> Result: train_dataset = get_dataset(split_type="train") config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} scaling_config = dict(num_workers=num_workers, use_gpu=use_gpu) trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, datasets={"train": train_dataset}, scaling_config=scaling_config, ) results = trainer.fit() print(results.metrics) return results
def test_keras_callback_e2e(): epochs = 3 config = { "epochs": epochs, } trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=2), datasets={TRAIN_DATASET_KEY: get_dataset()}, ) checkpoint = trainer.fit().checkpoint checkpoint_dict = checkpoint.to_dict() assert MODEL_KEY in checkpoint_dict predictor = TensorflowPredictor.from_checkpoint( checkpoint, model_definition=build_model) items = np.random.uniform(0, 1, size=(10, 1)) predictor.predict(data=items)
def test_tensorflow_linear(ray_start_4_cpus, num_workers): def train_func(config): result = tensorflow_linear_train_func(config) assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"] num_workers = num_workers epochs = 3 scaling_config = {"num_workers": num_workers} config = { "lr": 1e-3, "batch_size": 32, "epochs": epochs, } trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, datasets={TRAIN_DATASET_KEY: get_dataset()}, ) trainer.fit()
def test_report_and_load_using_ml_session(ray_start_4_cpus): def train_func(): if session.get_checkpoint(): with session.get_checkpoint().as_directory() as checkpoint_dir: import tensorflow as tf model = tf.keras.models.load_model(checkpoint_dir) else: model = build_model() model.save("my_model", overwrite=True) session.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model")) scaling_config = {"num_workers": 2} trainer = TensorflowTrainer(train_loop_per_worker=train_func, scaling_config=scaling_config) result = trainer.fit() trainer2 = TensorflowTrainer( train_loop_per_worker=train_func, scaling_config=scaling_config, resume_from_checkpoint=result.checkpoint, ) result = trainer2.fit() checkpoint = result.checkpoint with checkpoint.as_directory() as ckpt_dir: assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb")) assert result.metrics["iter"] == 1
def test_tensorflow_e2e(ray_start_4_cpus): def train_func(): model = build_model().get_weights() train.save_checkpoint(**{MODEL_KEY: model}) scaling_config = {"num_workers": 2} trainer = TensorflowTrainer(train_loop_per_worker=train_func, scaling_config=scaling_config) result = trainer.fit() class TensorflowScorer: def __init__(self): self.pred = TensorflowPredictor.from_checkpoint( result.checkpoint, build_model) def __call__(self, x): return self.pred.predict(x, dtype=np.float) predict_dataset = ray.data.range(3) predictions = predict_dataset.map_batches(TensorflowScorer, batch_format="pandas", compute="actors") assert predictions.count() == 3
def tune_tensorflow_mnist(num_workers, num_samples): trainer = TensorflowTrainer( train_func, scaling_config=ScalingConfig(num_workers=num_workers)) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, }, }, tune_config=TuneConfig(num_samples=num_samples), ) analysis = tuner.fit() best_loss = analysis.get_best_result(metric="loss", mode="min") best_accuracy = analysis.get_best_result(metric="accuracy", mode="max") print(f"Best loss result: {best_loss}") print(f"Best accuracy result: {best_accuracy}") return analysis
def tune_tensorflow_mnist(num_workers: int = 2, num_samples: int = 2, use_gpu: bool = False): trainer = TensorflowTrainer( train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, tune_config=TuneConfig(num_samples=num_samples, metric="accuracy", mode="max"), param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 3, } }, ) best_accuracy = tuner.fit().get_best_result().metrics["accuracy"] print(f"Best accuracy config: {best_accuracy}")
), batch_size=batch_size, )) history = multi_worker_model.fit(tf_dataset, callbacks=[Callback()]) results.append(history.history) return results num_workers = 2 use_gpu = False config = {"lr": 1e-3, "batch_size": 32, "epochs": 4} trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={"train": dataset}, ) result = trainer.fit() print(result.metrics) # __air_tf_train_end__ # __air_tf_batchpred_start__ import numpy as np from ray.train.batch_predictor import BatchPredictor from ray.train.tensorflow import TensorflowPredictor batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TensorflowPredictor, model_definition=build_model)
multi_worker_dataset = mnist_dataset(global_batch_size) with strategy.scope(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) # __tf_distributed_end__ if __name__ == "__main__": # __tf_single_run_begin__ train_func() # __tf_single_run_end__ # __tf_trainer_begin__ from ray.train.tensorflow import TensorflowTrainer from ray.air.config import ScalingConfig # For GPU Training, set `use_gpu` to True. use_gpu = False trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) trainer.fit() # __tf_trainer_end__
callbacks=[KerasCallback()], verbose=0, ) return results num_features = len(train_dataset.schema().names) - 1 trainer = TensorflowTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 128, "num_epochs": 50, "num_features": num_features, "lr": 0.0001, }, scaling_config=ScalingConfig( num_workers=2, # Number of data parallel training workers use_gpu=False, trainer_resources={"CPU": 0}, # so that the example works on Colab. ), datasets={"train": train_dataset}, preprocessor=preprocessor, ) result = trainer.fit() print(f"Last result: {result.metrics}") # Last result: {'loss': 8.997025489807129, ...} # __air_tf_train_end__ # __air_tf_tuner_start__ from ray import tune