def train_linear(num_workers=2, use_gpu=False, epochs=3): trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()] ) trainer.shutdown() print(results) return results
def train_linear(num_workers=2): trainer = Trainer(TorchConfig(backend="gloo"), num_workers=num_workers) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def train_linear(num_workers=2, use_gpu=False): datasets = get_datasets() trainer = Trainer("torch", num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer.start() results = trainer.run( train_func, config, dataset=datasets, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()], ) trainer.shutdown() print(results) return results
def test_TBX(ray_start_4_cpus, tmp_path): config = TestConfig() temp_dir = tmp_path num_workers = 4 def train_func(): train.report(episode_reward_mean=4) train.report(episode_reward_mean=5) train.report(episode_reward_mean=6, score=[1, 2, 3], hello={"world": 1}) return 1 callback = TBXLoggerCallback(temp_dir) trainer = Trainer(config, num_workers=num_workers) trainer.start() trainer.run(train_func, callbacks=[callback]) _validate_tbx_result(temp_dir)
from ray import train from ray.train import Trainer from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback def train_func(): for i in range(3): train.report(epoch=i) trainer = Trainer(backend="torch", num_workers=2) trainer.start() # Run the training function, logging all the intermediate results # to MLflow and Tensorboard. result = trainer.run( train_func, callbacks=[ MLflowLoggerCallback(experiment_name="train_experiment"), TBXLoggerCallback() ]) # Print the latest run directory and keep note of it. # For example: /home/ray_results/train_2021-09-01_12-00-00/run_001 print("Run directory:", trainer.latest_run_dir) trainer.shutdown()
"num_hidden": NUM_HIDDEN, "num_layers": NUM_LAYERS, "dropout_every": DROPOUT_EVERY, "dropout_prob": DROPOUT_PROB, "num_features": num_features, } # Create 2 callbacks: one for Tensorboard Logging and one for MLflow # logging. Pass these into Trainer, and all results that are # reported by ``train.report()`` will be logged to these 2 places. # TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. tbx_runs_dir = os.path.join(dir_path, "runs") os.makedirs(tbx_runs_dir, exist_ok=True) callbacks = [ TBXLoggerCallback(logdir=tbx_runs_dir), MLflowLoggerCallback(experiment_name="cuj-big-data-training", save_artifact=True), ] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ) trainer.start() results = trainer.run(train_func=train_func,
"num_hidden": NUM_HIDDEN, "num_layers": NUM_LAYERS, "dropout_every": DROPOUT_EVERY, "dropout_prob": DROPOUT_PROB, "num_features": num_features, } # Create 2 callbacks: one for Tensorboard Logging and one for MLflow # logging. Pass these into Trainer, and all results that are # reported by ``train.report()`` will be logged to these 2 places. # TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. tbx_logdir = "./runs" os.makedirs(tbx_logdir, exist_ok=True) callbacks = [ TBXLoggerCallback(logdir=tbx_logdir), MLflowLoggerCallback(experiment_name="cuj-big-data-training", save_artifact=True), ] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ) trainer.start() results = trainer.run(train_func=train_func,
"batch_size": BATCH_SIZE, "num_hidden": NUM_HIDDEN, "num_layers": NUM_LAYERS, "dropout_every": DROPOUT_EVERY, "dropout_prob": DROPOUT_PROB, "num_features": num_features } # Create 2 callbacks: one for Tensorboard Logging and one for MLflow # logging. Pass these into Trainer, and all results that are # reported by ``train.report()`` will be logged to these 2 places. # TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. tbx_logdir = "./runs" os.makedirs(tbx_logdir, exist_ok=True) callbacks = [TBXLoggerCallback(logdir=tbx_logdir), MLflowCallback(config)] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker) trainer.start() results = trainer.run(train_func=train_func, config=config, callbacks=callbacks, dataset=datasets) model = results[0] trainer.shutdown()
"use_gpu": use_gpu, "num_epochs": NUM_EPOCHS, "batch_size": BATCH_SIZE, "num_hidden": NUM_HIDDEN, "num_layers": NUM_LAYERS, "dropout_every": DROPOUT_EVERY, "dropout_prob": DROPOUT_PROB, "num_features": num_features } # Create 2 callbacks: one for Tensorboard Logging and one for MLflow # logging. Pass these into Trainer, and all results that are # reported by ``train.report()`` will be logged to these 2 places. # TODO: TBXLoggerCallback should create nonexistent logdir # and should also create 1 directory per file. callbacks = [TBXLoggerCallback(logdir="/tmp"), MLflowCallback(config)] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker) trainer.start() results = trainer.run(train_func=train_func, config=config, callbacks=callbacks, dataset=datasets) model = results[0] trainer.shutdown()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--address", required=False, type=str, help="the address to use for Ray") parser.add_argument( "--num-workers", "-n", type=int, default=2, help="Sets number of workers for training.", ) parser.add_argument("--use-gpu", action="store_true", default=False, help="Enables GPU training") args = parser.parse_args() ray.init(address=args.address) callbacks = [TorchTensorboardProfilerCallback(), TBXLoggerCallback()] trainer = Trainer(backend="torch", num_workers=args.num_workers, use_gpu=args.use_gpu) trainer.start() trainer.run(train_func, callbacks=callbacks) trainer.shutdown()