def test_torch_fashion_mnist_gpu(ray_start_4_cpus_2_gpus): num_workers = 2 epochs = 3 config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer = TorchTrainer( fashion_mnist_train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True), ) results = trainer.fit() result = results.metrics assert result[TRAINING_ITERATION] == epochs
def test_torch_linear(ray_start_4_cpus, num_workers): num_workers = num_workers epochs = 3 config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( linear_train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers), ) results = trainer.fit() result = results.metrics assert result[TRAINING_ITERATION] == epochs loss = list(results.metrics_dataframe["loss"]) assert len(loss) == epochs assert loss[-1] < loss[0]
def torch_fashion_mnist(num_workers, use_gpu, num_samples): trainer = TorchTrainer( fashion_mnist_train_func, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), "epochs": 2, } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis # Check that loss decreases in each trial. for path, df in analysis.trial_dataframes.items(): assert df.loc[1, "loss"] < df.loc[0, "loss"]
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker): from ray import tune from ray.tune.tuner import Tuner, TuneConfig num_samples = 2 @patch("torch.cuda.is_available", lambda: True) def train_func(): train.report(device_id=train.torch.get_device().index) trainer = TorchTrainer( train_func, torch_config=TorchConfig(backend="gloo"), scaling_config=ScalingConfig( num_workers=2, use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "dummy": tune.choice([32, 64, 128]), } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs] assert len(device_ids) == num_samples for i in range(num_samples): assert device_ids[i][0] == 0
def test_tf_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without TF MultiWorkerMirroredStrategy.""" trainer = TorchTrainer(tf_quick_start_train_func, scaling_config=ScalingConfig(num_workers=1)) trainer.fit()
def test_torch_non_distributed(ray_start_4_cpus): """Make sure Ray Train works without torch DDP.""" trainer = TorchTrainer(torch_quick_start_train_func, scaling_config=ScalingConfig(num_workers=1)) trainer.fit()
MLflowLoggerCallback(experiment_name="cuj-big-data-training", save_artifact=True), ] # Remove CPU resource so Datasets can be scheduled. resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, scaling_config=ScalingConfig( num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ), run_config=RunConfig(callbacks=callbacks), dataset_config={ "train": DatasetConfig(use_stream_api=True, stream_window_size=-1, global_shuffle=True) }, ) results = trainer.fit() state_dict = results.checkpoint.to_dict()["model"] def load_model_func(): num_layers = config["num_layers"] num_hidden = config["num_hidden"] dropout_every = config["dropout_every"]