Example #1
0
def test_torch_fashion_mnist_gpu(ray_start_4_cpus_2_gpus):
    num_workers = 2
    epochs = 3

    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TorchTrainer(
        fashion_mnist_train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True),
    )
    results = trainer.fit()

    result = results.metrics

    assert result[TRAINING_ITERATION] == epochs
Example #2
0
def test_torch_linear(ray_start_4_cpus, num_workers):
    num_workers = num_workers
    epochs = 3

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        linear_train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers),
    )
    results = trainer.fit()

    result = results.metrics
    assert result[TRAINING_ITERATION] == epochs

    loss = list(results.metrics_dataframe["loss"])
    assert len(loss) == epochs
    assert loss[-1] < loss[0]
Example #3
0
def torch_fashion_mnist(num_workers, use_gpu, num_samples):
    trainer = TorchTrainer(
        fashion_mnist_train_func,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([32, 64, 128]),
                "epochs": 2,
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis

    # Check that loss decreases in each trial.
    for path, df in analysis.trial_dataframes.items():
        assert df.loc[1, "loss"] < df.loc[0, "loss"]
Example #4
0
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker):
    from ray import tune
    from ray.tune.tuner import Tuner, TuneConfig

    num_samples = 2

    @patch("torch.cuda.is_available", lambda: True)
    def train_func():
        train.report(device_id=train.torch.get_device().index)

    trainer = TorchTrainer(
        train_func,
        torch_config=TorchConfig(backend="gloo"),
        scaling_config=ScalingConfig(
            num_workers=2,
            use_gpu=True,
            resources_per_worker={"GPU": num_gpus_per_worker},
        ),
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "dummy": tune.choice([32, 64, 128]),
            }
        },
        tune_config=TuneConfig(
            num_samples=num_samples,
        ),
    )
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs]

    assert len(device_ids) == num_samples
    for i in range(num_samples):
        assert device_ids[i][0] == 0
Example #5
0
def test_tf_non_distributed(ray_start_4_cpus):
    """Make sure Ray Train works without TF MultiWorkerMirroredStrategy."""

    trainer = TorchTrainer(tf_quick_start_train_func,
                           scaling_config=ScalingConfig(num_workers=1))
    trainer.fit()
Example #6
0
def test_torch_non_distributed(ray_start_4_cpus):
    """Make sure Ray Train works without torch DDP."""

    trainer = TorchTrainer(torch_quick_start_train_func,
                           scaling_config=ScalingConfig(num_workers=1))
    trainer.fit()
Example #7
0
        MLflowLoggerCallback(experiment_name="cuj-big-data-training",
                             save_artifact=True),
    ]

    # Remove CPU resource so Datasets can be scheduled.
    resources_per_worker = {"CPU": 0, "GPU": 1} if use_gpu else None

    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        datasets=datasets,
        scaling_config=ScalingConfig(
            num_workers=num_workers,
            use_gpu=use_gpu,
            resources_per_worker=resources_per_worker,
        ),
        run_config=RunConfig(callbacks=callbacks),
        dataset_config={
            "train":
            DatasetConfig(use_stream_api=True,
                          stream_window_size=-1,
                          global_shuffle=True)
        },
    )
    results = trainer.fit()
    state_dict = results.checkpoint.to_dict()["model"]

    def load_model_func():
        num_layers = config["num_layers"]
        num_hidden = config["num_hidden"]
        dropout_every = config["dropout_every"]