Beispiel #1
0
def main(data_size_gb: int, num_epochs=2, num_workers=1):
    data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
    print("Running Pytorch image model training with "
          f"{data_size_gb}GB data from {data_url}")
    print(f"Training for {num_epochs} epochs with {num_workers} workers.")
    start = time.time()
    # Enable cross host NCCL for larger scale tests
    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
    ray.init(runtime_env=runtime_env)
    dataset = ray.data.read_datasource(ImageFolderDatasource(),
                                       paths=[data_url])

    preprocessor = BatchMapper(preprocess_image_with_label)

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "batch_size": 64,
            "num_epochs": num_epochs
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True),
    )
    trainer.fit()

    total_time_s = round(time.time() - start, 2)

    # For structured output integration with internal tooling
    results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs}
    results["perf_metrics"] = [
        {
            "perf_metric_name": "total_time_s",
            "perf_metric_value": total_time_s,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name":
            "throughout_MB_s",
            "perf_metric_value":
            round(num_epochs * data_size_gb * 1024 / total_time_s, 2),
            "perf_metric_type":
            "THROUGHPUT",
        },
    ]

    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(results, f)

    print(results)
Beispiel #2
0
def test_checkpoint_freq(ray_start_4_cpus):
    # checkpoint_freq is not supported so raise an error
    trainer = TorchTrainer(
        train_loop_per_worker=lambda config: None,
        scaling_config=ray.air.ScalingConfig(num_workers=1),
        run_config=ray.air.RunConfig(
            checkpoint_config=ray.air.CheckpointConfig(
                checkpoint_frequency=2,
            ),
        ),
    )
    with pytest.raises(TuneError):
        trainer.fit()
Beispiel #3
0
def test_torch_e2e_state_dict(ray_start_4_cpus):
    def train_func():
        model = torch.nn.Linear(1, 1).state_dict()
        train.save_checkpoint(model=model)

    scaling_config = {"num_workers": 2}
    trainer = TorchTrainer(train_loop_per_worker=train_func,
                           scaling_config=scaling_config)
    result = trainer.fit()

    # If loading from a state dict, a model definition must be passed in.
    with pytest.raises(ValueError):
        TorchPredictor.from_checkpoint(result.checkpoint)

    class TorchScorer:
        def __init__(self):
            self.pred = TorchPredictor.from_checkpoint(result.checkpoint,
                                                       model=torch.nn.Linear(
                                                           1, 1))

        def __call__(self, x):
            return self.pred.predict(x, dtype=torch.float)

    predict_dataset = ray.data.range(3)
    predictions = predict_dataset.map_batches(TorchScorer,
                                              batch_format="pandas",
                                              compute="actors")
    assert predictions.count() == 3
Beispiel #4
0
def test_torch_linear(ray_start_4_cpus, num_workers):
    def train_func(config):
        result = linear_train_func(config)
        assert len(result) == epochs
        assert result[-1]["loss"] < result[0]["loss"]

    num_workers = num_workers
    epochs = 3
    scaling_config = {"num_workers": num_workers}
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
    )
    trainer.fit()
Beispiel #5
0
def train_torch_ray_air(
    *,
    config: dict,
    num_workers: int = 4,
    cpus_per_worker: int = 8,
    use_gpu: bool = False,
) -> Tuple[float, float]:
    # This function is kicked off by the main() function and runs a full training
    # run using Ray AIR.
    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig

    def train_loop(config):
        train_func(use_ray=True, config=config)

    start_time = time.monotonic()
    trainer = TorchTrainer(
        train_loop_per_worker=train_loop,
        train_loop_config=config,
        scaling_config=ScalingConfig(
            trainer_resources={"CPU": 0},
            num_workers=num_workers,
            resources_per_worker={"CPU": cpus_per_worker},
            use_gpu=use_gpu,
        ),
    )
    result = trainer.fit()
    time_taken = time.monotonic() - start_time

    print(f"Last result: {result.metrics}")
    return time_taken, result.metrics["loss"]
Beispiel #6
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()

    print(results.metrics)
    return results
def train_fashion_mnist(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")
def train_linear(num_workers=2, use_gpu=False):
    datasets, dataset_configs = get_datasets_and_configs()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        datasets=datasets,
        dataset_config=dataset_configs,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    print(results.metrics)
    return results
Beispiel #9
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config={
            "num_workers": num_workers,
            "use_gpu": use_gpu
        },
    )
    result = trainer.fit()

    print(result.metrics)
    return result.metrics
Beispiel #10
0
def main(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        run_config=RunConfig(callbacks=[
            MLflowLoggerCallback(experiment_name="train_fashion_mnist")
        ]),
    )
    final_results = trainer.fit()

    print("Final metrics: ", final_results.metrics)
Beispiel #11
0
def train_linear(num_workers=2, use_gpu=False):
    train_dataset, val_dataset = get_datasets()
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    result = trainer.fit()
    print(result.metrics)
    return result
def train_gnn(
    num_workers=2, use_gpu=False, epochs=3, global_batch_size=32, dataset="reddit"
):

    per_worker_batch_size = global_batch_size // num_workers

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "num_epochs": epochs,
            "batch_size": per_worker_batch_size,
            "dataset_fn": gen_reddit_dataset
            if dataset == "reddit"
            else gen_fake_dataset(),
        },
        scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
    )
    result = trainer.fit()
    print(result.metrics)
Beispiel #13
0
def train_linear(num_workers=1,
                 num_hidden_layers=1,
                 use_auto_transfer=True,
                 epochs=3):
    config = {
        "lr": 1e-2,
        "hidden_size": num_hidden_layers,
        "batch_size": 4096,
        "epochs": epochs,
        "use_auto_transfer": use_auto_transfer,
    }
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(use_gpu=True, num_workers=num_workers),
    )
    results = trainer.fit()

    print(results.metrics)
    return results
Beispiel #14
0
def main():
    args = parse_args()
    config = {"args": args}

    if args.start_local or args.address or args.num_workers > 1 or args.use_gpu:
        if args.start_local:
            # Start a local Ray runtime.
            ray.init(num_cpus=args.num_workers + 2)
        else:
            # Connect to a Ray cluster for distributed training.
            ray.init(address=args.address)
        trainer = TorchTrainer(
            train_func,
            train_loop_config=config,
            scaling_config=ScalingConfig(num_workers=args.num_workers,
                                         use_gpu=args.use_gpu),
        )
        results = trainer.fit()
        print(results.metrics)
    else:
        # Run training locally.
        train_func(config)
Beispiel #15
0
def test_torch_e2e(ray_start_4_cpus):
    def train_func():
        model = torch.nn.Linear(1, 1)
        session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model)))

    scaling_config = ScalingConfig(num_workers=2)
    trainer = TorchTrainer(
        train_loop_per_worker=train_func, scaling_config=scaling_config
    )
    result = trainer.fit()

    predict_dataset = ray.data.range(3)

    class TorchScorer:
        def __init__(self):
            self.pred = TorchPredictor.from_checkpoint(result.checkpoint)

        def __call__(self, x):
            return self.pred.predict(x, dtype=torch.float)

    predictions = predict_dataset.map_batches(
        TorchScorer, batch_format="pandas", compute="actors"
    )
    assert predictions.count() == 3
Beispiel #16
0
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)

    # View the stats for performance debugging.
    print(data_shard.stats())


my_trainer = TorchTrainer(
    train_loop_per_worker,
    scaling_config={"num_workers": 1},
    datasets={
        "train": ray.data.range_tensor(1000),
    },
)
my_trainer.fit()
# __config_4_end__

# __config_5__
import ray
from ray import train
from ray.data import DatasetPipeline
from ray.train.torch import TorchTrainer
from ray.air.config import DatasetConfig


def train_loop_per_worker():
    # A DatasetPipeline object is returned when `use_stream_api` is set.
    data_shard: DatasetPipeline = train.get_dataset_shard("train")

    # Use iter_epochs(10) to iterate over 10 epochs of data.
Beispiel #17
0
    for _ in range(epochs):
        train_epoch(train_dataloader, model, loss_fn, optimizer)
        loss = validate_epoch(test_dataloader, model, loss_fn)
        train.report(loss=loss)


num_workers = 2
use_gpu = False

trainer = TorchTrainer(
    train_loop_per_worker=train_func,
    train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
    scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
)
result = trainer.fit()
print(f"Last result: {result.metrics}")
# __air_pytorch_train_end__


# # __air_pytorch_batchpred_start__
# import random
# from ray.air.batch_predictor import BatchPredictor
# from ray.air.predictors.integrations.torch import TorchPredictor

# batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor)

# items = [{"x": random.uniform(0, 1) for _ in range(10)}]
# prediction_dataset = ray.data.from_items(items)

# predictions = batch_predictor.predict(prediction_dataset, dtype=torch.float)
Beispiel #18
0
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"epoch: {epoch}, loss: {loss.item()}")


# __torch_distributed_end__

if __name__ == "__main__":
    # __torch_single_run_begin__

    train_func()

    # __torch_single_run_end__

    # __torch_trainer_begin__

    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig

    # For GPU Training, set `use_gpu` to True.
    use_gpu = False

    trainer = TorchTrainer(train_func_distributed,
                           scaling_config=ScalingConfig(num_workers=4,
                                                        use_gpu=use_gpu))

    results = trainer.fit()

    # __torch_trainer_end__
Beispiel #19
0
def train_loop_per_worker():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard = session.get_dataset_shard("train")

    # Manually iterate over the data 10 times (10 epochs).
    for _ in range(10):
        for batch in data_shard.iter_batches():
            print("Do some training on batch", batch)


trainer = TorchTrainer(
    train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=1),
    datasets={"train": ray.data.range_tensor(1000)},
)
trainer.fit()
# __config_scaling_1_end__

# __config_scaling_2__
from ray.air import session
from ray.train.torch import TorchTrainer
import ray.data
from ray.air.config import ScalingConfig
from ray import tune
from ray.tune.tuner import Tuner
from ray.tune.tune_config import TuneConfig


def train_loop_per_worker():
    # By default, bulk loading is used and returns a Dataset object.
    data_shard = session.get_dataset_shard("train")