Esempio n. 1
0
 def test_tuner_with_torch_trainer(self):
     """Test a successful run using torch trainer."""
     shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"),
                   ignore_errors=True)
     # The following two should be tunable.
     config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10}
     scaling_config = ScalingConfig(num_workers=1, use_gpu=False)
     trainer = TorchTrainer(
         train_loop_per_worker=linear_train_func,
         train_loop_config=config,
         scaling_config=scaling_config,
     )
     param_space = {
         "scaling_config":
         ScalingConfig(num_workers=tune.grid_search([1, 2])),
         "train_loop_config": {
             "batch_size": tune.grid_search([4, 8]),
             "epochs": tune.grid_search([5, 10]),
         },
     }
     tuner = Tuner(
         trainable=trainer,
         run_config=RunConfig(name="test_tuner"),
         param_space=param_space,
         tune_config=TuneConfig(mode="min", metric="loss"),
     )
     results = tuner.fit()
     assert len(results) == 8
Esempio n. 2
0
def test_arg_override(ray_start_4_cpus):
    def check_override(self):
        assert self.scaling_config.num_workers == 1
        # Should do deep update.
        assert not self.custom_arg["outer"]["inner"]
        assert self.custom_arg["outer"]["fixed"] == 1
        # Should merge with base config.
        assert self.preprocessor.original

        pg = get_current_placement_group()
        assert len(pg.bundle_specs) == 2  # 1 trainer, 1 worker

    preprocessor = DummyPreprocessor()
    preprocessor.original = True
    scale_config = ScalingConfig(num_workers=4)
    trainer = DummyTrainer(
        check_override,
        custom_arg={"outer": {
            "inner": True,
            "fixed": 1
        }},
        preprocessor=preprocessor,
        scaling_config=scale_config,
    )

    new_config = {
        "custom_arg": {
            "outer": {
                "inner": False
            }
        },
        "scaling_config": ScalingConfig(num_workers=1),
    }

    tune.run(trainer.as_trainable(), config=new_config)
Esempio n. 3
0
def test_scaling_config_pgf_equivalance(trainer_resources,
                                        resources_per_worker_and_use_gpu,
                                        num_workers, placement_strategy):
    resources_per_worker, use_gpu = resources_per_worker_and_use_gpu
    scaling_config = ScalingConfig(
        trainer_resources=trainer_resources,
        num_workers=num_workers,
        resources_per_worker=resources_per_worker,
        use_gpu=use_gpu,
        placement_strategy=placement_strategy,
    )
    pgf = scaling_config.as_placement_group_factory()
    scaling_config_from_pgf = ScalingConfig.from_placement_group_factory(pgf)
    assert scaling_config == scaling_config_from_pgf
    assert scaling_config_from_pgf.as_placement_group_factory() == pgf
Esempio n. 4
0
def test_torch_e2e_state_dict(ray_start_4_cpus):
    def train_func():
        model = torch.nn.Linear(1, 1).state_dict()
        session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model)))

    scaling_config = ScalingConfig(num_workers=2)
    trainer = TorchTrainer(
        train_loop_per_worker=train_func, scaling_config=scaling_config
    )
    result = trainer.fit()

    # If loading from a state dict, a model definition must be passed in.
    with pytest.raises(ValueError):
        TorchPredictor.from_checkpoint(result.checkpoint)

    class TorchScorer:
        def __init__(self):
            self.pred = TorchPredictor.from_checkpoint(
                result.checkpoint, model=torch.nn.Linear(1, 1)
            )

        def __call__(self, x):
            return self.pred.predict(x, dtype=torch.float)

    predict_dataset = ray.data.range(3)
    predictions = predict_dataset.map_batches(
        TorchScorer, batch_format="pandas", compute="actors"
    )
    assert predictions.count() == 3
Esempio n. 5
0
def test_run(ray_start_4_cpus):
    """Tests that Train can be run without any specific backends."""
    num_workers = 2
    key = "value"
    value = 1
    config = TestConfig()

    def train_func():
        checkpoint = session.get_checkpoint()
        session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint)
        return checkpoint.to_dict()[key]

    checkpoint = Checkpoint.from_dict({
        # this would be set during checkpoint saving
        "_current_checkpoint_id": 1,
        key: value,
    })

    trainer = DataParallelTrainer(
        train_func,
        backend_config=config,
        resume_from_checkpoint=checkpoint,
        scaling_config=ScalingConfig(num_workers=num_workers),
    )
    results = trainer.fit()

    assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
Esempio n. 6
0
def test_reuse_checkpoint(ray_start_4_cpus):
    def train_func(config):
        itr = 0
        ckpt = session.get_checkpoint()
        if ckpt is not None:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 5}},
    )
    [trial] = tuner.fit()._experiment_analysis.trials
    checkpoint_path = trial.checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 4

    tuner = Tuner(
        trainer,
        param_space={"train_loop_config": {"max_iter": 10}},
    ).restore(trial.local_dir)
    analysis = tuner.fit()._experiment_analysis
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5
Esempio n. 7
0
def tune_linear(num_workers, num_samples, use_gpu):
    train_dataset, val_dataset = get_datasets()

    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3}

    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
        datasets={
            "train": train_dataset,
            "validation": val_dataset
        },
    )

    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "lr": tune.loguniform(1e-4, 1e-1),
                "batch_size": tune.choice([4, 16, 32]),
                "epochs": 3,
            }
        },
        tune_config=TuneConfig(num_samples=num_samples,
                               metric="loss",
                               mode="min"),
    )
    result_grid = tuner.fit()
    best_result = result_grid.get_best_result()
    print(best_result)
    return best_result
Esempio n. 8
0
def test_report_and_load_using_ml_session(ray_start_4_cpus):
    def train_func():
        if session.get_checkpoint():
            with session.get_checkpoint().as_directory() as checkpoint_dir:
                import tensorflow as tf

                model = tf.keras.models.load_model(checkpoint_dir)
        else:
            model = build_model()

        model.save("my_model", overwrite=True)
        session.report(metrics={"iter": 1},
                       checkpoint=Checkpoint.from_directory("my_model"))

    scaling_config = ScalingConfig(num_workers=2)
    trainer = TensorflowTrainer(train_loop_per_worker=train_func,
                                scaling_config=scaling_config)
    result = trainer.fit()

    trainer2 = TensorflowTrainer(
        train_loop_per_worker=train_func,
        scaling_config=scaling_config,
        resume_from_checkpoint=result.checkpoint,
    )
    result = trainer2.fit()
    checkpoint = result.checkpoint
    with checkpoint.as_directory() as ckpt_dir:
        assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb"))
    assert result.metrics["iter"] == 1
Esempio n. 9
0
def train_torch_ray_air(
    *,
    config: dict,
    num_workers: int = 4,
    cpus_per_worker: int = 8,
    use_gpu: bool = False,
) -> Tuple[float, float]:
    # This function is kicked off by the main() function and runs a full training
    # run using Ray AIR.
    from ray.train.torch import TorchTrainer
    from ray.air.config import ScalingConfig

    def train_loop(config):
        train_func(use_ray=True, config=config)

    start_time = time.monotonic()
    trainer = TorchTrainer(
        train_loop_per_worker=train_loop,
        train_loop_config=config,
        scaling_config=ScalingConfig(
            trainer_resources={"CPU": 0},
            num_workers=num_workers,
            resources_per_worker={"CPU": cpus_per_worker},
            use_gpu=use_gpu,
        ),
    )
    result = trainer.fit()
    time_taken = time.monotonic() - start_time

    print(f"Last result: {result.metrics}")
    return time_taken, result.metrics["loss"]
Esempio n. 10
0
def test_retry(ray_start_4_cpus):
    def train_func():
        ckpt = session.get_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            ckpt = ckpt.to_dict()
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            session.report(
                dict(test=i, training_iteration=i),
                checkpoint=Checkpoint.from_dict(dict(iter=i)),
            )

    trainer = DataParallelTrainer(
        train_func,
        backend_config=TestConfig(),
        scaling_config=ScalingConfig(num_workers=1),
    )
    tuner = Tuner(
        trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3))
    )

    analysis = tuner.fit()._experiment_analysis
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
Esempio n. 11
0
    def __init__(
        self, num_workers: int, expect_ds: bool, expect_sizes: Optional[dict], **kwargs
    ):
        def train_loop_per_worker():
            data_shard = session.get_dataset_shard("train")
            if expect_ds:
                assert isinstance(data_shard, Dataset), data_shard
            else:
                assert isinstance(data_shard, DatasetPipeline), data_shard
            for k, v in expect_sizes.items():
                shard = session.get_dataset_shard(k)
                if v == -1:
                    assert shard is None, shard
                else:
                    if isinstance(shard, DatasetPipeline):
                        assert next(shard.iter_epochs()).count() == v, shard
                    else:
                        assert shard.count() == v, shard

        kwargs.pop("scaling_config", None)
        super().__init__(
            train_loop_per_worker=train_loop_per_worker,
            scaling_config=ScalingConfig(num_workers=num_workers),
            **kwargs,
        )
Esempio n. 12
0
def test_data_parallel_trainer(ray_start_8_cpus):
    num_workers = 2
    trainer = AssertingDataParallelTrainer(
        train_fn, scaling_config=ScalingConfig(num_workers=num_workers)
    )
    tuner = Tuner(
        trainer,
        param_space={
            "train_loop_config": {
                "num_epochs": 100,
                "metric": tune.grid_search([1, 2, 3, 4, 5]),
            }
        },
        tune_config=TuneConfig(
            mode="max",
            metric="metric",
            scheduler=ResourceChangingScheduler(
                ASHAScheduler(),
                resources_allocation_function=DistributeResources(
                    add_bundles=True, reserve_resources={"CPU": 1}
                ),
            ),
        ),
        run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)),
    )
    result_grid = tuner.fit()
    assert not any(x.error for x in result_grid)
    # + 1 for Trainable
    assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
Esempio n. 13
0
def test_horovod_state_dict(ray_start_4_cpus):
    def train_func(config):
        result = hvd_train_func(config)
        assert len(result) == epochs
        assert result[-1] < result[0]

    num_workers = 2
    epochs = 10
    scaling_config = ScalingConfig(num_workers=num_workers)
    config = {"num_epochs": epochs, "save_model_as_dict": True}
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=scaling_config,
    )
    result = trainer.fit()
    predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net())

    # Find some test data to run on.
    test_set = datasets.MNIST(
        "./data",
        train=False,
        download=True,
        transform=transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        ),
    )

    test_dataloader = DataLoader(test_set, batch_size=10)
    test_dataloader_iter = iter(test_dataloader)
    images, labels = next(
        test_dataloader_iter
    )  # only running a batch inference of 10 images
    predicted_labels = run_image_prediction(predictor.model, images)
    assert torch.equal(predicted_labels, labels)
Esempio n. 14
0
            def _reconcile_scaling_config_with_trial_resources(
                    self, scaling_config: ScalingConfig) -> ScalingConfig:
                """
                ResourceChangingScheduler workaround.

                Ensures that the scaling config matches trial resources.

                This should be replaced with RCS returning a ScalingConfig
                in the future.
                """

                trial_resources = self.trial_resources
                # This will be false if the resources are default
                if not isinstance(trial_resources, PlacementGroupFactory):
                    return scaling_config

                if scaling_config:
                    scaling_config = trainer_cls._validate_scaling_config(
                        scaling_config)
                scaling_config_from_trial_resources = (
                    ScalingConfig.from_placement_group_factory(trial_resources)
                )

                # This check should always pass if ResourceChangingScheduler is not
                # used.
                if scaling_config_from_trial_resources != scaling_config:
                    scaling_config = trainer_cls._validate_scaling_config(
                        scaling_config_from_trial_resources)
                return scaling_config
Esempio n. 15
0
    def __init__(
        self,
        *,
        scaling_config: Optional[ScalingConfig] = None,
        run_config: Optional[RunConfig] = None,
        datasets: Optional[Dict[str, GenDataset]] = None,
        preprocessor: Optional["Preprocessor"] = None,
        resume_from_checkpoint: Optional[Checkpoint] = None,
    ):

        self.scaling_config = (scaling_config if scaling_config is not None
                               else ScalingConfig())
        self.run_config = run_config if run_config is not None else RunConfig()
        self.datasets = datasets if datasets is not None else {}
        self.preprocessor = preprocessor
        self.resume_from_checkpoint = resume_from_checkpoint

        self._validate_attributes()

        if datasets and not self.scaling_config._max_cpu_fraction_per_node:
            logger.warning(
                "When passing `datasets` to a Trainer, it is recommended to "
                "reserve at least 20% of node CPUs for Dataset execution by setting "
                "`_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. "
                "Not doing so can lead to resource contention or hangs. "
                "See https://docs.ray.io/en/master/data/key-concepts.html"
                "#example-datasets-in-tune for more info.")
Esempio n. 16
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_func,
        train_loop_config=kwargs,
        scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers),
    )
    results = trainer.fit()
    print(results.metrics)
Esempio n. 17
0
def test_resources(ray_start_4_cpus):
    def check_cpus(self):
        assert ray.available_resources()["CPU"] == 2

    assert ray.available_resources()["CPU"] == 4
    trainer = DummyTrainer(
        check_cpus, scaling_config=ScalingConfig(trainer_resources={"CPU": 2}))
    trainer.fit()
Esempio n. 18
0
def test_reserved_cpu_warnings(ray_start_4_cpus):
    def train_loop(self):
        pass

    class MockLogger:
        def __init__(self):
            self.warnings = []

        def warning(self, msg):
            self.warnings.append(msg)

        def info(self, msg):
            print(msg)

    try:
        old = base_trainer.logger
        base_trainer.logger = MockLogger()

        # Fraction correctly specified.
        DummyTrainer(
            train_loop,
            scaling_config=ScalingConfig(num_workers=1,
                                         _max_cpu_fraction_per_node=0.9),
            datasets={"train": ray.data.range(10)},
        )
        assert not base_trainer.logger.warnings

        # No datasets, no fraction.
        DummyTrainer(
            train_loop,
            scaling_config=ScalingConfig(num_workers=1),
        )
        assert not base_trainer.logger.warnings

        # Should warn.
        DummyTrainer(
            train_loop,
            scaling_config=ScalingConfig(num_workers=1),
            datasets={"train": ray.data.range(10)},
        )
        assert len(
            base_trainer.logger.warnings) == 1, base_trainer.logger.warnings
        assert "_max_cpu_fraction_per_node" in base_trainer.logger.warnings[0]
    finally:
        base_trainer.logger = old
Esempio n. 19
0
def test_trainable_name_is_overriden_gbdt_trainer(ray_start_4_cpus):
    trainer = DummyGBDTTrainer(
        params={},
        label_column="__values__",
        datasets={"train": ray.data.from_items([1, 2, 3])},
        scaling_config=ScalingConfig(num_workers=1),
    )

    _is_trainable_name_overriden(trainer)
Esempio n. 20
0
def test_scaling_config_validate_config_bad_allowed_keys():
    # Check for keys not present in dict
    scaling_config = {"num_workers": 2}
    with pytest.raises(ValueError) as exc_info:
        ensure_only_allowed_dataclass_keys_updated(
            ScalingConfig(**scaling_config),
            ["BAD_KEY"],
        )
    assert "BAD_KEY" in str(exc_info.value)
    assert "are not present in" in str(exc_info.value)
Esempio n. 21
0
def test_scaling_config(ray_start_4_cpus):
    def train_func():
        assert ray.available_resources()["CPU"] == 1
        session.report({"loss": 1})

    assert ray.available_resources()["CPU"] == 4
    trainer = DataParallelTrainer(
        train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=2)
    )
    trainer.fit()
Esempio n. 22
0
def test_scaling_config_validate_config_prohibited_class():
    # Check for prohibited keys
    scaling_config = {"num_workers": 2}
    with pytest.raises(ValueError) as exc_info:
        ensure_only_allowed_dataclass_keys_updated(
            ScalingConfig(**scaling_config),
            ["trainer_resources"],
        )
    assert "num_workers" in str(exc_info.value)
    assert "to be updated" in str(exc_info.value)
Esempio n. 23
0
def train_linear(num_workers=2, use_gpu=False, epochs=3):
    config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs}
    trainer = TorchTrainer(
        train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()

    print(results.metrics)
    return results
Esempio n. 24
0
def train_tensorflow_mnist(
    num_workers: int = 2, use_gpu: bool = False, epochs: int = 4
) -> Result:
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    return results
Esempio n. 25
0
def test_reserved_cpus(ray_start_4_cpus):
    def train_loop(self):
        ray.data.range(10).show()

    # Will deadlock without reserved CPU fraction.
    scale_config = ScalingConfig(num_workers=1, _max_cpu_fraction_per_node=0.9)
    trainer = DummyTrainer(
        train_loop,
        scaling_config=scale_config,
    )
    tune.run(trainer.as_trainable(), num_samples=4)
Esempio n. 26
0
def train_fashion_mnist(num_workers=2, use_gpu=False):
    trainer = TorchTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(f"Last result: {result.metrics}")
Esempio n. 27
0
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4):
    trainer = TensorflowTrainer(
        train_func,
        train_loop_config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": epochs
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    print(f"Results: {results.metrics}")
Esempio n. 28
0
def test_validation(ray_start_4_cpus):
    train_dataset = ray.data.from_pandas(train_df)
    valid_dataset = ray.data.from_pandas(test_df)
    with pytest.raises(KeyError, match=TRAIN_DATASET_KEY):
        XGBoostTrainer(
            scaling_config=ScalingConfig(num_workers=2),
            label_column="target",
            params=params,
            datasets={"valid": valid_dataset},
        )
    with pytest.raises(KeyError, match="dmatrix_params"):
        XGBoostTrainer(
            scaling_config=ScalingConfig(num_workers=2),
            label_column="target",
            params=params,
            dmatrix_params={"data": {}},
            datasets={
                TRAIN_DATASET_KEY: train_dataset,
                "valid": valid_dataset
            },
        )
Esempio n. 29
0
def main(num_workers, use_gpu, kwargs):
    trainer = HorovodTrainer(
        train_loop_per_worker=train_func,
        train_loop_config={
            "num_epochs": kwargs["num_epochs"],
            "log_interval": kwargs["log_interval"],
            "use_cuda": kwargs["use_cuda"],
        },
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    result = trainer.fit()
    print(result)
Esempio n. 30
0
def main(data_size_gb: int, num_epochs=2, num_workers=1):
    data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw"
    print("Running Pytorch image model training with "
          f"{data_size_gb}GB data from {data_url}")
    print(f"Training for {num_epochs} epochs with {num_workers} workers.")
    start = time.time()
    # Enable cross host NCCL for larger scale tests
    runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}}
    ray.init(runtime_env=runtime_env)
    dataset = ray.data.read_datasource(ImageFolderDatasource(),
                                       paths=[data_url])

    preprocessor = BatchMapper(preprocess_image_with_label)

    trainer = TorchTrainer(
        train_loop_per_worker=train_loop_per_worker,
        train_loop_config={
            "batch_size": 64,
            "num_epochs": num_epochs
        },
        datasets={"train": dataset},
        preprocessor=preprocessor,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True),
    )
    trainer.fit()

    total_time_s = round(time.time() - start, 2)

    # For structured output integration with internal tooling
    results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs}
    results["perf_metrics"] = [
        {
            "perf_metric_name": "total_time_s",
            "perf_metric_value": total_time_s,
            "perf_metric_type": "LATENCY",
        },
        {
            "perf_metric_name":
            "throughout_MB_s",
            "perf_metric_value":
            round(num_epochs * data_size_gb * 1024 / total_time_s, 2),
            "perf_metric_type":
            "THROUGHPUT",
        },
    ]

    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
                                      "/tmp/release_test_out.json")
    with open(test_output_json, "wt") as f:
        json.dump(results, f)

    print(results)