Example #1
0
 def __init__(self, model, trainer_kwargs, executable_kwargs):
     self.model = model.cpu()
     self.executable_kwargs = executable_kwargs
     self.trainer = Trainer(**{**get_trainer_kwargs(), **trainer_kwargs})
     self.trainer.start()
     self._validation_field = None
     self._validation_metric = None
Example #2
0
class RayTrainerV2(BaseTrainer):
    def __init__(self, model, trainer_kwargs, executable_kwargs):
        self.model = model.cpu()
        self.executable_kwargs = executable_kwargs
        self.trainer = Trainer(**{**get_trainer_kwargs(), **trainer_kwargs})
        self.trainer.start()
        self._validation_field = None
        self._validation_metric = None

    def train(
        self,
        training_set: RayDataset,
        validation_set: Optional[RayDataset] = None,
        test_set: Optional[RayDataset] = None,
        **kwargs,
    ):
        executable_kwargs = self.executable_kwargs

        kwargs = {
            "training_set_metadata": training_set.training_set_metadata,
            "features": training_set.features,
            **kwargs,
        }

        dataset = {"train": training_set.pipeline()}
        if validation_set is not None:
            dataset["val"] = validation_set.pipeline(shuffle=False)
        if test_set is not None:
            dataset["test"] = test_set.pipeline(shuffle=False)

        results, self._validation_field, self._validation_metric = self.trainer.run(
            lambda config: train_fn(**config),
            config={
                "executable_kwargs": executable_kwargs,
                "model_ref": ray.put(self.model),
                **kwargs
            },
            dataset=dataset,
        )[0]

        # load state dict back into the model
        state_dict, *args = results
        self.model.load_state_dict(state_dict)
        results = (self.model, *args)

        return results

    def train_online(self, *args, **kwargs):
        raise NotImplementedError()

    @property
    def validation_field(self):
        return self._validation_field

    @property
    def validation_metric(self):
        return self._validation_metric

    def shutdown(self):
        self.trainer.shutdown()
Example #3
0
def test_tune_error_legacy(ray_start_4_cpus):
    def train_func(config):
        raise RuntimeError("Error in training function!")

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    with pytest.raises(TuneError):
        tune.run(TestTrainable)
Example #4
0
def test_tune_checkpoint_legacy(ray_start_4_cpus):
    def train_func():
        for i in range(10):
            train.report(test=i)
        train.save_checkpoint(hello="world")

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    [trial] = tune.run(TestTrainable).trials
    checkpoint_path = trial.checkpoint.dir_or_data
    assert os.path.exists(checkpoint_path)
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["hello"] == "world"
Example #5
0
def test_reuse_checkpoint_legacy(ray_start_4_cpus):
    def train_func(config):
        itr = 0
        ckpt = train.load_checkpoint()
        if ckpt is not None:
            itr = ckpt["iter"] + 1

        for i in range(itr, config["max_iter"]):
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials
    checkpoint_path = trial.checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 4
    analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path)
    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 5
Example #6
0
 def create_runner(self):
     trainer = Trainer(**{**get_trainer_kwargs(), **self.trainer_kwargs})
     trainer.start()
     try:
         yield trainer
     finally:
         trainer.shutdown()
Example #7
0
def test_retry_legacy(ray_start_4_cpus):
    def train_func():
        ckpt = train.load_checkpoint()
        restored = bool(ckpt)  # Does a previous checkpoint exist?
        itr = 0
        if ckpt:
            itr = ckpt["iter"] + 1

        for i in range(itr, 4):
            if i == 2 and not restored:
                raise Exception("try to fail me")
            train.save_checkpoint(iter=i)
            train.report(test=i, training_iteration=i)

    trainer = Trainer(TestConfig(), num_workers=1)
    TestTrainable = trainer.to_tune_trainable(train_func)

    analysis = tune.run(TestTrainable, max_failures=3)
    checkpoint_path = analysis.trials[0].checkpoint.dir_or_data
    checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict()
    assert checkpoint["iter"] == 3

    trial_dfs = list(analysis.trial_dataframes.values())
    assert len(trial_dfs[0]["training_iteration"]) == 4
def train_fashion_mnist(num_workers=2, use_gpu=False):
    trainer = Trainer(
        backend="torch", num_workers=num_workers, use_gpu=use_gpu)
    trainer.start()
    result = trainer.run(
        train_func=train_func,
        config={
            "lr": 1e-3,
            "batch_size": 64,
            "epochs": 4
        },
        callbacks=[JsonLoggerCallback()])
    trainer.shutdown()
    print(f"Loss results: {result}")
Example #9
0
    def batch_evaluation(
        self,
        dataset: RayDataset,
        collect_predictions: bool = False,
        collect_logits=False,
        **kwargs,
    ):
        # We need to be in a Horovod context to collect the aggregated metrics, since it relies on collective
        # communication ops. However, Horovod is not suitable for transforming one big dataset to another. For that
        # we will use Ray Datasets. Therefore, we break this up into two separate steps, and two passes over the
        # dataset. In the future, we can explore ways to combine these into a single step to reduce IO.
        runner = Trainer(**{**get_trainer_kwargs(), **self.trainer_kwargs})
        runner.start()
        try:
            # Collect eval metrics by distributing work across nodes / gpus with Horovod
            datasets = {
                "eval": dataset.pipeline(shuffle=False,
                                         **self.data_loader_kwargs)
            }
            predictor_kwargs = {
                **self.predictor_kwargs,
                "collect_predictions": False,
            }
            eval_stats, _ = runner.run(
                lambda config: eval_fn(**config),
                config={
                    "predictor_kwargs": predictor_kwargs,
                    "model_ref": ray.put(self.model),
                    "training_set_metadata": dataset.training_set_metadata,
                    "features": dataset.features,
                    **kwargs,
                },
                dataset=datasets,
            )[0]
        finally:
            runner.shutdown()

        predictions = None
        if collect_predictions:
            # Collect eval predictions by using Ray Datasets to transform partitions of the data in parallel
            predictions = self.batch_predict(dataset,
                                             collect_logits=collect_logits)

        return eval_stats, predictions