Exemple #1
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict,
                        decode_ctx):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)

        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config)

        hyperopt_dict['config'] = modified_config
        hyperopt_dict[
            'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'

        tune_executor = self

        class RayTuneReportCallback(Callback):
            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if trainer.is_coordinator():
                    with tune.checkpoint_dir(
                            step=progress_tracker.epoch) as checkpoint_dir:
                        checkpoint_model = os.path.join(
                            checkpoint_dir, 'model')
                        shutil.copytree(save_path, checkpoint_model)

                    train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics
                    stats = eval_stats or train_stats
                    metric_score = tune_executor.get_metric_score_from_eval_stats(
                        stats)[-1]
                    tune.report(parameters=json.dumps(config,
                                                      cls=NumpyEncoder),
                                metric_score=metric_score,
                                training_stats=json.dumps(train_stats,
                                                          cls=NumpyEncoder),
                                eval_stats=json.dumps(eval_stats,
                                                      cls=NumpyEncoder))

        train_stats, eval_stats = run_experiment(
            **hyperopt_dict,
            model_resume_path=checkpoint_dir,
            callbacks=[RayTuneReportCallback()],
        )

        metric_score = self.get_metric_score(train_stats, eval_stats)
        tune.report(parameters=json.dumps(config, cls=NumpyEncoder),
                    metric_score=metric_score,
                    training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                    eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
Exemple #2
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)
        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config)

        trial_dir = Path(tune.get_trial_dir())
        trial_location = ray.util.get_node_ip_address()

        hyperopt_dict["config"] = modified_config
        hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'
        hyperopt_dict["output_directory"] = str(trial_dir)

        tune_executor = self
        if is_using_ray_backend:
            ray_queue = RayQueue(actor_options={"num_cpus": 0})
        else:
            ray_queue = None

        def checkpoint(progress_tracker, save_path):
            with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir:
                checkpoint_model = os.path.join(checkpoint_dir, "model")
                # shutil.copytree(save_path, checkpoint_model)
                # Note: A previous implementation used shutil.copytree()
                # however, this copying method is non atomic
                if not os.path.isdir(checkpoint_model):
                    copy_id = uuid.uuid4()
                    tmp_dst = f"{checkpoint_model}.{copy_id}.tmp"
                    assert os.path.exists(save_path)
                    shutil.copytree(save_path, tmp_dst)
                    try:
                        os.rename(tmp_dst, checkpoint_model)
                    except Exception:
                        shutil.rmtree(tmp_dst)

        def report(progress_tracker):
            train_stats = {
                TRAINING: progress_tracker.train_metrics,
                VALIDATION: progress_tracker.vali_metrics,
                TEST: progress_tracker.test_metrics,
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )

        class RayTuneReportCallback(Callback):
            def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]:
                # sync client has to be recreated to avoid issues with serialization
                return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir)

            def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
                if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address():
                    save_path = Path(save_path)

                    for path in trial_dir.glob("checkpoint*"):
                        if path not in (save_path.parent, checkpoint_dir):
                            shutil.rmtree(path, ignore_errors=True)

                    sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                    if sync_info is not None:
                        sync_client, remote_checkpoint_dir = sync_info
                        sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                        sync_client.wait()

            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if is_using_ray_backend:
                    save_path = Path(save_path)
                    if trial_location != ray.util.get_node_ip_address():
                        sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                        if sync_info is not None:
                            sync_client, remote_checkpoint_dir = sync_info
                            sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir)
                            sync_client.wait()
                    ray_queue.put((progress_tracker, str(save_path)))
                    return

                checkpoint(progress_tracker, save_path)
                report(progress_tracker)

        callbacks = hyperopt_dict.get("callbacks") or []
        hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()]

        # set tune resources
        if is_using_ray_backend:
            resources = tune.get_trial_resources()
            # check if we are using at least 1 gpu per trial
            use_gpu = bool(self._gpu_resources_per_trial_non_none)
            # get the resources assigned to the current trial
            current_resources = resources.required_resources["GPU" if use_gpu else "CPU"]

            hvd_kwargs = {
                "num_workers": int(current_resources),
                "use_gpu": use_gpu,
            }
            hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs)

            logger.debug(f"Trial horovod kwargs: {hvd_kwargs}")

        stats = []

        def _run():
            train_stats, eval_stats = run_experiment(
                **hyperopt_dict,
                model_resume_path=checkpoint_dir,
                parameters=config,
            )
            stats.append((train_stats, eval_stats))

        sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir)
        if is_using_ray_backend and sync_info is not None:
            # We have to pull the results to the trial actor
            # from worker actors, as the Tune session is running
            # only on the trial actor
            thread = threading.Thread(target=_run)
            thread.daemon = True
            thread.start()

            sync_client, remote_checkpoint_dir = sync_info

            def check_queue():
                qsize = ray_queue.qsize()
                if qsize:
                    results = ray_queue.get_nowait_batch(qsize)
                    sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                    sync_client.wait()
                    for progress_tracker, save_path in results:
                        checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path))))
                        report(progress_tracker)

            while thread.is_alive():
                thread.join(timeout=0)
                check_queue()
                time.sleep(0.1)
            thread.join()
            check_queue()
        else:
            # remove threading overhead
            _run()

        if not stats:
            raise RuntimeError("Experiment did not complete.")
        train_stats, eval_stats = stats.pop()

        metric_score = self.get_metric_score(train_stats)
        tune.report(
            parameters=json.dumps(config, cls=NumpyEncoder),
            metric_score=metric_score,
            training_stats=json.dumps(train_stats, cls=NumpyEncoder),
            eval_stats=json.dumps(eval_stats, cls=NumpyEncoder),
            trial_id=tune.get_trial_id(),
            trial_dir=tune.get_trial_dir(),
        )
Exemple #3
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict,
                        decode_ctx):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)

        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config)

        hyperopt_dict['config'] = modified_config
        hyperopt_dict[
            'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'

        tune_executor = self

        class RayTuneReportCallback(Callback):
            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if trainer.is_coordinator():
                    with tune.checkpoint_dir(
                            step=progress_tracker.epoch) as checkpoint_dir:
                        checkpoint_model = os.path.join(
                            checkpoint_dir, 'model')
                        # shutil.copytree(save_path, checkpoint_model)
                        # Note: A previous implementation used shutil.copytree()
                        # however, this copying method is non atomic
                        if not os.path.isdir(checkpoint_model):
                            copy_id = uuid.uuid4()
                            tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id)
                            shutil.copytree(save_path, tmp_dst)
                            try:
                                os.rename(tmp_dst, checkpoint_model)
                            except:
                                shutil.rmtree(tmp_dst)

                    train_stats = {
                        TRAINING: progress_tracker.train_metrics,
                        VALIDATION: progress_tracker.vali_metrics,
                        TEST: progress_tracker.test_metrics,
                    }

                    metric_score = tune_executor.get_metric_score(
                        train_stats, eval_stats=None)
                    tune.report(
                        parameters=json.dumps(config, cls=NumpyEncoder),
                        metric_score=metric_score,
                        training_stats=json.dumps(train_stats[TRAINING],
                                                  cls=NumpyEncoder),
                        eval_stats=json.dumps(train_stats[VALIDATION],
                                              cls=NumpyEncoder))

        train_stats, eval_stats = run_experiment(
            **hyperopt_dict,
            model_resume_path=checkpoint_dir,
            callbacks=[RayTuneReportCallback()],
        )

        metric_score = self.get_metric_score(train_stats, eval_stats)
        tune.report(parameters=json.dumps(config, cls=NumpyEncoder),
                    metric_score=metric_score,
                    training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                    eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
Exemple #4
0
    def _run_experiment(
        self,
        config,
        checkpoint_dir,
        hyperopt_dict,
        decode_ctx,
        features_eligible_for_shared_params,
        is_using_ray_backend=False,
    ):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)

        # Some config values may be JSON encoded as strings, so decode them here
        config = self.decode_values(config, decode_ctx)

        # Remove mlflow injected config parameters: https://github.com/ludwig-ai/ludwig/issues/2288
        if "mlflow" in config:
            del config["mlflow"]

        trial_id = tune.get_trial_id()
        trial_dir = Path(tune.get_trial_dir())
        driver_trial_location = ray.util.get_node_ip_address()

        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config,
            features_eligible_for_shared_params)

        hyperopt_dict["config"] = modified_config
        hyperopt_dict[
            "experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'
        hyperopt_dict["output_directory"] = str(trial_dir)

        tune_executor = self
        if is_using_ray_backend:
            ray_queue = RayQueue(actor_options={"num_cpus": 0})
        else:
            ray_queue = None

        def report(progress_tracker):
            # The progress tracker's metrics are nested dictionaries of TrainerMetrics: feature_name -> metric_name ->
            # List[TrainerMetric], with one entry per training checkpoint, according to steps_per_checkpoint.
            # We reduce the dictionary of TrainerMetrics to a simple list of floats for interfacing with Ray Tune.
            train_stats = {
                TRAINING:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.train_metrics),
                VALIDATION:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.validation_metrics),
                TEST:
                metric_utils.reduce_trainer_metrics_dict(
                    progress_tracker.test_metrics),
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )

        class RayTuneReportCallback(Callback):
            def __init__(self):
                super().__init__()
                self.last_steps = 0

            def _get_remote_checkpoint_dir(
                    self) -> Optional[Union[str, Tuple[str, str]]]:
                # sync client has to be recreated to avoid issues with serialization
                return tune_executor._get_remote_checkpoint_dir(trial_dir)

            def _checkpoint_progress(self, trainer, progress_tracker,
                                     save_path) -> None:
                """Checkpoints the progress tracker."""
                if is_using_ray_backend:
                    save_path = Path(save_path)
                    remote_checkpoint_dir = self._get_remote_checkpoint_dir()
                    if remote_checkpoint_dir is not None:
                        sync_client = tune_executor.sync_client
                        sync_client.sync_up(
                            str(save_path.parent.parent.absolute()),
                            remote_checkpoint_dir)
                        sync_client.wait_or_retry()
                    ray_queue.put((progress_tracker, str(save_path)))
                    return
                checkpoint(progress_tracker, save_path)

            def on_trainer_train_setup(self, trainer, save_path,
                                       is_coordinator):
                if is_using_ray_backend and checkpoint_dir and driver_trial_location != ray.util.get_node_ip_address(
                ):
                    save_path = Path(save_path)

                    for path in trial_dir.glob("checkpoint*"):
                        if path not in (save_path.parent, checkpoint_dir):
                            shutil.rmtree(path, ignore_errors=True)

                    remote_checkpoint_dir = self._get_remote_checkpoint_dir()
                    if remote_checkpoint_dir is not None:
                        sync_client = tune_executor.sync_client
                        sync_client.sync_down(remote_checkpoint_dir,
                                              str(trial_dir.absolute()))
                        sync_client.wait_or_retry()

            def on_eval_end(self, trainer, progress_tracker, save_path):
                progress_tracker.tune_checkpoint_num += 1
                self.last_steps = progress_tracker.steps
                self._checkpoint_progress(trainer, progress_tracker, save_path)
                if not is_using_ray_backend:
                    report(progress_tracker)

            def on_trainer_train_teardown(self, trainer, progress_tracker,
                                          save_path, is_coordinator):
                if is_coordinator and progress_tracker.steps > self.last_steps:
                    # Note: Calling tune.report in both on_eval_end() and here can cause multiprocessing issues
                    # for some ray samplers if not steps have happened since the last eval.
                    self._checkpoint_progress(trainer, progress_tracker,
                                              save_path)
                    if not is_using_ray_backend:
                        report(progress_tracker)

        callbacks = hyperopt_dict.get("callbacks") or []
        hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()]

        # set tune resources
        if is_using_ray_backend:
            resources = tune.get_trial_resources()
            # check if we are using at least 1 gpu per trial
            use_gpu = bool(self._gpu_resources_per_trial_non_none)
            # get the resources assigned to the current trial
            num_gpus = resources.required_resources.get("GPU", 0)
            num_cpus = resources.required_resources.get(
                "CPU", 1) if num_gpus == 0 else 0

            hvd_kwargs = {
                "num_workers": int(num_gpus) if use_gpu else 1,
                "use_gpu": use_gpu,
                "resources_per_worker": {
                    "CPU": num_cpus,
                    "GPU": 1 if use_gpu else 0,
                },
            }
            hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs)

            logger.debug(f"Trial horovod kwargs: {hvd_kwargs}")

        stats = []

        def _run():
            train_stats, eval_stats = run_experiment(
                **hyperopt_dict,
                model_resume_path=checkpoint_dir,
                parameters=config,
            )
            stats.append((train_stats, eval_stats))

        if is_using_ray_backend:
            # We have to pull the results to the trial actor
            # from worker actors, as the Tune session is running
            # only on the trial actor
            thread = threading.Thread(target=_run)
            thread.daemon = True
            thread.start()

            if self.sync_config is not None:
                remote_checkpoint_dir = self._get_remote_checkpoint_dir(
                    trial_dir)

            def check_queue():
                qsize = ray_queue.qsize()
                if qsize:
                    results = ray_queue.get_nowait_batch(qsize)
                    if self.sync_client is not None:
                        self.sync_client.sync_down(remote_checkpoint_dir,
                                                   str(trial_dir.absolute()))
                        self.sync_client.wait()
                    for progress_tracker, save_path in results:
                        checkpoint(progress_tracker,
                                   str(trial_dir.joinpath(Path(save_path))))
                        report(progress_tracker)

            while thread.is_alive():
                thread.join(timeout=0)
                check_queue()
                time.sleep(0.1)
            thread.join()
            check_queue()
        else:
            # remove threading overhead
            _run()

        if not stats:
            raise RuntimeError("Experiment did not complete.")
        train_stats, eval_stats = stats.pop()

        metric_score = self.get_metric_score(train_stats)
        tune.report(
            parameters=json.dumps(config, cls=NumpyEncoder),
            metric_score=metric_score,
            training_stats=json.dumps(train_stats, cls=NumpyEncoder),
            eval_stats=json.dumps(eval_stats, cls=NumpyEncoder),
            trial_id=tune.get_trial_id(),
            trial_dir=tune.get_trial_dir(),
        )