Beispiel #1
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict,
                        decode_ctx):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)

        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config)

        hyperopt_dict['config'] = modified_config
        hyperopt_dict[
            'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'

        tune_executor = self

        class RayTuneReportCallback(Callback):
            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if trainer.is_coordinator():
                    with tune.checkpoint_dir(
                            step=progress_tracker.epoch) as checkpoint_dir:
                        checkpoint_model = os.path.join(
                            checkpoint_dir, 'model')
                        shutil.copytree(save_path, checkpoint_model)

                    train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics
                    stats = eval_stats or train_stats
                    metric_score = tune_executor.get_metric_score_from_eval_stats(
                        stats)[-1]
                    tune.report(parameters=json.dumps(config,
                                                      cls=NumpyEncoder),
                                metric_score=metric_score,
                                training_stats=json.dumps(train_stats,
                                                          cls=NumpyEncoder),
                                eval_stats=json.dumps(eval_stats,
                                                      cls=NumpyEncoder))

        train_stats, eval_stats = run_experiment(
            **hyperopt_dict,
            model_resume_path=checkpoint_dir,
            callbacks=[RayTuneReportCallback()],
        )

        metric_score = self.get_metric_score(train_stats, eval_stats)
        tune.report(parameters=json.dumps(config, cls=NumpyEncoder),
                    metric_score=metric_score,
                    training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                    eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
Beispiel #2
0
def test_grid_strategy(key):

    hyperopt_test_params = HYPEROPT_PARAMS[key]
    expected_search_space = EXPECTED_SEARCH_SPACE[key]

    goal = hyperopt_test_params["goal"]
    num_samples = hyperopt_test_params["num_samples"]
    tune_sampler_params = hyperopt_test_params["parameters"]

    tune_sampler = RayTuneSampler(goal=goal,
                                  parameters=tune_sampler_params,
                                  num_samples=num_samples)
    search_space = tune_sampler.search_space

    actual_params_keys = search_space.keys()
    expected_params_keys = expected_search_space.keys()

    for param in search_space:
        assert type(search_space[param]) is type(expected_search_space[param])

    assert actual_params_keys == expected_params_keys
    assert tune_sampler.num_samples == num_samples
Beispiel #3
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict, decode_ctx, is_using_ray_backend=False):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)
        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(copy.deepcopy(hyperopt_dict["config"]), config)

        trial_dir = Path(tune.get_trial_dir())
        trial_location = ray.util.get_node_ip_address()

        hyperopt_dict["config"] = modified_config
        hyperopt_dict["experiment_name "] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'
        hyperopt_dict["output_directory"] = str(trial_dir)

        tune_executor = self
        if is_using_ray_backend:
            ray_queue = RayQueue(actor_options={"num_cpus": 0})
        else:
            ray_queue = None

        def checkpoint(progress_tracker, save_path):
            with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir:
                checkpoint_model = os.path.join(checkpoint_dir, "model")
                # shutil.copytree(save_path, checkpoint_model)
                # Note: A previous implementation used shutil.copytree()
                # however, this copying method is non atomic
                if not os.path.isdir(checkpoint_model):
                    copy_id = uuid.uuid4()
                    tmp_dst = f"{checkpoint_model}.{copy_id}.tmp"
                    assert os.path.exists(save_path)
                    shutil.copytree(save_path, tmp_dst)
                    try:
                        os.rename(tmp_dst, checkpoint_model)
                    except Exception:
                        shutil.rmtree(tmp_dst)

        def report(progress_tracker):
            train_stats = {
                TRAINING: progress_tracker.train_metrics,
                VALIDATION: progress_tracker.vali_metrics,
                TEST: progress_tracker.test_metrics,
            }

            metric_score = tune_executor.get_metric_score(train_stats)
            tune.report(
                parameters=json.dumps(config, cls=NumpyEncoder),
                metric_score=metric_score,
                training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                eval_stats="{}",
                trial_id=tune.get_trial_id(),
                trial_dir=tune.get_trial_dir(),
            )

        class RayTuneReportCallback(Callback):
            def _get_sync_client_and_remote_checkpoint_dir(self) -> Optional[Tuple["CommandBasedClient", str]]:
                # sync client has to be recreated to avoid issues with serialization
                return tune_executor._get_sync_client_and_remote_checkpoint_dir(trial_dir)

            def on_trainer_train_setup(self, trainer, save_path, is_coordinator):
                if is_using_ray_backend and checkpoint_dir and trial_location != ray.util.get_node_ip_address():
                    save_path = Path(save_path)

                    for path in trial_dir.glob("checkpoint*"):
                        if path not in (save_path.parent, checkpoint_dir):
                            shutil.rmtree(path, ignore_errors=True)

                    sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                    if sync_info is not None:
                        sync_client, remote_checkpoint_dir = sync_info
                        sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                        sync_client.wait()

            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if is_using_ray_backend:
                    save_path = Path(save_path)
                    if trial_location != ray.util.get_node_ip_address():
                        sync_info = self._get_sync_client_and_remote_checkpoint_dir()
                        if sync_info is not None:
                            sync_client, remote_checkpoint_dir = sync_info
                            sync_client.sync_up(str(save_path.parent.parent.absolute()), remote_checkpoint_dir)
                            sync_client.wait()
                    ray_queue.put((progress_tracker, str(save_path)))
                    return

                checkpoint(progress_tracker, save_path)
                report(progress_tracker)

        callbacks = hyperopt_dict.get("callbacks") or []
        hyperopt_dict["callbacks"] = callbacks + [RayTuneReportCallback()]

        # set tune resources
        if is_using_ray_backend:
            resources = tune.get_trial_resources()
            # check if we are using at least 1 gpu per trial
            use_gpu = bool(self._gpu_resources_per_trial_non_none)
            # get the resources assigned to the current trial
            current_resources = resources.required_resources["GPU" if use_gpu else "CPU"]

            hvd_kwargs = {
                "num_workers": int(current_resources),
                "use_gpu": use_gpu,
            }
            hyperopt_dict["backend"].set_distributed_kwargs(**hvd_kwargs)

            logger.debug(f"Trial horovod kwargs: {hvd_kwargs}")

        stats = []

        def _run():
            train_stats, eval_stats = run_experiment(
                **hyperopt_dict,
                model_resume_path=checkpoint_dir,
                parameters=config,
            )
            stats.append((train_stats, eval_stats))

        sync_info = self._get_sync_client_and_remote_checkpoint_dir(trial_dir)
        if is_using_ray_backend and sync_info is not None:
            # We have to pull the results to the trial actor
            # from worker actors, as the Tune session is running
            # only on the trial actor
            thread = threading.Thread(target=_run)
            thread.daemon = True
            thread.start()

            sync_client, remote_checkpoint_dir = sync_info

            def check_queue():
                qsize = ray_queue.qsize()
                if qsize:
                    results = ray_queue.get_nowait_batch(qsize)
                    sync_client.sync_down(remote_checkpoint_dir, str(trial_dir.absolute()))
                    sync_client.wait()
                    for progress_tracker, save_path in results:
                        checkpoint(progress_tracker, str(trial_dir.joinpath(Path(save_path))))
                        report(progress_tracker)

            while thread.is_alive():
                thread.join(timeout=0)
                check_queue()
                time.sleep(0.1)
            thread.join()
            check_queue()
        else:
            # remove threading overhead
            _run()

        if not stats:
            raise RuntimeError("Experiment did not complete.")
        train_stats, eval_stats = stats.pop()

        metric_score = self.get_metric_score(train_stats)
        tune.report(
            parameters=json.dumps(config, cls=NumpyEncoder),
            metric_score=metric_score,
            training_stats=json.dumps(train_stats, cls=NumpyEncoder),
            eval_stats=json.dumps(eval_stats, cls=NumpyEncoder),
            trial_id=tune.get_trial_id(),
            trial_dir=tune.get_trial_dir(),
        )
Beispiel #4
0
    def _run_experiment(self, config, checkpoint_dir, hyperopt_dict,
                        decode_ctx):
        for gpu_id in ray.get_gpu_ids():
            # Previous trial may not have freed its memory yet, so wait to avoid OOM
            wait_for_gpu(gpu_id)

        # Some config values may be JSON encoded as strings, so decode them here
        config = RayTuneSampler.decode_values(config, decode_ctx)

        trial_id = tune.get_trial_id()
        modified_config = substitute_parameters(
            copy.deepcopy(hyperopt_dict["config"]), config)

        hyperopt_dict['config'] = modified_config
        hyperopt_dict[
            'experiment_name '] = f'{hyperopt_dict["experiment_name"]}_{trial_id}'

        tune_executor = self

        class RayTuneReportCallback(Callback):
            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if trainer.is_coordinator():
                    with tune.checkpoint_dir(
                            step=progress_tracker.epoch) as checkpoint_dir:
                        checkpoint_model = os.path.join(
                            checkpoint_dir, 'model')
                        # shutil.copytree(save_path, checkpoint_model)
                        # Note: A previous implementation used shutil.copytree()
                        # however, this copying method is non atomic
                        if not os.path.isdir(checkpoint_model):
                            copy_id = uuid.uuid4()
                            tmp_dst = "%s.%s.tmp" % (checkpoint_model, copy_id)
                            shutil.copytree(save_path, tmp_dst)
                            try:
                                os.rename(tmp_dst, checkpoint_model)
                            except:
                                shutil.rmtree(tmp_dst)

                    train_stats = {
                        TRAINING: progress_tracker.train_metrics,
                        VALIDATION: progress_tracker.vali_metrics,
                        TEST: progress_tracker.test_metrics,
                    }

                    metric_score = tune_executor.get_metric_score(
                        train_stats, eval_stats=None)
                    tune.report(
                        parameters=json.dumps(config, cls=NumpyEncoder),
                        metric_score=metric_score,
                        training_stats=json.dumps(train_stats[TRAINING],
                                                  cls=NumpyEncoder),
                        eval_stats=json.dumps(train_stats[VALIDATION],
                                              cls=NumpyEncoder))

        train_stats, eval_stats = run_experiment(
            **hyperopt_dict,
            model_resume_path=checkpoint_dir,
            callbacks=[RayTuneReportCallback()],
        )

        metric_score = self.get_metric_score(train_stats, eval_stats)
        tune.report(parameters=json.dumps(config, cls=NumpyEncoder),
                    metric_score=metric_score,
                    training_stats=json.dumps(train_stats, cls=NumpyEncoder),
                    eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))