Exemple #1
0
    def _get_sync_client_and_remote_checkpoint_dir(
            self,
            trial_dir: Path) -> Optional[Tuple["CommandBasedClient", str]]:
        """Get the Ray sync client and path to remote checkpoint directory."""
        if self.sync_config is None:
            return None

        remote_checkpoint_dir = os.path.join(
            self.sync_config.upload_dir,
            *_get_relative_checkpoints_dir_parts(trial_dir))
        return get_cloud_sync_client(
            remote_checkpoint_dir), remote_checkpoint_dir
Exemple #2
0
    def testCloudSyncExclude(self):
        captured = deque(maxlen=1)
        captured.append("")

        def always_true(*args, **kwargs):
            return True

        def capture_popen(command, *args, **kwargs):
            captured.append(command)

        with patch("subprocess.Popen",
                   capture_popen), patch("distutils.spawn.find_executable",
                                         always_true):
            # S3
            s3_client = get_cloud_sync_client("s3://test-bucket/test-dir")
            s3_client.sync_down("s3://test-bucket/test-dir/remote_source",
                                "local_target")

            self.assertEqual(
                captured[0].strip(),
                "aws s3 sync s3://test-bucket/test-dir/remote_source "
                "local_target --only-show-errors",
            )

            s3_client.sync_down(
                "s3://test-bucket/test-dir/remote_source",
                "local_target",
                exclude=["*/checkpoint_*"],
            )
            self.assertEqual(
                captured[0].strip(),
                "aws s3 sync s3://test-bucket/test-dir/remote_source "
                "local_target --only-show-errors "
                "--exclude '*/checkpoint_*'",
            )

            s3_client.sync_down(
                "s3://test-bucket/test-dir/remote_source",
                "local_target",
                exclude=["*/checkpoint_*", "*.big"],
            )
            self.assertEqual(
                captured[0].strip(),
                "aws s3 sync s3://test-bucket/test-dir/remote_source "
                "local_target --only-show-errors "
                "--exclude '*/checkpoint_*' --exclude '*.big'",
            )

            # GS
            gs_client = get_cloud_sync_client("gs://test-bucket/test-dir")
            gs_client.sync_down("gs://test-bucket/test-dir/remote_source",
                                "local_target")

            self.assertEqual(
                captured[0].strip(),
                "gsutil rsync -r  "
                "gs://test-bucket/test-dir/remote_source "
                "local_target",
            )

            gs_client.sync_down(
                "gs://test-bucket/test-dir/remote_source",
                "local_target",
                exclude=["*/checkpoint_*"],
            )
            self.assertEqual(
                captured[0].strip(),
                "gsutil rsync -r "
                "-x '(.*/checkpoint_.*)' "
                "gs://test-bucket/test-dir/remote_source "
                "local_target",
            )

            gs_client.sync_down(
                "gs://test-bucket/test-dir/remote_source",
                "local_target",
                exclude=["*/checkpoint_*", "*.big"],
            )
            self.assertEqual(
                captured[0].strip(),
                "gsutil rsync -r "
                "-x '(.*/checkpoint_.*)|(.*.big)' "
                "gs://test-bucket/test-dir/remote_source "
                "local_target",
            )
 def _create_storage_client(self):
     """Returns a storage client."""
     return get_cloud_sync_client(self.remote_checkpoint_dir)
Exemple #4
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        resume=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        hyperopt_log_verbosity=3,
        features_eligible_for_shared_params=None,
        **kwargs,
    ) -> RayTuneResults:
        if isinstance(dataset, str) and not has_remote_protocol(
                dataset) and not os.path.isabs(dataset):
            dataset = os.path.abspath(dataset)

        if isinstance(backend, str):
            backend = initialize_backend(backend)

        if gpus is not None:
            raise ValueError(
                "Parameter `gpus` is not supported when using Ray Tune. "
                "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your "
                "hyperopt config.")

        if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1:
            # Enforce fractional GPU utilization
            gpu_memory_limit = self.gpu_resources_per_trial

        hyperopt_dict = dict(
            config=config,
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            experiment_name=experiment_name,
            model_name=model_name,
            eval_split=self.split,
            skip_save_training_description=skip_save_training_description,
            skip_save_training_statistics=skip_save_training_statistics,
            skip_save_model=skip_save_model,
            skip_save_progress=skip_save_progress,
            skip_save_log=skip_save_log,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
            skip_save_eval_stats=skip_save_eval_stats,
            output_directory=output_directory,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
            backend=backend,
            random_seed=random_seed,
            debug=debug,
        )

        mode = "min" if self.goal != MAXIMIZE else "max"
        metric = "metric_score"
        # if random seed not set, use Ludwig seed
        self.search_algorithm.check_for_random_seed(random_seed)
        if self.search_algorithm.search_alg_dict is not None:
            if TYPE not in self.search_algorithm.search_alg_dict:
                candiate_search_algs = [
                    search_alg for search_alg in SEARCH_ALG_IMPORT.keys()
                ]
                logger.warning(
                    "WARNING: search_alg type parameter missing, using 'variant_generator' as default. "
                    f"These are possible values for the type parameter: {candiate_search_algs}."
                )
                search_alg = None
            else:
                search_alg_type = self.search_algorithm.search_alg_dict[TYPE]
                search_alg = tune.create_searcher(
                    search_alg_type,
                    metric=metric,
                    mode=mode,
                    **self.search_algorithm.search_alg_dict)
        else:
            search_alg = None

        if self.max_concurrent_trials:
            assert (
                self.max_concurrent_trials > 0
            ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}"
            if isinstance(search_alg,
                          BasicVariantGenerator) or search_alg is None:
                search_alg = BasicVariantGenerator(
                    max_concurrent=self.max_concurrent_trials)
            elif isinstance(search_alg, ConcurrencyLimiter):
                raise ValueError(
                    "You have specified `max_concurrent_trials`, but the search "
                    "algorithm is already a `ConcurrencyLimiter`. FIX THIS "
                    "by setting `max_concurrent_trials=None`.")
            else:
                search_alg = ConcurrencyLimiter(
                    search_alg, max_concurrent=self.max_concurrent_trials)

        resources_per_trial = {
            "cpu": self._cpu_resources_per_trial_non_none,
            "gpu": self._gpu_resources_per_trial_non_none,
        }

        def run_experiment_trial(config,
                                 local_hyperopt_dict,
                                 checkpoint_dir=None):
            return self._run_experiment(
                config,
                checkpoint_dir,
                local_hyperopt_dict,
                self.decode_ctx,
                features_eligible_for_shared_params,
                _is_ray_backend(backend),
            )

        tune_config = {}
        tune_callbacks = []
        for callback in callbacks or []:
            run_experiment_trial, tune_config = callback.prepare_ray_tune(
                run_experiment_trial,
                tune_config,
                tune_callbacks,
            )

        if _is_ray_backend(backend):
            # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train)
            # but we do want to enable it when GPUs are specified
            resources_per_trial = PlacementGroupFactory(
                [{}] + ([{
                    "CPU": 0,
                    "GPU": 1
                }] * self._gpu_resources_per_trial_non_none) if self.
                _gpu_resources_per_trial_non_none else [{}] +
                [{
                    "CPU": self._cpu_resources_per_trial_non_none
                }])

        if has_remote_protocol(output_directory):
            run_experiment_trial = tune.durable(run_experiment_trial)
            self.sync_config = tune.SyncConfig(sync_to_driver=False,
                                               upload_dir=output_directory)
            if _ray_114:
                self.sync_client = get_node_to_storage_syncer(
                    SyncConfig(upload_dir=output_directory))
            else:
                self.sync_client = get_cloud_sync_client(output_directory)
            output_directory = None
        elif self.kubernetes_namespace:
            from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer

            self.sync_config = tune.SyncConfig(
                sync_to_driver=NamespacedKubernetesSyncer(
                    self.kubernetes_namespace))
            self.sync_client = KubernetesSyncClient(self.kubernetes_namespace)

        run_experiment_trial_params = tune.with_parameters(
            run_experiment_trial, local_hyperopt_dict=hyperopt_dict)
        register_trainable(
            f"trainable_func_f{hash_dict(config).decode('ascii')}",
            run_experiment_trial_params)

        # Note that resume="AUTO" will attempt to resume the experiment if possible, and
        # otherwise will start a new experiment:
        # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html
        should_resume = "AUTO" if resume is None else resume

        try:
            analysis = tune.run(
                f"trainable_func_f{hash_dict(config).decode('ascii')}",
                name=experiment_name,
                config={
                    **self.search_space,
                    **tune_config,
                },
                scheduler=self.scheduler,
                search_alg=search_alg,
                num_samples=self.num_samples,
                keep_checkpoints_num=1,
                max_failures=1,  # retry a trial failure once
                resources_per_trial=resources_per_trial,
                time_budget_s=self.time_budget_s,
                sync_config=self.sync_config,
                local_dir=output_directory,
                metric=metric,
                mode=mode,
                trial_name_creator=lambda trial: f"trial_{trial.trial_id}",
                trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}",
                callbacks=tune_callbacks,
                stop=CallbackStopper(callbacks),
                verbose=hyperopt_log_verbosity,
                resume=should_resume,
                log_to_file=True,
            )
        except Exception as e:
            # Explicitly raise a RuntimeError if an error is encountered during a Ray trial.
            # NOTE: Cascading the exception with "raise _ from e" still results in hanging.
            raise RuntimeError(f"Encountered Ray Tune error: {e}")

        if "metric_score" in analysis.results_df.columns:
            ordered_trials = analysis.results_df.sort_values(
                "metric_score", ascending=self.goal != MAXIMIZE)

            # Catch nans in edge case where the trial doesn't complete
            temp_ordered_trials = []
            for kwargs in ordered_trials.to_dict(orient="records"):
                for key in ["parameters", "training_stats", "eval_stats"]:
                    if isinstance(kwargs[key], float):
                        kwargs[key] = {}
                temp_ordered_trials.append(kwargs)

            # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate
            # tune.report call(s) but were terminated before reporting eval_stats from post-train
            # evaluation (e.g., trial stopped due to time budget or relatively poor performance.)
            # For any such trials, run model evaluation for the best model in that trial & record
            # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json.
            for trial in temp_ordered_trials:
                if trial["eval_stats"] == "{}" and trial[
                        "training_stats"] != "{}":
                    # Evaluate the best model on the eval_split, which is validation_set
                    if validation_set is not None and validation_set.size > 0:
                        trial_path = trial["trial_dir"]
                        best_model_path = self._get_best_model_path(
                            trial_path, analysis)
                        if best_model_path is not None:
                            self._evaluate_best_model(
                                trial,
                                trial_path,
                                best_model_path,
                                validation_set,
                                data_format,
                                skip_save_unprocessed_output,
                                skip_save_predictions,
                                skip_save_eval_stats,
                                gpus,
                                gpu_memory_limit,
                                allow_parallel_threads,
                                backend,
                                debug,
                            )
                        else:
                            logger.warning(
                                "Skipping evaluation as no model checkpoints were available"
                            )
                    else:
                        logger.warning(
                            "Skipping evaluation as no validation set was provided"
                        )

            ordered_trials = [
                TrialResults.from_dict(load_json_values(kwargs))
                for kwargs in temp_ordered_trials
            ]
        else:
            logger.warning(
                "No trials reported results; check if time budget lower than epoch latency"
            )
            ordered_trials = []

        return RayTuneResults(ordered_trials=ordered_trials,
                              experiment_analysis=analysis)