Example #1
0
    def run(
        self, model_config: Union[ModelConfig,
                                  Dict[str, ModelConfig]]) -> pd.DataFrame:
        """
        Runs the evaluation, providing a configuration's performances for all
        datasets.

        Args:
            model_config: A single configuration for which to obtain performances or a mapping from
                dataset names to model configurations.

        Returns:
            The metrics on individual datasets.
        """
        results = []
        for dataset in self.datasets:
            # Construct the config
            if isinstance(model_config, dict):
                config = Config(model_config[dataset.name()], dataset)
            else:
                config = Config(model_config, dataset)

            # Get the performance and append to results
            performance = self.tracker.get_performance(config)
            df = Performance.to_dataframe(
                [performance]).assign(test_dataset=dataset.name())
            results.append(df)

        return pd.concat(results).set_index("test_dataset")
Example #2
0
    def _fit(self, X: List[Config[ModelConfig]],
             y: npt.NDArray[np.float32]) -> None:
        # For each model configuration, we store all performances, sorted by dataset
        performances = defaultdict(list)
        datasets = set()
        for xx, yy in zip(X, y):
            datasets.add(xx.dataset)
            performances[xx.model].append({
                "performance": yy,
                "dataset": xx.dataset
            })

        # Then, we assign the model performances and dataset features
        self.model_performances_ = {
            model: np.stack([
                p["performance"] for p in sorted(
                    data,
                    key=lambda x: x["dataset"].name(),  # type: ignore
                )
            ])
            for model, data in performances.items()
        }

        # We use the seasonal naive model config here since it is ignored anyway
        if self.use_dataset_features:
            self.dataset_features_ = self.config_transformer.fit_transform([
                Config(SeasonalNaiveModelConfig(), d) for d in sorted(
                    datasets, key=lambda x: x.name())  # type: ignore
            ])
Example #3
0
 def _get_performance_array(
         self, X: list[Config[ModelConfig]]) -> npt.NDArray[np.float32]:
     return np.array([
         self.tracker.get_performance(
             Config(SeasonalNaiveModelConfig(), x.dataset)).ncrps.mean
         for x in X
     ])[:, None]
Example #4
0
    def recommend(
        self,
        dataset: DatasetConfig,
        candidates: Optional[List[T]] = None,
        max_count: int = 10,
    ) -> List[Recommendation[T]]:
        """
        This method takes a dataset and a set of constraints and outputs a set of recommendations.
        The recommendations provide both the configurations of the recommended model as well as the
        expected performance.

        Args:
            dataset: The configuration of the dataset for which to recommend a model.
            candidates: A list of model configurations that are allowed to be recommended. If
                `None`, any model configuration is permitted.
            max_count: The maximum number of models to recommend.

        Returns:
            The recommendations which (approximately) satisfy the provided constraints.
        """
        model_configs = self.generator.generate(candidates)
        configs = [Config(m, dataset) for m in model_configs]
        performances = self._get_performances(configs)

        # We construct a data frame, extracting the performance metrics to minimize.
        # Then, we invert the performance metrics for the metrics to maximize.
        df = Performance.to_dataframe(performances)[self.objectives]

        # Then, we perform a nondominated sort
        argsort = argsort_nondominated(
            df.to_numpy(),  # type: ignore
            dim=df.columns.tolist().index(self.focus)
            if self.focus is not None
            else None,
            max_items=max_count,
        )

        # And get the recommendations
        result = []
        for choice in cast(List[int], argsort):
            config = configs[choice]
            recommendation = Recommendation(config.model, performances[choice])
            result.append(recommendation)

        return result
Example #5
0
    def dataframe(self, std: bool = True) -> pd.DataFrame:
        """
        Returns a dataframe which contains the performance metrics as columns and the
        configurations as multi-index.

        Args:
            std: Whether to include the standard deviation of performance metrics in the dataframe.
        """
        # Should implement this for ensembles as well
        index_df = Config.to_dataframe(
            cast(List[Config[ModelConfig]], self.configurations))
        # Reorder columns
        column_order = ["dataset"] + [
            c for c in index_df.columns.tolist() if c != "dataset"
        ]
        index = pd.MultiIndex.from_frame(index_df[column_order])
        df = Performance.to_dataframe(self.performances, std=std)
        df.index = index
        return df.sort_index()
Example #6
0
    def __init__(self, directory: Path):
        """
        Args:
            files: Directory from which to load the files non-recursively.
        """
        configurations = []
        performances = []
        for file in os.listdir(directory):
            if not file.endswith(".pickle"):
                continue
            with Path(file).open("rb") as f:
                data = pickle.load(f)
                configurations.extend([
                    Config(frozenset(x["configurations"]), x["dataset"])
                    for x in data
                ])
                performances.extend([x["performance"] for x in data])

        self.performance_map: Dict[Config[EnsembleConfig], Performance] = dict(
            zip(configurations, performances))
Example #7
0
    def get_ensemble_performance(
        self,
        models: List[ModelConfig],
        dataset: DatasetConfig,
        member_performances: Optional[List[Performance]] = None,
        num_samples: int = 10,
    ) -> Performance:
        """
        Estimates the performance of a list of models on a particular dataset.
        For this, actually trained models are sampled for each configuration.

        Args:
            models: The list of models to evaluate.
            dataset: The dataset to evaluate on.
            member_performances: The (predicted) performances of the provided models. Used to weigh
                the ensemble members. If not provided, uses the true performances.
            num_samples: The number of samples for estimating the performance.

        Returns:
            The expected performance of the ensemble.
        """
        if member_performances is None:
            member_performances = [
                self.tracker.get_performance(Config(m, dataset))
                for m in models
            ]

        # First, we need to get the forecasts for all models
        forecasts = [
            self.tracker.get_forecasts(Config(m, dataset)) for m in models
        ]

        # Then, we want to construct min(#available_choices, 10) different ensembles by randomly
        # choosing models from the configurations without replacement.
        max_choices = np.prod([len(f) for f in forecasts])
        num_choices = min(max_choices, num_samples)
        pool = itertools.product(*[range(len(f)) for f in forecasts])
        model_combinations = random.sample(list(pool), k=num_choices)

        # Then, we evaluate each of the ensembles
        evaluations = []
        for combination in model_combinations:
            ensembled_forecast = ensemble_forecasts(
                [f[i] for i, f in zip(combination, forecasts)],
                self.ensemble_weighting,
                [p.ncrps.mean for p in member_performances],
            )
            evaluation = evaluate_forecasts(
                ensembled_forecast, dataset.data.test().evaluation()
            )
            evaluations.append(evaluation)

        # And eventually, we build the resulting performance object
        performance = Evaluation.performance(evaluations)
        performance.num_gradient_updates = self._combine_metrics(
            member_performances, lambda p: p.num_gradient_updates
        )
        performance.num_model_parameters = self._combine_metrics(
            member_performances, lambda p: p.num_model_parameters
        )
        performance.latency = self._combine_metrics(
            member_performances, lambda p: p.latency
        )
        performance.training_time = self._combine_metrics(
            member_performances, lambda p: p.training_time
        )
        return performance
Example #8
0
def extract_job_infos(
    training_jobs: List[Job],
    validation_metric: Optional[ValidationMetric],
    group_seeds: bool,
    data_path: Union[str, Path] = DEFAULT_DATA_PATH,
) -> List[JobInfo]:
    """
    Returns a list of the job information objects available for all training
    jobs provided.
    """
    # We group the jobs by hyperparameters, excluding the seed
    if group_seeds:
        grouped_jobs = defaultdict(list)
        for job in training_jobs:
            hypers = {
                "model": job.model,
                "dataset": job.dataset,
                **job.hyperparameters,
            }
            grouped_jobs[tuple(sorted(hypers.items()))].append(job)
        all_jobs = grouped_jobs.values()
    else:
        all_jobs = [[job] for job in training_jobs]

    # Then, we can instantiate the info objects by iterating over groups of jobs
    runs = []
    for jobs in tqdm(all_jobs):
        ref_job = jobs[0]
        model_name = ref_job.model
        base_hyperparams = {**ref_job.hyperparameters}

        # First, we reconstruct the training times
        if issubclass(MODEL_REGISTRY[model_name], TrainConfig):
            training_fractions = [1 / 81, 1 / 27] + [
                i / 9 for i in range(1, 10)
            ]
        else:
            training_fractions = [0]

        assert all(
            len(job.metrics) == len(training_fractions) for job in jobs
        ), "Job does not provide sufficiently many models."

        # Then, we iterate over the Hyperband training times
        if len(training_fractions) == 1:
            training_fraction_indices = [0]
        else:
            training_fraction_indices = [0, 1, 2, 4, 10]

        # Then, we iterate over all training times, construct the hyperparameters and collect
        # the performane metrics
        for i in training_fraction_indices:
            # Create the config object
            hyperparams = {
                **base_hyperparams,
                "training_fraction": training_fractions[i],
            }
            model_config = get_model_config(model_name, **hyperparams)
            config = Config(
                model_config, get_dataset_config(ref_job.dataset, data_path)
            )

            # Get the indices of the models that should be used to derive the performance
            if validation_metric is None or len(training_fractions) == 1:
                # If the model does not require training, or we don't look at the validation
                # performance, we just choose the current index
                choices = [i] * len(jobs)
            else:
                # Otherwise, we get the minimum value for the metric up to this point in time
                choices = [
                    np.argmin(
                        [
                            p["evaluation"][validation_metric]
                            for p in job.metrics
                        ][: i + 1]
                    ).item()
                    for job in jobs
                ]

            # Get the performances of the chosen models
            performances = [
                job.performances[choice] for choice, job in zip(choices, jobs)
            ]

            # And average the performance
            averaged_performance = Performance(
                **{
                    metric: Metric(
                        np.mean(
                            [getattr(p, metric).mean for p in performances]
                        ),
                        np.std(
                            [getattr(p, metric).mean for p in performances]
                        ),
                    )
                    for metric in Performance.metrics()
                }
            )

            # Get validation scores if available
            try:
                val_ncrps = np.mean(
                    [
                        job.metrics[c]["evaluation"]["val_ncrps"]
                        for (job, c) in zip(jobs, choices)
                    ]
                )
                val_loss = np.mean(
                    [
                        job.metrics[c]["evaluation"]["val_loss"]
                        for (job, c) in zip(jobs, choices)
                    ]
                ).item()
                val_scores = ValidationScores(val_ncrps, val_loss)
            except KeyError:
                val_scores = None

            # Initialize the info object
            runs.append(
                JobInfo(
                    config, averaged_performance, val_scores, jobs, choices
                )
            )

    return runs