def run( self, model_config: Union[ModelConfig, Dict[str, ModelConfig]]) -> pd.DataFrame: """ Runs the evaluation, providing a configuration's performances for all datasets. Args: model_config: A single configuration for which to obtain performances or a mapping from dataset names to model configurations. Returns: The metrics on individual datasets. """ results = [] for dataset in self.datasets: # Construct the config if isinstance(model_config, dict): config = Config(model_config[dataset.name()], dataset) else: config = Config(model_config, dataset) # Get the performance and append to results performance = self.tracker.get_performance(config) df = Performance.to_dataframe( [performance]).assign(test_dataset=dataset.name()) results.append(df) return pd.concat(results).set_index("test_dataset")
def _fit(self, X: List[Config[ModelConfig]], y: npt.NDArray[np.float32]) -> None: # For each model configuration, we store all performances, sorted by dataset performances = defaultdict(list) datasets = set() for xx, yy in zip(X, y): datasets.add(xx.dataset) performances[xx.model].append({ "performance": yy, "dataset": xx.dataset }) # Then, we assign the model performances and dataset features self.model_performances_ = { model: np.stack([ p["performance"] for p in sorted( data, key=lambda x: x["dataset"].name(), # type: ignore ) ]) for model, data in performances.items() } # We use the seasonal naive model config here since it is ignored anyway if self.use_dataset_features: self.dataset_features_ = self.config_transformer.fit_transform([ Config(SeasonalNaiveModelConfig(), d) for d in sorted( datasets, key=lambda x: x.name()) # type: ignore ])
def _get_performance_array( self, X: list[Config[ModelConfig]]) -> npt.NDArray[np.float32]: return np.array([ self.tracker.get_performance( Config(SeasonalNaiveModelConfig(), x.dataset)).ncrps.mean for x in X ])[:, None]
def recommend( self, dataset: DatasetConfig, candidates: Optional[List[T]] = None, max_count: int = 10, ) -> List[Recommendation[T]]: """ This method takes a dataset and a set of constraints and outputs a set of recommendations. The recommendations provide both the configurations of the recommended model as well as the expected performance. Args: dataset: The configuration of the dataset for which to recommend a model. candidates: A list of model configurations that are allowed to be recommended. If `None`, any model configuration is permitted. max_count: The maximum number of models to recommend. Returns: The recommendations which (approximately) satisfy the provided constraints. """ model_configs = self.generator.generate(candidates) configs = [Config(m, dataset) for m in model_configs] performances = self._get_performances(configs) # We construct a data frame, extracting the performance metrics to minimize. # Then, we invert the performance metrics for the metrics to maximize. df = Performance.to_dataframe(performances)[self.objectives] # Then, we perform a nondominated sort argsort = argsort_nondominated( df.to_numpy(), # type: ignore dim=df.columns.tolist().index(self.focus) if self.focus is not None else None, max_items=max_count, ) # And get the recommendations result = [] for choice in cast(List[int], argsort): config = configs[choice] recommendation = Recommendation(config.model, performances[choice]) result.append(recommendation) return result
def dataframe(self, std: bool = True) -> pd.DataFrame: """ Returns a dataframe which contains the performance metrics as columns and the configurations as multi-index. Args: std: Whether to include the standard deviation of performance metrics in the dataframe. """ # Should implement this for ensembles as well index_df = Config.to_dataframe( cast(List[Config[ModelConfig]], self.configurations)) # Reorder columns column_order = ["dataset"] + [ c for c in index_df.columns.tolist() if c != "dataset" ] index = pd.MultiIndex.from_frame(index_df[column_order]) df = Performance.to_dataframe(self.performances, std=std) df.index = index return df.sort_index()
def __init__(self, directory: Path): """ Args: files: Directory from which to load the files non-recursively. """ configurations = [] performances = [] for file in os.listdir(directory): if not file.endswith(".pickle"): continue with Path(file).open("rb") as f: data = pickle.load(f) configurations.extend([ Config(frozenset(x["configurations"]), x["dataset"]) for x in data ]) performances.extend([x["performance"] for x in data]) self.performance_map: Dict[Config[EnsembleConfig], Performance] = dict( zip(configurations, performances))
def get_ensemble_performance( self, models: List[ModelConfig], dataset: DatasetConfig, member_performances: Optional[List[Performance]] = None, num_samples: int = 10, ) -> Performance: """ Estimates the performance of a list of models on a particular dataset. For this, actually trained models are sampled for each configuration. Args: models: The list of models to evaluate. dataset: The dataset to evaluate on. member_performances: The (predicted) performances of the provided models. Used to weigh the ensemble members. If not provided, uses the true performances. num_samples: The number of samples for estimating the performance. Returns: The expected performance of the ensemble. """ if member_performances is None: member_performances = [ self.tracker.get_performance(Config(m, dataset)) for m in models ] # First, we need to get the forecasts for all models forecasts = [ self.tracker.get_forecasts(Config(m, dataset)) for m in models ] # Then, we want to construct min(#available_choices, 10) different ensembles by randomly # choosing models from the configurations without replacement. max_choices = np.prod([len(f) for f in forecasts]) num_choices = min(max_choices, num_samples) pool = itertools.product(*[range(len(f)) for f in forecasts]) model_combinations = random.sample(list(pool), k=num_choices) # Then, we evaluate each of the ensembles evaluations = [] for combination in model_combinations: ensembled_forecast = ensemble_forecasts( [f[i] for i, f in zip(combination, forecasts)], self.ensemble_weighting, [p.ncrps.mean for p in member_performances], ) evaluation = evaluate_forecasts( ensembled_forecast, dataset.data.test().evaluation() ) evaluations.append(evaluation) # And eventually, we build the resulting performance object performance = Evaluation.performance(evaluations) performance.num_gradient_updates = self._combine_metrics( member_performances, lambda p: p.num_gradient_updates ) performance.num_model_parameters = self._combine_metrics( member_performances, lambda p: p.num_model_parameters ) performance.latency = self._combine_metrics( member_performances, lambda p: p.latency ) performance.training_time = self._combine_metrics( member_performances, lambda p: p.training_time ) return performance
def extract_job_infos( training_jobs: List[Job], validation_metric: Optional[ValidationMetric], group_seeds: bool, data_path: Union[str, Path] = DEFAULT_DATA_PATH, ) -> List[JobInfo]: """ Returns a list of the job information objects available for all training jobs provided. """ # We group the jobs by hyperparameters, excluding the seed if group_seeds: grouped_jobs = defaultdict(list) for job in training_jobs: hypers = { "model": job.model, "dataset": job.dataset, **job.hyperparameters, } grouped_jobs[tuple(sorted(hypers.items()))].append(job) all_jobs = grouped_jobs.values() else: all_jobs = [[job] for job in training_jobs] # Then, we can instantiate the info objects by iterating over groups of jobs runs = [] for jobs in tqdm(all_jobs): ref_job = jobs[0] model_name = ref_job.model base_hyperparams = {**ref_job.hyperparameters} # First, we reconstruct the training times if issubclass(MODEL_REGISTRY[model_name], TrainConfig): training_fractions = [1 / 81, 1 / 27] + [ i / 9 for i in range(1, 10) ] else: training_fractions = [0] assert all( len(job.metrics) == len(training_fractions) for job in jobs ), "Job does not provide sufficiently many models." # Then, we iterate over the Hyperband training times if len(training_fractions) == 1: training_fraction_indices = [0] else: training_fraction_indices = [0, 1, 2, 4, 10] # Then, we iterate over all training times, construct the hyperparameters and collect # the performane metrics for i in training_fraction_indices: # Create the config object hyperparams = { **base_hyperparams, "training_fraction": training_fractions[i], } model_config = get_model_config(model_name, **hyperparams) config = Config( model_config, get_dataset_config(ref_job.dataset, data_path) ) # Get the indices of the models that should be used to derive the performance if validation_metric is None or len(training_fractions) == 1: # If the model does not require training, or we don't look at the validation # performance, we just choose the current index choices = [i] * len(jobs) else: # Otherwise, we get the minimum value for the metric up to this point in time choices = [ np.argmin( [ p["evaluation"][validation_metric] for p in job.metrics ][: i + 1] ).item() for job in jobs ] # Get the performances of the chosen models performances = [ job.performances[choice] for choice, job in zip(choices, jobs) ] # And average the performance averaged_performance = Performance( **{ metric: Metric( np.mean( [getattr(p, metric).mean for p in performances] ), np.std( [getattr(p, metric).mean for p in performances] ), ) for metric in Performance.metrics() } ) # Get validation scores if available try: val_ncrps = np.mean( [ job.metrics[c]["evaluation"]["val_ncrps"] for (job, c) in zip(jobs, choices) ] ) val_loss = np.mean( [ job.metrics[c]["evaluation"]["val_loss"] for (job, c) in zip(jobs, choices) ] ).item() val_scores = ValidationScores(val_ncrps, val_loss) except KeyError: val_scores = None # Initialize the info object runs.append( JobInfo( config, averaged_performance, val_scores, jobs, choices ) ) return runs