class RunHistory(object): """Container for target algorithm run information. **Note:** Guaranteed to be picklable. Attributes ---------- data : collections.OrderedDict() TODO config_ids : dict Maps config -> id ids_config : dict Maps id -> config cost_per_config : dict Maps config_id -> cost runs_per_config : dict Maps config_id -> number of runs aggregate_func overwrite_existing_runs """ def __init__(self, aggregate_func: typing.Callable, overwrite_existing_runs: bool = False) -> None: """Constructor Parameters ---------- aggregate_func: callable function to aggregate perf across instances overwrite_existing_runs: bool allows to overwrites old results if pairs of algorithm-instance-seed were measured multiple times """ self.logger = PickableLoggerAdapter(self.__module__ + "." + self.__class__.__name__) # By having the data in a deterministic order we can do useful tests # when we serialize the data and can assume it's still in the same # order as it was added. self.data = collections.OrderedDict( ) # type: typing.Dict[RunKey, RunValue] # for fast access, we have also an unordered data structure # to get all instance seed pairs of a configuration self._configid_to_inst_seed = {} # type: typing.Dict[int, InstSeedKey] self.config_ids = {} # type: typing.Dict[Configuration, int] self.ids_config = {} # type: typing.Dict[int, Configuration] self._n_id = 0 # Stores cost for each configuration ID self.cost_per_config = {} # type: typing.Dict[int, float] # runs_per_config maps the configuration ID to the number of runs for that configuration # and is necessary for computing the moving average self.runs_per_config = {} # type: typing.Dict[int, int] # Store whether a datapoint is "external", which means it was read from # a JSON file. Can be chosen to not be written to disk self.external = {} # type: typing.Dict[RunKey, DataOrigin] self.aggregate_func = aggregate_func self.overwrite_existing_runs = overwrite_existing_runs def add(self, config: Configuration, cost: float, time: float, status: StatusType, instance_id: str = None, seed: int = None, additional_info: dict = None, origin: DataOrigin = DataOrigin.INTERNAL): """Adds a data of a new target algorithm (TA) run; it will update data if the same key values are used (config, instance_id, seed) Parameters ---------- config : dict (or other type -- depending on config space module) Parameter configuration cost: float Cost of TA run (will be minimized) time: float Runtime of TA run status: str Status in {SUCCESS, TIMEOUT, CRASHED, ABORT, MEMOUT} instance_id: str String representing an instance (default: None) seed: int Random seed used by TA (default: None) additional_info: dict Additional run infos (could include further returned information from TA or fields such as start time and host_id) origin: DataOrigin Defines how data will be used. """ config_id = self.config_ids.get(config) if config_id is None: self._n_id += 1 self.config_ids[config] = self._n_id config_id = self.config_ids.get(config) self.ids_config[self._n_id] = config k = RunKey(config_id, instance_id, seed) v = RunValue(cost, time, status, additional_info) # Each runkey is supposed to be used only once. Repeated tries to add # the same runkey will be ignored silently if not capped. if self.overwrite_existing_runs or self.data.get(k) is None: self._add(k, v, status, origin) elif status != StatusType.CAPPED and self.data[ k].status == StatusType.CAPPED: # overwrite capped runs with uncapped runs self._add(k, v, status, origin) elif status == StatusType.CAPPED and self.data[ k].status == StatusType.CAPPED and cost > self.data[k].cost: # overwrite if censored with a larger cutoff self._add(k, v, status, origin) def _add(self, k: RunKey, v: RunValue, status: StatusType, origin: DataOrigin): """Actual function to add new entry to data structures TODO """ self.data[k] = v self.external[k] = origin if origin in (DataOrigin.INTERNAL, DataOrigin.EXTERNAL_SAME_INSTANCES) \ and status != StatusType.CAPPED: # also add to fast data structure is_k = InstSeedKey(k.instance_id, k.seed) self._configid_to_inst_seed[ k.config_id] = self._configid_to_inst_seed.get( k.config_id, []) if is_k not in self._configid_to_inst_seed[k.config_id]: self._configid_to_inst_seed[k.config_id].append(is_k) if not self.overwrite_existing_runs: # assumes an average across runs as cost function aggregation self.incremental_update_cost(self.ids_config[k.config_id], v.cost) else: self.update_cost(config=self.ids_config[k.config_id]) def update_cost(self, config: Configuration): """Store the performance of a configuration across the instances in self.cost_perf_config and also updates self.runs_per_config; uses self.aggregate_func Parameters ---------- config: Configuration configuration to update cost based on all runs in runhistory """ inst_seeds = set(self.get_runs_for_config(config)) perf = self.aggregate_func(config, self, inst_seeds) config_id = self.config_ids[config] self.cost_per_config[config_id] = perf self.runs_per_config[config_id] = len(inst_seeds) def compute_all_costs(self, instances: typing.List[str] = None): """Computes the cost of all configurations from scratch and overwrites self.cost_perf_config and self.runs_per_config accordingly; Parameters ---------- instances: typing.List[str] list of instances; if given, cost is only computed wrt to this instance set """ self.cost_per_config = {} self.runs_per_config = {} for config, config_id in self.config_ids.items(): inst_seeds = set(self.get_runs_for_config(config)) if instances is not None: inst_seeds = list( filter(lambda x: x.instance in instances, inst_seeds)) if inst_seeds: # can be empty if never saw any runs on <instances> perf = self.aggregate_func(config, self, inst_seeds) self.cost_per_config[config_id] = perf self.runs_per_config[config_id] = len(inst_seeds) def incremental_update_cost(self, config: Configuration, cost: float): """Incrementally updates the performance of a configuration by using a moving average; Parameters ---------- config: Configuration configuration to update cost based on all runs in runhistory cost: float cost of new run of config """ config_id = self.config_ids[config] n_runs = self.runs_per_config.get(config_id, 0) old_cost = self.cost_per_config.get(config_id, 0.) self.cost_per_config[config_id] = ( (old_cost * n_runs) + cost) / (n_runs + 1) self.runs_per_config[config_id] = n_runs + 1 def get_cost(self, config: Configuration): """Returns empirical cost for a configuration; uses self.cost_per_config Parameters ---------- config: Configuration Returns ------- cost: float Computed cost for configuration """ config_id = self.config_ids[config] return self.cost_per_config.get(config_id, np.nan) def get_runs_for_config(self, config: Configuration): """Return all runs (instance seed pairs) for a configuration. Parameters ---------- config : Configuration from ConfigSpace Parameter configuration Returns ------- instance_seed_pairs : list<tuples of instance, seed> """ config_id = self.config_ids.get(config) return self._configid_to_inst_seed.get(config_id, []) def get_instance_costs_for_config(self, config: Configuration): """ Returns the average cost per instance (across seeds) for a configuration Parameters ---------- config : Configuration from ConfigSpace Parameter configuration Returns ------- cost_per_inst: dict<instance name<str>, cost<float>> """ config_id = self.config_ids.get(config) runs_ = self._configid_to_inst_seed.get(config_id, []) cost_per_inst = {} for inst, seed in runs_: cost_per_inst[inst] = cost_per_inst.get(inst, []) rkey = RunKey(config_id, inst, seed) vkey = self.data[rkey] cost_per_inst[inst].append(vkey.cost) cost_per_inst = dict([(inst, np.mean(costs)) for inst, costs in cost_per_inst.items()]) return cost_per_inst def get_all_configs(self): """Return all configurations in this RunHistory object Returns ------- parameter configurations: list """ return list(self.config_ids.keys()) def empty(self): """Check whether or not the RunHistory is empty. Returns ------- emptiness: bool True if runs have been added to the RunHistory, False otherwise """ return len(self.data) == 0 def save_json(self, fn: str = "runhistory.json", save_external: bool = False): """ saves runhistory on disk Parameters ---------- fn : str file name save_external : bool Whether to save external data in the runhistory file. """ fn = fn.replace(":", "-") data = [([ int(k.config_id), str(k.instance_id) if k.instance_id is not None else None, int(k.seed) ], list(v)) for k, v in self.data.items() if save_external or self.external[k] == DataOrigin.INTERNAL] config_ids_to_serialize = set([entry[0][0] for entry in data]) configs = { id_: conf.get_dictionary() for id_, conf in self.ids_config.items() if id_ in config_ids_to_serialize } config_origins = { id_: conf.origin for id_, conf in self.ids_config.items() if (id_ in config_ids_to_serialize and conf.origin is not None) } with open(fn, "w") as fp: json.dump( { "data": data, "config_origins": config_origins, "configs": configs }, fp, cls=EnumEncoder, indent=2) def load_json(self, fn: str, cs: ConfigurationSpace): """Load and runhistory in json representation from disk. Overwrites current runhistory! Parameters ---------- fn : str file name to load from cs : ConfigSpace instance of configuration space """ try: with open(fn) as fp: all_data = json.load(fp, object_hook=StatusType.enum_hook) except Exception as e: self.logger.warning( 'Encountered exception %s while reading runhistory from %s. ' 'Not adding any runs!', e, fn, ) return config_origins = all_data.get("config_origins", {}) self.ids_config = { int(id_): Configuration(cs, values=values, origin=config_origins.get(id_, None)) for id_, values in all_data["configs"].items() } self.config_ids = { config: id_ for id_, config in self.ids_config.items() } self._n_id = len(self.config_ids) # important to use add method to use all data structure correctly for k, v in all_data["data"]: self.add(config=self.ids_config[int(k[0])], cost=float(v[0]), time=float(v[1]), status=StatusType(v[2]), instance_id=k[1], seed=int(k[2]), additional_info=v[3]) def update_from_json( self, fn: str, cs: ConfigurationSpace, origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES): """Update the current runhistory by adding new runs from a json file. Parameters ---------- fn : str File name to load from. cs : ConfigSpace Instance of configuration space. origin : DataOrigin What to store as data origin. """ new_runhistory = RunHistory(self.aggregate_func) new_runhistory.load_json(fn, cs) self.update(runhistory=new_runhistory, origin=origin) def update(self, runhistory: 'RunHistory', origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES): """Update the current runhistory by adding new runs from a RunHistory. Parameters ---------- runhistory: RunHistory Runhistory with additional data to be added to self origin: DataOrigin If set to ``INTERNAL`` or ``EXTERNAL_FULL`` the data will be added to the internal data structure self._configid_to_inst_seed and be available :meth:`through get_runs_for_config`. """ # Configurations might be already known, but by a different ID. This # does not matter here because the add() method handles this # correctly by assigning an ID to unknown configurations and re-using # the ID for key, value in runhistory.data.items(): config_id, instance_id, seed = key cost, time, status, additional_info = value config = runhistory.ids_config[config_id] self.add(config=config, cost=cost, time=time, status=status, instance_id=instance_id, seed=seed, additional_info=additional_info, origin=origin)
class BaseRunner(ABC): """Interface class to handle the execution of SMAC' configurations. This interface defines how to interact with the SMBO loop. The complexity of running a configuration as well as handling the results is abstracted to the SMBO via a BaseRunner. From SMBO perspective, launching a configuration follows a submit/collect scheme as follows: 1- A run is launched via submit_run() 1.1- Submit_run internally calls run_wrapper(), a method that contains common processing functions among different runners, for example, handling capping and stats checking. 1.2- A class that implements BaseRunner defines run() which is really the algorithm to translate a RunInfo to a RunValue, i.e. a configuration to an actual result. 2- A completed run is collected via get_finished_runs(), which returns any finished runs, if any. 3- This interface also offers the method wait() as a mechanism to make sure we have enough data in the next iteration to make a decision. For example, the intensifier might not be able to select the next challenger until more results are available. Attributes ---------- results ta stats run_obj par_factor cost_for_crash abort_first_run_crash Parameters --------- ta : typing.Union[typing.List[str], typing.Callable] target algorithm stats: Stats stats object to collect statistics about runtime/additional info run_obj: str run objective of SMAC par_factor: int penalization factor cost_for_crash : float cost that is used in case of crashed runs (including runs that returned NaN or inf) abort_on_first_run_crash: bool if true and first run crashes, raise FirstRunCrashedException """ def __init__( self, ta: typing.Union[typing.List[str], typing.Callable], stats: Stats, run_obj: str = "runtime", par_factor: int = 1, cost_for_crash: float = float(MAXINT), abort_on_first_run_crash: bool = True, ): # The results is a FIFO structure, implemented via a list # (because the Queue lock is not pickable). Finished runs are # put in this list and collected via process_finished_runs self.results = [] # type: typing.List[typing.Tuple[RunInfo, RunValue]] # Below state the support for a Runner algorithm that # implements a ta self.ta = ta self.stats = stats self.run_obj = run_obj self.par_factor = par_factor self.cost_for_crash = cost_for_crash self.abort_on_first_run_crash = abort_on_first_run_crash self.logger = PickableLoggerAdapter(self.__module__ + '.' + self.__class__.__name__) self._supports_memory_limit = False super().__init__() @abstractmethod def submit_run(self, run_info: RunInfo) -> None: """This function submits a configuration embedded in a RunInfo object, and uses one of the workers to produce a result (such result will eventually be available on the self.results FIFO). This interface method will be called by SMBO, with the expectation that a function will be executed by a worker. What will be executed is dictated by run_info, and "how" will it be executed is decided via the child class that implements a run() method. Because config submission can be a serial/parallel endeavor, it is expected to be implemented by a child class. Parameters ---------- run_info: RunInfo An object containing the configuration and the necessary data to run it """ pass @abstractmethod def run( self, config: Configuration, instance: str, cutoff: typing.Optional[float] = None, seed: int = 12345, budget: typing.Optional[float] = None, instance_specific: str = "0", ) -> typing.Tuple[StatusType, float, float, typing.Dict]: """Runs target algorithm <self.ta> with configuration <config> on instance <instance> with instance specifics <specifics> for at most <cutoff> seconds and random seed <seed> This method exemplifies how to defined the run() method Parameters ---------- config : Configuration dictionary param -> value instance : string problem instance cutoff : float, optional Wallclock time limit of the target algorithm. If no value is provided no limit will be enforced. seed : int random seed budget : float, optional A positive, real-valued number representing an arbitrary limit to the target algorithm. Handled by the target algorithm internally instance_specific: str instance specific information (e.g., domain file or solution) Returns ------- status: enum of StatusType (int) {SUCCESS, TIMEOUT, CRASHED, ABORT} cost: float cost/regret/quality (float) (None, if not returned by TA) runtime: float runtime (None if not returned by TA) additional_info: dict all further additional run information """ pass def run_wrapper( self, run_info: RunInfo, ) -> typing.Tuple[RunInfo, RunValue]: """Wrapper around run() to exec and check the execution of a given config file This function encapsulates common handling/processing, so that run() implementation is simplified. Parameters ---------- run_info : RunInfo Object that contains enough information to execute a configuration run in isolation. Returns ------- RunInfo: an object containing the configuration launched RunValue: Contains information about the status/performance of config """ start = time.time() if run_info.cutoff is None and self.run_obj == "runtime": if self.logger: self.logger.critical( "For scenarios optimizing running time " "(run objective), a cutoff time is required, " "but not given to this call.") raise ValueError("For scenarios optimizing running time " "(run objective), a cutoff time is required, " "but not given to this call.") cutoff = None if run_info.cutoff is not None: cutoff = int(math.ceil(run_info.cutoff)) try: status, cost, runtime, additional_info = self.run( config=run_info.config, instance=run_info.instance, cutoff=cutoff, seed=run_info.seed, budget=run_info.budget, instance_specific=run_info.instance_specific) except Exception as e: status = StatusType.CRASHED cost = self.cost_for_crash runtime = time.time() - start # Add context information to the error message exception_traceback = traceback.format_exc() error_message = repr(e) additional_info = { 'traceback': exception_traceback, 'error': error_message } end = time.time() if run_info.budget == 0 and status == StatusType.DONOTADVANCE: raise ValueError( "Cannot handle DONOTADVANCE state when using intensify or SH/HB on " "instances.") # Catch NaN or inf. if (self.run_obj == 'runtime' and not np.isfinite(runtime) or self.run_obj == 'quality' and not np.isfinite(cost)): if self.logger: self.logger.warning( "Target Algorithm returned NaN or inf as {}. " "Algorithm run is treated as CRASHED, cost " "is set to {} for quality scenarios. " "(Change value through \"cost_for_crash\"" "-option.)".format(self.run_obj, self.cost_for_crash)) status = StatusType.CRASHED if self.run_obj == "runtime": # The following line pleases mypy - we already check for cutoff not being none above, # prior to calling run. However, mypy assumes that the data type of cutoff # is still Optional[int] assert cutoff is not None if runtime > self.par_factor * cutoff: self.logger.warning("Returned running time is larger " "than {0} times the passed cutoff time. " "Clamping to {0} x cutoff.".format( self.par_factor)) runtime = cutoff * self.par_factor status = StatusType.TIMEOUT if status == StatusType.SUCCESS: cost = runtime else: cost = cutoff * self.par_factor if status == StatusType.TIMEOUT and run_info.capped: status = StatusType.CAPPED else: if status == StatusType.CRASHED: cost = self.cost_for_crash return run_info, RunValue(status=status, cost=cost, time=runtime, additional_info=additional_info, starttime=start, endtime=end) @abstractmethod def get_finished_runs( self) -> typing.List[typing.Tuple[RunInfo, RunValue]]: """This method returns any finished configuration, and returns a list with the results of exercising the configurations. This class keeps populating results to self.results until a call to get_finished runs is done. In this case, the self.results list is emptied and all RunValues produced by running run() are returned. Returns ------- List[RunInfo, RunValue]: A list of pais RunInfo/RunValues a submitted configuration """ raise NotImplementedError() @abstractmethod def wait(self) -> None: """SMBO/intensifier might need to wait for runs to finish before making a decision. This method waits until 1 run completes """ pass @abstractmethod def pending_runs(self) -> bool: """ Whether or not there are configs still running. Generally if the runner is serial, launching a run instantly returns it's result. On parallel runners, there might be pending configurations to complete. """ pass @abstractmethod def num_workers(self) -> int: """ Return the active number of workers that will execute tae runs. """ pass
class RunHistory(object): """Container for target algorithm run information. Most importantly, the runhistory contains an efficient mapping from each evaluated configuration to the empirical cost observed on either the full instance set or a subset. The cost is the average over all observed costs for one configuration: * If using budgets for a single instance, only the cost on the highest observed budget is returned. * If using instances as the budget, the average cost over all evaluated instances is returned. * Theoretically, the runhistory object can handle instances and budgets at the same time. This is neither used nor tested. * Capped runs are not included in this cost. Note ---- Guaranteed to be picklable. Attributes ---------- data : collections.OrderedDict() TODO config_ids : dict Maps config -> id ids_config : dict Maps id -> config num_runs_per_config : dict Maps config_id -> number of runs Parameters ---------- overwrite_existing_runs : bool (default=True) If set to ``True`` and a run of a configuration on an instance-budget-seed-pair already exists, it is overwritten. """ def __init__(self, overwrite_existing_runs: bool = False) -> None: """Constructor Parameters ---------- overwrite_existing_runs: bool allows to overwrites old results if pairs of algorithm-instance-seed were measured multiple times """ self.logger = PickableLoggerAdapter(self.__module__ + "." + self.__class__.__name__) # By having the data in a deterministic order we can do useful tests # when we serialize the data and can assume it's still in the same # order as it was added. self.data = collections.OrderedDict( ) # type: typing.Dict[RunKey, RunValue] # for fast access, we have also an unordered data structure # to get all instance seed pairs of a configuration. # This does not include capped runs. self._configid_to_inst_seed_budget = { } # type: typing.Dict[int, typing.Dict[InstSeedKey, typing.List[float]]] self.config_ids = {} # type: typing.Dict[Configuration, int] self.ids_config = {} # type: typing.Dict[int, Configuration] self._n_id = 0 # Stores cost for each configuration ID self._cost_per_config = {} # type: typing.Dict[int, float] # Stores min cost across all budgets for each configuration ID self._min_cost_per_config = {} # type: typing.Dict[int, float] # runs_per_config maps the configuration ID to the number of runs for that configuration # and is necessary for computing the moving average self.num_runs_per_config = {} # type: typing.Dict[int, int] # Store whether a datapoint is "external", which means it was read from # a JSON file. Can be chosen to not be written to disk self.external = {} # type: typing.Dict[RunKey, DataOrigin] self.overwrite_existing_runs = overwrite_existing_runs def add( self, config: Configuration, cost: float, time: float, status: StatusType, instance_id: typing.Optional[str] = None, seed: typing.Optional[int] = None, budget: float = 0.0, starttime: float = 0.0, endtime: float = 0.0, additional_info: typing.Optional[typing.Dict] = None, origin: DataOrigin = DataOrigin.INTERNAL, force_update: bool = False, ) -> None: """Adds a data of a new target algorithm (TA) run; it will update data if the same key values are used (config, instance_id, seed) Parameters ---------- config : dict (or other type -- depending on config space module) Parameter configuration cost: float Cost of TA run (will be minimized) time: float Runtime of TA run status: str Status in {SUCCESS, TIMEOUT, CRASHED, ABORT, MEMOUT} instance_id: str String representing an instance (default: None) seed: int Random seed used by TA (default: None) budget: float budget (cutoff) used in intensifier to limit TA (default: 0) starttime: float starting timestamp of TA evaluation endtime: float ending timestamp of TA evaluation additional_info: dict Additional run infos (could include further returned information from TA or fields such as start time and host_id) origin: DataOrigin Defines how data will be used. force_update: bool (default: False) Forces the addition of a config to the history """ if config is None: raise TypeError( 'Configuration to add to the runhistory must not be None') elif not isinstance(config, Configuration): raise TypeError( 'Configuration to add to the runhistory is not of type Configuration, but %s' % type(config)) # Get the config id config_id_tmp = self.config_ids.get(config) if config_id_tmp is None: self._n_id += 1 self.config_ids[config] = self._n_id config_id = typing.cast(int, self.config_ids.get(config)) self.ids_config[self._n_id] = config else: config_id = typing.cast(int, config_id_tmp) # Construct keys and values for the data dictionary k = RunKey(config_id, instance_id, seed, budget) v = RunValue(cost, time, status, starttime, endtime, additional_info) # Each runkey is supposed to be used only once. Repeated tries to add # the same runkey will be ignored silently if not capped. if self.overwrite_existing_runs or force_update or self.data.get( k) is None: self._add(k, v, status, origin) elif status != StatusType.CAPPED and self.data[ k].status == StatusType.CAPPED: # overwrite capped runs with uncapped runs self._add(k, v, status, origin) elif status == StatusType.CAPPED and self.data[ k].status == StatusType.CAPPED and cost > self.data[k].cost: # overwrite if censored with a larger cutoff self._add(k, v, status, origin) def _add(self, k: RunKey, v: RunValue, status: StatusType, origin: DataOrigin) -> None: """Actual function to add new entry to data structures TODO """ self.data[k] = v self.external[k] = origin # Capped data is added above # Do not register the cost until the run has completed if origin in (DataOrigin.INTERNAL, DataOrigin.EXTERNAL_SAME_INSTANCES) \ and status not in [StatusType.CAPPED, StatusType.RUNNING]: # also add to fast data structure is_k = InstSeedKey(k.instance_id, k.seed) self._configid_to_inst_seed_budget[ k.config_id] = self._configid_to_inst_seed_budget.get( k.config_id, {}) if is_k not in self._configid_to_inst_seed_budget[ k.config_id].keys(): # add new inst-seed-key with budget to main dict self._configid_to_inst_seed_budget[k.config_id][is_k] = [ k.budget ] elif k.budget not in is_k: # append new budget to existing inst-seed-key dict self._configid_to_inst_seed_budget[k.config_id][is_k].append( k.budget) # if budget is used, then update cost instead of incremental updates if not self.overwrite_existing_runs and k.budget == 0: # assumes an average across runs as cost function aggregation, this is used for algorithm configuration # (incremental updates are used to save time as getting the cost for > 100 instances is high) self.incremental_update_cost(self.ids_config[k.config_id], v.cost) else: # this is when budget > 0 (only successive halving and hyperband so far) self.update_cost(config=self.ids_config[k.config_id]) if k.budget > 0: if self.num_runs_per_config[ k. config_id] != 1: # This is updated in update_cost raise ValueError('This should not happen!') def update_cost(self, config: Configuration) -> None: """Store the performance of a configuration across the instances in self.cost_per_config and also updates self.runs_per_config; Note ---- This method ignores capped runs. Parameters ---------- config: Configuration configuration to update cost based on all runs in runhistory """ config_id = self.config_ids[config] # removing duplicates while keeping the order inst_seed_budgets = list( dict.fromkeys( self.get_runs_for_config(config, only_max_observed_budget=True))) self._cost_per_config[config_id] = self.average_cost( config, inst_seed_budgets) self.num_runs_per_config[config_id] = len(inst_seed_budgets) all_inst_seed_budgets = list( dict.fromkeys( self.get_runs_for_config(config, only_max_observed_budget=False))) self._min_cost_per_config[config_id] = self.min_cost( config, all_inst_seed_budgets) def incremental_update_cost(self, config: Configuration, cost: float) -> None: """Incrementally updates the performance of a configuration by using a moving average; Parameters ---------- config: Configuration configuration to update cost based on all runs in runhistory cost: float cost of new run of config """ config_id = self.config_ids[config] n_runs = self.num_runs_per_config.get(config_id, 0) old_cost = self._cost_per_config.get(config_id, 0.) self._cost_per_config[config_id] = ( (old_cost * n_runs) + cost) / (n_runs + 1) self.num_runs_per_config[config_id] = n_runs + 1 def get_cost(self, config: Configuration) -> float: """Returns empirical cost for a configuration. See the class docstring for how the costs are computed. The costs are not re-computed, but are read from cache. Parameters ---------- config: Configuration Returns ------- cost: float Computed cost for configuration """ config_id = self.config_ids.get(config) return self._cost_per_config.get( config_id, np.nan) # type: ignore[arg-type] # noqa F821 def get_runs_for_config( self, config: Configuration, only_max_observed_budget: bool) -> typing.List[InstSeedBudgetKey]: """Return all runs (instance seed pairs) for a configuration. Note ---- This method ignores capped runs. Parameters ---------- config : Configuration from ConfigSpace Parameter configuration only_max_observed_budget : bool Select only the maximally observed budget run for this configuration Returns ------- instance_seed_budget_pairs : list<tuples of instance, seed, budget> """ config_id = self.config_ids.get(config) runs = self._configid_to_inst_seed_budget.get( config_id, {}).copy() # type: ignore[arg-type] # noqa F821 # select only the max budget run if specified if only_max_observed_budget: for k, v in runs.items(): runs[k] = [max(v)] # convert to inst-seed-budget key rval = [ InstSeedBudgetKey(k.instance, k.seed, budget) for k, v in runs.items() for budget in v ] return rval def get_all_configs(self) -> typing.List[Configuration]: """Return all configurations in this RunHistory object Returns ------- parameter configurations: list """ return list(self.config_ids.keys()) def get_all_configs_per_budget( self, budget_subset: typing.Optional[typing.List] = None, ) -> typing.List[Configuration]: """ Return all configs in this RunHistory object that have been run on one of these budgets Parameter --------- budget_subset: list Returns ------- parameter configurations: list """ if budget_subset is None: return self.get_all_configs() configs = [] for c, i, s, b in self.data.keys(): if b in budget_subset: configs.append(self.ids_config[c]) return configs def get_min_cost(self, config: Configuration) -> float: """Returns the lowest empirical cost for a configuration, across all runs (budgets) See the class docstring for how the costs are computed. The costs are not re-computed, but are read from cache. Parameters ---------- config: Configuration Returns ------- min_cost: float Computed cost for configuration """ config_id = self.config_ids.get(config) return self._min_cost_per_config.get( config_id, np.nan) # type: ignore[arg-type] # noqa F821 def empty(self) -> bool: """Check whether or not the RunHistory is empty. Returns ------- emptiness: bool True if runs have been added to the RunHistory, False otherwise """ return len(self.data) == 0 def save_json(self, fn: str = "runhistory.json", save_external: bool = False) -> None: """ saves runhistory on disk Parameters ---------- fn : str file name save_external : bool Whether to save external data in the runhistory file. """ data = [([ int(k.config_id), str(k.instance_id) if k.instance_id is not None else None, int(k.seed), float(k.budget) if k[3] is not None else 0 ], list(v)) for k, v in self.data.items() if save_external or self.external[k] == DataOrigin.INTERNAL] config_ids_to_serialize = set([entry[0][0] for entry in data]) configs = { id_: conf.get_dictionary() for id_, conf in self.ids_config.items() if id_ in config_ids_to_serialize } config_origins = { id_: conf.origin for id_, conf in self.ids_config.items() if (id_ in config_ids_to_serialize and conf.origin is not None) } with open(fn, "w") as fp: json.dump( { "data": data, "config_origins": config_origins, "configs": configs }, fp, cls=EnumEncoder, indent=2) def load_json(self, fn: str, cs: ConfigurationSpace) -> None: """Load and runhistory in json representation from disk. Overwrites current runhistory! Parameters ---------- fn : str file name to load from cs : ConfigSpace instance of configuration space """ try: with open(fn) as fp: all_data = json.load(fp, object_hook=StatusType.enum_hook) except Exception as e: self.logger.warning( 'Encountered exception %s while reading runhistory from %s. ' 'Not adding any runs!', e, fn, ) return config_origins = all_data.get("config_origins", {}) self.ids_config = { int(id_): Configuration(cs, values=values, origin=config_origins.get(id_, None)) for id_, values in all_data["configs"].items() } self.config_ids = { config: id_ for id_, config in self.ids_config.items() } self._n_id = len(self.config_ids) # important to use add method to use all data structure correctly for k, v in all_data["data"]: self.add(config=self.ids_config[int(k[0])], cost=float(v[0]), time=float(v[1]), status=StatusType(v[2]), instance_id=k[1], seed=int(k[2]), budget=float(k[3]) if len(k) == 4 else 0, starttime=v[3], endtime=v[4], additional_info=v[5]) def update_from_json( self, fn: str, cs: ConfigurationSpace, origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES, ) -> None: """Update the current runhistory by adding new runs from a json file. Parameters ---------- fn : str File name to load from. cs : ConfigSpace Instance of configuration space. origin : DataOrigin What to store as data origin. """ new_runhistory = RunHistory() new_runhistory.load_json(fn, cs) self.update(runhistory=new_runhistory, origin=origin) def update( self, runhistory: 'RunHistory', origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES, ) -> None: """Update the current runhistory by adding new runs from a RunHistory. Parameters ---------- runhistory: RunHistory Runhistory with additional data to be added to self origin: DataOrigin If set to ``INTERNAL`` or ``EXTERNAL_FULL`` the data will be added to the internal data structure self._configid_to_inst_seed_budget and be available :meth:`through get_runs_for_config`. """ # Configurations might be already known, but by a different ID. This # does not matter here because the add() method handles this # correctly by assigning an ID to unknown configurations and re-using # the ID for key, value in runhistory.data.items(): config_id, instance_id, seed, budget = key cost, time, status, start, end, additional_info = value config = runhistory.ids_config[config_id] self.add(config=config, cost=cost, time=time, status=status, instance_id=instance_id, starttime=start, endtime=end, seed=seed, budget=budget, additional_info=additional_info, origin=origin) def _cost( self, config: Configuration, instance_seed_budget_keys: typing.Optional[ typing.Iterable[InstSeedBudgetKey]] = None, ) -> typing.List[float]: """Return array of all costs for the given config for further calculations. Parameters ---------- config : Configuration Configuration to calculate objective for instance_seed_budget_keys : list, optional (default=None) List of tuples of instance-seeds-budget keys. If None, the run_history is queried for all runs of the given configuration. Returns ------- Costs: list Array of all costs """ try: id_ = self.config_ids[config] except KeyError: # challenger was not running so far return [] if instance_seed_budget_keys is None: instance_seed_budget_keys = self.get_runs_for_config( config, only_max_observed_budget=True) costs = [] for i, r, b in instance_seed_budget_keys: k = RunKey(id_, i, r, b) costs.append(self.data[k].cost) return costs def average_cost( self, config: Configuration, instance_seed_budget_keys: typing.Optional[ typing.Iterable[InstSeedBudgetKey]] = None, ) -> float: """Return the average cost of a configuration. This is the mean of costs of all instance-seed pairs. Parameters ---------- config : Configuration Configuration to calculate objective for instance_seed_budget_keys : list, optional (default=None) List of tuples of instance-seeds-budget keys. If None, the run_history is queried for all runs of the given configuration. Returns ---------- Cost: float Average cost """ costs = self._cost(config, instance_seed_budget_keys) if costs: return float(np.mean(costs)) return np.nan def sum_cost( self, config: Configuration, instance_seed_budget_keys: typing.Optional[ typing.Iterable[InstSeedBudgetKey]] = None, ) -> float: """Return the sum of costs of a configuration. This is the sum of costs of all instance-seed pairs. Parameters ---------- config : Configuration Configuration to calculate objective for instance_seed_budget_keys : list, optional (default=None) List of tuples of instance-seeds-budget keys. If None, the run_history is queried for all runs of the given configuration. Returns ---------- sum_cost: float Sum of costs of config """ return float(np.sum(self._cost(config, instance_seed_budget_keys))) def min_cost( self, config: Configuration, instance_seed_budget_keys: typing.Optional[ typing.Iterable[InstSeedBudgetKey]] = None, ) -> float: """Return the minimum cost of a configuration This is the minimum cost of all instance-seed pairs. Parameters ---------- config : Configuration Configuration to calculate objective for instance_seed_budget_keys : list, optional (default=None) List of tuples of instance-seeds-budget keys. If None, the run_history is queried for all runs of the given configuration. Returns ---------- min_cost: float minimum cost of config """ costs = self._cost(config, instance_seed_budget_keys) if costs: return float(np.min(costs)) return np.nan def compute_all_costs(self, instances: typing.Optional[typing.List[str]] = None ) -> None: """Computes the cost of all configurations from scratch and overwrites self.cost_perf_config and self.runs_per_config accordingly; Note ---- This method is only used for ``merge_foreign_data`` and should be removed. Parameters ---------- instances: typing.List[str] list of instances; if given, cost is only computed wrt to this instance set """ self._cost_per_config = {} self.num_runs_per_config = {} for config, config_id in self.config_ids.items(): # removing duplicates while keeping the order inst_seed_budgets = list( dict.fromkeys( self.get_runs_for_config(config, only_max_observed_budget=True))) if instances is not None: inst_seed_budgets = list( filter( lambda x: x.instance in typing.cast( typing.List, instances), inst_seed_budgets)) if inst_seed_budgets: # can be empty if never saw any runs on <instances> self._cost_per_config[config_id] = self.average_cost( config, inst_seed_budgets) self._min_cost_per_config[config_id] = self.min_cost( config, inst_seed_budgets) self.num_runs_per_config[config_id] = len(inst_seed_budgets) def get_instance_costs_for_config( self, config: Configuration) -> typing.Dict[str, typing.List[float]]: """ Returns the average cost per instance (across seeds) for a configuration If the runhistory contains budgets, only the highest budget for a configuration is returned. Note ---- This is used by the pSMAC facade to determine the incumbent after the evaluation. Parameters ---------- config : Configuration from ConfigSpace Parameter configuration Returns ------- cost_per_inst: dict<instance name<str>, cost<float>> """ runs_ = self.get_runs_for_config(config, only_max_observed_budget=True) cost_per_inst = {} # type: typing.Dict[str, typing.List[float]] for inst, seed, budget in runs_: cost_per_inst[inst] = cost_per_inst.get(inst, []) rkey = RunKey(self.config_ids[config], inst, seed, budget) vkey = self.data[rkey] cost_per_inst[inst].append(vkey.cost) cost_per_inst = dict([(inst, np.mean(costs)) for inst, costs in cost_per_inst.items()]) return cost_per_inst