Ejemplo n.º 1
0
class RunHistory(object):
    """Container for target algorithm run information.

    **Note:** Guaranteed to be picklable.

    Attributes
    ----------
    data : collections.OrderedDict()
        TODO
    config_ids : dict
        Maps config -> id
    ids_config : dict
        Maps id -> config
    cost_per_config : dict
        Maps config_id -> cost
    runs_per_config : dict
        Maps config_id -> number of runs

    aggregate_func
    overwrite_existing_runs
    """
    def __init__(self,
                 aggregate_func: typing.Callable,
                 overwrite_existing_runs: bool = False) -> None:
        """Constructor

        Parameters
        ----------
        aggregate_func: callable
            function to aggregate perf across instances
        overwrite_existing_runs: bool
            allows to overwrites old results if pairs of
            algorithm-instance-seed were measured
            multiple times
        """
        self.logger = PickableLoggerAdapter(self.__module__ + "." +
                                            self.__class__.__name__)

        # By having the data in a deterministic order we can do useful tests
        # when we serialize the data and can assume it's still in the same
        # order as it was added.
        self.data = collections.OrderedDict(
        )  # type: typing.Dict[RunKey, RunValue]

        # for fast access, we have also an unordered data structure
        # to get all instance seed pairs of a configuration
        self._configid_to_inst_seed = {}  # type: typing.Dict[int, InstSeedKey]

        self.config_ids = {}  # type: typing.Dict[Configuration, int]
        self.ids_config = {}  # type: typing.Dict[int, Configuration]
        self._n_id = 0

        # Stores cost for each configuration ID
        self.cost_per_config = {}  # type: typing.Dict[int, float]
        # runs_per_config maps the configuration ID to the number of runs for that configuration
        # and is necessary for computing the moving average
        self.runs_per_config = {}  # type: typing.Dict[int, int]

        # Store whether a datapoint is "external", which means it was read from
        # a JSON file. Can be chosen to not be written to disk
        self.external = {}  # type: typing.Dict[RunKey, DataOrigin]

        self.aggregate_func = aggregate_func
        self.overwrite_existing_runs = overwrite_existing_runs

    def add(self,
            config: Configuration,
            cost: float,
            time: float,
            status: StatusType,
            instance_id: str = None,
            seed: int = None,
            additional_info: dict = None,
            origin: DataOrigin = DataOrigin.INTERNAL):
        """Adds a data of a new target algorithm (TA) run;
        it will update data if the same key values are used
        (config, instance_id, seed)

        Parameters
        ----------
            config : dict (or other type -- depending on config space module)
                Parameter configuration
            cost: float
                Cost of TA run (will be minimized)
            time: float
                Runtime of TA run
            status: str
                Status in {SUCCESS, TIMEOUT, CRASHED, ABORT, MEMOUT}
            instance_id: str
                String representing an instance (default: None)
            seed: int
                Random seed used by TA (default: None)
            additional_info: dict
                Additional run infos (could include further returned
                information from TA or fields such as start time and host_id)
            origin: DataOrigin
                Defines how data will be used.
        """

        config_id = self.config_ids.get(config)
        if config_id is None:
            self._n_id += 1
            self.config_ids[config] = self._n_id
            config_id = self.config_ids.get(config)
            self.ids_config[self._n_id] = config

        k = RunKey(config_id, instance_id, seed)
        v = RunValue(cost, time, status, additional_info)

        # Each runkey is supposed to be used only once. Repeated tries to add
        # the same runkey will be ignored silently if not capped.
        if self.overwrite_existing_runs or self.data.get(k) is None:
            self._add(k, v, status, origin)
        elif status != StatusType.CAPPED and self.data[
                k].status == StatusType.CAPPED:
            # overwrite capped runs with uncapped runs
            self._add(k, v, status, origin)
        elif status == StatusType.CAPPED and self.data[
                k].status == StatusType.CAPPED and cost > self.data[k].cost:
            # overwrite if censored with a larger cutoff
            self._add(k, v, status, origin)

    def _add(self, k: RunKey, v: RunValue, status: StatusType,
             origin: DataOrigin):
        """Actual function to add new entry to data structures

        TODO

        """
        self.data[k] = v
        self.external[k] = origin

        if origin in (DataOrigin.INTERNAL, DataOrigin.EXTERNAL_SAME_INSTANCES) \
                and status != StatusType.CAPPED:
            # also add to fast data structure
            is_k = InstSeedKey(k.instance_id, k.seed)
            self._configid_to_inst_seed[
                k.config_id] = self._configid_to_inst_seed.get(
                    k.config_id, [])
            if is_k not in self._configid_to_inst_seed[k.config_id]:
                self._configid_to_inst_seed[k.config_id].append(is_k)

            if not self.overwrite_existing_runs:
                # assumes an average across runs as cost function aggregation
                self.incremental_update_cost(self.ids_config[k.config_id],
                                             v.cost)
            else:
                self.update_cost(config=self.ids_config[k.config_id])

    def update_cost(self, config: Configuration):
        """Store the performance of a configuration across the instances in
        self.cost_perf_config and also updates self.runs_per_config;
        uses self.aggregate_func

        Parameters
        ----------
        config: Configuration
            configuration to update cost based on all runs in runhistory
        """
        inst_seeds = set(self.get_runs_for_config(config))
        perf = self.aggregate_func(config, self, inst_seeds)
        config_id = self.config_ids[config]
        self.cost_per_config[config_id] = perf
        self.runs_per_config[config_id] = len(inst_seeds)

    def compute_all_costs(self, instances: typing.List[str] = None):
        """Computes the cost of all configurations from scratch and overwrites
        self.cost_perf_config and self.runs_per_config accordingly;

        Parameters
        ----------
        instances: typing.List[str]
            list of instances; if given, cost is only computed wrt to this instance set
        """

        self.cost_per_config = {}
        self.runs_per_config = {}
        for config, config_id in self.config_ids.items():
            inst_seeds = set(self.get_runs_for_config(config))
            if instances is not None:
                inst_seeds = list(
                    filter(lambda x: x.instance in instances, inst_seeds))

            if inst_seeds:  # can be empty if never saw any runs on <instances>
                perf = self.aggregate_func(config, self, inst_seeds)
                self.cost_per_config[config_id] = perf
                self.runs_per_config[config_id] = len(inst_seeds)

    def incremental_update_cost(self, config: Configuration, cost: float):
        """Incrementally updates the performance of a configuration by using a
        moving average;

        Parameters
        ----------
        config: Configuration
            configuration to update cost based on all runs in runhistory
        cost: float
            cost of new run of config
        """

        config_id = self.config_ids[config]
        n_runs = self.runs_per_config.get(config_id, 0)
        old_cost = self.cost_per_config.get(config_id, 0.)
        self.cost_per_config[config_id] = (
            (old_cost * n_runs) + cost) / (n_runs + 1)
        self.runs_per_config[config_id] = n_runs + 1

    def get_cost(self, config: Configuration):
        """Returns empirical cost for a configuration; uses  self.cost_per_config

        Parameters
        ----------
        config: Configuration

        Returns
        -------
        cost: float
            Computed cost for configuration
        """
        config_id = self.config_ids[config]
        return self.cost_per_config.get(config_id, np.nan)

    def get_runs_for_config(self, config: Configuration):
        """Return all runs (instance seed pairs) for a configuration.

        Parameters
        ----------
        config : Configuration from ConfigSpace
            Parameter configuration

        Returns
        -------
        instance_seed_pairs : list<tuples of instance, seed>
        """
        config_id = self.config_ids.get(config)
        return self._configid_to_inst_seed.get(config_id, [])

    def get_instance_costs_for_config(self, config: Configuration):
        """
            Returns the average cost per instance (across seeds)
            for a configuration
            Parameters
            ----------
            config : Configuration from ConfigSpace
                Parameter configuration

            Returns
            -------
            cost_per_inst: dict<instance name<str>, cost<float>>
        """
        config_id = self.config_ids.get(config)
        runs_ = self._configid_to_inst_seed.get(config_id, [])
        cost_per_inst = {}
        for inst, seed in runs_:
            cost_per_inst[inst] = cost_per_inst.get(inst, [])
            rkey = RunKey(config_id, inst, seed)
            vkey = self.data[rkey]
            cost_per_inst[inst].append(vkey.cost)
        cost_per_inst = dict([(inst, np.mean(costs))
                              for inst, costs in cost_per_inst.items()])
        return cost_per_inst

    def get_all_configs(self):
        """Return all configurations in this RunHistory object

        Returns
        -------
            parameter configurations: list
        """
        return list(self.config_ids.keys())

    def empty(self):
        """Check whether or not the RunHistory is empty.

        Returns
        -------
        emptiness: bool
            True if runs have been added to the RunHistory,
            False otherwise
        """
        return len(self.data) == 0

    def save_json(self,
                  fn: str = "runhistory.json",
                  save_external: bool = False):
        """
        saves runhistory on disk

        Parameters
        ----------
        fn : str
            file name
        save_external : bool
            Whether to save external data in the runhistory file.
        """
        fn = fn.replace(":", "-")
        data = [([
            int(k.config_id),
            str(k.instance_id) if k.instance_id is not None else None,
            int(k.seed)
        ], list(v)) for k, v in self.data.items()
                if save_external or self.external[k] == DataOrigin.INTERNAL]
        config_ids_to_serialize = set([entry[0][0] for entry in data])
        configs = {
            id_: conf.get_dictionary()
            for id_, conf in self.ids_config.items()
            if id_ in config_ids_to_serialize
        }
        config_origins = {
            id_: conf.origin
            for id_, conf in self.ids_config.items()
            if (id_ in config_ids_to_serialize and conf.origin is not None)
        }

        with open(fn, "w") as fp:
            json.dump(
                {
                    "data": data,
                    "config_origins": config_origins,
                    "configs": configs
                },
                fp,
                cls=EnumEncoder,
                indent=2)

    def load_json(self, fn: str, cs: ConfigurationSpace):
        """Load and runhistory in json representation from disk.

        Overwrites current runhistory!

        Parameters
        ----------
        fn : str
            file name to load from
        cs : ConfigSpace
            instance of configuration space
        """
        try:
            with open(fn) as fp:
                all_data = json.load(fp, object_hook=StatusType.enum_hook)
        except Exception as e:
            self.logger.warning(
                'Encountered exception %s while reading runhistory from %s. '
                'Not adding any runs!',
                e,
                fn,
            )
            return

        config_origins = all_data.get("config_origins", {})

        self.ids_config = {
            int(id_): Configuration(cs,
                                    values=values,
                                    origin=config_origins.get(id_, None))
            for id_, values in all_data["configs"].items()
        }

        self.config_ids = {
            config: id_
            for id_, config in self.ids_config.items()
        }

        self._n_id = len(self.config_ids)

        # important to use add method to use all data structure correctly
        for k, v in all_data["data"]:
            self.add(config=self.ids_config[int(k[0])],
                     cost=float(v[0]),
                     time=float(v[1]),
                     status=StatusType(v[2]),
                     instance_id=k[1],
                     seed=int(k[2]),
                     additional_info=v[3])

    def update_from_json(
            self,
            fn: str,
            cs: ConfigurationSpace,
            origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES):
        """Update the current runhistory by adding new runs from a json file.

        Parameters
        ----------
        fn : str
            File name to load from.
        cs : ConfigSpace
            Instance of configuration space.
        origin : DataOrigin
            What to store as data origin.
        """
        new_runhistory = RunHistory(self.aggregate_func)
        new_runhistory.load_json(fn, cs)
        self.update(runhistory=new_runhistory, origin=origin)

    def update(self,
               runhistory: 'RunHistory',
               origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES):
        """Update the current runhistory by adding new runs from a RunHistory.

        Parameters
        ----------
        runhistory: RunHistory
            Runhistory with additional data to be added to self
        origin: DataOrigin
            If set to ``INTERNAL`` or ``EXTERNAL_FULL`` the data will be
            added to the internal data structure self._configid_to_inst_seed
            and be available :meth:`through get_runs_for_config`.
        """

        # Configurations might be already known, but by a different ID. This
        # does not matter here because the add() method handles this
        # correctly by assigning an ID to unknown configurations and re-using
        #  the ID
        for key, value in runhistory.data.items():
            config_id, instance_id, seed = key
            cost, time, status, additional_info = value
            config = runhistory.ids_config[config_id]
            self.add(config=config,
                     cost=cost,
                     time=time,
                     status=status,
                     instance_id=instance_id,
                     seed=seed,
                     additional_info=additional_info,
                     origin=origin)
Ejemplo n.º 2
0
class BaseRunner(ABC):
    """Interface class to handle the execution of SMAC' configurations.

    This interface defines how to interact with the SMBO loop.
    The complexity of running a configuration as well as handling the
    results is abstracted to the SMBO via a BaseRunner.

    From SMBO perspective, launching a configuration follows a
    submit/collect scheme as follows:
    1- A run is launched via submit_run()
    1.1- Submit_run internally calls run_wrapper(), a method that
         contains common processing functions among different runners,
         for example, handling capping and stats checking.
    1.2- A class that implements BaseRunner defines run() which is
         really the algorithm to translate a RunInfo to a RunValue, i.e.
         a configuration to an actual result.
    2- A completed run is collected via get_finished_runs(), which returns
       any finished runs, if any.
    3- This interface also offers the method wait() as a mechanism to make
       sure we have enough data in the next iteration to make a decision. For
       example, the intensifier might not be able to select the next challenger
       until more results are available.


    Attributes
    ----------

    results
    ta
    stats
    run_obj
    par_factor
    cost_for_crash
    abort_first_run_crash

    Parameters
    ---------
    ta : typing.Union[typing.List[str], typing.Callable]
        target algorithm
    stats: Stats
         stats object to collect statistics about runtime/additional info
    run_obj: str
        run objective of SMAC
    par_factor: int
        penalization factor
    cost_for_crash : float
        cost that is used in case of crashed runs (including runs
        that returned NaN or inf)
    abort_on_first_run_crash: bool
        if true and first run crashes, raise FirstRunCrashedException
    """
    def __init__(
        self,
        ta: typing.Union[typing.List[str], typing.Callable],
        stats: Stats,
        run_obj: str = "runtime",
        par_factor: int = 1,
        cost_for_crash: float = float(MAXINT),
        abort_on_first_run_crash: bool = True,
    ):

        # The results is a FIFO structure, implemented via a list
        # (because the Queue lock is not pickable). Finished runs are
        # put in this list and collected via process_finished_runs
        self.results = []  # type: typing.List[typing.Tuple[RunInfo, RunValue]]

        # Below state the support for a Runner algorithm that
        # implements a ta
        self.ta = ta
        self.stats = stats
        self.run_obj = run_obj
        self.par_factor = par_factor
        self.cost_for_crash = cost_for_crash
        self.abort_on_first_run_crash = abort_on_first_run_crash
        self.logger = PickableLoggerAdapter(self.__module__ + '.' +
                                            self.__class__.__name__)
        self._supports_memory_limit = False

        super().__init__()

    @abstractmethod
    def submit_run(self, run_info: RunInfo) -> None:
        """This function submits a configuration
        embedded in a RunInfo object, and uses one of the workers
        to produce a result (such result will eventually be available
        on the self.results FIFO).

        This interface method will be called by SMBO, with the expectation
        that a function will be executed by a worker.

        What will be executed is dictated by run_info, and "how" will it be
        executed is decided via the child class that implements a run() method.

        Because config submission can be a serial/parallel endeavor,
        it is expected to be implemented by a child class.

        Parameters
        ----------
        run_info: RunInfo
            An object containing the configuration and the necessary data to run it

        """
        pass

    @abstractmethod
    def run(
        self,
        config: Configuration,
        instance: str,
        cutoff: typing.Optional[float] = None,
        seed: int = 12345,
        budget: typing.Optional[float] = None,
        instance_specific: str = "0",
    ) -> typing.Tuple[StatusType, float, float, typing.Dict]:
        """Runs target algorithm <self.ta> with configuration <config> on
        instance <instance> with instance specifics <specifics> for at most
        <cutoff> seconds and random seed <seed>

        This method exemplifies how to defined the run() method

        Parameters
        ----------
            config : Configuration
                dictionary param -> value
            instance : string
                problem instance
            cutoff : float, optional
                Wallclock time limit of the target algorithm. If no value is
                provided no limit will be enforced.
            seed : int
                random seed
            budget : float, optional
                A positive, real-valued number representing an arbitrary limit to the target
                algorithm. Handled by the target algorithm internally
            instance_specific: str
                instance specific information (e.g., domain file or solution)

        Returns
        -------
            status: enum of StatusType (int)
                {SUCCESS, TIMEOUT, CRASHED, ABORT}
            cost: float
                cost/regret/quality (float) (None, if not returned by TA)
            runtime: float
                runtime (None if not returned by TA)
            additional_info: dict
                all further additional run information
        """
        pass

    def run_wrapper(
        self,
        run_info: RunInfo,
    ) -> typing.Tuple[RunInfo, RunValue]:
        """Wrapper around run() to exec and check the execution of a given config file

        This function encapsulates common handling/processing, so that run() implementation
        is simplified.

        Parameters
        ----------
            run_info : RunInfo
                Object that contains enough information to execute a configuration run in
                isolation.

        Returns
        -------
            RunInfo:
                an object containing the configuration launched
            RunValue:
                Contains information about the status/performance of config
        """
        start = time.time()

        if run_info.cutoff is None and self.run_obj == "runtime":
            if self.logger:
                self.logger.critical(
                    "For scenarios optimizing running time "
                    "(run objective), a cutoff time is required, "
                    "but not given to this call.")
            raise ValueError("For scenarios optimizing running time "
                             "(run objective), a cutoff time is required, "
                             "but not given to this call.")
        cutoff = None
        if run_info.cutoff is not None:
            cutoff = int(math.ceil(run_info.cutoff))

        try:
            status, cost, runtime, additional_info = self.run(
                config=run_info.config,
                instance=run_info.instance,
                cutoff=cutoff,
                seed=run_info.seed,
                budget=run_info.budget,
                instance_specific=run_info.instance_specific)
        except Exception as e:
            status = StatusType.CRASHED
            cost = self.cost_for_crash
            runtime = time.time() - start

            # Add context information to the error message
            exception_traceback = traceback.format_exc()
            error_message = repr(e)
            additional_info = {
                'traceback': exception_traceback,
                'error': error_message
            }

        end = time.time()

        if run_info.budget == 0 and status == StatusType.DONOTADVANCE:
            raise ValueError(
                "Cannot handle DONOTADVANCE state when using intensify or SH/HB on "
                "instances.")

        # Catch NaN or inf.
        if (self.run_obj == 'runtime' and not np.isfinite(runtime)
                or self.run_obj == 'quality' and not np.isfinite(cost)):
            if self.logger:
                self.logger.warning(
                    "Target Algorithm returned NaN or inf as {}. "
                    "Algorithm run is treated as CRASHED, cost "
                    "is set to {} for quality scenarios. "
                    "(Change value through \"cost_for_crash\""
                    "-option.)".format(self.run_obj, self.cost_for_crash))
            status = StatusType.CRASHED

        if self.run_obj == "runtime":
            # The following line pleases mypy - we already check for cutoff not being none above,
            # prior to calling run. However, mypy assumes that the data type of cutoff
            # is still Optional[int]
            assert cutoff is not None
            if runtime > self.par_factor * cutoff:
                self.logger.warning("Returned running time is larger "
                                    "than {0} times the passed cutoff time. "
                                    "Clamping to {0} x cutoff.".format(
                                        self.par_factor))
                runtime = cutoff * self.par_factor
                status = StatusType.TIMEOUT
            if status == StatusType.SUCCESS:
                cost = runtime
            else:
                cost = cutoff * self.par_factor
            if status == StatusType.TIMEOUT and run_info.capped:
                status = StatusType.CAPPED
        else:
            if status == StatusType.CRASHED:
                cost = self.cost_for_crash

        return run_info, RunValue(status=status,
                                  cost=cost,
                                  time=runtime,
                                  additional_info=additional_info,
                                  starttime=start,
                                  endtime=end)

    @abstractmethod
    def get_finished_runs(
            self) -> typing.List[typing.Tuple[RunInfo, RunValue]]:
        """This method returns any finished configuration, and returns a list with
        the results of exercising the configurations. This class keeps populating results
        to self.results until a call to get_finished runs is done. In this case, the
        self.results list is emptied and all RunValues produced by running run() are
        returned.

        Returns
        -------
            List[RunInfo, RunValue]: A list of pais RunInfo/RunValues
            a submitted configuration
        """
        raise NotImplementedError()

    @abstractmethod
    def wait(self) -> None:
        """SMBO/intensifier might need to wait for runs to finish before making a decision.
        This method waits until 1 run completes
        """
        pass

    @abstractmethod
    def pending_runs(self) -> bool:
        """
        Whether or not there are configs still running. Generally if the runner is serial,
        launching a run instantly returns it's result. On parallel runners, there might
        be pending configurations to complete.
        """
        pass

    @abstractmethod
    def num_workers(self) -> int:
        """
        Return the active number of workers that will execute tae runs.
        """
        pass
Ejemplo n.º 3
0
class RunHistory(object):
    """Container for target algorithm run information.

    Most importantly, the runhistory contains an efficient mapping from each evaluated configuration to the
    empirical cost observed on either the full instance set or a subset. The cost is the average over all
    observed costs for one configuration:

    * If using budgets for a single instance, only the cost on the highest observed budget is returned.
    * If using instances as the budget, the average cost over all evaluated instances is returned.
    * Theoretically, the runhistory object can handle instances and budgets at the same time. This is
      neither used nor tested.
    * Capped runs are not included in this cost.

    Note
    ----
    Guaranteed to be picklable.

    Attributes
    ----------
    data : collections.OrderedDict()
        TODO
    config_ids : dict
        Maps config -> id
    ids_config : dict
        Maps id -> config
    num_runs_per_config : dict
        Maps config_id -> number of runs

    Parameters
    ----------
    overwrite_existing_runs : bool (default=True)
        If set to ``True`` and a run of a configuration on an instance-budget-seed-pair already exists,
        it is overwritten.
    """
    def __init__(self, overwrite_existing_runs: bool = False) -> None:
        """Constructor

        Parameters
        ----------
        overwrite_existing_runs: bool
            allows to overwrites old results if pairs of
            algorithm-instance-seed were measured
            multiple times
        """
        self.logger = PickableLoggerAdapter(self.__module__ + "." +
                                            self.__class__.__name__)

        # By having the data in a deterministic order we can do useful tests
        # when we serialize the data and can assume it's still in the same
        # order as it was added.
        self.data = collections.OrderedDict(
        )  # type: typing.Dict[RunKey, RunValue]

        # for fast access, we have also an unordered data structure
        # to get all instance seed pairs of a configuration.
        # This does not include capped runs.
        self._configid_to_inst_seed_budget = {
        }  # type: typing.Dict[int, typing.Dict[InstSeedKey, typing.List[float]]]

        self.config_ids = {}  # type: typing.Dict[Configuration, int]
        self.ids_config = {}  # type: typing.Dict[int, Configuration]
        self._n_id = 0

        # Stores cost for each configuration ID
        self._cost_per_config = {}  # type: typing.Dict[int, float]
        # Stores min cost across all budgets for each configuration ID
        self._min_cost_per_config = {}  # type: typing.Dict[int, float]
        # runs_per_config maps the configuration ID to the number of runs for that configuration
        # and is necessary for computing the moving average
        self.num_runs_per_config = {}  # type: typing.Dict[int, int]

        # Store whether a datapoint is "external", which means it was read from
        # a JSON file. Can be chosen to not be written to disk
        self.external = {}  # type: typing.Dict[RunKey, DataOrigin]

        self.overwrite_existing_runs = overwrite_existing_runs

    def add(
        self,
        config: Configuration,
        cost: float,
        time: float,
        status: StatusType,
        instance_id: typing.Optional[str] = None,
        seed: typing.Optional[int] = None,
        budget: float = 0.0,
        starttime: float = 0.0,
        endtime: float = 0.0,
        additional_info: typing.Optional[typing.Dict] = None,
        origin: DataOrigin = DataOrigin.INTERNAL,
        force_update: bool = False,
    ) -> None:
        """Adds a data of a new target algorithm (TA) run;
        it will update data if the same key values are used
        (config, instance_id, seed)

        Parameters
        ----------
            config : dict (or other type -- depending on config space module)
                Parameter configuration
            cost: float
                Cost of TA run (will be minimized)
            time: float
                Runtime of TA run
            status: str
                Status in {SUCCESS, TIMEOUT, CRASHED, ABORT, MEMOUT}
            instance_id: str
                String representing an instance (default: None)
            seed: int
                Random seed used by TA (default: None)
            budget: float
                budget (cutoff) used in intensifier to limit TA (default: 0)
            starttime: float
                starting timestamp of TA evaluation
            endtime: float
                ending timestamp of TA evaluation
            additional_info: dict
                Additional run infos (could include further returned
                information from TA or fields such as start time and host_id)
            origin: DataOrigin
                Defines how data will be used.
            force_update: bool (default: False)
                Forces the addition of a config to the history
        """

        if config is None:
            raise TypeError(
                'Configuration to add to the runhistory must not be None')
        elif not isinstance(config, Configuration):
            raise TypeError(
                'Configuration to add to the runhistory is not of type Configuration, but %s'
                % type(config))

        # Get the config id
        config_id_tmp = self.config_ids.get(config)
        if config_id_tmp is None:
            self._n_id += 1
            self.config_ids[config] = self._n_id
            config_id = typing.cast(int, self.config_ids.get(config))
            self.ids_config[self._n_id] = config
        else:
            config_id = typing.cast(int, config_id_tmp)

        # Construct keys and values for the data dictionary
        k = RunKey(config_id, instance_id, seed, budget)
        v = RunValue(cost, time, status, starttime, endtime, additional_info)

        # Each runkey is supposed to be used only once. Repeated tries to add
        # the same runkey will be ignored silently if not capped.
        if self.overwrite_existing_runs or force_update or self.data.get(
                k) is None:
            self._add(k, v, status, origin)
        elif status != StatusType.CAPPED and self.data[
                k].status == StatusType.CAPPED:
            # overwrite capped runs with uncapped runs
            self._add(k, v, status, origin)
        elif status == StatusType.CAPPED and self.data[
                k].status == StatusType.CAPPED and cost > self.data[k].cost:
            # overwrite if censored with a larger cutoff
            self._add(k, v, status, origin)

    def _add(self, k: RunKey, v: RunValue, status: StatusType,
             origin: DataOrigin) -> None:
        """Actual function to add new entry to data structures

        TODO

        """
        self.data[k] = v
        self.external[k] = origin

        # Capped data is added above
        # Do not register the cost until the run has completed
        if origin in (DataOrigin.INTERNAL, DataOrigin.EXTERNAL_SAME_INSTANCES) \
                and status not in [StatusType.CAPPED, StatusType.RUNNING]:
            # also add to fast data structure
            is_k = InstSeedKey(k.instance_id, k.seed)
            self._configid_to_inst_seed_budget[
                k.config_id] = self._configid_to_inst_seed_budget.get(
                    k.config_id, {})
            if is_k not in self._configid_to_inst_seed_budget[
                    k.config_id].keys():
                # add new inst-seed-key with budget to main dict
                self._configid_to_inst_seed_budget[k.config_id][is_k] = [
                    k.budget
                ]
            elif k.budget not in is_k:
                # append new budget to existing inst-seed-key dict
                self._configid_to_inst_seed_budget[k.config_id][is_k].append(
                    k.budget)

            # if budget is used, then update cost instead of incremental updates
            if not self.overwrite_existing_runs and k.budget == 0:
                # assumes an average across runs as cost function aggregation, this is used for algorithm configuration
                # (incremental updates are used to save time as getting the cost for > 100 instances is high)
                self.incremental_update_cost(self.ids_config[k.config_id],
                                             v.cost)
            else:
                # this is when budget > 0 (only successive halving and hyperband so far)
                self.update_cost(config=self.ids_config[k.config_id])
                if k.budget > 0:
                    if self.num_runs_per_config[
                            k.
                            config_id] != 1:  # This is updated in update_cost
                        raise ValueError('This should not happen!')

    def update_cost(self, config: Configuration) -> None:
        """Store the performance of a configuration across the instances in
        self.cost_per_config and also updates self.runs_per_config;

        Note
        ----
        This method ignores capped runs.

        Parameters
        ----------
        config: Configuration
            configuration to update cost based on all runs in runhistory
        """
        config_id = self.config_ids[config]
        # removing duplicates while keeping the order
        inst_seed_budgets = list(
            dict.fromkeys(
                self.get_runs_for_config(config,
                                         only_max_observed_budget=True)))
        self._cost_per_config[config_id] = self.average_cost(
            config, inst_seed_budgets)
        self.num_runs_per_config[config_id] = len(inst_seed_budgets)

        all_inst_seed_budgets = list(
            dict.fromkeys(
                self.get_runs_for_config(config,
                                         only_max_observed_budget=False)))
        self._min_cost_per_config[config_id] = self.min_cost(
            config, all_inst_seed_budgets)

    def incremental_update_cost(self, config: Configuration,
                                cost: float) -> None:
        """Incrementally updates the performance of a configuration by using a
        moving average;

        Parameters
        ----------
        config: Configuration
            configuration to update cost based on all runs in runhistory
        cost: float
            cost of new run of config
        """

        config_id = self.config_ids[config]
        n_runs = self.num_runs_per_config.get(config_id, 0)
        old_cost = self._cost_per_config.get(config_id, 0.)
        self._cost_per_config[config_id] = (
            (old_cost * n_runs) + cost) / (n_runs + 1)
        self.num_runs_per_config[config_id] = n_runs + 1

    def get_cost(self, config: Configuration) -> float:
        """Returns empirical cost for a configuration.

        See the class docstring for how the costs are computed. The costs are not re-computed, but are read from cache.

        Parameters
        ----------
        config: Configuration

        Returns
        -------
        cost: float
            Computed cost for configuration
        """
        config_id = self.config_ids.get(config)
        return self._cost_per_config.get(
            config_id, np.nan)  # type: ignore[arg-type] # noqa F821

    def get_runs_for_config(
            self, config: Configuration,
            only_max_observed_budget: bool) -> typing.List[InstSeedBudgetKey]:
        """Return all runs (instance seed pairs) for a configuration.

        Note
        ----
        This method ignores capped runs.

        Parameters
        ----------
        config : Configuration from ConfigSpace
            Parameter configuration
        only_max_observed_budget : bool
            Select only the maximally observed budget run for this configuration
        Returns
        -------
        instance_seed_budget_pairs : list<tuples of instance, seed, budget>
        """
        config_id = self.config_ids.get(config)
        runs = self._configid_to_inst_seed_budget.get(
            config_id, {}).copy()  # type: ignore[arg-type] # noqa F821

        # select only the max budget run if specified
        if only_max_observed_budget:
            for k, v in runs.items():
                runs[k] = [max(v)]

        # convert to inst-seed-budget key
        rval = [
            InstSeedBudgetKey(k.instance, k.seed, budget)
            for k, v in runs.items() for budget in v
        ]
        return rval

    def get_all_configs(self) -> typing.List[Configuration]:
        """Return all configurations in this RunHistory object

        Returns
        -------
            parameter configurations: list
        """
        return list(self.config_ids.keys())

    def get_all_configs_per_budget(
        self,
        budget_subset: typing.Optional[typing.List] = None,
    ) -> typing.List[Configuration]:
        """
        Return all configs in this RunHistory object that have been run on one of these budgets

        Parameter
        ---------
            budget_subset: list

        Returns
        -------
            parameter configurations: list
        """
        if budget_subset is None:
            return self.get_all_configs()
        configs = []
        for c, i, s, b in self.data.keys():
            if b in budget_subset:
                configs.append(self.ids_config[c])
        return configs

    def get_min_cost(self, config: Configuration) -> float:
        """Returns the lowest empirical cost for a configuration, across all runs (budgets)

        See the class docstring for how the costs are computed. The costs are not re-computed, but are read from cache.

        Parameters
        ----------
        config: Configuration

        Returns
        -------
        min_cost: float
            Computed cost for configuration
        """
        config_id = self.config_ids.get(config)
        return self._min_cost_per_config.get(
            config_id, np.nan)  # type: ignore[arg-type] # noqa F821

    def empty(self) -> bool:
        """Check whether or not the RunHistory is empty.

        Returns
        -------
        emptiness: bool
            True if runs have been added to the RunHistory,
            False otherwise
        """
        return len(self.data) == 0

    def save_json(self,
                  fn: str = "runhistory.json",
                  save_external: bool = False) -> None:
        """
        saves runhistory on disk

        Parameters
        ----------
        fn : str
            file name
        save_external : bool
            Whether to save external data in the runhistory file.
        """
        data = [([
            int(k.config_id),
            str(k.instance_id) if k.instance_id is not None else None,
            int(k.seed),
            float(k.budget) if k[3] is not None else 0
        ], list(v)) for k, v in self.data.items()
                if save_external or self.external[k] == DataOrigin.INTERNAL]
        config_ids_to_serialize = set([entry[0][0] for entry in data])
        configs = {
            id_: conf.get_dictionary()
            for id_, conf in self.ids_config.items()
            if id_ in config_ids_to_serialize
        }
        config_origins = {
            id_: conf.origin
            for id_, conf in self.ids_config.items()
            if (id_ in config_ids_to_serialize and conf.origin is not None)
        }

        with open(fn, "w") as fp:
            json.dump(
                {
                    "data": data,
                    "config_origins": config_origins,
                    "configs": configs
                },
                fp,
                cls=EnumEncoder,
                indent=2)

    def load_json(self, fn: str, cs: ConfigurationSpace) -> None:
        """Load and runhistory in json representation from disk.

        Overwrites current runhistory!

        Parameters
        ----------
        fn : str
            file name to load from
        cs : ConfigSpace
            instance of configuration space
        """
        try:
            with open(fn) as fp:
                all_data = json.load(fp, object_hook=StatusType.enum_hook)
        except Exception as e:
            self.logger.warning(
                'Encountered exception %s while reading runhistory from %s. '
                'Not adding any runs!',
                e,
                fn,
            )
            return

        config_origins = all_data.get("config_origins", {})

        self.ids_config = {
            int(id_): Configuration(cs,
                                    values=values,
                                    origin=config_origins.get(id_, None))
            for id_, values in all_data["configs"].items()
        }

        self.config_ids = {
            config: id_
            for id_, config in self.ids_config.items()
        }

        self._n_id = len(self.config_ids)

        # important to use add method to use all data structure correctly
        for k, v in all_data["data"]:
            self.add(config=self.ids_config[int(k[0])],
                     cost=float(v[0]),
                     time=float(v[1]),
                     status=StatusType(v[2]),
                     instance_id=k[1],
                     seed=int(k[2]),
                     budget=float(k[3]) if len(k) == 4 else 0,
                     starttime=v[3],
                     endtime=v[4],
                     additional_info=v[5])

    def update_from_json(
        self,
        fn: str,
        cs: ConfigurationSpace,
        origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES,
    ) -> None:
        """Update the current runhistory by adding new runs from a json file.

        Parameters
        ----------
        fn : str
            File name to load from.
        cs : ConfigSpace
            Instance of configuration space.
        origin : DataOrigin
            What to store as data origin.
        """
        new_runhistory = RunHistory()
        new_runhistory.load_json(fn, cs)
        self.update(runhistory=new_runhistory, origin=origin)

    def update(
        self,
        runhistory: 'RunHistory',
        origin: DataOrigin = DataOrigin.EXTERNAL_SAME_INSTANCES,
    ) -> None:
        """Update the current runhistory by adding new runs from a RunHistory.

        Parameters
        ----------
        runhistory: RunHistory
            Runhistory with additional data to be added to self
        origin: DataOrigin
            If set to ``INTERNAL`` or ``EXTERNAL_FULL`` the data will be
            added to the internal data structure self._configid_to_inst_seed_budget
            and be available :meth:`through get_runs_for_config`.
        """

        # Configurations might be already known, but by a different ID. This
        # does not matter here because the add() method handles this
        # correctly by assigning an ID to unknown configurations and re-using
        #  the ID
        for key, value in runhistory.data.items():
            config_id, instance_id, seed, budget = key
            cost, time, status, start, end, additional_info = value
            config = runhistory.ids_config[config_id]
            self.add(config=config,
                     cost=cost,
                     time=time,
                     status=status,
                     instance_id=instance_id,
                     starttime=start,
                     endtime=end,
                     seed=seed,
                     budget=budget,
                     additional_info=additional_info,
                     origin=origin)

    def _cost(
        self,
        config: Configuration,
        instance_seed_budget_keys: typing.Optional[
            typing.Iterable[InstSeedBudgetKey]] = None,
    ) -> typing.List[float]:
        """Return array of all costs for the given config for further calculations.

        Parameters
        ----------
        config : Configuration
            Configuration to calculate objective for
        instance_seed_budget_keys : list, optional (default=None)
            List of tuples of instance-seeds-budget keys. If None, the run_history is
            queried for all runs of the given configuration.

        Returns
        -------
        Costs: list
            Array of all costs
        """
        try:
            id_ = self.config_ids[config]
        except KeyError:  # challenger was not running so far
            return []

        if instance_seed_budget_keys is None:
            instance_seed_budget_keys = self.get_runs_for_config(
                config, only_max_observed_budget=True)

        costs = []
        for i, r, b in instance_seed_budget_keys:
            k = RunKey(id_, i, r, b)
            costs.append(self.data[k].cost)
        return costs

    def average_cost(
        self,
        config: Configuration,
        instance_seed_budget_keys: typing.Optional[
            typing.Iterable[InstSeedBudgetKey]] = None,
    ) -> float:
        """Return the average cost of a configuration.

        This is the mean of costs of all instance-seed pairs.

        Parameters
        ----------
        config : Configuration
            Configuration to calculate objective for
        instance_seed_budget_keys : list, optional (default=None)
            List of tuples of instance-seeds-budget keys. If None, the run_history is
            queried for all runs of the given configuration.

        Returns
        ----------
        Cost: float
            Average cost
        """
        costs = self._cost(config, instance_seed_budget_keys)
        if costs:
            return float(np.mean(costs))

        return np.nan

    def sum_cost(
        self,
        config: Configuration,
        instance_seed_budget_keys: typing.Optional[
            typing.Iterable[InstSeedBudgetKey]] = None,
    ) -> float:
        """Return the sum of costs of a configuration.

        This is the sum of costs of all instance-seed pairs.

        Parameters
        ----------
        config : Configuration
            Configuration to calculate objective for
        instance_seed_budget_keys : list, optional (default=None)
            List of tuples of instance-seeds-budget keys. If None, the run_history is
            queried for all runs of the given configuration.

        Returns
        ----------
        sum_cost: float
            Sum of costs of config
        """
        return float(np.sum(self._cost(config, instance_seed_budget_keys)))

    def min_cost(
        self,
        config: Configuration,
        instance_seed_budget_keys: typing.Optional[
            typing.Iterable[InstSeedBudgetKey]] = None,
    ) -> float:
        """Return the minimum cost of a configuration

        This is the minimum cost of all instance-seed pairs.

        Parameters
        ----------
        config : Configuration
            Configuration to calculate objective for
        instance_seed_budget_keys : list, optional (default=None)
            List of tuples of instance-seeds-budget keys. If None, the run_history is
            queried for all runs of the given configuration.

        Returns
        ----------
        min_cost: float
            minimum cost of config
        """
        costs = self._cost(config, instance_seed_budget_keys)
        if costs:
            return float(np.min(costs))

        return np.nan

    def compute_all_costs(self,
                          instances: typing.Optional[typing.List[str]] = None
                          ) -> None:
        """Computes the cost of all configurations from scratch and overwrites
        self.cost_perf_config and self.runs_per_config accordingly;

        Note
        ----
        This method is only used for ``merge_foreign_data`` and should be removed.

        Parameters
        ----------
        instances: typing.List[str]
            list of instances; if given, cost is only computed wrt to this instance set
        """
        self._cost_per_config = {}
        self.num_runs_per_config = {}
        for config, config_id in self.config_ids.items():
            # removing duplicates while keeping the order
            inst_seed_budgets = list(
                dict.fromkeys(
                    self.get_runs_for_config(config,
                                             only_max_observed_budget=True)))
            if instances is not None:
                inst_seed_budgets = list(
                    filter(
                        lambda x: x.instance in typing.cast(
                            typing.List, instances), inst_seed_budgets))

            if inst_seed_budgets:  # can be empty if never saw any runs on <instances>
                self._cost_per_config[config_id] = self.average_cost(
                    config, inst_seed_budgets)
                self._min_cost_per_config[config_id] = self.min_cost(
                    config, inst_seed_budgets)
                self.num_runs_per_config[config_id] = len(inst_seed_budgets)

    def get_instance_costs_for_config(
            self,
            config: Configuration) -> typing.Dict[str, typing.List[float]]:
        """ Returns the average cost per instance (across seeds) for a configuration

        If the runhistory contains budgets, only the highest budget for a configuration is returned.

        Note
        ----
        This is used by the pSMAC facade to determine the incumbent after the evaluation.

        Parameters
        ----------
        config : Configuration from ConfigSpace
            Parameter configuration

        Returns
        -------
        cost_per_inst: dict<instance name<str>, cost<float>>
        """
        runs_ = self.get_runs_for_config(config, only_max_observed_budget=True)
        cost_per_inst = {}  # type: typing.Dict[str, typing.List[float]]
        for inst, seed, budget in runs_:
            cost_per_inst[inst] = cost_per_inst.get(inst, [])
            rkey = RunKey(self.config_ids[config], inst, seed, budget)
            vkey = self.data[rkey]
            cost_per_inst[inst].append(vkey.cost)
        cost_per_inst = dict([(inst, np.mean(costs))
                              for inst, costs in cost_per_inst.items()])
        return cost_per_inst