def _cost(config, run_history, instance_seed_pairs=None): """Return array of all costs for the given config for further calculations. Parameters ---------- config : Configuration configuration to calculate objective for run_history : RunHistory RunHistory object from which the objective value is computed. instance_seed_pairs : list, optional (default=None) list of tuples of instance-seeds pairs. If None, the run_history is queried for all runs of the given configuration. Returns ---------- list """ try: id_ = run_history.config_ids[config] except KeyError: # challenger was not running so far return [] if instance_seed_pairs is None: instance_seed_pairs = run_history.get_runs_for_config(config) costs = [] for i, r in instance_seed_pairs: k = RunKey(id_, i, r) costs.append(run_history.data[k].cost) return costs
def get_instance_costs_for_config(runhis: RunHistory, config: Configuration): """ Returns the average cost per instance (across seeds) for a configuration Parameters ---------- config : Configuration from ConfigSpace Parameter configuration Returns ------- cost_per_inst: dict<instance name<str>, cost<float>> """ config_id = runhis.config_ids.get(config) runs_ = runhis._configid_to_inst_seed.get(config_id, []) cost_per_inst = {} for inst, seed in runs_: cost_per_inst[inst] = cost_per_inst.get(inst, []) rkey = RunKey(config_id, inst, seed) vkey = runhis.data[rkey] cost_per_inst[inst].append(vkey.cost) cost_per_inst = dict([(inst, np.mean(costs)) for inst, costs in cost_per_inst.items()]) return cost_per_inst
def get_cost_dict_for_config(rh: RunHistory, conf: Configuration, par: int = 1, cutoff: typing.Union[float, None] = None): """ Aggregates loss for configuration on evaluated instances over seeds. Parameters ---------- rh: RunHistory runhistory with data conf: Configuration configuration to evaluate par: int par-factor with which to multiply timeouts cutoff: float cutoff of scenario - used to penalize costs if par != 1 Returns ------- cost: dict(instance->cost) cost per instance (aggregated or as list per seed) """ # Check if config is in runhistory conf_id = rh.config_ids[conf] # Map instances to seeds in dict runs = rh.get_runs_for_config(conf) instance_to_seeds = dict() for run in runs: inst, seed = run if inst in instance_to_seeds: instance_to_seeds[inst].append(seed) else: instance_to_seeds[inst] = [seed] # Get loss per instance instance_costs = { i: [rh.data[RunKey(conf_id, i, s)].cost for s in instance_to_seeds[i]] for i in instance_to_seeds } # Aggregate: instance_costs = {i: np.mean(instance_costs[i]) for i in instance_costs} # TODO: uncomment next line and delete all above after next SMAC dev->master # instance_costs = rh.get_instance_costs_for_config(conf) if par != 1: if cutoff: instance_costs = { k: v if v < cutoff else v * par for k, v in instance_costs.items() } else: raise ValueError( "To apply penalization of costs, a cutoff needs to be provided." ) return instance_costs
def ensemble_run_history(request): run_history = RunHistory() run_history._add( RunKey(config_id=3, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=3.0), RunValue(cost=0.11347517730496459, time=0.21858787536621094, status=None, starttime=time.time(), endtime=time.time(), additional_info={ 'duration': 0.20323538780212402, 'num_run': 3, 'configuration_origin': 'Random Search' }), status=None, origin=None, ) run_history._add( RunKey(config_id=6, instance_id='{"task_id": "breast_cancer"}', seed=1, budget=6.0), RunValue(cost=2 * 0.11347517730496459, time=2 * 0.21858787536621094, status=None, starttime=time.time(), endtime=time.time(), additional_info={ 'duration': 0.20323538780212402, 'num_run': 6, 'configuration_origin': 'Random Search' }), status=None, origin=None, ) return run_history
def get_cost_dict_for_config(rh, conf, aggregate=np.mean): """ Aggregates loss for configuration on evaluated instances over seeds. Parameters: ----------- rh: RunHistory runhistory with data conf: Configuration configuration to evaluate aggregate: function or None used to aggregate loss over different seeds, function must take list as argument, if None no aggregation happens (individual values per seed returned, but seeds not) Returns: -------- loss: dict(instance->loss) loss per instance (aggregated or as list per seed) """ # Check if config is in runhistory conf_id = rh.config_ids[conf] # Map instances to seeds in dict runs = rh.get_runs_for_config(conf) instance_to_seeds = dict() for run in runs: inst, seed = run if inst in instance_to_seeds: instance_to_seeds[inst].append(seed) else: instance_to_seeds[inst] = [seed] # Get loss per instance instance_losses = { i: [rh.data[RunKey(conf_id, i, s)].cost for s in instance_to_seeds[i]] for i in instance_to_seeds } # Aggregate: if aggregate: instance_losses = { i: aggregate(instance_losses[i]) for i in instance_losses } return instance_losses
def get_instance_costs_for_config(runhis: RunHistory, config: Configuration): """ return average cost per instance :param runhis: SMAC run history :param config: parameter configuration :return: mapping from instance name to cost """ config_id = runhis.config_ids.get(config) runs_ = runhis._configid_to_inst_seed.get(config_id, []) cost_per_inst = {} for inst, seed in runs_: cost_per_inst[inst] = cost_per_inst.get(inst, []) rkey = RunKey(config_id, inst, seed) vkey = runhis.data[rkey] cost_per_inst[inst].append(vkey.cost) cost_per_inst = dict([(inst, np.mean(costs)) for inst, costs in cost_per_inst.items()]) return cost_per_inst
def make_dict_run_history_data(data): run_history_data = dict() for row in data: run_key = RunKey( config_id=row[0][0], instance_id=row[0][1], seed=row[0][2], budget=row[0][3]) run_value = RunValue( cost=row[1][0], time=row[1][1], status=getattr(StatusType, row[1][2]['__enum__'].split(".")[-1]), starttime=row[1][3], endtime=row[1][4], additional_info=row[1][5], ) run_history_data[run_key] = run_value return run_history_data
def get_timeout(rh, conf, cutoff): """Check for timeouts. If multiple runs for an inst/config-pair are available, using the median (not the mean: no fractional timeouts) Parameters ---------- rh: RunHistory runhistory to take runs from conf: Configuration config to use cutoff: int to determine timeouts Returns ------- timeouts: Dict(str: bool) mapping instances to [True, False], where True indicates a timeout """ # TODO Possibly inconsistent: median over timeouts is timeout, but mean over # costs is not. Possible? if not cutoff: return {} # Check if config is in runhistory conf_id = rh.config_ids[conf] timeouts = {} runs = rh.get_runs_for_config(conf, only_max_observed_budget=True) for run in runs: # Averaging over seeds, run = (inst, seed) inst, seed, _git = run status = rh.data[RunKey(conf_id, inst, seed)].time < cutoff if inst in timeouts: timeouts[inst].append(status) else: timeouts[inst] = [status] # Use median timeouts = {i: np.floor(np.median(timeouts[i])) for i in timeouts.keys()} return timeouts
def _get_mean_var_time(self, validator, traj, use_epm, rh): """ Parameters ---------- validator: Validator validator (smac-based) traj: List[Configuraton] trajectory to set in validator use_epm: bool validated or not (no need to use epm if validated) rh: RunHistory ?? Returns ------- mean, var times: List[float] times to plot (x-values) configs """ # TODO kinda important: docstrings, what is this function doing? if validator: validator.traj = traj # set trajectory time, configs = [], [] if use_epm and not self.block_epm: for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) # self.logger.debug('Time: %d Runs: %d', time[-1], len(rh.get_runs_for_config(configs[-1]))) self.logger.debug( "Using %d samples (%d distinct) from trajectory.", len(time), len(set(configs))) # Initialize EPM if validator.epm: # not log as validator epm is trained on cost, not log cost epm = validator.epm else: self.logger.debug( "No EPM passed! Training new one from runhistory.") # Train random forest and transform training data (from given rh) # Not using validator because we want to plot uncertainties rh2epm = RunHistory2EPM4Cost(num_params=len( self.scenario.cs.get_hyperparameters()), scenario=self.scenario) X, y = rh2epm.transform(rh) self.logger.debug( "Training model with data of shape X: %s, y: %s", str(X.shape), str(y.shape)) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) epm = RandomForestWithInstances( self.scenario.cs, types=types, bounds=bounds, seed=self.rng.randint(MAXINT), instance_features=self.scenario.feature_array, ratio_features=1.0) epm.train(X, y) config_array = convert_configurations_to_array(configs) mean, var = epm.predict_marginalized_over_instances(config_array) var = np.zeros(mean.shape) # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs # This variance is computed in an outer loop. else: mean, var = [], [] for entry in traj: #self.logger.debug(entry) time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) self.logger.debug( rh.get_runs_for_config(configs[-1], only_max_observed_budget=True)) costs = [ rh.data[RunKey(rh.config_ids[configs[-1]], i, s, b)].cost for i, s, b in rh.get_runs_for_config( configs[-1], only_max_observed_budget=True) ] # self.logger.debug(len(costs), time[-1] if not costs: time.pop() else: mean.append(np.mean(costs)) var.append(0) # No variance over instances mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape( -1, 1) return mean, var, time, configs
def _get_runs( self, configs: Union[str, typing.List[Configuration]], insts: Union[str, typing.List[str]], repetitions: int = 1, runhistory: RunHistory = None, ) -> typing.Tuple[typing.List[_Run], RunHistory]: """ Generate list of SMAC-TAE runs to be executed. This means combinations of configs with all instances on a certain number of seeds. side effect: Adds runs that don't need to be reevaluated to self.rh! Parameters ---------- configs: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time insts: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of seeds per instance/config-pair to be evaluated runhistory: RunHistory optional, try to reuse this runhistory and save some runs Returns ------- runs: list<_Run> list with _Runs [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1), _Run(config=CONFIG2,inst=INSTANCE2,seed=SEED2,inst_specs=INST_SPECIFICS2), ...] """ # Get relevant configurations and instances if isinstance(configs, str): configs = self._get_configs(configs) if isinstance(insts, str): instances = self._get_instances( insts) # type: typing.Sequence[typing.Union[str, None]] elif insts is not None: instances = insts else: instances = [None] # If no instances are given, fix the instances to one "None" instance if not instances: instances = [None] # If algorithm is deterministic, fix repetitions to 1 if self.scen.deterministic and repetitions != 1: # type: ignore[attr-defined] # noqa F821 self.logger.warning( "Specified %d repetitions, but fixing to 1, " "because algorithm is deterministic.", repetitions) repetitions = 1 # Extract relevant information from given runhistory inst_seed_config = self._process_runhistory(configs, instances, runhistory) # Now create the actual run-list runs = [] # Counter for runs without the need of recalculation runs_from_rh = 0 # If we reuse runs, we want to return them as well new_rh = RunHistory() for i in sorted(instances): for rep in range(repetitions): # First, find a seed and add all the data we can take from the # given runhistory to "our" validation runhistory. configs_evaluated = [] # type: Configuration if runhistory and i in inst_seed_config: # Choose seed based on most often evaluated inst-seed-pair seed, configs_evaluated = inst_seed_config[i].pop(0) # Delete inst if all seeds are used if not inst_seed_config[i]: inst_seed_config.pop(i) # Add runs to runhistory for c in configs_evaluated[:]: runkey = RunKey(runhistory.config_ids[c], i, seed) cost, time, status, start, end, additional_info = runhistory.data[ runkey] if status in [ StatusType.CRASHED, StatusType.ABORT, StatusType.CAPPED ]: # Not properly executed target algorithm runs should be repeated configs_evaluated.remove(c) continue new_rh.add(c, cost, time, status, instance_id=i, seed=seed, starttime=start, endtime=end, additional_info=additional_info) runs_from_rh += 1 else: # If no runhistory or no entries for instance, get new seed seed = self.rng.randint(MAXINT) # We now have a seed and add all configs that are not already # evaluated on that seed to the runs-list. This way, we # guarantee the same inst-seed-pairs for all configs. for config in [ c for c in configs if c not in configs_evaluated ]: # Only use specifics if specific exists, else use string "0" specs = self.scen.instance_specific[ i] if i and i in self.scen.instance_specific else "0" runs.append( _Run(config=config, inst=i, seed=seed, inst_specs=specs)) self.logger.info( "Collected %d runs from %d configurations on %d " "instances with %d repetitions. Reusing %d runs from " "given runhistory.", len(runs), len(configs), len(instances), repetitions, runs_from_rh) return runs, new_rh
def test_load(self): configuration_space = test_helpers.get_branin_config_space() other_runhistory = '{"data": [[[2, "branini", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[1, "branin", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[3, "branin-hoo", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[2, null, 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[1, "branini", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[4, null, 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]]], ' \ '"configs": {' \ '"4": {"x": -2.2060968293349363, "y": 5.183410905645716}, ' \ '"3": {"x": -2.7986616377433045, "y": 1.385078921531967}, ' \ '"1": {"x": 1.2553300705386103, "y": 10.804867401632372}, ' \ '"2": {"x": -4.998284377739827, "y": 4.534988589477597}}}' other_runhistory_filename = os.path.join(self.tmp_dir, 'runhistory.json') with open(other_runhistory_filename, 'w') as fh: fh.write(other_runhistory) # load from an empty runhistory runhistory = RunHistory(aggregate_func=average_cost) runhistory.load_json(other_runhistory_filename, configuration_space) self.assertEqual(sorted(list(runhistory.ids_config.keys())), [1, 2, 3, 4]) self.assertEqual(len(runhistory.data), 6) # load from non-empty runhistory, in case of a duplicate the existing # result will be kept and the new one silently discarded runhistory = RunHistory(aggregate_func=average_cost) configuration_space.seed(1) config = configuration_space.sample_configuration() runhistory.add(config, 1, 1, StatusType.SUCCESS, seed=1, instance_id='branin') id_before = id(runhistory.data[RunKey(1, 'branin', 1)]) runhistory.update_from_json(other_runhistory_filename, configuration_space) id_after = id(runhistory.data[RunKey(1, 'branin', 1)]) self.assertEqual(len(runhistory.data), 6) self.assertEqual(id_before, id_after) # load from non-empty runhistory, in case of a duplicate the existing # result will be kept and the new one silently discarded runhistory = RunHistory(aggregate_func=average_cost) configuration_space.seed(1) config = configuration_space.sample_configuration() config = configuration_space.sample_configuration() # This is the former config_3 config = configuration_space.sample_configuration() runhistory.add(config, 1, 1, StatusType.SUCCESS, seed=1, instance_id='branin') id_before = id(runhistory.data[RunKey(1, 'branin', 1)]) runhistory.update_from_json(other_runhistory_filename, configuration_space) id_after = id(runhistory.data[RunKey(1, 'branin', 1)]) self.assertEqual(len(runhistory.data), 7) self.assertEqual(id_before, id_after) self.assertEqual(sorted(list(runhistory.ids_config.keys())), [1, 2, 3, 4]) self.assertEqual( [runhistory.external[run_key] for run_key in runhistory.data], [DataOrigin.INTERNAL] + [DataOrigin.EXTERNAL_SAME_INSTANCES] * 6)
def get_runs(self, configs, insts, repetitions=1, runhistory=None): """ Generate list of SMAC-TAE runs to be executed. This means combinations of configs with all instances on a certain number of seeds. Parameters ---------- configs: list<Configuration> configurations to be evaluated insts: list<strings> instances to be validated repetitions: int number of seeds per instance/config to be evaluated runhistory: RunHistory or None if given, try to reuse these results and save some runs Returns ------- runs: list<dict<string,string,string,string>> list with dicts [{"config":CONFIG1,"inst":INSTANCE1,"seed":SEED1,"inst_specs":INST_SPECIFICS1}, {"config":CONFIG2,"inst":INSTANCE2,"seed":SEED2,"inst_specs":INST_SPECIFICS2}] """ # If no instances are given, fix the instances to one "None" instance if len(insts) == 0: insts = [None] # If algorithm is deterministic, fix repetitions to 1 if self.scen.deterministic: self.logger.debug("Fixing repetitions to one, because algorithm is" " deterministic.") repetitions = 1 # Extract relevant information from given runhistory inst_seed_config = self._process_runhistory(configs, insts, runhistory) # Now create the actual run-list runs = [] # Counter for runs without the need of recalculation runs_from_rh = 0 for i in sorted(insts): for rep in range(repetitions): configs_evaluated = [] if runhistory and i in inst_seed_config: # Choose seed based on most often evaluated inst-seed-pair seed, configs_evaluated = inst_seed_config[i].pop(0) # Delete i from dict if list is empty if len(inst_seed_config[i]) == 0: inst_seed_config.pop(i) # Add runs to runhistory for c in configs_evaluated: runkey = RunKey(runhistory.config_ids[c], i, seed) cost, time, status, additional_info = runhistory.data[ runkey] self.rh.add(c, cost, time, status, instance_id=i, seed=seed, additional_info=additional_info) runs_from_rh += 1 else: # If no runhistory or no entries for instance, get new seed seed = self.rng.randint(MAXINT) if self.scen.deterministic: seed = 0 # configs in inner loop -> same inst-seed-pairs for all configs for config in [ c for c in configs if not c in configs_evaluated ]: specs = self.scen.instance_specific[ i] if i and i in self.scen.instance_specific else "0" runs.append({ 'config': config, 'inst': i, 'seed': seed, 'inst_specs': specs }) self.logger.info( "Collected %d runs from %d configurations on %d instances " "with %d repetitions.", len(runs), len(configs), len(insts), repetitions) self.logger.info("Using %d runs from given runhistory.", runs_from_rh) return runs
def _race_challenger(self, challenger: Configuration, incumbent: Configuration, run_history: RunHistory, aggregate_func: typing.Callable): ''' aggressively race challenger against incumbent Parameters ---------- challenger : Configuration configuration which challenges incumbent incumbent : Configuration best configuration so far run_history : RunHistory stores all runs we ran so far aggregate_func: typing.Callable aggregate performance across instances Returns ------- new_incumbent: Configuration either challenger or incumbent ''' # at least one run of challenger # to increase chall_indx counter first_run = False inc_perf = run_history.get_cost(incumbent) learning_curve = [] self._num_run += 1 self._chall_indx += 1 pc = None for epoch in range(self.max_epochs): status, cost, time, add_info = self.tae_runner.start( config=challenger, instance=None, seed=0, cutoff=2**32 - 1, instance_specific=None, pc=pc) try: pc = add_info["model"] except KeyError: # model building failed, e.g. because of nan break learning_curve.append(cost) if len(self.learning_curves) > 10 and epoch > self.max_epochs / 4: seen_curves = np.array(self.learning_curves)[:, epoch] if cost > np.median(seen_curves): self.logger.info("Abort run (%f vs %f)" % (cost, np.median(seen_curves))) break # delete model in runhistory to be more memory efficient chall_id = run_history.config_ids[challenger] runkey = RunKey(chall_id, None, 0) runvalue = run_history.data[runkey] try: del runvalue.additional_info["model"] except KeyError: pass if epoch == self.max_epochs - 1: self.learning_curves.append(learning_curve) chal_perf = cost if cost < inc_perf: self.logger.info( "Challenger (%.4f) is better than incumbent (%.4f)" % (chal_perf, inc_perf)) # Show changes in the configuration params = sorted([(param, incumbent[param], challenger[param]) for param in challenger.keys()]) self.logger.info("Changes in incumbent:") for param in params: if param[1] != param[2]: self.logger.info(" %s : %r -> %r" % (param)) else: self.logger.debug(" %s remains unchanged: %r" % (param[0], param[1])) incumbent = challenger self.stats.inc_changed += 1 self.traj_logger.add_entry(train_perf=chal_perf, incumbent_id=self.stats.inc_changed, incumbent=challenger) else: self.logger.debug( "Incumbent (%.4f) is better than challenger (%.4f)" % (inc_perf, chal_perf)) return incumbent
def fmin_smac(func: typing.Callable, x0: typing.List[float], bounds: typing.List[typing.Iterable[float]], maxfun: int = -1, rng: typing.Union[np.random.RandomState, int] = None, scenario_args: typing.Mapping[str, typing.Any] = None, **kwargs): """ Minimize a function func using the SMAC4HPO facade (i.e., a modified version of SMAC). This function is a convenience wrapper for the SMAC4HPO class. Parameters ---------- func : typing.Callable Function to minimize. x0 : typing.List[float] Initial guess/default configuration. bounds : typing.List[typing.List[float]] ``(min, max)`` pairs for each element in ``x``, defining the bound on that parameters. maxfun : int, optional Maximum number of function evaluations. rng : np.random.RandomState, optional Random number generator used by SMAC. scenario_args: typing.Mapping[str,typing.Any] Arguments passed to the scenario See smac.scenario.scenario.Scenario **kwargs: Arguments passed to the optimizer class See ~smac.facade.smac_facade.SMAC Returns ------- x : list Estimated position of the minimum. f : float Value of `func` at the minimum. s : :class:`smac.facade.smac_hpo_facade.SMAC4HPO` SMAC objects which enables the user to get e.g., the trajectory and runhistory. """ # create configuration space cs = ConfigurationSpace() # Adjust zero padding tmplt = 'x{0:0' + str(len(str(len(bounds)))) + 'd}' for idx, (lower_bound, upper_bound) in enumerate(bounds): parameter = UniformFloatHyperparameter(name=tmplt.format(idx + 1), lower=lower_bound, upper=upper_bound, default_value=x0[idx]) cs.add_hyperparameter(parameter) # create scenario scenario_dict = { "run_obj": "quality", "cs": cs, "deterministic": "true", "initial_incumbent": "DEFAULT", } if scenario_args is not None: scenario_dict.update(scenario_args) if maxfun > 0: scenario_dict["runcount_limit"] = maxfun scenario = Scenario(scenario_dict) smac = SMAC4HPO(scenario=scenario, tae_runner=ExecuteTAFuncArray, tae_runner_kwargs={'ta': func}, rng=rng, **kwargs) smac.logger = logging.getLogger(smac.__module__ + "." + smac.__class__.__name__) incumbent = smac.optimize() config_id = smac.solver.runhistory.config_ids[incumbent] run_key = RunKey(config_id, None, 0) incumbent_performance = smac.solver.runhistory.data[run_key] incumbent = np.array( [incumbent[tmplt.format(idx + 1)] for idx in range(len(bounds))], dtype=np.float) return incumbent, incumbent_performance.cost, smac
def fmin_smac(func: callable, x0: list, bounds: list, maxfun: int = -1, maxtime: int = -1, rng: np.random.RandomState = None): """ Minimize a function func using the SMAC algorithm. This function is a convenience wrapper for the SMAC class. Parameters ---------- func : callable f(x) Function to minimize. x0 : list Initial guess/default configuration. bounds : list ``(min, max)`` pairs for each element in ``x``, defining the bound on that parameters. maxtime : int, optional Maximum runtime in seconds. maxfun : int, optional Maximum number of function evaluations. rng : np.random.RandomState, optional Random number generator used by SMAC. Returns ------- x : list Estimated position of the minimum. f : float Value of `func` at the minimum. s : :class:`smac.facade.smac_facade.SMAC` SMAC objects which enables the user to get e.g., the trajectory and runhistory. """ # create configuration space cs = ConfigurationSpace() for idx, (lower_bound, upper_bound) in enumerate(bounds): parameter = UniformFloatHyperparameter(name="x%d" % (idx + 1), lower=lower_bound, upper=upper_bound, default_value=x0[idx]) cs.add_hyperparameter(parameter) # Create target algorithm runner ta = ExecuteTAFuncArray(ta=func) # create scenario scenario_dict = { "run_obj": "quality", "cs": cs, "deterministic": "true", "initial_incumbent": "DEFAULT" } if maxfun > 0: scenario_dict["runcount_limit"] = maxfun if maxtime > 0: scenario_dict["wallclock_limit"] = maxtime scenario = Scenario(scenario_dict) smac = SMAC(scenario=scenario, tae_runner=ta, rng=rng) smac.logger = logging.getLogger(smac.__module__ + "." + smac.__class__.__name__) incumbent = smac.optimize() config_id = smac.solver.runhistory.config_ids[incumbent] run_key = RunKey(config_id, None, 0) incumbent_performance = smac.solver.runhistory.data[run_key] incumbent = np.array( [incumbent['x%d' % (idx + 1)] for idx in range(len(bounds))], dtype=np.float) return incumbent, incumbent_performance.cost, \ smac
def __init__(self, scenario: Scenario, smac: Union[SMAC, None] = None, mode: str = 'all', X: Union[None, List[list], np.ndarray] = None, y: Union[None, List[list], np.ndarray] = None, numParams: int = -1, impute: bool = False, seed: int = 12345, run: bool = False, max_sample_size: int = -1, fanova_cut_at_default: bool = False, fANOVA_pairwise: bool = True, forwardsel_feat_imp: bool = False, incn_quant_var: bool = True, marginalize_away_instances: bool = False, save_folder: str = 'PIMP'): """ Interface to be used with SMAC or with X and y matrices. :param scenario: The scenario object, that knows the configuration space. :param smac: The smac object that keeps all the run-data :param mode: The mode with which to run PIMP [ablation, fanova, all, forward-selection] :param X: Numpy Array that contains parameter arrays :param y: Numpy array that contains the corresponding performance values :param numParams: The number of parameters to evaluate :param impute: Flag to decide if censored data gets imputed or not :param seed: The random seed :param run: Flag to immediately compute the importance values after this setup or not. """ self.scenario = scenario self.imp = None self.mode = mode self.save_folder = save_folder if not os.path.exists(self.save_folder): os.mkdir(self.save_folder) if smac is not None: self.imp = Importance(scenario=scenario, runhistory=smac.runhistory, incumbent=smac.solver.incumbent, seed=seed, parameters_to_evaluate=numParams, save_folder='PIMP', impute_censored=impute, max_sample_size=max_sample_size, fANOVA_cut_at_default=fanova_cut_at_default, fANOVA_pairwise=fANOVA_pairwise, forwardsel_feat_imp=forwardsel_feat_imp, incn_quant_var=incn_quant_var, preprocess=marginalize_away_instances) elif X is not None and y is not None: X = np.array(X) y = np.array(y) runHist = RunHistory(average_cost) if X.shape[0] != y.shape[0]: raise Exception('Number of samples in X and y dont match!') n_params = len(scenario.cs.get_hyperparameters()) feats = None if X.shape[1] > n_params: feats = X[:, n_params:] assert feats.shape[1] == scenario.feature_array.shape[1] X = X[:, :n_params] for p in range(X.shape[1]): # Normalize the data to fit into [0, 1] _min, _max = np.min(X[:, p]), np.max(X[:, p]) if _min < 0. or 1 < _max: # if it is not already normalized for id, v in enumerate(X[:, p]): X[id, p] = (v - _min) / (_max - _min) # Add everything to a runhistory such that PIMP can work with it for x, feat, y_val in zip(X, feats if feats is not None else X, y): id = None for inst in scenario.feature_dict: # determine on which instance a configuration was run if np.all(scenario.feature_dict[inst] == feat): id = inst break runHist.add(Configuration(scenario.cs, vector=x), y_val, 0, StatusType.SUCCESS, id) self.X = X self.y = y best_ = None # Determine incumbent according to the best mean cost in the runhistory for config in runHist.config_ids: inst_seed_pairs = runHist.get_runs_for_config(config) all_ = [] for inst, seed in inst_seed_pairs: rk = RunKey(runHist.config_ids[config], inst, seed) all_.append(runHist.data[rk].cost) mean = np.mean(all_) if best_ is None or best_[0] > mean: best_ = (mean, config) incumbent = best_[1] self.imp = Importance(scenario=scenario, runhistory=runHist, seed=seed, parameters_to_evaluate=numParams, save_folder=self.save_folder, impute_censored=impute, incumbent=incumbent, fANOVA_cut_at_default=fanova_cut_at_default, fANOVA_pairwise=fANOVA_pairwise, forwardsel_feat_imp=forwardsel_feat_imp, incn_quant_var=incn_quant_var, preprocess=marginalize_away_instances ) else: raise Exception('Neither X and y matrices nor a SMAC object were specified to compute the importance ' 'values from!') if run: self.compute_importances()
def test_load(self): configuration_space = test_helpers.get_branin_config_space() other_runhistory = '{"data": [[[2, "branini", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[1, "branin", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[3, "branin-hoo", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[2, null, 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[1, "branini", 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]], ' \ '[[4, null, 1], [1, 1,' \ '{"__enum__": "StatusType.SUCCESS"}, null]]], ' \ '"configs": {' \ '"4": {"x": -2.2060968293349363, "y": 5.183410905645716}, ' \ '"3": {"x": -2.7986616377433045, "y": 1.385078921531967}, ' \ '"1": {"x": 1.2553300705386103, "y": 10.804867401632372}, ' \ '"2": {"x": -4.998284377739827, "y": 4.534988589477597}}}' other_runhistory_filename = os.path.join(self.tmp_dir, '.runhistory_20.json') with open(other_runhistory_filename, 'w') as fh: fh.write(other_runhistory) # load from an empty runhistory runhistory = RunHistory(aggregate_func=average_cost) runhistory.load_json(other_runhistory_filename, configuration_space) self.assertEqual(sorted(list(runhistory.ids_config.keys())), [1, 2, 3, 4]) self.assertEqual(len(runhistory.data), 6) # load from non-empty runhistory, but existing run will be overridden # because it alread existed runhistory = RunHistory(aggregate_func=average_cost) configuration_space.seed(1) config = configuration_space.sample_configuration() runhistory.add(config, 1, 1, StatusType.SUCCESS, seed=1, instance_id='branin') id_before = id(runhistory.data[RunKey(1, 'branin', 1)]) runhistory.update_from_json(other_runhistory_filename, configuration_space) id_after = id(runhistory.data[RunKey(1, 'branin', 1)]) self.assertEqual(len(runhistory.data), 6) self.assertNotEqual(id_before, id_after) # load from non-empty runhistory, but existing run will not be # overridden, but config_id will be re-used runhistory = RunHistory(aggregate_func=average_cost) configuration_space.seed(1) config = configuration_space.sample_configuration() config = configuration_space.sample_configuration() # This is the former config_3 config = configuration_space.sample_configuration() runhistory.add(config, 1, 1, StatusType.SUCCESS, seed=1, instance_id='branin') id_before = id(runhistory.data[RunKey(1, 'branin', 1)]) runhistory.update_from_json(other_runhistory_filename, configuration_space) id_after = id(runhistory.data[RunKey(1, 'branin', 1)]) self.assertEqual(len(runhistory.data), 7) self.assertEqual(id_before, id_after) print(runhistory.config_ids) self.assertEqual(sorted(list(runhistory.ids_config.keys())), [1, 2, 3, 4]) print(list(runhistory.data.keys()))