def test_objective_runtime(self): ''' test if everything is ok with objective runtime (imputing!) ''' scen = Scenario(self.scen_fn, cmd_options={'run_obj': 'runtime', 'cutoff_time': 5}) validator = Validator(scen, self.trajectory, self.rng) old_configs = [entry["incumbent"] for entry in self.trajectory] old_rh = RunHistory() for config in old_configs[:int(len(old_configs) / 2)]: old_rh.add(config, 1, 1, StatusType.SUCCESS, instance_id='0') validator.validate_epm('all', 'train', 1, old_rh)
def test_choose_next(self): seed = 42 config = self.scenario.cs.sample_configuration() rh = RunHistory() rh.add(config, 10, 10, StatusType.SUCCESS) smbo = SMAC4AC(self.scenario, rng=seed, runhistory=rh).solver x = next(smbo.epm_chooser.choose_next()).get_array() self.assertEqual(x.shape, (2, ))
def test_get_config_runs(self): ''' get some config runs from runhistory ''' rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={'a': 1, 'b': 2}) config2 = Configuration(cs, values={'a': 1, 'b': 3}) rh.add(config=config1, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1) rh.add(config=config2, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1) rh.add(config=config1, cost=10, time=20, status=StatusType.SUCCESS, instance_id=2, seed=2) ist = rh.get_runs_for_config(config=config1) #print(ist) #print(ist[0]) #print(ist[1]) self.assertEqual(len(ist), 2) self.assertEqual(ist[0].instance, 1) self.assertEqual(ist[1].instance, 2)
def __init__(self, folder: str, ta_exec_dir: Union[str, None] = None): """Initialize scenario, runhistory and incumbent from folder, execute init-method of SMAC facade (so you could simply use SMAC-instances instead) Parameters ---------- folder: string output-dir of this run ta_exec_dir: string if the execution directory for the SMAC-run differs from the cwd, there might be problems loading instance-, feature- or PCS-files in the scenario-object. since instance- and PCS-files are necessary, specify the path to the execution-dir of SMAC here """ run_1_existed = os.path.exists('run_1') self.logger = logging.getLogger("cave.SMACrun.{}".format(folder)) in_reader = InputReader() self.folder = folder self.logger.debug("Loading from %s", folder) split_folder = os.path.split(folder) self.logger.info(split_folder) if ta_exec_dir is None: ta_exec_dir = '.' self.scen_fn = os.path.join(folder, 'scenario.txt') self.rh_fn = os.path.join(folder, 'runhistory.json') self.traj_fn = os.path.join(folder, 'traj_aclib2.json') self.traj_old_fn = os.path.join(folder, 'traj_old.csv') # Create Scenario (disable output_dir to avoid cluttering) scen_dict = in_reader.read_scenario_file(self.scen_fn) scen_dict['output_dir'] = "" with changedir(ta_exec_dir): self.scen = Scenario(scen_dict) # Load runhistory and trajectory self.runhistory = RunHistory(average_cost) self.runhistory.update_from_json(self.rh_fn, self.scen.cs) self.traj = TrajLogger.read_traj_aclib_format(fn=self.traj_fn, cs=self.scen.cs) incumbent = self.traj[-1]['incumbent'] self.train_inst = self.scen.train_insts self.test_inst = self.scen.test_insts # Initialize SMAC-object super().__init__(scenario=self.scen, runhistory=self.runhistory) #restore_incumbent=incumbent) # TODO use restore, delete next line self.solver.incumbent = incumbent if (not run_1_existed) and os.path.exists('run_1'): shutil.rmtree('run_1')
def test_multiple_budgets(self): rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={'a': 1, 'b': 2}) rh.add(config=config1, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=1) self.assertEqual(rh.get_cost(config1), 10) # only the higher budget gets included in the config cost rh.add(config=config1, cost=20, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=2) self.assertEqual(rh.get_cost(config1), 20) self.assertEqual(rh.get_min_cost(config1), 10)
def test_get_configs_per_budget(self): rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={'a': 1, 'b': 1}) rh.add(config=config1, cost=10, time=10, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=1) config2 = Configuration(cs, values={'a': 2, 'b': 2}) rh.add(config=config2, cost=20, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=1) config3 = Configuration(cs, values={'a': 3, 'b': 3}) rh.add(config=config3, cost=30, time=30, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=3) self.assertListEqual(rh.get_all_configs_per_budget([1]), [config1, config2])
def _get_instances_to_run( self, challenger: Configuration, incumbent: Configuration, run_history: RunHistory, N: int) -> typing.Tuple[typing.List[InstSeedBudgetKey], float]: """ Returns the minimum list of <instance, seed> pairs to run the challenger on before comparing it with the incumbent Parameters ---------- incumbent: Configuration incumbent configuration challenger: Configuration promising configuration that is presently being evaluated run_history: smac.runhistory.runhistory.RunHistory Stores all runs we ran so far N: int number of <instance, seed> pairs to select Returns ------- typing.List[InstSeedBudgetKey] list of <instance, seed, budget> tuples to run float total (runtime) cost of running the incumbent on the instances (used for adaptive capping while racing) """ # get next instances left for the challenger # Line 8 inc_inst_seeds = set( run_history.get_runs_for_config(incumbent, only_max_observed_budget=True)) chall_inst_seeds = set( run_history.get_runs_for_config(challenger, only_max_observed_budget=True)) # Line 10 missing_runs = list(inc_inst_seeds - chall_inst_seeds) # Line 11 self.rs.shuffle(missing_runs) if N < 0: raise ValueError( 'Argument N must not be smaller than zero, but is %s' % str(N)) to_run = missing_runs[:min(N, len(missing_runs))] missing_runs = missing_runs[min(N, len(missing_runs)):] # for adaptive capping # because of efficiency computed here inst_seed_pairs = list(inc_inst_seeds - set(missing_runs)) # cost used by incumbent for going over all runs in inst_seed_pairs inc_sum_cost = run_history.sum_cost( config=incumbent, instance_seed_budget_keys=inst_seed_pairs, ) return to_run, inc_sum_cost
def test_choose_next_higher_budget(self): seed = 42 config = self.scenario.cs.sample_configuration rh = RunHistory() rh.add( config=config(), cost=1, time=10, instance_id=None, seed=1, budget=1, additional_info=None, status=StatusType.SUCCESS, ) rh.add( config=config(), cost=2, time=10, instance_id=None, seed=1, budget=2, additional_info=None, status=StatusType.SUCCESS, ) rh.add( config=config(), cost=3, time=10, instance_id=None, seed=1, budget=2, additional_info=None, status=StatusType.SUCCESS, ) rh.add( config=config(), cost=4, time=10, instance_id=None, seed=1, budget=3, additional_info=None, status=StatusType.SUCCESS, ) smbo = SMAC4AC(self.scenario, rng=seed, runhistory=rh).solver smbo.epm_chooser.min_samples_model = 2 # Return two configurations evaluated with budget==2 X, Y, X_configurations = smbo.epm_chooser._collect_data_to_train_model( ) self.assertListEqual(list(Y.flatten()), [2, 3]) self.assertEqual(X.shape[0], 2) self.assertEqual(X_configurations.shape[0], 2)
def merge_foreign_data( scenario: Scenario, runhistory: RunHistory, in_scenario_list: typing.List[Scenario], in_runhistory_list: typing.List[RunHistory] ) -> typing.Tuple[Scenario, RunHistory]: """Extend <scenario> and <runhistory> with runhistory data from another <in_scenario> assuming the same pcs, feature space, but different instances Parameters ---------- scenario: Scenario original scenario -- feature dictionary will be extended runhistory: RunHistory original runhistory -- will be extended by further data points in_scenario_list: typing.List[Scenario] input scenario in_runhistory_list: typing.List[RunHistory] list of runhistories wrt <in_scenario> Returns ------- scenario: Scenario runhistory: Runhistory """ # add further instance features for in_scenario in in_scenario_list: if scenario.n_features != in_scenario.n_features: raise ValueError( "Feature Space has to be the same for both scenarios (%d vs %d)." % (scenario.n_features, in_scenario.n_features)) if scenario.cs != in_scenario.cs: # type: ignore[attr-defined] # noqa F821 raise ValueError("PCS of both scenarios have to be identical.") if scenario.cutoff != in_scenario.cutoff: # type: ignore[attr-defined] # noqa F821 raise ValueError("Cutoffs of both scenarios have to be identical.") scenario.feature_dict.update(in_scenario.feature_dict) # extend runhistory for rh in in_runhistory_list: runhistory.update(rh, origin=DataOrigin.EXTERNAL_DIFFERENT_INSTANCES) for date in runhistory.data: if scenario.feature_dict.get(date.instance_id) is None: raise ValueError( "Instance feature for \"%s\" was not found in scenario data." % (date.instance_id)) runhistory.compute_all_costs(instances=scenario.train_insts) return scenario, runhistory
def test_no_feature_dict(self): scen = Scenario(self.scen_fn, cmd_options={'run_obj': 'quality'}) scen.feature_array = None validator = Validator(scen, self.trajectory) old_rh = RunHistory() for config in [e["incumbent"] for e in self.trajectory]: old_rh.add(config, 1, 1, StatusType.SUCCESS, instance_id='0', seed=127) validator.validate_epm(runhistory=old_rh)
def merge_foreign_data(scenario: Scenario, runhistory: RunHistory, in_scenario_list: typing.List[Scenario], in_runhistory_list: typing.List[RunHistory]): ''' extend <scenario> and <runhistory> with runhistory data from another <in_scenario> assuming the same pcs, feature space, but different instances Arguments --------- scenario: Scenario original scenario -- feature dictionary will be extended runhistory: RunHistory original runhistory -- will be extended by further data points in_scenario_list: typing.List[Scenario] input scenario in_runhistory_list: typing.List[RunHistory] list of runhistories wrt <in_scenario> Returns ------- scenario, runhistory ''' # add further instance features for in_scenario in in_scenario_list: if scenario.n_features != in_scenario.n_features: raise ValueError( "Feature Space has to be the same for both scenarios (%d vs %d)." % (scenario.n_features, in_scenario.n_features)) if scenario.cs != in_scenario.cs: raise ValueError("PCS of both scenarios have to be identical.") if scenario.cutoff != in_scenario.cutoff: raise ValueError("Cutoffs of both scenarios have to be identical.") scenario.feature_dict.update(in_scenario.feature_dict) # extend runhistory for rh in in_runhistory_list: runhistory.update(rh, external_data=True) for date in runhistory.data: if scenario.feature_dict.get(date.instance_id) is None: raise ValueError( "Instance feature for \"%s\" was not found in scenario data." % (date.instance_id)) runhistory.compute_all_costs(instances=scenario.train_insts) return scenario, runhistory
def _aggregate(self, runs): # path_to_folder is the concatenation of all the paths of the individual runs path_to_folder = '-'.join( sorted(list(set([r.path_to_folder for r in runs])))) # budgets are the union of individual budgets. if they are not the same for all runs (no usecase atm), # they get an additional entry of the hash over the string of the combination to avoid false-positives budgets = [r.reduced_to_budgets for r in runs] budget_hash = ['budgetmix-%d' % (hash(str(budgets))) ] if len(set([frozenset(b) for b in budgets])) != 1 else [] budgets = [ a for b in [x for x in budgets if x is not None] for a in b ] + budget_hash if ConfiguratorRun.identify(path_to_folder, budgets) in self.cache: return self.cache[ConfiguratorRun.identify(path_to_folder, budgets)] orig_rh, vali_rh = RunHistory(), RunHistory() for run in runs: orig_rh.update(run.original_runhistory, origin=DataOrigin.INTERNAL) vali_rh.update(run.original_runhistory, origin=DataOrigin.INTERNAL) if run.validated_runhistory: vali_rh.update(run.validated_runhistory, origin=DataOrigin.EXTERNAL_SAME_INSTANCES) for rh_name, rh in [ ("original", orig_rh), ("validated", vali_rh), ]: self.logger.debug( 'Combined number of %s RunHistory data points: %d ' '# Configurations: %d. # Configurator runs: %d', rh_name, len(rh.data), len(rh.get_all_configs()), len(runs)) traj = combine_trajectories([run.trajectory for run in runs], self.logger) new_cr = ConfiguratorRun( runs[0].scenario, orig_rh, vali_rh, traj, self.analyzing_options, output_dir=self.output_dir, path_to_folder=path_to_folder, reduced_to_budgets=budgets, ) self._cache(new_cr) return new_cr
def test_choose_next_2(self): # Test with a single configuration in the runhistory! def side_effect(X): return np.mean(X, axis=1).reshape((-1, 1)) def side_effect_predict(X): m, v = np.ones((X.shape[0], 1)), None return m, v seed = 42 incumbent = self.scenario.cs.get_default_configuration() rh = RunHistory() rh.add(incumbent, 10, 10, StatusType.SUCCESS) epm_chooser = SMAC4AC(self.scenario, rng=seed, runhistory=rh).solver.epm_chooser epm_chooser.model = mock.Mock(spec=RandomForestWithInstances) epm_chooser.model.predict_marginalized_over_instances.side_effect = side_effect_predict epm_chooser.acquisition_func._compute = mock.Mock( spec=RandomForestWithInstances) epm_chooser.acquisition_func._compute.side_effect = side_effect epm_chooser.incumbent = incumbent challengers = epm_chooser.choose_next() # Convert challenger list (a generator) to a real list challengers = [c for c in challengers] self.assertEqual(epm_chooser.model.train.call_count, 1) # For each configuration it is randomly sampled whether to take it from the list of challengers or to sample it # completely at random. Therefore, it is not guaranteed to obtain twice the number of configurations selected # by EI. self.assertEqual(len(challengers), 9968) num_random_search_sorted = 0 num_random_search = 0 num_local_search = 0 for c in challengers: self.assertIsInstance(c, Configuration) if 'Random Search (sorted)' == c.origin: num_random_search_sorted += 1 elif 'Random Search' == c.origin: num_random_search += 1 elif 'Local Search' == c.origin: num_local_search += 1 else: raise ValueError((c.origin, 'Local Search' == c.origin, type('Local Search'), type(c.origin))) self.assertEqual(num_local_search, 11) self.assertEqual(num_random_search_sorted, 5000) self.assertEqual(num_random_search, 4957)
def read(run_history: RunHistory, output_dirs: typing.Union[str, typing.List[str]], configuration_space: ConfigurationSpace, logger: logging.Logger) -> None: """Update runhistory with run results from concurrent runs of pSMAC. Parameters ---------- run_history : smac.runhistory.RunHistory RunHistory object to be updated with run information from runhistory objects stored in the output directory. output_dirs : typing.Union[str,typing.List[str]] List of SMAC output directories or Linux path expression (str) which will be casted into a list with glob.glob(). This function will search the output directories for files matching the runhistory regular expression. configuration_space : ConfigSpace.ConfigurationSpace A ConfigurationSpace object to check if loaded configurations are valid. logger : logging.Logger """ numruns_in_runhistory = len(run_history.data) initial_numruns_in_runhistory = numruns_in_runhistory if isinstance(output_dirs, str): parsed_output_dirs = glob.glob(output_dirs) if glob.glob(os.path.join(output_dirs, "run_*")): parsed_output_dirs += glob.glob(os.path.join(output_dirs, "run_*")) else: parsed_output_dirs = output_dirs for output_directory in parsed_output_dirs: for file_in_output_directory in os.listdir(output_directory): match = re.match(RUNHISTORY_RE, file_in_output_directory) valid_match = re.match(VALIDATEDRUNHISTORY_RE, file_in_output_directory) if match or valid_match: runhistory_file = os.path.join(output_directory, file_in_output_directory) run_history.update_from_json(runhistory_file, configuration_space) new_numruns_in_runhistory = len(run_history.data) difference = new_numruns_in_runhistory - numruns_in_runhistory logger.debug('Shared model mode: Loaded %d new runs from %s' % (difference, runhistory_file)) numruns_in_runhistory = new_numruns_in_runhistory difference = numruns_in_runhistory - initial_numruns_in_runhistory logger.info( 'Shared model mode: Finished loading new runs, found %d new runs.' % difference)
def main(): parser = argparse.ArgumentParser(description='test', fromfile_prefix_chars="@") parser.add_argument('-s', '--scenario_file', dest='scenario', required=True) parser.add_argument('-rh', '--runhistory_file', dest='runhistory', required=True) parser.add_argument('-o', '--output_file', dest='output', required=True) args = parser.parse_args() scenario = Scenario(args.scenario) # We load the runhistory, ... rh_path = os.path.join(args.runhistory) runhistory = RunHistory(aggregate_func=None) runhistory.load_json(rh_path, scenario.cs) cost_default = [] cost_incumbent = [] for entry, values in runhistory.data.items( ): # iterate over data because it is an OrderedDict config_id = entry.config_id # look up config id config = runhistory.ids_config[config_id] # look up config z_ = values.cost # get cost if z_ > 100: z_ = 150 if config_id == 1: cost_default.append(z_) else: cost_incumbent.append(z_) fig1 = plt.figure() ax1 = fig1.add_subplot(111) ax1.plot(cost_incumbent, cost_default, linestyle='None', marker='o', color="black") ax1.plot([0, 100], [0, 100], 'r-') ax1.plot([0, 100], [100, 100], linestyle='dashed', color="black") ax1.plot([100, 100], [100, 0], linestyle='dashed', color="black") plt.ylabel("Default Configuration") plt.xlabel("Incumbent") ax1.set_xlim([0, 175]) ax1.set_ylim([0, 175]) plt.gca().set_aspect('equal', adjustable='box') plt.title("Performance of Incumbent compared to Default Configuration") plt.savefig(args.output)
def test_multiple_budgets(self): rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={"a": 1, "b": 2}) rh.add( config=config1, cost=[10, 50], time=20, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=1, ) self.assertEqual(rh.get_cost(config1), 1.0) # Only the higher budget gets included in the config cost # However, we expect that the bounds are changed rh.add( config=config1, cost=[20, 25], time=25, status=StatusType.SUCCESS, instance_id=1, seed=1, budget=5, ) self.assertEqual(rh.get_cost(config1), 0.5)
def test_add_and_pickle(self): ''' simply adding some rundata to runhistory, then pickle it ''' rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={'a': 1, 'b': 2}) self.assertTrue(rh.empty()) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=None, seed=None, additional_info=None) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, additional_info={"start_time": 10}) self.assertFalse(rh.empty()) tmpfile = tempfile.NamedTemporaryFile(mode='wb', delete=False) pickle.dump(rh, tmpfile, -1) name = tmpfile.name tmpfile.close() with open(name, 'rb') as fh: loaded_rh = pickle.load(fh) self.assertEqual(loaded_rh.data, rh.data)
def test_incremental_update(self): rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={"a": 1, "b": 2}) rh.add( config=config1, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1, ) self.assertEqual(rh.get_cost(config1), 10) rh.add( config=config1, cost=20, time=20, status=StatusType.SUCCESS, instance_id=2, seed=1, ) self.assertEqual(rh.get_cost(config1), 15)
def test_add_multiple_times(self): rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={'a': 1, 'b': 2}) for i in range(5): rh.add(config=config, cost=i + 1, time=i + 1, status=StatusType.SUCCESS, instance_id=None, seed=12345, additional_info=None) self.assertEqual(len(rh.data), 1) self.assertEqual(len(rh.get_runs_for_config(config, only_max_observed_budget=True)), 1) self.assertEqual(len(rh._configid_to_inst_seed_budget[1]), 1) self.assertEqual(list(rh.data.values())[0].cost, 1)
def merge_foreign_data_from_file( scenario: Scenario, runhistory: RunHistory, in_scenario_fn_list: typing.List[str], in_runhistory_fn_list: typing.List[str], cs: ConfigurationSpace, aggregate_func: typing.Callable = average_cost): """Extend <scenario> and <runhistory> with runhistory data from another <in_scenario> assuming the same pcs, feature space, but different instances Parameters --------- scenario: Scenario original scenario -- feature dictionary will be extended runhistory: RunHistory original runhistory -- will be extended by further data points in_scenario_fn_list: typing.List[str] input scenario file names in_runhistory_fn_list: typing.List[str] list filenames of runhistory dumps cs: ConfigurationSpace parameter configuration space to read runhistory from file aggregate_func: typing.Callable function to aggregate performance of a configuratoion across instances Returns ------- scenario: Scenario runhistory: Runhistory """ if not in_scenario_fn_list: raise ValueError( "To read warmstart data from previous runhistories, the corresponding scenarios are required. Use option --warmstart_scenario" ) scens = [ Scenario(scenario=scen_fn, cmd_args={"output_dir": ""}) for scen_fn in in_scenario_fn_list ] rhs = [] for rh_fn in in_runhistory_fn_list: rh = RunHistory(aggregate_func) rh.load_json(rh_fn, cs) rhs.append(rh) return merge_foreign_data(scenario, runhistory, in_scenario_list=scens, in_runhistory_list=rhs)
def main(): parser = argparse.ArgumentParser(description='test', fromfile_prefix_chars="@") parser.add_argument('-s', '--scenario_file', dest='scenario', required=True) parser.add_argument('-rh', '--runhistory_file', dest='runhistory', required=True) parser.add_argument('-o', '--output_file', dest='output', required=True) args = parser.parse_args() scenario = Scenario(args.scenario) # We load the runhistory, ... rh_path = os.path.join(args.runhistory) runhistory = RunHistory(aggregate_func=None) runhistory.load_json(rh_path, scenario.cs) cost_default = 1 cost_incumbent = [] for entry, values in runhistory.data.items( ): # iterate over data because it is an OrderedDict config_id = entry.config_id # look up config id config = runhistory.ids_config[config_id] # look up config z_ = values.cost # get cost if config_id == 1: #default configuration cost_default = z_ else: cost_incumbent.append(z_) fig1 = plt.figure() ax1 = fig1.add_subplot(111) x = range(len(cost_incumbent)) ax1.plot(x, cost_incumbent, linestyle='None', marker='+', color="black", label='SMAC') ax1.plot([0, len(x)], [cost_default, cost_default], 'r-', label='Default Configuration') plt.ylabel("Loss") plt.xlabel("SMAC") plt.title("Performance of Incumbents compared to Default Configuration") plt.legend() plt.savefig(args.output)
def test_validate_epm(self): ''' test using epm to validate ''' scen = Scenario(self.scen_fn, cmd_args={'run_obj':'quality', 'instances' : self.train_insts, 'test_instances': self.test_insts, 'features': self.feature_dict}) scen.instance_specific = self.inst_specs validator = Validator(scen, self.trajectory, self.rng) # Add a few runs and check, if they are correctly processed old_configs = [entry["incumbent"] for entry in self.trajectory] old_rh = RunHistory(average_cost) for config in old_configs[:int(len(old_configs)/2)]: old_rh.add(config, 1, 1, StatusType.SUCCESS, instance_id='0', seed=127) validator.validate_epm('all', 'train', 1, old_rh)
def _get_mean_costs(self, incs: typing.List[Configuration], new_rh: RunHistory): """ Compute mean cost per instance Parameters ---------- incs : typing.List[Configuration] incumbents determined by all parallel SMAC runs new_rh : RunHistory runhistory to determine mean performance Returns ------- List[float] means Dict(Config -> Dict(inst_id(str) -> float)) """ config_cost_per_inst = {} results = [] for incumbent in incs: cost_per_inst = new_rh.get_instance_costs_for_config( config=incumbent) config_cost_per_inst[incumbent] = cost_per_inst values = list(cost_per_inst.values()) if values: results.append(np.mean(values)) else: results.append(np.nan) return results, config_cost_per_inst
def get_cost_dict_for_config(rh: RunHistory, conf: Configuration, par: int = 1, cutoff: typing.Union[float, None] = None): """ Aggregates loss for configuration on evaluated instances over seeds. Parameters ---------- rh: RunHistory runhistory with data conf: Configuration configuration to evaluate par: int par-factor with which to multiply timeouts cutoff: float cutoff of scenario - used to penalize costs if par != 1 Returns ------- cost: dict(instance->cost) cost per instance (aggregated or as list per seed) """ # Check if config is in runhistory conf_id = rh.config_ids[conf] # Map instances to seeds in dict runs = rh.get_runs_for_config(conf) instance_to_seeds = dict() for run in runs: inst, seed = run if inst in instance_to_seeds: instance_to_seeds[inst].append(seed) else: instance_to_seeds[inst] = [seed] # Get loss per instance instance_costs = { i: [rh.data[RunKey(conf_id, i, s)].cost for s in instance_to_seeds[i]] for i in instance_to_seeds } # Aggregate: instance_costs = {i: np.mean(instance_costs[i]) for i in instance_costs} # TODO: uncomment next line and delete all above after next SMAC dev->master # instance_costs = rh.get_instance_costs_for_config(conf) if par != 1: if cutoff: instance_costs = { k: v if v < cutoff else v * par for k, v in instance_costs.items() } else: raise ValueError( "To apply penalization of costs, a cutoff needs to be provided." ) return instance_costs
def test_epm_reuse_rf(self): """ if no runhistory is passed to epm, but there was a model trained before, that model should be reused! (if reuse_epm flag is set) """ scen = Scenario(self.scen_fn, cmd_args={'run_obj':'quality'}) scen.feature_array = None validator = Validator(scen, self.trajectory) old_rh = RunHistory(average_cost) for config in [e["incumbent"] for e in self.trajectory]: old_rh.add(config, 1, 1, StatusType.SUCCESS, instance_id='0', seed=127) self.assertTrue(isinstance(validator.validate_epm(runhistory=old_rh), RunHistory)) self.assertTrue(isinstance(validator.validate_epm( output_fn="test/test_files/validation/"), RunHistory)) self.assertRaises(ValueError, validator.validate_epm, reuse_epm=False)
def test_choose_next_2(self): def side_effect(X, derivative): return np.mean(X, axis=1).reshape((-1, 1)) smbo = SMAC(self.scenario, rng=1).solver smbo.incumbent = self.scenario.cs.sample_configuration() smbo.runhistory = RunHistory(aggregate_func=average_cost) smbo.model = mock.Mock(spec=RandomForestWithInstances) smbo.acquisition_func._compute = mock.Mock( spec=RandomForestWithInstances) smbo.acquisition_func._compute.side_effect = side_effect X = smbo.rng.rand(10, 2) Y = smbo.rng.rand(10, 1) x = smbo.choose_next(X, Y) self.assertEqual(smbo.model.train.call_count, 1) self.assertEqual(len(x), 2020) num_random_search = 0 num_local_search = 0 for i in range(0, 2020, 2): #print(x[i].origin) self.assertIsInstance(x[i], Configuration) if 'Random Search (sorted)' in x[i].origin: num_random_search += 1 elif 'Local Search' in x[i].origin: num_local_search += 1 # number of local search configs has to be least 10 # since x can have duplicates # which can be associated with the local search self.assertGreaterEqual(num_local_search, 10) for i in range(1, 2020, 2): self.assertIsInstance(x[i], Configuration) self.assertEqual(x[i].origin, 'Random Search')
def _cost(config: Configuration, run_history: RunHistory, instance_seed_pairs=None): """Return array of all costs for the given config for further calculations. Parameters ---------- config : Configuration Configuration to calculate objective for run_history : RunHistory RunHistory object from which the objective value is computed. instance_seed_pairs : list, optional (default=None) List of tuples of instance-seeds pairs. If None, the run_history is queried for all runs of the given configuration. Returns ------- Costs: list Array of all costs """ try: id_ = run_history.config_ids[config] except KeyError: # challenger was not running so far return [] if instance_seed_pairs is None: instance_seed_pairs = run_history.get_runs_for_config(config) costs = [] for i, r in instance_seed_pairs: k = RunKey(id_, i, r) costs.append(run_history.data[k].cost) return costs
def __init__(self, scenario: typing.Type[Scenario], rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[ExecuteTARun] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, shared_model: bool = True, validate: bool = True, n_optimizers: int = 2, val_set: typing.Union[typing.List[str], None] = None, n_incs: int = 1, **kwargs): """ Constructor Parameters ---------- scenario : ~smac.scenario.scenario.Scenario Scenario object n_optimizers: int Number of optimizers to run in parallel per round rng: int/np.random.RandomState The randomState/seed to pass to each smac run run_id: int run_id for this hydra run tae: ExecuteTARun Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc) tae_kwargs: Optional[dict] arguments passed to constructor of '~tae' shared_model: bool Flag to indicate whether information is shared between SMAC runs or not validate: bool / None Flag to indicate whether to validate the found configurations or to use the SMAC estimates None => neither and return the full portfolio n_incs: int Number of incumbents to return (n_incs <= 0 ==> all found configurations) val_set: typing.List[str] List of instance-ids to validate on """ self.logger = logging.getLogger( self.__module__ + "." + self.__class__.__name__) self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, logger=self.logger) self.kwargs = kwargs self.output_dir = None self.rh = RunHistory() self._tae = tae self._tae_kwargs = tae_kwargs if n_optimizers <= 1: self.logger.warning('Invalid value in %s: %d. Setting to 2', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 2) self.validate = validate self.shared_model = shared_model self.n_incs = min(max(1, n_incs), self.n_optimizers) if val_set is None: self.val_set = scenario.train_insts else: self.val_set = val_set
def test_choose_next_2(self): def side_effect(X, derivative): return np.mean(X, axis=1).reshape((-1, 1)) smbo = SMBO(self.scenario, 1) smbo.runhistory = RunHistory() smbo.model = mock.MagicMock() smbo.acquisition_func._compute = mock.MagicMock() smbo.acquisition_func._compute.side_effect = side_effect # local search would call the underlying local search maximizer, # which would have to be mocked out. Replacing the method by random # search is way easier! smbo._get_next_by_local_search = smbo._get_next_by_random_search X = smbo.rng.rand(10, 2) Y = smbo.rng.rand(10, 1) x = smbo.choose_next(X, Y) self.assertEqual(smbo.model.train.call_count, 1) self.assertEqual(smbo.acquisition_func._compute.call_count, 1) self.assertEqual(len(x), 2020) num_random_search = 0 for i in range(0, 2020, 2): self.assertIsInstance(x[i], Configuration) if x[i].origin == 'Random Search': num_random_search += 1 # Since we replace local search with random search, we have to count # the occurences of random seacrh instead self.assertEqual(num_random_search, 10) for i in range(1, 2020, 2): self.assertIsInstance(x[i], Configuration) self.assertEqual(x[i].origin, 'Random Search')
def __init__(self, scenario: typing.Type[Scenario], rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[BaseRunner] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, shared_model: bool = True, validate: bool = True, n_optimizers: int = 2, val_set: typing.Union[typing.List[str], None] = None, n_incs: int = 1, **kwargs): self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, logger=self.logger) self.kwargs = kwargs self.output_dir = None self.rh = RunHistory() self._tae = tae self._tae_kwargs = tae_kwargs if n_optimizers <= 1: self.logger.warning('Invalid value in %s: %d. Setting to 2', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 2) self.validate = validate self.shared_model = shared_model self.n_incs = min(max(1, n_incs), self.n_optimizers) if val_set is None: self.val_set = scenario.train_insts else: self.val_set = val_set
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info("Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits( self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info("Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info("A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info("A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ( (1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate(([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info("Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info("Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info("Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info("Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time() - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)