def test_psmac(self): with joblib.parallel_backend('multiprocessing', n_jobs=1): optimizer = PSMAC(self.scenario, n_optimizers=3, n_incs=2, validate=False) incs = optimizer.optimize() self.assertEqual(len(incs), 2) optimizer = PSMAC(self.scenario, n_optimizers=1, n_incs=4, validate=False) incs = optimizer.optimize() self.assertEqual(len(incs), 2) optimizer = PSMAC(self.scenario, n_optimizers=5, n_incs=4, validate=False) incs = optimizer.optimize() self.assertEqual(len(incs), 4)
class Hydra(object): """ Facade to use Hydra default mode Attributes ---------- logger stats : Stats loggs information about used resources solver : SMBO handles the actual algorithm calls rh : RunHistory List with information about previous runs portfolio : list List of all incumbents """ def __init__(self, scenario: Scenario, n_iterations: int, val_set: str = 'train', incs_per_round: int = 1, n_optimizers: int = 1, rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[ExecuteTARun] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, **kwargs): """ Constructor Parameters ---------- scenario : ~smac.scenario.scenario.Scenario Scenario object n_iterations: int, number of Hydra iterations val_set: str Set to validate incumbent(s) on. [train, valX]. train => whole training set, valX => train_set * 100/X where X in (0, 100) incs_per_round: int Number of incumbents to keep per round n_optimizers: int Number of optimizers to run in parallel per round rng: int/np.random.RandomState The randomState/seed to pass to each smac run run_id: int run_id for this hydra run tae: ExecuteTARun Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc) tae_kwargs: Optional[dict] arguments passed to constructor of '~tae' """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.n_iterations = n_iterations self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, self.logger) self.kwargs = kwargs self.output_dir = None self.top_dir = None self.solver = None self.portfolio = None self.rh = RunHistory(average_cost, file_system=scenario.file_system) self._tae = tae self._tae_kwargs = tae_kwargs if incs_per_round <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'incs_per_round', incs_per_round) self.incs_per_round = max(incs_per_round, 1) if n_optimizers <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 1) self.val_set = self._get_validation_set(val_set) self.cost_per_inst = {} self.optimizer = None self.portfolio_cost = None def _get_validation_set(self, val_set: str, delete: bool = True) -> typing.List[str]: """ Create small validation set for hydra to determine incumbent performance Parameters ---------- val_set: str Set to validate incumbent(s) on. [train, valX]. train => whole training set, valX => train_set * 100/X where X in (0, 100) delete: bool Flag to delete all validation instances from the training set Returns ------- val: typing.List[str] List of instance-ids to validate on """ if val_set == 'none': return None if val_set == 'train': return self.scenario.train_insts elif val_set[:3] != 'val': self.logger.warning( 'Can not determine validation set size. Using full training-set!' ) return self.scenario.train_insts else: size = int(val_set[3:]) / 100 if size <= 0 or size >= 1: raise ValueError( 'X invalid in valX, should be between 0 and 1') insts = np.array(self.scenario.train_insts) # just to make sure this also works with the small example we have to round up to 3 size = max(np.floor(insts.shape[0] * size).astype(int), 3) ids = np.random.choice(insts.shape[0], size, replace=False) val = insts[ids].tolist() if delete: self.scenario.train_insts = np.delete(insts, ids).tolist() return val def optimize(self) -> typing.List[Configuration]: """ Optimizes the algorithm provided in scenario (given in constructor) Returns ------- portfolio : typing.List[Configuration] Portfolio of found configurations """ # Setup output directory self.portfolio = [] portfolio_cost = np.inf if self.output_dir is None: self.top_dir = "hydra-output_%s" % ( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) scen = copy.deepcopy(self.scenario) scen.output_dir_for_this_run = None scen.output_dir = None # parent process SMAC only used for validation purposes self.solver = SMAC4AC(scenario=scen, tae_runner=self._tae, rng=self.rng, run_id=self.run_id, **self.kwargs) for i in range(self.n_iterations): self.logger.info("=" * 120) self.logger.info("Hydra Iteration: %d", (i + 1)) if i == 0: tae = self._tae tae_kwargs = self._tae_kwargs else: tae = ExecuteTARunHydra if self._tae_kwargs: tae_kwargs = self._tae_kwargs else: tae_kwargs = {} tae_kwargs['cost_oracle'] = self.cost_per_inst self.optimizer = PSMAC( scenario=self.scenario, run_id=self.run_id, rng=self.rng, tae=tae, tae_kwargs=tae_kwargs, shared_model=False, validate=True if self.val_set else False, n_optimizers=self.n_optimizers, val_set=self.val_set, n_incs=self. n_optimizers, # return all configurations (unvalidated) **self.kwargs) self.optimizer.output_dir = self.output_dir incs = self.optimizer.optimize() cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids( incs) if self.val_set: to_keep_ids = val_ids[:self.incs_per_round] else: to_keep_ids = est_ids[:self.incs_per_round] config_cost_per_inst = {} incs = incs[to_keep_ids] self.logger.info('Kept incumbents') for inc in incs: self.logger.info(inc) config_cost_per_inst[inc] = cost_per_conf_v[ inc] if self.val_set else cost_per_conf_e[inc] cur_portfolio_cost = self._update_portfolio( incs, config_cost_per_inst) if portfolio_cost <= cur_portfolio_cost: self.logger.info( "No further progress (%f) --- terminate hydra", portfolio_cost) break else: portfolio_cost = cur_portfolio_cost self.logger.info("Current pertfolio cost: %f", portfolio_cost) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) read(self.rh, os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)), self.scenario.cs, self.logger) self.rh.save_json(fn=os.path.join( self.top_dir, 'all_validated_runs_runhistory.json'), save_external=True) with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh: pickle.dump(self.portfolio, fh) self.logger.info("~" * 120) self.logger.info('Resulting Portfolio:') for configuration in self.portfolio: self.logger.info(str(configuration)) self.logger.info("~" * 120) return self.portfolio def _update_portfolio( self, incs: np.ndarray, config_cost_per_inst: typing.Dict ) -> typing.Union[np.float, float]: """ Validates all configurations (in incs) and determines which ones to add to the portfolio Parameters ---------- incs: np.ndarray List of Configurations Returns ------- cur_cost: typing.Union[np.float, float] The current cost of the portfolio """ if self.val_set: # we have validated data for kept in incs: if kept not in self.portfolio: self.portfolio.append(kept) cost_per_inst = config_cost_per_inst[kept] if self.cost_per_inst: if len(self.cost_per_inst) != len(cost_per_inst): raise ValueError( 'Num validated Instances mismatch!') else: for key in cost_per_inst: self.cost_per_inst[key] = min( self.cost_per_inst[key], cost_per_inst[key]) else: self.cost_per_inst = cost_per_inst cur_cost = np.mean(list( self.cost_per_inst.values())) # type: np.float else: # No validated data. Set the mean to the approximated mean means = [ ] # can contain nans as not every instance was evaluated thus we should use nanmean to approximate for kept in incs: means.append( np.nanmean( list( self.optimizer.rh.get_instance_costs_for_config( kept).values()))) self.portfolio.append(kept) if self.portfolio_cost: new_mean = self.portfolio_cost * ( len(self.portfolio) - len(incs)) / len(self.portfolio) new_mean += np.nansum(means) else: new_mean = np.mean(means) self.cost_per_inst = defaultdict(lambda: new_mean) cur_cost = new_mean self.portfolio_cost = cur_cost return cur_cost
def optimize(self) -> typing.List[Configuration]: """ Optimizes the algorithm provided in scenario (given in constructor) Returns ------- portfolio : typing.List[Configuration] Portfolio of found configurations """ # Setup output directory self.portfolio = [] portfolio_cost = np.inf if self.output_dir is None: self.top_dir = "hydra-output_%s" % ( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) scen = copy.deepcopy(self.scenario) scen.output_dir_for_this_run = None scen.output_dir = None # parent process SMAC only used for validation purposes self.solver = SMAC4AC(scenario=scen, tae_runner=self._tae, rng=self.rng, run_id=self.run_id, **self.kwargs) for i in range(self.n_iterations): self.logger.info("=" * 120) self.logger.info("Hydra Iteration: %d", (i + 1)) if i == 0: tae = self._tae tae_kwargs = self._tae_kwargs else: tae = ExecuteTARunHydra if self._tae_kwargs: tae_kwargs = self._tae_kwargs else: tae_kwargs = {} tae_kwargs['cost_oracle'] = self.cost_per_inst self.optimizer = PSMAC( scenario=self.scenario, run_id=self.run_id, rng=self.rng, tae=tae, tae_kwargs=tae_kwargs, shared_model=False, validate=True if self.val_set else False, n_optimizers=self.n_optimizers, val_set=self.val_set, n_incs=self. n_optimizers, # return all configurations (unvalidated) **self.kwargs) self.optimizer.output_dir = self.output_dir incs = self.optimizer.optimize() cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids( incs) if self.val_set: to_keep_ids = val_ids[:self.incs_per_round] else: to_keep_ids = est_ids[:self.incs_per_round] config_cost_per_inst = {} incs = incs[to_keep_ids] self.logger.info('Kept incumbents') for inc in incs: self.logger.info(inc) config_cost_per_inst[inc] = cost_per_conf_v[ inc] if self.val_set else cost_per_conf_e[inc] cur_portfolio_cost = self._update_portfolio( incs, config_cost_per_inst) if portfolio_cost <= cur_portfolio_cost: self.logger.info( "No further progress (%f) --- terminate hydra", portfolio_cost) break else: portfolio_cost = cur_portfolio_cost self.logger.info("Current pertfolio cost: %f", portfolio_cost) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) read(self.rh, os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)), self.scenario.cs, self.logger) self.rh.save_json(fn=os.path.join( self.top_dir, 'all_validated_runs_runhistory.json'), save_external=True) with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh: pickle.dump(self.portfolio, fh) self.logger.info("~" * 120) self.logger.info('Resulting Portfolio:') for configuration in self.portfolio: self.logger.info(str(configuration)) self.logger.info("~" * 120) return self.portfolio
def fit(self, dataset: BinaryDataset, clustering_algs: List[str], feature_selection_algs: List[str], n_evaluations: int = 30, cutoff_time=20, evaluator: Callable = Measures.silhouette, experiments_dir: str = "../../experiments", n_optimizers=2, top_algorithms_count=5) -> List[str]: cs = build_config_space(clustering_ls=clustering_algs, feature_selection_ls=feature_selection_algs) config_name: str = "mixed" if len(clustering_algs) == 1 and len(feature_selection_algs) == 1: config_name: str = f"{feature_selection_algs[0]}_{clustering_algs[0]}" base_dir_name = _create_smac_directory(experiments_dir, evaluator.__name__, config_name) scenario_params: dict = { "run_obj": "quality", "runcount-limit": n_evaluations, "cutoff_time": cutoff_time, "cs": cs, "deterministic": "false", "output_dir": base_dir_name, "abort_on_first_run_crash": False, "shared_model": True, "input_psmac_dirs": _create_psmac_dirs(base_dir_name, n_optimizers) } scenario = Scenario(scenario_params) dataset_content = dataset.load_dataset() def fit_models(cfg: dict, data: np.ndarray): feature_selection_alg = Mapper.get_class(cfg["feature_selection_choice"]) cfg_feature_selection: dict = { decode_parameter(k, feature_selection_alg.name): v for k, v in cfg.items() if decode_parameter(k, feature_selection_alg.name) is not None } feature_selection_model = feature_selection_alg.model(**cfg_feature_selection) selected_data: np.ndarray = feature_selection_model.fit_transform(data) clustering_alg = Mapper.get_class(cfg["clustering_choice"]) cfg_clustering: dict = { decode_parameter(k, clustering_alg.name): v for k, v in cfg.items() if decode_parameter(k, clustering_alg.name) is not None } clustering_model = clustering_alg.model(**cfg_clustering) clustering_result = clustering_model.fit_predict(selected_data) return feature_selection_model, clustering_model, clustering_result def cfg_to_dict(cfg): cfg = {k: cfg[k] for k in cfg if cfg[k]} return {k: v for k, v in cfg.items() if v is not None} def evaluate_model(cfg): cfg_dict = cfg_to_dict(cfg) _, _, y_pred = fit_models(cfg_dict, dataset_content) if len(np.unique(y_pred)) < 2: return np.inf else: return evaluator(dataset_content, y_pred) optimal_config = None smac = SMAC( scenario=scenario, rng=np.random.RandomState(42), tae=evaluate_model, n_optimizers=n_optimizers, validate=False, n_incs=top_algorithms_count ) # for some reason smac sets output dir to None in constructor, so we inject it manually smac.output_dir = base_dir_name scenario.output_dir = base_dir_name self._smac = smac optimal_configs: List[Configuration] = self._smac.optimize() best_algorithms: List[str] = [] for i, optimal_config in enumerate(optimal_configs): dict_config = cfg_to_dict(optimal_config) feature_selection_model, clustering_model, clustering_result = \ fit_models(dict_config, dataset_content) if len(np.unique(clustering_result)) < 2: measure_value = np.inf else: measure_value = evaluator(dataset_content, clustering_result) result = { "optimal_config": dict_config, "smac": self._smac, "feature_selection_model": feature_selection_model, "clustering_model": clustering_model, "clustering_result": clustering_result, "measure_value": measure_value } _save_clustering_result(result, f"{base_dir_name}", name=f"config_{i}") best_algorithms.append(_encode(dict_config["clustering_choice"], dict_config["feature_selection_choice"])) _save_best_algorithms(experiments_dir, best_algorithms, clustering_algs, feature_selection_algs) return best_algorithms
def main_cli( self, commandline_arguments: typing.Optional[typing.List[str]] = None ) -> None: """Main function of SMAC for CLI interface""" self.logger.info("SMAC call: %s" % (" ".join(sys.argv))) cmd_reader = CMDReader() kwargs = {} if commandline_arguments: kwargs['commandline_arguments'] = commandline_arguments main_args_, smac_args_, scen_args_ = cmd_reader.read_cmd(**kwargs) root_logger = logging.getLogger() root_logger.setLevel(main_args_.verbose_level) logger_handler = logging.StreamHandler(stream=sys.stdout) if root_logger.level >= logging.INFO: formatter = logging.Formatter("%(levelname)s:\t%(message)s") else: formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(name)s:\t%(message)s", "%Y-%m-%d %H:%M:%S") logger_handler.setFormatter(formatter) root_logger.addHandler(logger_handler) # remove default handler if len(root_logger.handlers) > 1: root_logger.removeHandler(root_logger.handlers[0]) # Create defaults rh = None initial_configs = None stats = None incumbent = None # Create scenario-object scenario = {} scenario.update(vars(smac_args_)) scenario.update(vars(scen_args_)) scen = Scenario(scenario=scenario) # Restore state if main_args_.restore_state: root_logger.debug("Restoring state from %s...", main_args_.restore_state) restore_state = main_args_.restore_state rh, stats, traj_list_aclib, traj_list_old = self.restore_state( scen, restore_state) scen.output_dir_for_this_run = create_output_directory( scen, main_args_.seed, root_logger, ) scen.write() incumbent = self.restore_state_after_output_dir( scen, stats, traj_list_aclib, traj_list_old) if main_args_.warmstart_runhistory: rh = RunHistory() scen, rh = merge_foreign_data_from_file( scenario=scen, runhistory=rh, in_scenario_fn_list=main_args_.warmstart_scenario, in_runhistory_fn_list=main_args_.warmstart_runhistory, cs=scen.cs, # type: ignore[attr-defined] # noqa F821 ) if main_args_.warmstart_incumbent: initial_configs = [scen.cs.get_default_configuration() ] # type: ignore[attr-defined] # noqa F821 for traj_fn in main_args_.warmstart_incumbent: trajectory = TrajLogger.read_traj_aclib_format( fn=traj_fn, cs=scen.cs, # type: ignore[attr-defined] # noqa F821 ) initial_configs.append(trajectory[-1]["incumbent"]) if main_args_.mode == "SMAC4AC": optimizer = SMAC4AC(scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "SMAC4HPO": optimizer = SMAC4HPO(scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "SMAC4BB": optimizer = SMAC4BB(scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "ROAR": optimizer = ROAR(scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, run_id=main_args_.seed) elif main_args_.mode == "Hydra": optimizer = Hydra( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed, random_configuration_chooser=main_args_. random_configuration_chooser, n_iterations=main_args_.hydra_iterations, val_set=main_args_.hydra_validation, incs_per_round=main_args_.hydra_incumbents_per_round, n_optimizers=main_args_.hydra_n_optimizers) elif main_args_.mode == "PSMAC": optimizer = PSMAC( scenario=scen, rng=np.random.RandomState(main_args_.seed), run_id=main_args_.seed, shared_model=smac_args_.shared_model, validate=main_args_.psmac_validate, n_optimizers=main_args_.hydra_n_optimizers, n_incs=main_args_.hydra_incumbents_per_round, ) try: optimizer.optimize() except (TAEAbortException, FirstRunCrashedException) as err: self.logger.error(err)