def __init__( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], params: Union[int, List[str]], n_configs: int, pc_sort_by: str, output_dir: str, cs: ConfigurationSpace, runtime: bool = False, max_runs_epm: int = 3000000, ): """This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine Parameters ---------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance params: Union[int, List[str]] either directly the parameters to displayed or the number of parameters (will try to define the most important ones n_configs: int number of configs to be plotted pc_sort_by: str defines the pimp-method by which to choose the plotted parameters max_runs_epm: int maximum number of runs to train the epm with. this should prevent MemoryErrors output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.error = None self.default = default self.param_imp = param_imp self.cs = cs # Sorting by importance, if possible (choose first executed parameter-importance) self.method, self.importance = "", {} if pc_sort_by == 'all': self.logger.debug("Sorting by average importance") self.method = 'average' for m, i in self.param_imp.items(): if i: for p, imp in i.items(): if p in self.importance: self.importance[p].append(imp) else: self.importance[p] = [imp] self.importance = { k: sum(v) / len(v) for k, v in self.importance.items() } elif pc_sort_by in self.param_imp: self.method, self.importance = pc_sort_by, self.param_imp[ pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", pc_sort_by, str(list(self.param_imp.keys()))) for m, i in self.param_imp.items(): if i: self.method, self.importance = m, i break self.hp_names = sorted( [hp for hp in self.cs.get_hyperparameter_names()], key=lambda x: self.importance.get(x, 0), reverse=True) self.logger.debug("Sorted hp's by method \'%s\': %s", self.method, str(self.hp_names)) # To be set self.plots = [] # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() max_runs_epm = 300000 # Maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing " "the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, max_runs_epm) all_configs = sorted( all_configs, key=lambda c: len(original_rh.get_runs_for_config(c) ))[:max_configs] if not default in all_configs: all_configs = [default] + all_configs if not incumbent in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory(average_cost) epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) self.config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs} self.params = self.get_params(params) self.n_configs = n_configs self.pcp = ParallelCoordinatesPlotter(self.config_to_cost, output_dir, cs, runtime)
def _preprocess_budget( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], output_dir: str, cs: ConfigurationSpace, runtime: bool = False, ): """ Preprocess data and save in self.data to enable fast replots Parameters: ----------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ # Sorting parameters by importance, if possible (choose first executed parameter-importance) method, importance = "", {} if self.pc_sort_by == 'all': self.logger.debug("Sorting by average importance") method = 'average' for m, i in param_imp.items(): if i: for p, imp in i.items(): if p in importance: importance[p].append(imp) else: importance[p] = [imp] importance = {k: sum(v) / len(v) for k, v in importance.items()} elif self.pc_sort_by in param_imp: method, importance = self.pc_sort_by, param_imp[self.pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by, str(list(param_imp.keys()))) for m, i in param_imp.items(): if i: method, importance = m, i self.logger.debug("Chose %s", method) break hp_names = sorted([p for p in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True) self.logger.debug("Sorted hyperparameters by method \'%s\': %s", method, str(hp_names)) # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( self.max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and " "choosing the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, self.max_runs_epm) all_configs = sorted(all_configs, key=lambda c: len( original_rh.get_runs_for_config( c, only_max_observed_budget=False))) all_configs = all_configs[:max_configs] if default not in all_configs: all_configs = [default] + all_configs if incumbent not in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory() epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) config_to_cost = OrderedDict( {c: epm_rh.get_cost(c) for c in all_configs}) data = OrderedDict() data['cost'] = list(config_to_cost.values()) for hp in self.runscontainer.scenario.cs.get_hyperparameter_names(): data[hp] = np.array([ c[hp] # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan for c in config_to_cost.keys() ]) df = pd.DataFrame(data=data) return df
def _plot_parallel_coordinates( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], output_dir: str, cs: ConfigurationSpace, runtime: bool = False, ): """ Parameters: ----------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ # Sorting parameters by importance, if possible (choose first executed parameter-importance) method, importance = "", {} if self.pc_sort_by == 'all': self.logger.debug("Sorting by average importance") method = 'average' for m, i in param_imp.items(): if i: for p, imp in i.items(): if p in importance: importance[p].append(imp) else: importance[p] = [imp] importance = {k: sum(v) / len(v) for k, v in importance.items()} elif self.pc_sort_by in param_imp: method, importance = self.pc_sort_by, param_imp[self.pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by, str(list(param_imp.keys()))) for m, i in param_imp.items(): if i: method, importance = m, i self.logger.debug("Chose %s", method) break hp_names = sorted([hp for hp in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True) self.logger.debug("Sorted hp's by method \'%s\': %s", method, str(hp_names)) # To be set self.plots = [] # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() max_runs_epm = self.max_runs_epm # Maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing " "the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, max_runs_epm) all_configs = sorted( all_configs, key=lambda c: len(original_rh.get_runs_for_config(c) ))[:max_configs] if not default in all_configs: all_configs = [default] + all_configs if not incumbent in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory(average_cost) epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs} pcp = ParallelCoordinatesPlotter(config_to_cost, output_dir, cs, runtime) try: plots = [ pcp.plot_n_configs( self.n_configs, self.get_params(self.params, importance, hp_names)) ] self.logger.debug("Paths to plot(s): %s", str(plots)) return {'figure': plots} except ValueError as err: self.logger.debug("Error: %s", str(err)) return {'else': str(err)}
def plot_parallel_coordinates(self, original_rh, validated_rh, validator, n_param=10, n_configs=500, max_runs_epm=300000): """ Plot parallel coordinates (visualize higher dimensions), here used to visualize pcs. This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine. NOTE: the given runhistory should contain only optimization and no validation to analyze the explored parameter-space. Parameters ---------- original_rh: RunHistory rundata to take configs from (no validation data - we want to visualize optimization process) validate_rh: RunHistory rundata to estimate costs of configs from (can contain validation data but no empirical estimations, since it's used to train an epm) validator: Validator to calculate alpha values n_param: int parameters to be plotted n_configs: int max # configs to be plotted max_runs_epm: int maximum number of total runs that should be predicted using epm. the higher this value is, the better the predictions (probably), however high numbers are likely to lead to MemoryErrors Returns ------- output: str path to plot """ self.logger.info("... plotting parallel coordinates") # If a parameter importance has been performed in this analyzer-object, # only plot the n_param most important parameters. if self.param_imp: # Use the first applied parameter importance analysis to choose method, importance = list(self.param_imp.items())[0] self.logger.debug( "Choosing visualized parameters in parallel coordinates " "according to parameter importance method %s" % method) n_param = min( n_param, max(3, len([x for x in importance.values() if x > 0.05]))) # Some importance methods add "--source--" or similar to the parameter names -> filter them in next line params = [ p for p in importance.keys() if p in self.scenario.cs.get_hyperparameter_names() ][:n_param] else: self.logger.info( "No parameter importance performed. Plotting random parameters in parallel coordinates." ) params = list(self.default.keys())[:n_param] self.logger.info( " plotting %s parameters for (max) %s configurations", len(params), n_configs) # Reduce to feasible number of configurations all_configs = original_rh.get_all_configs() max_configs = int( max_runs_epm / (len(self.scenario.train_insts) + len(self.scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing " "the ones with the most runs", len(all_configs), max_configs, max_runs_epm) all_configs = sorted( all_configs, key=lambda c: len(original_rh.get_runs_for_config(c) ))[:max_configs] if not self.default in all_configs: all_configs = [self.default] + all_configs if not self.incumbent in all_configs: all_configs.append(self.incumbent) if self.scenario.feature_dict: epm_rh = timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh) epm_rh.update(validated_rh) else: epm_rh = validated_rh pcp = ParallelCoordinatesPlotter( original_rh, epm_rh, self.output_dir, self.scenario.cs, runtime=(self.scenario.run_obj == 'runtime')) output = pcp.plot_n_configs(n_configs, params) return output