def plot_result(self, name='fANOVA', show=True): if not os.path.exists(name): os.mkdir(name) vis = Visualizer(self.evaluator, self.cs, directory=name, y_label=self._get_label(self.scenario.run_obj)) self.logger.info('Getting Marginals!') pbar = tqdm(range(self.to_evaluate), ascii=True, disable=not self.verbose) for i in pbar: plt.close('all') plt.clf() param = list(self.evaluated_parameter_importance.keys())[i] # Plot once in log, once linear for mode in [(True, '_log'), (False, '')]: outfile_name = os.path.join( name, param.replace(os.sep, "_") + mode[1] + ".png") # The try/except clause is only for back-compatibility with fanova <= 2.0.11 try: vis.plot_marginal( self.cs.get_idx_by_hyperparameter_name(param), log_scale=mode[0], show=False, incumbents=self.incumbents) except TypeError: self.logger.debug( "Plotting incumbents not supported by fanova < 2.0.12") vis.plot_marginal( self.cs.get_idx_by_hyperparameter_name(param), log_scale=mode[0], show=False) fig = plt.gcf() fig.savefig(outfile_name) plt.close('all') plt.clf() if show: plt.show() pbar.set_description('Creating fANOVA plot: {: <.30s}'.format( outfile_name.split(os.path.sep)[-1])) if self.pairwise: self.logger.info('Plotting Pairwise-Marginals!') most_important_ones = list( self.evaluated_parameter_importance.keys( ))[:min(self.num_single, self.n_most_imp_pairs)] vis.create_most_important_pairwise_marginal_plots( most_important_ones) try: vis.create_most_important_pairwise_marginal_plots( most_important_ones) except TypeError as err: self.logger.debug(err, exc_info=1) self.logger.warning('Could not create pairwise plots!') plt.close('all')
def __init__(self, data: pd.DataFrame, hp_names, objective, hp_space=None): if import_error is not None: raise import_error x = data[hp_names] y = data[objective] self.space = hp_space self.hp_names = hp_names self.fanova = fANOVA(x.values, y.values) self.size = len(hp_names) self._importance = np.zeros((self.size, self.size)) self._importance_std = np.zeros_like(self._importance) self.vis = Visualizer(self.fanova, Space.from_dict(hp_space).instantiate(), '/tmp', 'objective') self.computed = False
def _plot_result(fANOVA, configspace, directory, yrange=None): os.makedirs(directory, exist_ok=True) vis = Visualizer(fANOVA, configspace, directory, y_label='Predictive Accuracy') for hp1 in configspace.get_hyperparameters(): plt.close('all') plt.clf() param = hp1.name outfile_name = os.path.join(directory, param.replace(os.sep, "_") + ".pdf") vis.plot_marginal(configspace.get_idx_by_hyperparameter_name(param), show=False) x1, x2, _, _ = plt.axis() if yrange: plt.axis((x1, x2, yrange[0], yrange[1])) plt.savefig(outfile_name) pass
def plot_result(self, name='fANOVA', show=True): if not os.path.exists(name): os.mkdir(name) if self.scenario.run_obj == 'runtime': label = 'runtime [sec]' elif self.scenario.run_obj == 'quality': label = 'cost' else: label = '%s' % self.scenario.run_obj vis = Visualizer(self.evaluator, self.cs, directory=name, y_label=label) self.logger.info('Getting Marginals!') pbar = tqdm(range(self.to_evaluate), ascii=True, disable=not self.verbose) for i in pbar: plt.close('all') plt.clf() param = list(self.evaluated_parameter_importance.keys())[i] outfile_name = os.path.join(name, param.replace(os.sep, "_") + ".png") vis.plot_marginal(self.cs.get_idx_by_hyperparameter_name(param), log_scale=False, show=False) fig = plt.gcf() fig.savefig(outfile_name) plt.close('all') plt.clf() outfile_name = os.path.join( name, param.replace(os.sep, "_") + "_log.png") vis.plot_marginal(self.cs.get_idx_by_hyperparameter_name(param), log_scale=True, show=False) fig = plt.gcf() fig.savefig(outfile_name) plt.close('all') plt.clf() if show: plt.show() pbar.set_description('Creating fANOVA plot: {: <.30s}'.format( outfile_name.split(os.path.sep)[-1])) if self.pairwise: self.logger.info('Plotting Pairwise-Marginals!') most_important_ones = list( self.evaluated_parameter_importance.keys( ))[:min(self.num_single, self.n_most_imp_pairs)] try: vis.create_most_important_pairwise_marginal_plots( most_important_ones) except TypeError: self.logger.warning('Could not create pairwise plots!') plt.close('all')
def _plot_result(fANOVA, configspace, directory, yrange=None): vis = Visualizer(fANOVA, configspace) try: os.makedirs(directory) except FileExistsError: pass for hp in configspace.get_hyperparameters(): plt.close('all') plt.clf() param = hp.name outfile_name = os.path.join(directory, param.replace(os.sep, "_") + ".pdf") if isinstance(hp, (CategoricalHyperparameter)): vis.plot_categorical_marginal( configspace.get_idx_by_hyperparameter_name(param), show=False, ylabel='Predictive Accuracy') else: vis.plot_marginal( configspace.get_idx_by_hyperparameter_name(param), resolution=100, show=False, ylabel='Predictive Accuracy') x1, x2, _, _ = plt.axis() if yrange: plt.axis((x1, x2, yrange[0], yrange[1])) plt.savefig(outfile_name) pass
def plot_result(self, name='fANOVA'): vis = Visualizer(self.evaluator, self.cs) if not os.path.exists(name): os.mkdir(name) self.logger.info('Getting Marginals!') for i in range(self.to_evaluate): plt.close('all') plt.clf() param = list(self.evaluated_parameter_importance.keys())[i] outfile_name = os.path.join(name, param.replace(os.sep, "_") + ".png") if isinstance(self.cs.get_hyperparameter(param), (CategoricalHyperparameter)): vis.plot_categorical_marginal( self.cs.get_idx_by_hyperparameter_name(param), show=False) else: vis.plot_marginal( self.cs.get_idx_by_hyperparameter_name(param), show=False) plt.savefig(outfile_name) self.logger.info('Creating fANOVA plot: %s' % outfile_name) self.logger.info('Getting Pairwise-Marginals!') self.logger.info('This will take some time!') vis.create_most_important_pairwise_marginal_plots( name, self.to_evaluate)
def plot_result(self, name='fANOVA', show=True): if not os.path.exists(name): os.mkdir(name) vis = Visualizer(self.evaluator, self.cs, directory=name) self.logger.info('Getting Marginals!') for i in range(self.to_evaluate): plt.close('all') plt.clf() param = list(self.evaluated_parameter_importance.keys())[i] outfile_name = os.path.join(name, param.replace(os.sep, "_") + ".png") vis.plot_marginal(self.cs.get_idx_by_hyperparameter_name(param), show=False) fig = plt.gcf() fig.savefig(outfile_name) if show: plt.show() self.logger.info('Creating fANOVA plot: %s' % outfile_name) self.logger.info('Plotting Pairwise-Marginals!') most_important_ones = list(self.evaluated_parameter_importance.keys( ))[:min(self.num_single, self.n_most_imp_pairs)] vis.create_most_important_pairwise_marginal_plots(most_important_ones) plt.close('all')
def run(args): root = logging.getLogger() root.setLevel(logging.INFO) with open(args.dataset_path, 'r') as fp: arff_dataset = arff.load(fp) config_space = sklearnbot.config_spaces.get_config_space(args.classifier, None) data = openmlcontrib.meta.arff_to_dataframe(arff_dataset, config_space) data = openmlcontrib.meta.integer_encode_dataframe(data, config_space) meta_data = get_dataset_metadata(args.dataset_path) if args.measure not in data.columns.values: raise ValueError('Could not find measure in dataset: %s' % args.measure) if set(config_space.get_hyperparameter_names()) != set(meta_data['col_parameters']): missing_cs = set(meta_data['col_parameters']) - set(config_space.get_hyperparameter_names()) missing_ds = set(config_space.get_hyperparameter_names()) - set(meta_data['col_parameters']) raise ValueError('ConfigSpace and hyperparameters of dataset do not ' 'align. ConfigSpace misses: %s, dataset misses: %s' % (missing_cs, missing_ds)) task_ids = data['task_id'].unique() result = list() for idx, task_id in enumerate(task_ids): logging.info('Running fanova on task %d (%d/%d)' % (task_id, idx + 1, len(task_ids))) data_task = data[data['task_id'] == task_id] evaluator = fanova.fanova.fANOVA(X=data_task[config_space.get_hyperparameter_names()].values, Y=data_task[args.measure].values, config_space=config_space, n_trees=args.n_trees) os.makedirs(args.output_directory, exist_ok=True) vis = Visualizer(evaluator, config_space, args.output_directory, y_label='Predictive Accuracy') indices = list(range(len(config_space.get_hyperparameters()))) for comb_size in range(1, args.comb_size + 1): for idx in itertools.combinations(indices, comb_size): param_names = np.array(config_space.get_hyperparameter_names())[np.array(idx)] logging.info('-- Calculating marginal for %s' % param_names) importance = evaluator.quantify_importance(idx)[idx] if comb_size == 1: visualizer_res = vis.generate_marginal(idx[0], args.resolution) # visualizer returns mean, std and potentially grid avg_marginal = np.array(visualizer_res[0]) elif comb_size == 2: visualizer_res = vis.generate_pairwise_marginal(idx, args.resolution) # visualizer returns grid names and values avg_marginal = np.array(visualizer_res[1]) else: raise ValueError('No support yet for higher dimensions than 2. Got: %d' % comb_size) difference_max_min = max(avg_marginal.reshape((-1,))) - min(avg_marginal.reshape((-1,))) current = { 'task_id': task_id, 'hyperparameter': ' / '.join(param_names), 'n_hyperparameters': len(param_names), 'importance_variance': importance['individual importance'], 'importance_max_min': difference_max_min, } result.append(current) df_result = pd.DataFrame(result) result_path = os.path.join(args.output_directory, 'fanova_%s_depth_%d.csv' % (args.classifier, args.comb_size)) df_result.to_csv(result_path) logging.info('resulting csv: %s' % result_path) logging.info('To plot, run <openml_pimp_root>/examples/plot/plot_fanova.py')
def plot_bokeh(self, plot_name=None, show_plot=False, plot_pairwise="most_important"): """ Plot single and pairwise margins in bokeh-plot. Single margins are always plotted (not expensive), pairwise can be configured by argument. Parameters ---------- plot_name: str path where to store the plot, None to not save it show_plot: bool whether or not to open plot in standard browser plot_pairwise: str choose from ["none", "most_important", "all"] where "most_important" relies on the fanova module to decide what that means Returns ------- layout: bokeh.models.Column bokeh plot (can be used in notebook or comparted with components) """ vis = Visualizer(self.evaluator, self.cs, directory='.', y_label=self._get_label(self.scenario.run_obj)) #################### # Single marginals # #################### plots_single = [] params = list(self.evaluated_parameter_importance.keys()) pbar = tqdm(deepcopy(params), ascii=True, disable=not self.verbose) for param_name in pbar: # Try and except pairwise importances that are also saved in evaluated_parameter_importance... try: param = self.cs.get_hyperparameter(param_name) except KeyError as err: self.logger.debug(err, exc_info=1) continue pbar.set_description('Plotting fANOVA (in bokeh) for %s' % param_name) incumbents = [] if not self.incumbents is None: incumbents = self.incumbents.copy() if isinstance(self.incumbents, list) else [self.incumbents] values = [c[param_name] for c in incumbents if param_name in c and c[param_name] is not None] if isinstance(param, (CategoricalHyperparameter, Constant)): labels = param.choices if isinstance(param, CategoricalHyperparameter) else str(param) mean, std = vis.generate_marginal(param_name) inc_indices = [labels.index(val) for val in values] p = bokeh_boxplot(labels, mean, std, x_label=param.name, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", runtime=self.scenario.run_obj=="runtime", inc_indices=inc_indices) else: mean, std, grid = vis.generate_marginal(param_name, 100) mean, std = np.asarray(mean), np.asarray(std) log_scale = param.log or (np.diff(grid).std() > 0.000001) inc_indices = [(np.abs(np.asarray(grid) - val)).argmin() for val in values] p = bokeh_line_uncertainty(grid, mean, std, log_scale, x_label=param.name, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", inc_indices=inc_indices) plots_single.append(Panel(child=Row(p), title=param_name)) ###################### # Pairwise marginals # ###################### if plot_pairwise == "all": combis = list(it.combinations(self.cs.get_hyperparameters(), 2)) elif plot_pairwise == "most_important": most_important_ones = list(self.evaluated_parameter_importance.keys())[ :min(self.num_single, self.n_most_imp_pairs)] most_important_pairwise_marginals = vis.fanova.get_most_important_pairwise_marginals( params=most_important_ones) combis = [(self.cs.get_hyperparameter(name1), self.cs.get_hyperparameter(name2)) for name1, name2 in most_important_pairwise_marginals] elif plot_pairwise == "none": combis = [] else: raise ValueError("{} not a valid set of pairwise plots to generate...".format(plot_pairwise)) plots_pairwise = [] pbar = tqdm(deepcopy(combis), ascii=True, disable=not self.verbose) for p1, p2 in pbar: pbar.set_description('Plotting pairwise fANOVA (in bokeh) for %s & %s' % (p1.name, p2.name)) first_is_cat = isinstance(p1, CategoricalHyperparameter) second_is_cat = isinstance(p2, CategoricalHyperparameter) # There are essentially three cases / different plots: # First case: both categorical -> heatmap if first_is_cat or second_is_cat: choices, zz = vis.generate_pairwise_marginal((p1.name, p2.name), 20) # Working with pandas makes life easier data = pd.DataFrame(zz, index=choices[0], columns=choices[1]) # Setting names for rows and columns and make categoricals strings data.index.name, data.columns.name = p1.name, p2.name data.index = data.index.astype(str) if first_is_cat else data.index data.columns = data.columns.astype(str) if second_is_cat else data.columns if first_is_cat and second_is_cat: p = bokeh_heatmap_cat(data, p1.name, p2.name) else: # Only one of them is categorical -> create multi-line-plot cat_choices = p1.choices if first_is_cat else p2.choices # We want categorical values be represented by columns: if not second_is_cat: data = data.transpose() # Find y_min and y_max BEFORE resetting index (otherwise index max obscure the query) y_limits = (data.min().min(), data.max().max()) x_limits = (p1.lower if second_is_cat else p2.lower, p1.upper if second_is_cat else p2.upper) # We want the index as a column (for plotting on x-axis) data = data.reset_index() p = bokeh_multiline(data, x_limits, y_limits, p1.name if second_is_cat else p2.name, cat_choices, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", z_label=p1.name if first_is_cat else p2.name, ) else: # Third case: both continous grid, zz = vis.generate_pairwise_marginal((p1.name, p2.name), 20) data = pd.DataFrame(zz, index=grid[0], columns=grid[1]) data.index.name, data.columns.name = p1.name, p2.name p = bokeh_heatmap_num(data, p1.name, p2.name, p1.log, p2.log) plots_pairwise.append(Panel(child=Row(p), title=" & ".join([p1.name, p2.name]))) # Putting both together tabs_single = Tabs(tabs=[*plots_single]) if len(plots_pairwise) > 0: tabs_pairwise = Tabs(tabs=[*plots_pairwise]) layout = Column(tabs_single, tabs_pairwise) else: layout = Column(tabs_single) # Save and show... save_and_show(plot_name, show_plot, layout) return layout
def execute(save_folder, runhistory_location, configspace_location, manual_logtransform, use_percentiles, interaction_effect, n_trees, run_limit=None, draw_plots=True): with open(runhistory_location) as runhistory_file: runhistory = json.load(runhistory_file) with open(configspace_location) as configspace_file: configspace = read(configspace_file) os.makedirs(save_folder, exist_ok=True) X = [] y = [] for item in runhistory['data']: if run_limit is not None and len(X) > run_limit: break valid = True current = [] setup_id = str(item[0][0]) configuration = runhistory['configs'][setup_id] for param in configspace.get_hyperparameters(): value = configuration[param.name] if isinstance(param, ConfigSpace.hyperparameters.UniformFloatHyperparameter) and not isinstance(value, float): valid = False elif isinstance(param, ConfigSpace.hyperparameters.UniformIntegerHyperparameter) and not isinstance(value, int): valid = False if isinstance(param, ConfigSpace.hyperparameters.CategoricalHyperparameter): value = param.choices.index(value) elif param.log and manual_logtransform: value = np.log(value) current.append(value) if valid: X.append(current) y.append(item[1][0]) else: print('Illegal configuration', current) X = np.array(X) y = np.array(y) if X.ndim != 2: raise ValueError('Wrong shape') if manual_logtransform: configspace = openmlpimp.utils.scale_configspace_to_log(configspace) cutoffs = (-np.inf, np.inf) if use_percentiles: p75 = np.percentile(y, 75.0) p100 = np.percentile(y, 100.0) cutoffs = (p75, p100) # start the evaluator evaluator = fanova_pyrfr(X=X, Y=y, config_space=configspace, config_on_hypercube=False, cutoffs=cutoffs, n_trees=n_trees) # obtain the results params = configspace.get_hyperparameters() result = {} for idx, param in enumerate(params): importance = evaluator.quantify_importance([idx])[(idx,)]['total importance'] result[param.name] = importance # store main results to disk filename = 'pimp_values_fanova.json' with open(os.path.join(save_folder, filename), 'w') as out_file: json.dump(result, out_file, sort_keys=True, indent=4, separators=(',', ': ')) print('Saved individuals to %s' %os.path.join(save_folder, filename)) # call plotting fn yrange = (0, 1) if use_percentiles: yrange = (p75, p100) if draw_plots: FanovaBackend._plot_result(evaluator, configspace, save_folder + '/fanova', yrange) if interaction_effect: result_interaction = {} for idx, param in enumerate(params): for idx2, param2 in enumerate(params): if param.name >= param2.name: # string comparison cause stable continue print('interaction effects between', param.name, param2.name) interaction = evaluator.quantify_importance([idx, idx2])[(idx,idx2)]['total importance'] interaction -= result[param.name] interaction -= result[param2.name] combined_name = param.name + '__' + param2.name if interaction < 0.0: raise ValueError('interaction score too low. Params: %s score %d' %(combined_name, interaction)) result_interaction[combined_name] = interaction for idx, param in enumerate(params): for idx2, param2 in enumerate(params): if param.name >= param2.name: # string comparison cause stable continue for idx3, param3 in enumerate(params): if param2.name >= param3.name: # string comparison cause stable continue print('interaction effects between', param.name, param2.name, param3.name) interaction = evaluator.quantify_importance([idx, idx2, idx3])[(idx, idx2, idx3)]['total importance'] interaction -= result[param.name] interaction -= result[param2.name] interaction -= result[param3.name] combined_name = param.name + '__' + param2.name + '__' + param3.name interaction -= result_interaction[param.name + '__' + param2.name] interaction -= result_interaction[param2.name + '__' + param3.name] interaction -= result_interaction[param.name + '__' + param3.name] if interaction < 0.0: raise ValueError('interaction score too low. Params: %s score %d' % (combined_name, interaction)) result_interaction[combined_name] = interaction # store interaction effects to disk if sum(result_interaction.values()) + sum(result.values()) > 1: raise ValueError('Sum of results too high') filename = 'pimp_values_fanova_interaction.json' with open(os.path.join(save_folder, filename), 'w') as out_file: json.dump(result_interaction, out_file, sort_keys=True, indent=4, separators=(',', ': ')) print('Saved interactions to %s' %os.path.join(save_folder, filename)) if draw_plots: vis = Visualizer(evaluator, configspace, save_folder + '/fanova', y_label='Predictive Accuracy') vis.create_most_important_pairwise_marginal_plots() return save_folder + "/" + filename
class FANOVA: """ Parameters ---------- data: DataFrame hp_names: List[str] list of hyper parameter names (to extract from the DataFrame) objective: str name of the objective References ---------- .. [1] F. Hutter and H. Hoos and K. Leyton-Brown "An Efficient Approach for Assessing Hyperparameter Importance" Proceedings of International Conference on Machine Learning 2014 (ICML 2014). """ def __init__(self, data: pd.DataFrame, hp_names, objective, hp_space=None): if import_error is not None: raise import_error x = data[hp_names] y = data[objective] self.space = hp_space self.hp_names = hp_names self.fanova = fANOVA(x.values, y.values) self.size = len(hp_names) self._importance = np.zeros((self.size, self.size)) self._importance_std = np.zeros_like(self._importance) self.vis = Visualizer(self.fanova, Space.from_dict(hp_space).instantiate(), '/tmp', 'objective') self.computed = False def _gen(self): if not self.computed: self._compute_importance() self.computed = True def compute_marginal(self, index, marginals=None): """Compute the effect a change on the hyper parameter value has on the objective value Returns ------- returns a list of dictionaries (name, objective, value, std) """ data = self.vis.generate_marginal(index, resolution=100) marginals = select(marginals, []) # (mean, std, grid) if len(data) > 2: for y, std, x in zip(data[0], data[1], data[2]): marginals.append(dict( name=self.hp_names[index], objective=y, value=x, std=std)) # (mean, std) else: # Categorical choices = self._get_choices(index) for (y, std), x in zip(data, choices): marginals.append(dict( name=self.hp_names[index], objective=y, value=x, std=std)) return marginals def _get_choices(self, param): """For categorical HP retrieve the choices available""" from ConfigSpace import CategoricalHyperparameter p, p_name, p_idx = self.vis._get_parameter(param) if isinstance(p, CategoricalHyperparameter): return p.choices return ['Constant'] def compute_marginals(self): marginals = [] for i, _ in enumerate(self.hp_names): self.compute_marginal(i, marginals) return marginals @property def importance(self): """TODO: doc Returns ------- Importance matrix of pairs of hyper parameters """ self._gen() return self._importance @property def importance_std(self): """TODO: doc Returns ------- Standard deviation of the importance matrix of pairs of hyper parameters """ self._gen() return self._importance_std @property def importance_long(self): self._gen() return self._importance_long def _compute_importance(self): importance = self.fanova.quantify_importance(list(range(self.size ))) self._importance_long = [] for k, values in importance.items(): if len(k) == 1: i = k[0] j = k[0] elif len(k) == 2: i = k[0] j = k[1] else: continue self._importance[i, j] = values['total importance'] self._importance_std[i, j] = values['total std'] self._importance_long.append(dict( row=self.hp_names[i], col=self.hp_names[j], importance=values['total importance'], std=values['total std'] ))