def evaluate(self, study: Study, params: Optional[List[str]]) -> Dict[str, float]: distributions = _get_distributions(study, params) params_data, values_data = _get_study_data(study, distributions) evaluator = fANOVA( X=params_data, Y=values_data, config_space=_get_configuration_space(distributions), max_features=max(1, int(params_data.shape[1] * 0.7)), ) individual_importances = {} for i, name in enumerate(evaluator.cs.get_hyperparameter_names()): imp = evaluator.quantify_importance((i, )) imp = imp[(i, )]["individual importance"] individual_importances[name] = imp tot_importance = sum(v for v in individual_importances.values()) for name in individual_importances.keys(): individual_importances[name] /= tot_importance param_importances = OrderedDict( reversed( sorted( individual_importances.items(), key=lambda name_and_importance: name_and_importance[1], ))) return param_importances
def smac_to_fanova(state_run_directory, destination_dir): ''' Takes the state-run files, merges them and prepares the configuration space for fANOVA. outputs: fANOVA object state_run_directory: str path to the directory of the pysmac_output/out/scenario file destination_dir: str path to the directory in which the merged states should be stored ''' state_run_list =[] files = glob(state_run_directory + "/*") for file in files: if file.startswith(state_run_directory + "/state-run"): state_run_list.append(file) state_merge.state_merge(state_run_list, destination_dir) merged_files = glob(destination_dir + '/*') for file in merged_files: if file.startswith(destination_dir + '/runs_and_results'): response_file = file if file.startswith(destination_dir + '/paramstrings'): paramstrings = file param_dict = output_reader.read_paramstrings_file(paramstrings) num_line = str(param_dict[0]).replace("'", "") num_line = str(num_line).replace("}", "") # messy way to get the parameter names wrt order f_params = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") line = line.replace('{', '') if ':' in line: parameter = line.replace(':', '') f_params.append(parameter) # get configspace with open(destination_dir + '/param.pcs') as fh: cs = pcs_new.read(fh.readlines(), debug=True) X = [] hps = cs.get_hyperparameters() for p in param_dict: c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True) X.append([]) for hp in hps: if hasattr(hp, 'choices'): value = hp.choices.index(c[hp.name]) else: value = c[hp.name] X[-1].append(value) X = np.array(X) Y = data_extractor(response_file, X.shape[0]) return fanova.fANOVA(X = X, Y = Y, config_space= cs)
def fanova_to_df(data, algorithm, missing_values, cs1, cs2): """ Derive importance of hyperparameter combinations for the given algorithm Input: data - (DataFrame) contains the performance data for a dataset algorithm - (str) takes one of the following options {RandomForest, AdaBoost, ExtraTrees, SVM, GradientBoosting} missing_values - (boolean) whether imputation has been done on the dataset cs1, cs2 - configuration space objects Output: df - (DataFrame) contains the variance contributions per hyperparameter combination time_taken - performance time in sec """ if missing_values: X = data.loc[:, sorted(data.columns[1:-1])].values y = data.iloc[:, -1].values cs = cs1 else: X = data.loc[:, sorted(data.columns[1:-2])].values y = data.iloc[:, -1].values cs = cs2 f = fanova.fANOVA(X, y, n_trees=32, bootstrapping=True, config_space=cs) start = time.perf_counter() print('Singles') imp1 = get_single_importance(f) print('Pairs') imp2 = f.get_most_important_pairwise_marginals() print('Triples') if missing_values: imp3_1 = get_triple_importance(f, algorithm) imp3_2 = get_triple_impute(f, algorithm) imp3 = dict_merge(imp3_1, imp3_2) else: imp3 = get_triple_importance(f, algorithm) imp = dict_merge(imp1, imp2, imp3) end = time.perf_counter() time_taken = end - start print('time taken is {} min'.format(time_taken / 60)) df = pd.DataFrame({'param': list(imp.keys()), 'importance': list(imp.values())}, index=None) return df, time_taken
def test_with_toy_data(self): f = fanova.fANOVA(self.X,self.y,self.cfs, bootstrapping=False, n_trees=1, seed=5, max_features=1) f.the_forest.save_latex_representation('/tmp/fanova_') print("="*80) print(f.the_forest.all_split_values()) print("total variances", f.the_forest.get_trees_total_variances()) print(f.quantify_importance([0,1])) print(f.trees_total_variance) print(f.V_U)
def online_lda(): marginals =[] the_keys = [] X = np.loadtxt('../example_data/online_lda/online_lda_features.csv', delimiter=",") Y = np.loadtxt('../example_data/online_lda/online_lda_responses.csv', delimiter=",") f = fanova.fANOVA(X,Y, n_trees=32,bootstrapping=True) res = f.quantify_importance((0, 1, 2)) for key in res.keys(): if key != (0, 1, 2): marginals.append(res[key]['individual importance']) the_keys.append(key) return the_keys, marginals
def __init__(self, data: pd.DataFrame, hp_names, objective, hp_space=None): if import_error is not None: raise import_error x = data[hp_names] y = data[objective] self.space = hp_space self.hp_names = hp_names self.fanova = fANOVA(x.values, y.values) self.size = len(hp_names) self._importance = np.zeros((self.size, self.size)) self._importance_std = np.zeros_like(self._importance) self.vis = Visualizer(self.fanova, Space.from_dict(hp_space).instantiate(), '/tmp', 'objective') self.computed = False
def calculate_importance(trials): df_x, responses, cs = __make_datas(trials) f = fANOVA(X=df_x, Y=responses, config_space=cs) num_of_features = len(df_x.columns) # marginal of particular parameter: importances = [] for i in range(num_of_features): dims = (i, ) res = f.quantify_importance(dims) importances.append(res) # # getting the 10 most important pairwise marginals sorted by importance # best_margs = f.get_most_important_pairwise_marginals(n=1) # print(best_margs) return importances
def csv_example(): marginals =[] the_keys = [] data = np.loadtxt("../example_data/csv-example/test_data.csv", delimiter=",") X = np.array(data[:, :2], dtype = np.float) Y = np.array(data[:,-1:], dtype = np.float).flatten() # config space pcs = list(zip(np.min(X,axis=0), np.max(X, axis=0))) cs = ConfigSpace.ConfigurationSpace() for i in range(len(pcs)): cs.add_hyperparameter(UniformFloatHyperparameter("%i" %i, pcs[i][0], pcs[i][1])) f2 = fanova.fANOVA(X, Y, cs) res = f2.quantify_importance((0, 1)) for key in res.keys(): marginals.append(res[key]['individual importance']) the_keys.append(key) return the_keys, marginals
def run(args): root = logging.getLogger() root.setLevel(logging.INFO) os.makedirs(args.output_dir, exist_ok=True) legal_hyperparameters = None for dataset in config_spaces.DATASETS: logging.info('Dataset %s' % dataset) # artificial dataset (here: features) directory = os.path.join(args.input_dir, dataset) features = np.loadtxt(directory + '/' + dataset + '-features.csv', delimiter=",") responses = np.loadtxt(directory + '/' + dataset + '-responses-acc.csv', delimiter=",") cs = config_spaces.get_config_space(dataset, 0) if legal_hyperparameters is None: legal_hyperparameters = cs.get_hyperparameter_names() else: if legal_hyperparameters != cs.get_hyperparameter_names(): raise ValueError() fanova_model = fanova.fANOVA(X=features, Y=responses, config_space=cs, n_trees=16, seed=7) # marginal of particular parameter: output_file = os.path.join(args.output_dir, '%s.txt' % dataset) with open(output_file, 'w') as fp: for idx, hyperparameter in enumerate(legal_hyperparameters): logging.info('Hyperparameter %d: %s' % (idx, hyperparameter)) dims = (idx, ) res = fanova_model.quantify_importance(dims) fp.write(str(res) + '\n')
def calculate_hp_importance_over_dataset_size(working_dir, network, destination_dir): budgets = [81] importance = defaultdict(lambda: list()) default_task_ids = read_task_ids(working_dir) default_task_dataset_sizes = \ calculate_dataset_sizes(default_task_ids) dataset_size_for_task = dict() for task, task_size in zip(default_task_ids, default_task_dataset_sizes): dataset_size_for_task[task] = task_size fig = plt.figure(5) methods = { 'Conditional Network': 'resnet_only_conditional', } task_dataset_sizes = list() config_space = get_fixed_conditional_fanova_fcresnet_config() hp_names = list(map(lambda hp: hp.name, config_space.get_hyperparameters())) for method, folder in methods.items(): for task_id in read_task_ids(working_dir): X = [] y = [] try: with open( os.path.join(working_dir, folder, 'task_%d' % task_id, network, "results.pkl"), "rb") as fp: result = pickle.load(fp) id2conf = result.get_id2config_mapping() all_runs = result.get_all_runs(only_largest_budget=False) all_runs = list(filter(lambda r: r.budget in budgets, all_runs)) for r in all_runs: if r.loss is None: continue config = id2conf[r.config_id]['config'] X.append([ string_to_numerical_categorical(hp, config[hp]) for hp in hp_names ]) y.append(r.loss) except FileNotFoundError: continue if len(X) > 0: fanova_object = fANOVA(np.asarray(X), np.asarray(y), config_space) else: continue for hp in hp_names: importance_hp = fanova_object.quantify_importance((hp, )).get( (hp, )) importance[hp].append(importance_hp['individual importance']) task_dataset_sizes.append(dataset_size_for_task[task_id]) path = os.path.join(os.path.expanduser(destination_dir), 'importance_over_datasets', method) if os.path.exists(path): if not os.path.isdir(path): os.makedirs(path) else: os.makedirs(path) plt.xlabel("Dataset size") for hp in importance: plt.scatter(task_dataset_sizes, importance[hp]) plt.ylabel("Importance %s" % hp) plt.rcParams['axes.unicode_minus'] = False title = "Importance of %s over dataset size" % hp plt.title(title) plt.savefig(os.path.join(path, 'importance_%s.pdf' % hp), bbox_inches='tight') plt.clf() plt.close(fig)
def get_fanova_info( base_dir, params_to_ignore=('seed', 'exp_id', 'unique_id', 'exp_name'), ylabel='AverageReturn', ): data_and_variants = get_trials(base_dir) experiment_data_list, variants_list = zip(*data_and_variants) ylabel = ylabel.replace(' ', '_') ylabel = ylabel.replace('-', '') if ylabel not in experiment_data_list[0].dtype.names: print("Possible ylabels:") for name in experiment_data_list[0].dtype.names: print(" - {}".format(name)) raise ValueError("Invalid ylabel: {}".format(ylabel)) indices_of_experiments_with_data = [ i for i, exp in enumerate(experiment_data_list) if exp[ylabel].size >= 1 ] if len(indices_of_experiments_with_data) != len(experiment_data_list): print("WARNING: Skipping some experiments. Probably because they only " "have one data point.") valid_experiment_data_list = [ d for i, d in enumerate(experiment_data_list) if i in indices_of_experiments_with_data ] variants_list = [ v for i, v in enumerate(variants_list) if i in indices_of_experiments_with_data ] Y = np.array([ exp[ylabel][-1] if exp[ylabel].size > 1 else np.array( float(exp[ylabel]), dtype=np.double) for exp in valid_experiment_data_list ]) filtered_variants_list = remove_keys_with_nonunique_values( variants_list, params_to_ignore=params_to_ignore) filtered_variants_to_values = get_dict_key_to_values( filtered_variants_list) names = list(filtered_variants_list[0].keys()) X_raw = _extract_features(filtered_variants_list, names) config_space, X, categorical_remapping = ( _get_config_space_and_new_features( X_raw, names, filtered_variants_to_values, )) # Not sure why, but config_space shuffles the order of the hyperparameters new_name_order = [ config_space.get_hyperparameter_by_idx(i) for i in range(len(names)) ] new_order = [names.index(name) for name in new_name_order] X = [X[i] for i in new_order] # X has be [feature_dim X batch_size], but Fanova expects the transpose X = np.array(X, dtype=object).T return FanovaInfo( fANOVA(X, Y, config_space=config_space), config_space, X, Y, categorical_remapping, variants_list, )
[0, 0], [0.1, 1], [0.4, 1], [0.5, 1], [0.6, 1], [0.65, 1], [1, 1], ]) y = np.random.rand(len(X)) config_space = ConfigSpace.ConfigurationSpace() # config_space.add_hyperparameter(CategoricalHyperparameter('f', [0, 1, 2])) config_space.add_hyperparameter( UniformFloatHyperparameter('f', X[:, 0].min(), X[:, 0].max())) config_space.add_hyperparameter(CategoricalHyperparameter('f2', [0, 1])) f = fanova.fANOVA(X, y, config_space) import sys import os import pickle import tempfile import unittest class TestfANOVAtoyData(unittest.TestCase): def setUp(self): self.X = np.loadtxt('/home/vitchyr/tmp_features.csv', delimiter=',') self.y = np.loadtxt('/home/vitchyr/tmp_responses.csv', delimiter=',')
# get sample data from online lda X = np.loadtxt(path + '/example_data/online_lda/online_lda_features.csv', delimiter=",") Y = np.loadtxt(path + '/example_data/online_lda/online_lda_responses.csv', delimiter=",") # setting up config space: param_file = path + '/example_data/online_lda/param-file.txt' f = open(param_file, 'rb') cs = ConfigurationSpace() for row in f: cs.add_hyperparameter(UniformFloatHyperparameter("%s" %row[0:4].decode('utf-8'), np.float(row[6:9]), np.float(row[10:13]),np.float(row[18:21]))) param = cs.get_hyperparameters() # create an instance of fanova with data for the random forest and the configSpace f = fANOVA(X = X, Y = Y, config_space = cs) # marginal for first parameter p_list = (0, ) res = f.quantify_importance(p_list) print(res) p2_list = ('Col1', 'Col2') res2 = f.quantify_importance(p2_list) print(res2) p2_list = ('Col0', 'Col2') res2 = f.quantify_importance(p2_list) print(res2) p2_list = ('Col1', 'Col0') res2 = f.quantify_importance(p2_list) print(res2)
for datasize in config["datasizes"]: if "bo" in algo: title = system + "_" + app + "_" + datasize + "_" + estimator + "_" + acq write_to_config(config, acq=acq, estimator=estimator) X, Y = get_results() else: title = system + "_" + app + "_" + datasize write_to_config(config) X, Y = get_results() if len(X) == 0 or len(Y) == 0: continue f = fANOVA(X, Y) i = 0 param_importance = [] all_params = ["Budget", "Experiment", "Init Samples", *params] for e in all_params: importance = f.quantify_importance((i, )) param_importance.append( [e, list(importance.values())[0]['individual importance']]) # print(importance.values()) print(e, importance) i += 1 print(param_importance) ### Marginal Pairwise importance. Takes too long
def smac_to_fanova(state_run_directory, destination_dir): ''' Takes the state-run files, merges them and prepares the configuration space for fANOVA. outputs: fANOVA object state_run_directory: str path to the directory of the pysmac_output/out/scenario file destination_dir: str path to the directory in which the merged states should be stored ''' state_run_list = [] files = glob(state_run_directory + "/*") for file in files: if file.startswith(state_run_directory + "/state-run"): state_run_list.append(file) state_merge.state_merge(state_run_list, destination_dir) merged_files = glob(destination_dir + '/*') for file in merged_files: if file.startswith(destination_dir + '/runs_and_results'): response_file = file if file.startswith(destination_dir + '/paramstrings'): paramstrings = file param_dict = output_reader.read_paramstrings_file(paramstrings) num_line = str(param_dict[0]).replace("'", "") num_line = str(num_line).replace("}", "") # messy way to get the parameter names wrt order f_params = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") line = line.replace('{', '') if ':' in line: parameter = line.replace(':', '') f_params.append(parameter) # get configspace with open(destination_dir + '/param.pcs') as fh: cs = pcs_new.read(fh.readlines(), debug=True) X = [] hps = cs.get_hyperparameters() for p in param_dict: c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True) X.append([]) for hp in hps: if hasattr(hp, 'choices'): value = hp.choices.index(c[hp.name]) else: value = c[hp.name] X[-1].append(value) X = np.array(X) Y = data_extractor(response_file, X.shape[0]) return fanova.fANOVA(X=X, Y=Y, config_space=cs)
for hp_name in hp_order: config_dict[hp_name] = [] for config in config_list: for hp_name in hp_order: config_dict[hp_name].append(config[hp_name]) for i, hp in enumerate(hp_type): if isinstance(hp, CategoricalHyperparameter): encoder = OrdinalEncoder() config_dict[hp_order[i]] = encoder.fit_transform( np.array(config_dict[hp_order[i]]).reshape(1, -1))[0] elif isinstance(hp, UnParametrizedHyperparameter) or isinstance(hp, Constant): config_dict[hp_order[i]] = [0] * len(config_dict[hp_order[i]]) X = pd.DataFrame.from_dict(config_dict) f = fANOVA(X=X, Y=Y, config_space=cs) # marginal for first parameter for key in hp_order: p_list = (key,) importance = f.quantify_importance(p_list)[p_list]['total importance'] individual_importance[key].append(importance) except Exception as e: print(e) print(individual_importance) with open('lightgbm_%d.pkl' % rep_num, 'wb') as f: pkl.dump(individual_importance, f)
inputs = f['inputs'] outputs = f['outputs'] import ipdb ipdb.set_trace() outputs = np.zeros(inputs.shape[1]) cs = ConfigurationSpace() cs.add_hyperparameter(UniformFloatHyperparameter("0", 0., 1., default_value=0.5)) cs.add_hyperparameter(CategoricalHyperparameter("1", [0., 1.], default_value=1.)) cs.add_hyperparameter(CategoricalHyperparameter("2", [0., 1.], default_value=1.)) cs.add_hyperparameter(UniformFloatHyperparameter("3", 0., 1., default_value=0.5)) cs.add_hyperparameter(UniformFloatHyperparameter("4", 0., 1., default_value=0.5)) param = cs.get_hyperparameters() f = fANOVA(X, Y, config_space=cs, n_trees=30) print('Total importances') res = f.quantify_importance((0,)) print(res[(0,)]['total importance']) res = f.quantify_importance((1,)) print(res[(1,)]['total importance']) res = f.quantify_importance((2,)) print(res[(2,)]['total importance']) res = f.quantify_importance((3,)) print(res[(3,)]['total importance']) res = f.quantify_importance((4,)) print(res[(4,)]['total importance']) print('Pair-wise importance') print(f.get_most_important_pairwise_marginals((0, 1, 2, 3, 4)))
# artificial dataset (here: features) features = np.loadtxt(path + '/example_data/diabetes_features.csv', delimiter=",") responses = np.loadtxt(path + '/example_data/diabetes_responses.csv', delimiter=",") # config space pcs = list(zip(np.min(features, axis=0), np.max(features, axis=0))) cs = ConfigSpace.ConfigurationSpace() for i in range(len(pcs)): cs.add_hyperparameter( UniformFloatHyperparameter("%i" % i, pcs[i][0], pcs[i][1])) # create an instance of fanova with trained forest and ConfigSpace f = fANOVA(X=features, Y=responses, config_space=cs) # marginal of particular parameter: dims = (1, ) res = f.quantify_importance(dims) print(res) # getting the 10 most important pairwise marginals sorted by importance best_margs = f.get_most_important_pairwise_marginals(n=10) print(best_margs) # visualizations: # first create an instance of the visualizer with fanova object and configspace vis = fanova.visualizer.Visualizer(f, cs, 'example_output') # creating the plot of pairwise marginal: vis.plot_pairwise_marginal((0, 2), resolution=20)
def __init__( self, configspace, top_n_percent, num_samples=64, random_fraction=1 / 3, bandwidth_factor=3, min_bandwidth=1e-3, previous_results=None, logger=None, use_gp=False ): self.logger = logger self.configspace = configspace self.best_previous_config_projected = None self.configspace_important_or_new = None if previous_results is not None and len(previous_results.batch_results) > 0: # Assume same-task changing-configspace trajectory for now results_previous_adjustment = previous_results.batch_results[-1] config_ranking_previous_adjustment = rank_configs( results_previous_adjustment.results[0] ) # 2. Construct intersection / only_new configspace configspace_old = results_previous_adjustment.configspace ( configspace_intersection, configspace_only_new, ) = get_configspace_partitioning(self.configspace, configspace_old) # 3. Project configs to the intersection configspace config_ranking_previous_projected = project_configs( config_ranking_previous_adjustment, configspace_intersection ) config_ranking_previous_projected = sortout_configs( config_ranking_previous_projected, configspace_intersection ) # 4. Read in best previous projected config self.best_previous_config_projected = config_ranking_previous_projected[0] # 5. Determine important hyperparameters x, y, _ = results_previous_adjustment.results[0].get_fANOVA_data( configspace_old ) f = fANOVA(x, y, configspace_old) def importance(hyperparameter_name): imp = f.quantify_importance((hyperparameter_name,)) return imp[(hyperparameter_name,)]["total importance"] hyperparameter_to_importance = { hyperparameter: importance(hyperparameter.name) for hyperparameter in configspace_old.get_hyperparameters() } mean_importance = np.mean(list(hyperparameter_to_importance.values())) important_hyperparameters = { hyperparameter.name for hyperparameter, importance in hyperparameter_to_importance.items() if importance >= mean_importance } # 6. Construct configspace with only important or new hyperparameters important_hyperparameters = [ hyperparameter for hyperparameter in self.configspace.get_hyperparameters() if hyperparameter.name in important_hyperparameters ] self.configspace_important_or_new = copy.deepcopy(configspace_only_new) self.configspace_important_or_new.add_hyperparameters( important_hyperparameters ) # 6. Initialize tpe for the only_important configspace tpe_configspace = self.configspace_important_or_new else: tpe_configspace = configspace if use_gp: self.tpe_current = GPSampler(tpe_configspace, logger=self.logger) else: self.tpe_current = TPESampler( tpe_configspace, top_n_percent, num_samples, random_fraction, bandwidth_factor, min_bandwidth, logger, )
# directory in which you can find all plots plot_dir = path + '/example_data/test_plots' # artificial dataset (here: features) features = np.loadtxt(path + '/example_data/diabetes_features.csv', delimiter=",") responses = np.loadtxt(path + '/example_data/diabetes_responses.csv', delimiter=",") # config space pcs = list(zip(np.min(features,axis=0), np.max(features, axis=0))) cs = ConfigSpace.ConfigurationSpace() for i in range(len(pcs)): cs.add_hyperparameter(UniformFloatHyperparameter("%i" %i, pcs[i][0], pcs[i][1])) # create an instance of fanova with trained forest and ConfigSpace f = fANOVA(X = features, Y = responses, config_space=cs) # marginal of particular parameter: dims = (1, ) res = f.quantify_importance(dims) print(res) # getting the 10 most important pairwise marginals sorted by importance best_margs = f.get_most_important_pairwise_marginals(n=10) print(best_margs) # visualizations: # first create an instance of the visualizer with fanova object and configspace vis = fanova.visualizer.Visualizer(f, cs) # creating the plot of pairwise marginal: vis.plot_pairwise_marginal((0,2), resolution=20)
from fanova import fANOVA import numpy as np from robo.fmin import random_search from hpolib.benchmarks.synthetic_functions import Branin import fanova.visualizer objective_function = Branin() info = objective_function.get_meta_information() bounds = np.array(info['bounds']) config_space = objective_function.get_configuration_space() # Start Bayesian optimization to optimize the objective function results = random_search(objective_function, bounds[:, 0], bounds[:, 1], num_iterations=50) # Creating a fANOVA object X = np.array([i for i in results['X']]) Y = np.array([i for i in results['y']]) f = fANOVA(X,Y) print(f.quantify_importance((0, ))) # Visualization vis = fanova.visualizer.Visualizer(f, config_space, "./plots/") vis.plot_marginal(1)
# directory in which you can find all plots plot_dir = path + '/example_data/test_plots' # artificial dataset (here: features) features = np.loadtxt(path + '/example_data/diabetes_features.csv', delimiter=",") responses = np.loadtxt(path + '/example_data/diabetes_responses.csv', delimiter=",") df_x = pd.DataFrame(features[0:443, 0:3], columns=['0', '1', '2']) #,'3', '4','5','6','7','8','9']) # config space pcs = list(zip(np.min(df_x, axis=0), np.max(df_x, axis=0))) cs = ConfigSpace.ConfigurationSpace() #for i in range(len(pcs)): for i in range(3): cs.add_hyperparameter( UniformFloatHyperparameter("%i" % i, pcs[i][0], pcs[i][1])) # create an instance of fanova with trained forest and ConfigSpace f = fANOVA(X=df_x, Y=responses, config_space=cs) # marginal of particular parameter: dims = ('0', '1', '2') res = f.quantify_importance(dims) print(res) # getting the 10 most important pairwise marginals sorted by importance best_margs = f.get_most_important_pairwise_marginals(n=2) print(best_margs)
def smac_to_fanova(state_run_directory, destination_dir): ''' Takes the state-run files, merges them and prepares the configuration space for fANOVA. outputs: fANOVA object state_run_directory: str path to the directory of the pysmac_output/out/scenario file destination_dir: str path to the directory in which the merged states should be stored ''' state_run_list =[] files = glob(state_run_directory + "/*") for file in files: if file.startswith(state_run_directory + "/state-run"): state_run_list.append(file) state_merge.state_merge(state_run_list, destination_dir) merged_files = glob(destination_dir + '/*') for file in merged_files: if file.startswith(destination_dir + '/runs_and_results'): response_file = file if file.startswith(destination_dir + '/paramstrings'): paramstrings = file param_dict = output_reader.read_paramstrings_file(paramstrings) num_line = str(param_dict[0]).replace("'", "") num_line = str(num_line).replace("}", "") # messy way to get the parameter names wrt order f_params = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") line = line.replace('{', '') if ':' in line: parameter = line.replace(':', '') f_params.append(parameter) # getting features all_nums = [] for dict_row in param_dict: num_line = str(dict_row).replace("'", "") num_line = str(num_line).replace("}", "") nums = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") if line.isdigit(): nums.append(np.int(line)) elif line.replace(".", "", 1).isdigit(): nums.append(np.float(line)) elif '-' in line: new_line = line.replace("-","") if new_line.isdigit(): nums.append(np.int(line)) elif new_line.replace(".", "", 1).isdigit(): nums.append(np.float(line)) all_nums.append(nums) x = np.array(all_nums) length = len(x) Y = data_extractor(response_file, length) fh = open(destination_dir + '/param.pcs') orig_pcs = fh.readlines() cs = pcs_new.read(orig_pcs, debug=True) X = np.zeros((x.shape)) for i in range(x.shape[1]): idx = cs.get_idx_by_hyperparameter_name(f_params[i]) X[:, idx] = x[:, i] # create an instance of fanova with data for the random forest and the configSpace return fanova.fANOVA(X = X, Y = Y, config_space= cs)
color = {'boxes': 'y', 'whiskers': 'grey', 'medians': 'white', 'caps': 'grey'} box = mean_scores.boxplot(vert=False, notch=True, grid=False, color=color, patch_artist=True, sym='+') plt.xlabel('Fehler', fontsize=15) plt.xticks(fontsize=9) plt.yticks(fontsize=12) plt.show() # Compute fANOVA externally (plots look better than from CAVE) X = config_csv.drop(columns=['CONFIG_ID'], axis=1).to_numpy() y = runhistory_csv['cost'].to_numpy() f = fANOVA(X, y) vis = fanova.visualizer.Visualizer(f, cs, "./fANOVA_plots/") vis.create_all_plots() # Search Space Evaluation choice = 'criterion' # Preprocessing XandY = pd.concat([X, y], axis=1) XandY = XandY[XandY[choice].notna()] y = XandY['cost'] X = XandY.drop(columns=['cost'], axis=[1]) X_numeric = X.select_dtypes(include=['number']) X_object = X.select_dtypes(include=['category', 'object', 'bool']) choiceLabelEncoder = LabelEncoder() labelEncoder = LabelEncoder() for colname in X_object: