Ejemplo n.º 1
0
    def evaluate(self, study: Study,
                 params: Optional[List[str]]) -> Dict[str, float]:
        distributions = _get_distributions(study, params)
        params_data, values_data = _get_study_data(study, distributions)

        evaluator = fANOVA(
            X=params_data,
            Y=values_data,
            config_space=_get_configuration_space(distributions),
            max_features=max(1, int(params_data.shape[1] * 0.7)),
        )

        individual_importances = {}
        for i, name in enumerate(evaluator.cs.get_hyperparameter_names()):
            imp = evaluator.quantify_importance((i, ))
            imp = imp[(i, )]["individual importance"]
            individual_importances[name] = imp

        tot_importance = sum(v for v in individual_importances.values())
        for name in individual_importances.keys():
            individual_importances[name] /= tot_importance

        param_importances = OrderedDict(
            reversed(
                sorted(
                    individual_importances.items(),
                    key=lambda name_and_importance: name_and_importance[1],
                )))
        return param_importances
Ejemplo n.º 2
0
def smac_to_fanova(state_run_directory, destination_dir):
    '''
    Takes the state-run files, merges them and prepares the configuration space for fANOVA.
    
    outputs: fANOVA object
    
    state_run_directory: str
                        path to the directory of the pysmac_output/out/scenario file
    destination_dir: str
                    path to the directory in which the merged states should be stored
    '''

    state_run_list =[]
    files = glob(state_run_directory + "/*")
    for file in files:
        if file.startswith(state_run_directory + "/state-run"):
            state_run_list.append(file)
    state_merge.state_merge(state_run_list, destination_dir)
    merged_files = glob(destination_dir + '/*')

    for file in merged_files:
        if file.startswith(destination_dir + '/runs_and_results'):
            response_file = file
        if file.startswith(destination_dir + '/paramstrings'):
            paramstrings = file
    param_dict = output_reader.read_paramstrings_file(paramstrings)
    
    num_line = str(param_dict[0]).replace("'", "")
    num_line = str(num_line).replace("}", "")
    # messy way to get the parameter names wrt order
    f_params = []
    for line in str(num_line).split(" "):
        line = str(line).replace(",", "")
        line = line.replace('{',  '')
        if ':' in line:
            parameter = line.replace(':', '')
            f_params.append(parameter)
    
    # get configspace
    with open(destination_dir + '/param.pcs') as fh:
        cs = pcs_new.read(fh.readlines(), debug=True)

    X = []
    hps = cs.get_hyperparameters()


    for p in param_dict:
        c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True)
        X.append([])
        for hp in hps:
            if hasattr(hp, 'choices'):
                value = hp.choices.index(c[hp.name])
            else:
                value = c[hp.name]
            X[-1].append(value)
    
    X = np.array(X)
    Y = data_extractor(response_file, X.shape[0])

    return fanova.fANOVA(X = X, Y = Y, config_space= cs)
Ejemplo n.º 3
0
def fanova_to_df(data, algorithm, missing_values, cs1, cs2):
    """
    Derive importance of hyperparameter combinations
    for the given algorithm
    
    Input:
           data - (DataFrame) contains the performance data 
                  for a dataset
           algorithm - (str) takes one of the following options
                        {RandomForest, AdaBoost, ExtraTrees, 
                         SVM, GradientBoosting}
           missing_values - (boolean) whether imputation has 
                            been done on the dataset
           cs1, cs2 - configuration space objects
    Output:
           df - (DataFrame) contains the variance contributions
                per hyperparameter combination
           time_taken - performance time in sec
    
    """
    if missing_values:
        X = data.loc[:, sorted(data.columns[1:-1])].values
        y = data.iloc[:, -1].values
        cs = cs1
    else:
        X = data.loc[:, sorted(data.columns[1:-2])].values
        y = data.iloc[:, -1].values
        cs = cs2

    f = fanova.fANOVA(X, y,
                      n_trees=32,
                      bootstrapping=True,
                      config_space=cs)
    
    start = time.perf_counter()
    print('Singles')
    imp1 = get_single_importance(f)
    print('Pairs')
    imp2 = f.get_most_important_pairwise_marginals()
    print('Triples')
    if missing_values:
        imp3_1 = get_triple_importance(f, algorithm)
        imp3_2 = get_triple_impute(f, algorithm)
        imp3 = dict_merge(imp3_1, imp3_2)
    else:
        imp3 = get_triple_importance(f, algorithm)

    imp = dict_merge(imp1, imp2, imp3)
    end = time.perf_counter()

    time_taken = end - start
    print('time taken is {} min'.format(time_taken / 60))
    
    df = pd.DataFrame({'param': list(imp.keys()),
                       'importance': list(imp.values())},
                      index=None)
    return df, time_taken   
Ejemplo n.º 4
0
	def test_with_toy_data(self):
		
		f = fanova.fANOVA(self.X,self.y,self.cfs, bootstrapping=False, n_trees=1, seed=5, max_features=1)

		f.the_forest.save_latex_representation('/tmp/fanova_')
		print("="*80)
		print(f.the_forest.all_split_values())
		print("total variances", f.the_forest.get_trees_total_variances())
		print(f.quantify_importance([0,1]))
		print(f.trees_total_variance)
		
		print(f.V_U)
Ejemplo n.º 5
0
def online_lda():
    marginals =[]
    the_keys = []
    X = np.loadtxt('../example_data/online_lda/online_lda_features.csv', delimiter=",")
    Y = np.loadtxt('../example_data/online_lda/online_lda_responses.csv', delimiter=",")
    f = fanova.fANOVA(X,Y, n_trees=32,bootstrapping=True)

    res = f.quantify_importance((0, 1, 2))

    for key in res.keys():
        if key != (0, 1, 2):
            marginals.append(res[key]['individual importance'])
            the_keys.append(key)
    return the_keys, marginals
Ejemplo n.º 6
0
    def __init__(self, data: pd.DataFrame, hp_names, objective, hp_space=None):
        if import_error is not None:
            raise import_error

        x = data[hp_names]
        y = data[objective]

        self.space = hp_space
        self.hp_names = hp_names
        self.fanova = fANOVA(x.values, y.values)
        self.size = len(hp_names)
        self._importance = np.zeros((self.size, self.size))
        self._importance_std = np.zeros_like(self._importance)
        self.vis = Visualizer(self.fanova, Space.from_dict(hp_space).instantiate(), '/tmp', 'objective')
        self.computed = False
Ejemplo n.º 7
0
def calculate_importance(trials):
    df_x, responses, cs = __make_datas(trials)

    f = fANOVA(X=df_x, Y=responses, config_space=cs)

    num_of_features = len(df_x.columns)

    # marginal of particular parameter:
    importances = []
    for i in range(num_of_features):
        dims = (i, )
        res = f.quantify_importance(dims)
        importances.append(res)

    # # getting the 10 most important pairwise marginals sorted by importance
    # best_margs = f.get_most_important_pairwise_marginals(n=1)
    # print(best_margs)

    return importances
Ejemplo n.º 8
0
def csv_example():
    marginals =[]
    the_keys = []
    data = np.loadtxt("../example_data/csv-example/test_data.csv", delimiter=",")
    X = np.array(data[:, :2], dtype = np.float)
    Y = np.array(data[:,-1:], dtype = np.float).flatten()
    # config space
    pcs = list(zip(np.min(X,axis=0), np.max(X, axis=0)))
    cs = ConfigSpace.ConfigurationSpace()
    for i in range(len(pcs)):
        cs.add_hyperparameter(UniformFloatHyperparameter("%i" %i, pcs[i][0], pcs[i][1]))

    f2 = fanova.fANOVA(X, Y, cs)
    res = f2.quantify_importance((0, 1))
    
    for key in res.keys():
        marginals.append(res[key]['individual importance'])
        the_keys.append(key)
    
    return the_keys, marginals
Ejemplo n.º 9
0
def run(args):
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    os.makedirs(args.output_dir, exist_ok=True)

    legal_hyperparameters = None
    for dataset in config_spaces.DATASETS:
        logging.info('Dataset %s' % dataset)

        # artificial dataset (here: features)
        directory = os.path.join(args.input_dir, dataset)
        features = np.loadtxt(directory + '/' + dataset + '-features.csv',
                              delimiter=",")
        responses = np.loadtxt(directory + '/' + dataset +
                               '-responses-acc.csv',
                               delimiter=",")

        cs = config_spaces.get_config_space(dataset, 0)
        if legal_hyperparameters is None:
            legal_hyperparameters = cs.get_hyperparameter_names()
        else:
            if legal_hyperparameters != cs.get_hyperparameter_names():
                raise ValueError()

        fanova_model = fanova.fANOVA(X=features,
                                     Y=responses,
                                     config_space=cs,
                                     n_trees=16,
                                     seed=7)

        # marginal of particular parameter:
        output_file = os.path.join(args.output_dir, '%s.txt' % dataset)
        with open(output_file, 'w') as fp:
            for idx, hyperparameter in enumerate(legal_hyperparameters):
                logging.info('Hyperparameter %d: %s' % (idx, hyperparameter))
                dims = (idx, )
                res = fanova_model.quantify_importance(dims)
                fp.write(str(res) + '\n')
Ejemplo n.º 10
0
def calculate_hp_importance_over_dataset_size(working_dir, network,
                                              destination_dir):

    budgets = [81]

    importance = defaultdict(lambda: list())

    default_task_ids = read_task_ids(working_dir)
    default_task_dataset_sizes = \
        calculate_dataset_sizes(default_task_ids)
    dataset_size_for_task = dict()

    for task, task_size in zip(default_task_ids, default_task_dataset_sizes):
        dataset_size_for_task[task] = task_size

    fig = plt.figure(5)

    methods = {
        'Conditional Network': 'resnet_only_conditional',
    }
    task_dataset_sizes = list()
    config_space = get_fixed_conditional_fanova_fcresnet_config()
    hp_names = list(map(lambda hp: hp.name,
                        config_space.get_hyperparameters()))
    for method, folder in methods.items():
        for task_id in read_task_ids(working_dir):

            X = []
            y = []
            try:
                with open(
                        os.path.join(working_dir, folder, 'task_%d' % task_id,
                                     network, "results.pkl"), "rb") as fp:
                    result = pickle.load(fp)

                id2conf = result.get_id2config_mapping()
                all_runs = result.get_all_runs(only_largest_budget=False)
                all_runs = list(filter(lambda r: r.budget in budgets,
                                       all_runs))

                for r in all_runs:
                    if r.loss is None: continue
                    config = id2conf[r.config_id]['config']
                    X.append([
                        string_to_numerical_categorical(hp, config[hp])
                        for hp in hp_names
                    ])
                    y.append(r.loss)

            except FileNotFoundError:
                continue

            if len(X) > 0:
                fanova_object = fANOVA(np.asarray(X), np.asarray(y),
                                       config_space)
            else:
                continue
            for hp in hp_names:
                importance_hp = fanova_object.quantify_importance((hp, )).get(
                    (hp, ))
                importance[hp].append(importance_hp['individual importance'])
            task_dataset_sizes.append(dataset_size_for_task[task_id])
        path = os.path.join(os.path.expanduser(destination_dir),
                            'importance_over_datasets', method)
        if os.path.exists(path):
            if not os.path.isdir(path):
                os.makedirs(path)
        else:
            os.makedirs(path)

        plt.xlabel("Dataset size")
        for hp in importance:

            plt.scatter(task_dataset_sizes, importance[hp])
            plt.ylabel("Importance %s" % hp)
            plt.rcParams['axes.unicode_minus'] = False
            title = "Importance of  %s over dataset size" % hp
            plt.title(title)
            plt.savefig(os.path.join(path, 'importance_%s.pdf' % hp),
                        bbox_inches='tight')
            plt.clf()
    plt.close(fig)
Ejemplo n.º 11
0
def get_fanova_info(
    base_dir,
    params_to_ignore=('seed', 'exp_id', 'unique_id', 'exp_name'),
    ylabel='AverageReturn',
):
    data_and_variants = get_trials(base_dir)
    experiment_data_list, variants_list = zip(*data_and_variants)
    ylabel = ylabel.replace(' ', '_')
    ylabel = ylabel.replace('-', '')
    if ylabel not in experiment_data_list[0].dtype.names:
        print("Possible ylabels:")
        for name in experiment_data_list[0].dtype.names:
            print(" - {}".format(name))
        raise ValueError("Invalid ylabel: {}".format(ylabel))
    indices_of_experiments_with_data = [
        i for i, exp in enumerate(experiment_data_list)
        if exp[ylabel].size >= 1
    ]
    if len(indices_of_experiments_with_data) != len(experiment_data_list):
        print("WARNING: Skipping some experiments. Probably because they only "
              "have one data point.")
    valid_experiment_data_list = [
        d for i, d in enumerate(experiment_data_list)
        if i in indices_of_experiments_with_data
    ]
    variants_list = [
        v for i, v in enumerate(variants_list)
        if i in indices_of_experiments_with_data
    ]
    Y = np.array([
        exp[ylabel][-1] if exp[ylabel].size > 1 else np.array(
            float(exp[ylabel]), dtype=np.double)
        for exp in valid_experiment_data_list
    ])
    filtered_variants_list = remove_keys_with_nonunique_values(
        variants_list, params_to_ignore=params_to_ignore)
    filtered_variants_to_values = get_dict_key_to_values(
        filtered_variants_list)
    names = list(filtered_variants_list[0].keys())
    X_raw = _extract_features(filtered_variants_list, names)
    config_space, X, categorical_remapping = (
        _get_config_space_and_new_features(
            X_raw,
            names,
            filtered_variants_to_values,
        ))

    # Not sure why, but config_space shuffles the order of the hyperparameters
    new_name_order = [
        config_space.get_hyperparameter_by_idx(i) for i in range(len(names))
    ]
    new_order = [names.index(name) for name in new_name_order]
    X = [X[i] for i in new_order]
    # X has be [feature_dim X batch_size], but Fanova expects the transpose
    X = np.array(X, dtype=object).T
    return FanovaInfo(
        fANOVA(X, Y, config_space=config_space),
        config_space,
        X,
        Y,
        categorical_remapping,
        variants_list,
    )
Ejemplo n.º 12
0
        [0, 0],
        [0.1, 1],
        [0.4, 1],
        [0.5, 1],
        [0.6, 1],
        [0.65, 1],
        [1, 1],
    ])
    y = np.random.rand(len(X))

    config_space = ConfigSpace.ConfigurationSpace()
    # config_space.add_hyperparameter(CategoricalHyperparameter('f', [0, 1, 2]))
    config_space.add_hyperparameter(
        UniformFloatHyperparameter('f', X[:, 0].min(), X[:, 0].max()))
    config_space.add_hyperparameter(CategoricalHyperparameter('f2', [0, 1]))
    f = fanova.fANOVA(X, y, config_space)

    import sys

    import os
    import pickle
    import tempfile
    import unittest

    class TestfANOVAtoyData(unittest.TestCase):
        def setUp(self):
            self.X = np.loadtxt('/home/vitchyr/tmp_features.csv',
                                delimiter=',')
            self.y = np.loadtxt('/home/vitchyr/tmp_responses.csv',
                                delimiter=',')
Ejemplo n.º 13
0
# get sample data from online lda
X = np.loadtxt(path + '/example_data/online_lda/online_lda_features.csv', delimiter=",")
Y = np.loadtxt(path + '/example_data/online_lda/online_lda_responses.csv', delimiter=",")

# setting up config space:
param_file = path + '/example_data/online_lda/param-file.txt'
f = open(param_file, 'rb')

cs = ConfigurationSpace()
for row in f:
    cs.add_hyperparameter(UniformFloatHyperparameter("%s" %row[0:4].decode('utf-8'), np.float(row[6:9]), np.float(row[10:13]),np.float(row[18:21])))
param = cs.get_hyperparameters()


# create an instance of fanova with data for the random forest and the configSpace
f = fANOVA(X = X, Y = Y, config_space = cs)

# marginal for first parameter
p_list = (0, )
res = f.quantify_importance(p_list)
print(res)

p2_list = ('Col1', 'Col2')
res2 = f.quantify_importance(p2_list)
print(res2)
p2_list = ('Col0', 'Col2')
res2 = f.quantify_importance(p2_list)
print(res2)
p2_list = ('Col1', 'Col0')
res2 = f.quantify_importance(p2_list)
print(res2)
Ejemplo n.º 14
0
        for datasize in config["datasizes"]:

            if "bo" in algo:
                title = system + "_" + app + "_" + datasize + "_" + estimator + "_" + acq
                write_to_config(config, acq=acq, estimator=estimator)
                X, Y = get_results()

            else:
                title = system + "_" + app + "_" + datasize
                write_to_config(config)
                X, Y = get_results()

            if len(X) == 0 or len(Y) == 0:
                continue

            f = fANOVA(X, Y)
            i = 0

            param_importance = []
            all_params = ["Budget", "Experiment", "Init Samples", *params]
            for e in all_params:
                importance = f.quantify_importance((i, ))
                param_importance.append(
                    [e,
                     list(importance.values())[0]['individual importance']])
                # print(importance.values())
                print(e, importance)
                i += 1
            print(param_importance)

            ### Marginal Pairwise importance. Takes too long
Ejemplo n.º 15
0
def smac_to_fanova(state_run_directory, destination_dir):
    '''
    Takes the state-run files, merges them and prepares the configuration space for fANOVA.
    
    outputs: fANOVA object
    
    state_run_directory: str
                        path to the directory of the pysmac_output/out/scenario file
    destination_dir: str
                    path to the directory in which the merged states should be stored
    '''

    state_run_list = []
    files = glob(state_run_directory + "/*")
    for file in files:
        if file.startswith(state_run_directory + "/state-run"):
            state_run_list.append(file)
    state_merge.state_merge(state_run_list, destination_dir)
    merged_files = glob(destination_dir + '/*')

    for file in merged_files:
        if file.startswith(destination_dir + '/runs_and_results'):
            response_file = file
        if file.startswith(destination_dir + '/paramstrings'):
            paramstrings = file
    param_dict = output_reader.read_paramstrings_file(paramstrings)

    num_line = str(param_dict[0]).replace("'", "")
    num_line = str(num_line).replace("}", "")
    # messy way to get the parameter names wrt order
    f_params = []
    for line in str(num_line).split(" "):
        line = str(line).replace(",", "")
        line = line.replace('{', '')
        if ':' in line:
            parameter = line.replace(':', '')
            f_params.append(parameter)

    # get configspace
    with open(destination_dir + '/param.pcs') as fh:
        cs = pcs_new.read(fh.readlines(), debug=True)

    X = []
    hps = cs.get_hyperparameters()

    for p in param_dict:
        c = CS.Configuration(cs,
                             fix_types(p, cs),
                             allow_inactive_with_values=True)
        X.append([])
        for hp in hps:
            if hasattr(hp, 'choices'):
                value = hp.choices.index(c[hp.name])
            else:
                value = c[hp.name]
            X[-1].append(value)

    X = np.array(X)
    Y = data_extractor(response_file, X.shape[0])

    return fanova.fANOVA(X=X, Y=Y, config_space=cs)
Ejemplo n.º 16
0
                    for hp_name in hp_order:
                        config_dict[hp_name] = []

                    for config in config_list:
                        for hp_name in hp_order:
                            config_dict[hp_name].append(config[hp_name])

                    for i, hp in enumerate(hp_type):
                        if isinstance(hp, CategoricalHyperparameter):
                            encoder = OrdinalEncoder()
                            config_dict[hp_order[i]] = encoder.fit_transform(
                                np.array(config_dict[hp_order[i]]).reshape(1, -1))[0]
                        elif isinstance(hp, UnParametrizedHyperparameter) or isinstance(hp, Constant):
                            config_dict[hp_order[i]] = [0] * len(config_dict[hp_order[i]])

                X = pd.DataFrame.from_dict(config_dict)

                f = fANOVA(X=X, Y=Y, config_space=cs)

                # marginal for first parameter
                for key in hp_order:
                    p_list = (key,)
                    importance = f.quantify_importance(p_list)[p_list]['total importance']
                    individual_importance[key].append(importance)
        except Exception as e:
            print(e)

print(individual_importance)
with open('lightgbm_%d.pkl' % rep_num, 'wb') as f:
    pkl.dump(individual_importance, f)
inputs = f['inputs']
outputs = f['outputs']

import ipdb
ipdb.set_trace()
outputs = np.zeros(inputs.shape[1])

cs = ConfigurationSpace()
cs.add_hyperparameter(UniformFloatHyperparameter("0", 0., 1., default_value=0.5))
cs.add_hyperparameter(CategoricalHyperparameter("1", [0., 1.], default_value=1.))
cs.add_hyperparameter(CategoricalHyperparameter("2", [0., 1.], default_value=1.))
cs.add_hyperparameter(UniformFloatHyperparameter("3", 0., 1., default_value=0.5))
cs.add_hyperparameter(UniformFloatHyperparameter("4", 0., 1., default_value=0.5))
param = cs.get_hyperparameters()

f = fANOVA(X, Y, config_space=cs, n_trees=30)

print('Total importances')
res = f.quantify_importance((0,))
print(res[(0,)]['total importance'])
res = f.quantify_importance((1,))
print(res[(1,)]['total importance'])
res = f.quantify_importance((2,))
print(res[(2,)]['total importance'])
res = f.quantify_importance((3,))
print(res[(3,)]['total importance'])
res = f.quantify_importance((4,))
print(res[(4,)]['total importance'])

print('Pair-wise importance')
print(f.get_most_important_pairwise_marginals((0, 1, 2, 3, 4)))
Ejemplo n.º 18
0
# artificial dataset (here: features)
features = np.loadtxt(path + '/example_data/diabetes_features.csv',
                      delimiter=",")
responses = np.loadtxt(path + '/example_data/diabetes_responses.csv',
                       delimiter=",")

# config space
pcs = list(zip(np.min(features, axis=0), np.max(features, axis=0)))
cs = ConfigSpace.ConfigurationSpace()
for i in range(len(pcs)):
    cs.add_hyperparameter(
        UniformFloatHyperparameter("%i" % i, pcs[i][0], pcs[i][1]))

# create an instance of fanova with trained forest and ConfigSpace
f = fANOVA(X=features, Y=responses, config_space=cs)

# marginal of particular parameter:
dims = (1, )
res = f.quantify_importance(dims)
print(res)

# getting the 10 most important pairwise marginals sorted by importance
best_margs = f.get_most_important_pairwise_marginals(n=10)
print(best_margs)

# visualizations:
# first create an instance of the visualizer with fanova object and configspace
vis = fanova.visualizer.Visualizer(f, cs, 'example_output')
# creating the plot of pairwise marginal:
vis.plot_pairwise_marginal((0, 2), resolution=20)
    def __init__(
        self,
        configspace,
        top_n_percent,
        num_samples=64,
        random_fraction=1 / 3,
        bandwidth_factor=3,
        min_bandwidth=1e-3,
        previous_results=None,
        logger=None,
        use_gp=False
    ):
        self.logger = logger

        self.configspace = configspace
        self.best_previous_config_projected = None
        self.configspace_important_or_new = None
        if previous_results is not None and len(previous_results.batch_results) > 0:
            # Assume same-task changing-configspace trajectory for now
            results_previous_adjustment = previous_results.batch_results[-1]
            config_ranking_previous_adjustment = rank_configs(
                results_previous_adjustment.results[0]
            )

            # 2. Construct intersection / only_new configspace
            configspace_old = results_previous_adjustment.configspace
            (
                configspace_intersection,
                configspace_only_new,
            ) = get_configspace_partitioning(self.configspace, configspace_old)

            # 3. Project configs to the intersection configspace
            config_ranking_previous_projected = project_configs(
                config_ranking_previous_adjustment, configspace_intersection
            )
            config_ranking_previous_projected = sortout_configs(
                config_ranking_previous_projected, configspace_intersection
            )

            # 4. Read in best previous projected config
            self.best_previous_config_projected = config_ranking_previous_projected[0]

            # 5. Determine important hyperparameters
            x, y, _ = results_previous_adjustment.results[0].get_fANOVA_data(
                configspace_old
            )
            f = fANOVA(x, y, configspace_old)

            def importance(hyperparameter_name):
                imp = f.quantify_importance((hyperparameter_name,))
                return imp[(hyperparameter_name,)]["total importance"]

            hyperparameter_to_importance = {
                hyperparameter: importance(hyperparameter.name)
                for hyperparameter in configspace_old.get_hyperparameters()
            }
            mean_importance = np.mean(list(hyperparameter_to_importance.values()))
            important_hyperparameters = {
                hyperparameter.name
                for hyperparameter, importance in hyperparameter_to_importance.items()
                if importance >= mean_importance
            }

            # 6. Construct configspace with only important or new hyperparameters
            important_hyperparameters = [
                hyperparameter
                for hyperparameter in self.configspace.get_hyperparameters()
                if hyperparameter.name in important_hyperparameters
            ]
            self.configspace_important_or_new = copy.deepcopy(configspace_only_new)
            self.configspace_important_or_new.add_hyperparameters(
                important_hyperparameters
            )

            # 6. Initialize tpe for the only_important configspace
            tpe_configspace = self.configspace_important_or_new
        else:
            tpe_configspace = configspace

        if use_gp:
            self.tpe_current = GPSampler(tpe_configspace, logger=self.logger)
        else:
            self.tpe_current = TPESampler(
                tpe_configspace,
                top_n_percent,
                num_samples,
                random_fraction,
                bandwidth_factor,
                min_bandwidth,
                logger,
            )
Ejemplo n.º 20
0
# directory in which you can find all plots
plot_dir = path + '/example_data/test_plots'

# artificial dataset (here: features)
features = np.loadtxt(path + '/example_data/diabetes_features.csv', delimiter=",")
responses = np.loadtxt(path + '/example_data/diabetes_responses.csv', delimiter=",")

# config space
pcs = list(zip(np.min(features,axis=0), np.max(features, axis=0)))
cs = ConfigSpace.ConfigurationSpace()
for i in range(len(pcs)):
	cs.add_hyperparameter(UniformFloatHyperparameter("%i" %i, pcs[i][0], pcs[i][1]))


# create an instance of fanova with trained forest and ConfigSpace
f = fANOVA(X = features, Y = responses, config_space=cs)

# marginal of particular parameter:
dims = (1, )
res = f.quantify_importance(dims)
print(res)

# getting the 10 most important pairwise marginals sorted by importance
best_margs = f.get_most_important_pairwise_marginals(n=10)
print(best_margs)

# visualizations:
# first create an instance of the visualizer with fanova object and configspace
vis = fanova.visualizer.Visualizer(f, cs)
# creating the plot of pairwise marginal:
vis.plot_pairwise_marginal((0,2), resolution=20)
Ejemplo n.º 21
0
from fanova import fANOVA
import numpy as np
from robo.fmin import random_search
from hpolib.benchmarks.synthetic_functions import Branin
import fanova.visualizer


objective_function = Branin()
info = objective_function.get_meta_information()
bounds = np.array(info['bounds'])
config_space = objective_function.get_configuration_space()

# Start Bayesian optimization to optimize the objective function
results = random_search(objective_function, bounds[:, 0], bounds[:, 1], num_iterations=50)

# Creating a fANOVA object
X = np.array([i for i in results['X']])
Y = np.array([i for i in results['y']])
f = fANOVA(X,Y)

print(f.quantify_importance((0, )))

# Visualization
vis = fanova.visualizer.Visualizer(f, config_space, "./plots/")
vis.plot_marginal(1)
Ejemplo n.º 22
0
# directory in which you can find all plots
plot_dir = path + '/example_data/test_plots'

# artificial dataset (here: features)
features = np.loadtxt(path + '/example_data/diabetes_features.csv',
                      delimiter=",")
responses = np.loadtxt(path + '/example_data/diabetes_responses.csv',
                       delimiter=",")
df_x = pd.DataFrame(features[0:443, 0:3],
                    columns=['0', '1', '2'])  #,'3', '4','5','6','7','8','9'])

# config space
pcs = list(zip(np.min(df_x, axis=0), np.max(df_x, axis=0)))
cs = ConfigSpace.ConfigurationSpace()
#for i in range(len(pcs)):
for i in range(3):
    cs.add_hyperparameter(
        UniformFloatHyperparameter("%i" % i, pcs[i][0], pcs[i][1]))

# create an instance of fanova with trained forest and ConfigSpace
f = fANOVA(X=df_x, Y=responses, config_space=cs)

# marginal of particular parameter:
dims = ('0', '1', '2')
res = f.quantify_importance(dims)
print(res)

# getting the 10 most important pairwise marginals sorted by importance
best_margs = f.get_most_important_pairwise_marginals(n=2)
print(best_margs)
Ejemplo n.º 23
0
def smac_to_fanova(state_run_directory, destination_dir):
    '''
    Takes the state-run files, merges them and prepares the configuration space for fANOVA.
    
    outputs: fANOVA object
    
    state_run_directory: str
                        path to the directory of the pysmac_output/out/scenario file
    destination_dir: str
                    path to the directory in which the merged states should be stored
    '''
    state_run_list =[]
    files = glob(state_run_directory + "/*")
    for file in files:
        if file.startswith(state_run_directory + "/state-run"):
            state_run_list.append(file)
    state_merge.state_merge(state_run_list, destination_dir)
    merged_files = glob(destination_dir + '/*')
    for file in merged_files:
        if file.startswith(destination_dir + '/runs_and_results'):
            response_file = file
        if file.startswith(destination_dir + '/paramstrings'):
            paramstrings = file
    param_dict = output_reader.read_paramstrings_file(paramstrings)
    
    num_line = str(param_dict[0]).replace("'", "")
    num_line = str(num_line).replace("}", "")
    # messy way to get the parameter names wrt order
    f_params = []
    for line in str(num_line).split(" "):
        line = str(line).replace(",", "")
        line = line.replace('{',  '')
        if ':' in line:
            parameter = line.replace(':', '')
            f_params.append(parameter)

    # getting features
    all_nums = []
    for dict_row in param_dict:
        num_line = str(dict_row).replace("'", "")
        num_line = str(num_line).replace("}", "")
        nums = []
        for line in str(num_line).split(" "):
            line = str(line).replace(",", "")
            if line.isdigit():
                nums.append(np.int(line))
            elif line.replace(".", "", 1).isdigit():
                nums.append(np.float(line))
            elif '-' in line:
                new_line = line.replace("-","")
                if new_line.isdigit():
                    nums.append(np.int(line))
                elif new_line.replace(".", "", 1).isdigit():
                    nums.append(np.float(line))
        all_nums.append(nums)

    x = np.array(all_nums)
    length = len(x)
    Y = data_extractor(response_file, length)
    fh = open(destination_dir + '/param.pcs')
    orig_pcs = fh.readlines()
    cs = pcs_new.read(orig_pcs, debug=True)
    X = np.zeros((x.shape))


    for i in range(x.shape[1]):
        idx = cs.get_idx_by_hyperparameter_name(f_params[i])
        X[:, idx] = x[:, i]   
    # create an instance of fanova with data for the random forest and the configSpace
    return fanova.fANOVA(X = X, Y = Y, config_space= cs)
Ejemplo n.º 24
0
color = {'boxes': 'y', 'whiskers': 'grey', 'medians': 'white', 'caps': 'grey'}
box = mean_scores.boxplot(vert=False,
                          notch=True,
                          grid=False,
                          color=color,
                          patch_artist=True,
                          sym='+')
plt.xlabel('Fehler', fontsize=15)
plt.xticks(fontsize=9)
plt.yticks(fontsize=12)
plt.show()

# Compute fANOVA externally (plots look better than from CAVE)
X = config_csv.drop(columns=['CONFIG_ID'], axis=1).to_numpy()
y = runhistory_csv['cost'].to_numpy()
f = fANOVA(X, y)
vis = fanova.visualizer.Visualizer(f, cs, "./fANOVA_plots/")
vis.create_all_plots()

# Search Space Evaluation
choice = 'criterion'
# Preprocessing
XandY = pd.concat([X, y], axis=1)
XandY = XandY[XandY[choice].notna()]
y = XandY['cost']
X = XandY.drop(columns=['cost'], axis=[1])
X_numeric = X.select_dtypes(include=['number'])
X_object = X.select_dtypes(include=['category', 'object', 'bool'])
choiceLabelEncoder = LabelEncoder()
labelEncoder = LabelEncoder()
for colname in X_object: