Ejemplo n.º 1
0
    def __init__(self,
                 dataset_name,
                 configuration_space,
                 datasets_file,
                 experiments_file,
                 distance='l1',
                 seed=None,
                 use_features='',
                 distance_kwargs=None,
                 subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Filename, must be a yaml file which has one entry for every
            dataset. The entry must have the keys name, file and
            metafeatures_file. The names must be different to the
            dataset_name, file is information about where to find the dataset
            and not used to perform metalearning. Metafeatures file must
            contain metafeatures in the .arff format as written
            :class:`pyMetaLearn.metafeatures.metafeature.DatasetMetafeatures`.

        experiments_file : str
            Filename, must be a yaml file which has one entry for every
            dataset. The entry must have the keys name and experiments. Name
            is the name of the dataset where the experiment belongs to (as
            specified in the datasets_file) and experiments is a list of
            HPOlib pickle files or csv files which contain run information.

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        distance_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.task_files_list = datasets_file
        self.experiments_file_list = experiments_file
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None  # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, datasets_file,
                                  experiments_file)
        args['metalearning_directory'])

    with open(args["task_files_list"]) as fh:
        task_files_list = fh.readlines()
    with open(args["experiments_list"]) as fh:
        experiments_list = fh.readlines()

    if 'keep_configurations' in args:
        keep_configurations = args['keep_configurations']
        keep_configurations = keep_configurations.split(',')
        keep_configurations = tuple(
            [tuple(kc.split('=')) for kc in keep_configurations])
    else:
        keep_configurations = None

    meta_base = MetaBase(task_files_list, experiments_list,
                         keep_configurations)
    metafeatures = meta_base.get_all_train_metafeatures_as_pandas()
    runs = meta_base.get_all_runs()
    split_masks = dict()
    training = dict()

    # This can print the best hyperparameters of every dataset
    # for dataset in runs:
    # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0]

    for i, name in enumerate(runs):
        runs[name].sort()
        rs = np.random.RandomState(i * 37)
        ones = np.ones((200, ))
        zeros = np.zeros((len(runs[name]) - len(ones), ))
        numbers = np.append(ones, zeros)
Ejemplo n.º 3
0
    with open(args.tasks) as fh:
        task_files_list = fh.readlines()
    # Load all the experiment run data only if needed
    if args.distance == 'runs':
        with open(args.runs) as fh:
            experiments_file_list = fh.readlines()
    else:
        experiments_file_list = StringIO.StringIO()
        for i in range(len(task_files_list)):
            experiments_file_list.write("\n")
        experiments_file_list.seek(0)

    pyMetaLearn.data_repositories.openml.apiconnector.set_local_directory(
        args.experiment_directory)
    meta_base = MetaBase(task_files_list, experiments_file_list)
    metafeatures = meta_base.get_all_metafeatures_as_pandas(
        metafeature_subset=args.subset)
    metafeature_times = meta_base.get_all_metafeatures_times_as_pandas(
        metafeature_subset=args.subset)

    #if args.subset:
    #    metafeatures = metafeatures.loc[:,subsets[args.subset]]
    #    metafeature_times = metafeature_times.loc[:,subsets[args.subset]]

    runs = meta_base.get_all_runs()

    general_plot_directory = os.path.join(args.experiment_directory, "plots")
    try:
        os.mkdir(general_plot_directory)
    except:
Ejemplo n.º 4
0
from collections import deque
import logging
import time

import numpy as np
import scipy.stats
import sklearn.cluster
import sklearn.manifold
import sklearn.preprocessing
import sklearn.utils

from pyMetaLearn.metalearning.meta_base import MetaBase, Run
from HPOlibConfigSpace.configuration_space import ConfigurationSpace

logging.basicConfig()

parser = argparse.ArgumentParser()
parser.add_argument("metadata_dir", type=str)
args = parser.parse_args()

metadata_dir = args.metadata_dir

configuration_space = ConfigurationSpace()

meta_base = MetaBase(configuration_space, args.metadata_dir)
metafeatures = meta_base.metafeatures.copy()
metafeatures = metafeatures.fillna(metafeatures.mean())

scaler = sklearn.preprocessing.MinMaxScaler()
metafeatures.values[:, :] = scaler.fit_transform(metafeatures.values)
Ejemplo n.º 5
0
class MetaLearningOptimizer(object):
    def __init__(self,
                 dataset_name,
                 configuration_space,
                 datasets_file,
                 experiments_file,
                 distance='l1',
                 seed=None,
                 use_features='',
                 distance_kwargs=None,
                 subset='all'):
        """Metalearning optimizer.

        Parameters
        ----------
        dataset_name : str
            Name of the dataset

        configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace

        datasets_file : str
            Filename, must be a yaml file which has one entry for every
            dataset. The entry must have the keys name, file and
            metafeatures_file. The names must be different to the
            dataset_name, file is information about where to find the dataset
            and not used to perform metalearning. Metafeatures file must
            contain metafeatures in the .arff format as written
            :class:`pyMetaLearn.metafeatures.metafeature.DatasetMetafeatures`.

        experiments_file : str
            Filename, must be a yaml file which has one entry for every
            dataset. The entry must have the keys name and experiments. Name
            is the name of the dataset where the experiment belongs to (as
            specified in the datasets_file) and experiments is a list of
            HPOlib pickle files or csv files which contain run information.

        distance : str, "l1" or "l2" or "random"
            Distance function to be used by the kNearestDatasets algorithm.

        seed

        use_features

        distance_kwargs

        subset
        """
        self.dataset_name = dataset_name
        self.configuration_space = configuration_space
        self.task_files_list = datasets_file
        self.experiments_file_list = experiments_file
        self.distance = distance
        self.seed = seed
        self.use_features = use_features
        self.distance_kwargs = distance_kwargs
        self.subset = subset
        self.kND = None  # For caching, makes things faster...

        self.meta_base = MetaBase(configuration_space, datasets_file,
                                  experiments_file)

    def perform_sequential_optimization(self,
                                        target_algorithm=test_function,
                                        time_budget=sys.maxint,
                                        evaluation_budget=sys.maxint):
        raise NotImplementedError("Right now this is not implemented due to "
                                  "timing issues.")
        time_taken = 0
        num_evaluations = 0
        history = []

        logger.info("Taking distance measure %s" % self.distance)
        while True:
            if time_taken >= time_budget:
                logger.info("Reached time budget. Exiting optimization.")
                break
            if num_evaluations >= evaluation_budget:
                logger.info("Reached maximum number of evaluations. Exiting "
                            "optimization.")
                break

            params = self.metalearning_suggest(history)

            fixed_params = OrderedDict()
            # Hack to remove all trailing - from the params which are
            # accidently in the experiment pickle of the current HPOlib version
            for key in params:
                if key[0] == "-":
                    fixed_params[key[1:]] = params[key]
                else:
                    fixed_params[key] = params[key]

            logger.info(
                "%d/%d, parameters: %s" %
                (num_evaluations, evaluation_budget, str(fixed_params)))
            result = target_algorithm(fixed_params)
            history.append(Run(params, result))
            num_evaluations += 1

        return min([run.result for run in history])

    def metalearning_suggest_all(self, exclude_double_configurations=True):
        """Return a list of the best hyperparameters of neighboring datasets"""
        # TODO check if _learn was called before!
        neighbors = self._learn(exclude_double_configurations)
        hp_list = []
        for neighbor in neighbors:
            logger.info("%s %s %s" % (neighbor[0], neighbor[1], neighbor[2]))
            hp_list.append(neighbor[2])
        return hp_list

    def metalearning_suggest(self, history):
        """Suggest the next most promising hyperparameters which were not yet evaluated"""
        # TODO test the object in the history!
        neighbors = self._learn()
        # Iterate over all datasets which are sorted ascending by distance
        for idx, neighbor in enumerate(neighbors):
            already_evaluated = False
            # Check if that dataset was already evaluated
            for run in history:
                # If so, return to the outer loop
                if neighbor[2] == run.configuration:
                    already_evaluated = True
                    break
            if not already_evaluated:
                logger.info(
                    "Nearest dataset with hyperparameters of best value "
                    "not evaluated yet is %s with a distance of %f" %
                    (neighbor[0], neighbor[1]))
                return neighbor[2]
        raise StopIteration("No more values available.")

    def _learn(self, exclude_double_configurations=True):
        dataset_metafeatures, all_other_metafeatures = self._get_metafeatures()
        if self.kND is None:

            # In case that we learn our distance function, get_value the parameters for
            #  the random forest
            if self.distance_kwargs:
                rf_params = ast.literal_eval(self.distance_kwargs)
            else:
                rf_params = None

            # To keep the distance the same in every iteration, we create a new
            # random state
            random_state = sklearn.utils.check_random_state(self.seed)
            kND = KNearestDatasets(distance=self.distance,
                                   random_state=random_state,
                                   distance_kwargs=rf_params)

            runs = dict()
            for task_id in all_other_metafeatures.index:
                runs[task_id] = self.meta_base.get_runs(task_id)
            kND.fit(all_other_metafeatures, runs)
            self.kND = kND
        return self.kND.kBestSuggestions(
            dataset_metafeatures,
            k=-1,
            exclude_double_configurations=exclude_double_configurations)

    def _get_metafeatures(self):
        """This is inside an extra function for testing purpose"""
        # Load the task

        logger.info("Going to use the metafeature subset: %s", self.subset)
        all_metafeatures = self.meta_base.get_all_metafeatures()
        logger.info(" ".join(all_metafeatures.columns))

        # TODO: buggy and hacky, replace with a list seperated by commas
        if self.use_features and \
                (type(self.use_features) != str or self.use_features != ''):
            #ogger.warn("Going to keep the following features %s",
            #        str(self.use_features))
            if type(self.use_features) == str:
                use_features = self.use_features.split(",")
            elif type(self.use_features) in (list, np.ndarray):
                use_features = self.use_features
            else:
                raise NotImplementedError(type(self.use_features))

            if len(use_features) == 0:
                logger.info("You just tried to remove all metafeatures...")
            else:
                keep = [
                    col for col in all_metafeatures.columns
                    if col in use_features
                ]
                if len(use_features) == 0:
                    logger.info("You just tried to remove all metafeatures...")
                else:
                    all_metafeatures = all_metafeatures.loc[:, keep]
                    logger.info("Going to keep the following metafeatures:")
                    logger.info(str(keep))

        return self._split_metafeature_array(self.dataset_name,
                                             all_metafeatures)

    def _split_metafeature_array(self, dataset_name, metafeatures):
        """Split the metafeature array into dataset metafeatures and all other.

        This is inside an extra function for testing purpose.
        """
        dataset_metafeatures = metafeatures.ix[dataset_name].copy()
        metafeatures = metafeatures[metafeatures.index != dataset_name]
        return dataset_metafeatures, metafeatures

    def read_task_list(self, fh):
        dataset_filenames = list()
        for line in fh:
            line = line.replace("\n", "")
            if line:
                dataset_filenames.append(line)
            else:
                raise ValueError("Blank lines in the task list are not "
                                 "supported.")
        return dataset_filenames

    def read_experiments_list(self, fh):
        experiments_list = list()
        for line in fh.readlines():
            experiments_list.append(line.split())
        return experiments_list
Ejemplo n.º 6
0
        args['metalearning_directory'])

    with open(args["task_files_list"]) as fh:
        task_files_list = fh.readlines()
    with open(args["experiments_list"]) as fh:
        experiments_list = fh.readlines()

    if 'keep_configurations' in args:
        keep_configurations = args['keep_configurations']
        keep_configurations = keep_configurations.split(',')
        keep_configurations = tuple(
            [tuple(kc.split('=')) for kc in keep_configurations])
    else:
        keep_configurations = None

    meta_base = MetaBase(task_files_list, experiments_list, keep_configurations)
    metafeatures = meta_base.get_all_train_metafeatures_as_pandas()
    runs = meta_base.get_all_runs()

    # This can print the best hyperparameters of every dataset
    # for dataset in runs:
    # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0]

    rf = LearnedDistanceRF(**params)
    X, Y = rf._create_dataset(metafeatures, runs)
    import cPickle

    with open("test.pkl", "w") as fh:
        cPickle.dump((X, Y, metafeatures), fh, -1)

    print "Metafeatures", metafeatures.shape
Ejemplo n.º 7
0
    with open(args.tasks) as fh:
        task_files_list = fh.readlines()
    # Load all the experiment run data only if needed
    if args.distance == 'runs':
        with open(args.runs) as fh:
            experiments_file_list = fh.readlines()
    else:
        experiments_file_list = StringIO.StringIO()
        for i in range(len(task_files_list)):
            experiments_file_list.write("\n")
        experiments_file_list.seek(0)

    pyMetaLearn.data_repositories.openml.apiconnector.set_local_directory(
        args.experiment_directory)
    meta_base = MetaBase(task_files_list, experiments_file_list)
    metafeatures = meta_base.get_all_metafeatures_as_pandas(
        metafeature_subset=args.subset)
    metafeature_times = meta_base.get_all_metafeatures_times_as_pandas(
        metafeature_subset=args.subset)

    #if args.subset:
    #    metafeatures = metafeatures.loc[:,subsets[args.subset]]
    #    metafeature_times = metafeature_times.loc[:,subsets[args.subset]]

    runs = meta_base.get_all_runs()

    general_plot_directory = os.path.join(args.experiment_directory, "plots")
    try:
        os.mkdir(general_plot_directory)
    except: