def __init__(self, dataset_name, configuration_space, datasets_file, experiments_file, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Filename, must be a yaml file which has one entry for every dataset. The entry must have the keys name, file and metafeatures_file. The names must be different to the dataset_name, file is information about where to find the dataset and not used to perform metalearning. Metafeatures file must contain metafeatures in the .arff format as written :class:`pyMetaLearn.metafeatures.metafeature.DatasetMetafeatures`. experiments_file : str Filename, must be a yaml file which has one entry for every dataset. The entry must have the keys name and experiments. Name is the name of the dataset where the experiment belongs to (as specified in the datasets_file) and experiments is a list of HPOlib pickle files or csv files which contain run information. distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features distance_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.task_files_list = datasets_file self.experiments_file_list = experiments_file self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, datasets_file, experiments_file)
args['metalearning_directory']) with open(args["task_files_list"]) as fh: task_files_list = fh.readlines() with open(args["experiments_list"]) as fh: experiments_list = fh.readlines() if 'keep_configurations' in args: keep_configurations = args['keep_configurations'] keep_configurations = keep_configurations.split(',') keep_configurations = tuple( [tuple(kc.split('=')) for kc in keep_configurations]) else: keep_configurations = None meta_base = MetaBase(task_files_list, experiments_list, keep_configurations) metafeatures = meta_base.get_all_train_metafeatures_as_pandas() runs = meta_base.get_all_runs() split_masks = dict() training = dict() # This can print the best hyperparameters of every dataset # for dataset in runs: # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0] for i, name in enumerate(runs): runs[name].sort() rs = np.random.RandomState(i * 37) ones = np.ones((200, )) zeros = np.zeros((len(runs[name]) - len(ones), )) numbers = np.append(ones, zeros)
with open(args.tasks) as fh: task_files_list = fh.readlines() # Load all the experiment run data only if needed if args.distance == 'runs': with open(args.runs) as fh: experiments_file_list = fh.readlines() else: experiments_file_list = StringIO.StringIO() for i in range(len(task_files_list)): experiments_file_list.write("\n") experiments_file_list.seek(0) pyMetaLearn.data_repositories.openml.apiconnector.set_local_directory( args.experiment_directory) meta_base = MetaBase(task_files_list, experiments_file_list) metafeatures = meta_base.get_all_metafeatures_as_pandas( metafeature_subset=args.subset) metafeature_times = meta_base.get_all_metafeatures_times_as_pandas( metafeature_subset=args.subset) #if args.subset: # metafeatures = metafeatures.loc[:,subsets[args.subset]] # metafeature_times = metafeature_times.loc[:,subsets[args.subset]] runs = meta_base.get_all_runs() general_plot_directory = os.path.join(args.experiment_directory, "plots") try: os.mkdir(general_plot_directory) except:
from collections import deque import logging import time import numpy as np import scipy.stats import sklearn.cluster import sklearn.manifold import sklearn.preprocessing import sklearn.utils from pyMetaLearn.metalearning.meta_base import MetaBase, Run from HPOlibConfigSpace.configuration_space import ConfigurationSpace logging.basicConfig() parser = argparse.ArgumentParser() parser.add_argument("metadata_dir", type=str) args = parser.parse_args() metadata_dir = args.metadata_dir configuration_space = ConfigurationSpace() meta_base = MetaBase(configuration_space, args.metadata_dir) metafeatures = meta_base.metafeatures.copy() metafeatures = metafeatures.fillna(metafeatures.mean()) scaler = sklearn.preprocessing.MinMaxScaler() metafeatures.values[:, :] = scaler.fit_transform(metafeatures.values)
class MetaLearningOptimizer(object): def __init__(self, dataset_name, configuration_space, datasets_file, experiments_file, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Filename, must be a yaml file which has one entry for every dataset. The entry must have the keys name, file and metafeatures_file. The names must be different to the dataset_name, file is information about where to find the dataset and not used to perform metalearning. Metafeatures file must contain metafeatures in the .arff format as written :class:`pyMetaLearn.metafeatures.metafeature.DatasetMetafeatures`. experiments_file : str Filename, must be a yaml file which has one entry for every dataset. The entry must have the keys name and experiments. Name is the name of the dataset where the experiment belongs to (as specified in the datasets_file) and experiments is a list of HPOlib pickle files or csv files which contain run information. distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features distance_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.task_files_list = datasets_file self.experiments_file_list = experiments_file self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, datasets_file, experiments_file) def perform_sequential_optimization(self, target_algorithm=test_function, time_budget=sys.maxint, evaluation_budget=sys.maxint): raise NotImplementedError("Right now this is not implemented due to " "timing issues.") time_taken = 0 num_evaluations = 0 history = [] logger.info("Taking distance measure %s" % self.distance) while True: if time_taken >= time_budget: logger.info("Reached time budget. Exiting optimization.") break if num_evaluations >= evaluation_budget: logger.info("Reached maximum number of evaluations. Exiting " "optimization.") break params = self.metalearning_suggest(history) fixed_params = OrderedDict() # Hack to remove all trailing - from the params which are # accidently in the experiment pickle of the current HPOlib version for key in params: if key[0] == "-": fixed_params[key[1:]] = params[key] else: fixed_params[key] = params[key] logger.info( "%d/%d, parameters: %s" % (num_evaluations, evaluation_budget, str(fixed_params))) result = target_algorithm(fixed_params) history.append(Run(params, result)) num_evaluations += 1 return min([run.result for run in history]) def metalearning_suggest_all(self, exclude_double_configurations=True): """Return a list of the best hyperparameters of neighboring datasets""" # TODO check if _learn was called before! neighbors = self._learn(exclude_double_configurations) hp_list = [] for neighbor in neighbors: logger.info("%s %s %s" % (neighbor[0], neighbor[1], neighbor[2])) hp_list.append(neighbor[2]) return hp_list def metalearning_suggest(self, history): """Suggest the next most promising hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance for idx, neighbor in enumerate(neighbors): already_evaluated = False # Check if that dataset was already evaluated for run in history: # If so, return to the outer loop if neighbor[2] == run.configuration: already_evaluated = True break if not already_evaluated: logger.info( "Nearest dataset with hyperparameters of best value " "not evaluated yet is %s with a distance of %f" % (neighbor[0], neighbor[1])) return neighbor[2] raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): dataset_metafeatures, all_other_metafeatures = self._get_metafeatures() if self.kND is None: # In case that we learn our distance function, get_value the parameters for # the random forest if self.distance_kwargs: rf_params = ast.literal_eval(self.distance_kwargs) else: rf_params = None # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) kND = KNearestDatasets(distance=self.distance, random_state=random_state, distance_kwargs=rf_params) runs = dict() for task_id in all_other_metafeatures.index: runs[task_id] = self.meta_base.get_runs(task_id) kND.fit(all_other_metafeatures, runs) self.kND = kND return self.kND.kBestSuggestions( dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations) def _get_metafeatures(self): """This is inside an extra function for testing purpose""" # Load the task logger.info("Going to use the metafeature subset: %s", self.subset) all_metafeatures = self.meta_base.get_all_metafeatures() logger.info(" ".join(all_metafeatures.columns)) # TODO: buggy and hacky, replace with a list seperated by commas if self.use_features and \ (type(self.use_features) != str or self.use_features != ''): #ogger.warn("Going to keep the following features %s", # str(self.use_features)) if type(self.use_features) == str: use_features = self.use_features.split(",") elif type(self.use_features) in (list, np.ndarray): use_features = self.use_features else: raise NotImplementedError(type(self.use_features)) if len(use_features) == 0: logger.info("You just tried to remove all metafeatures...") else: keep = [ col for col in all_metafeatures.columns if col in use_features ] if len(use_features) == 0: logger.info("You just tried to remove all metafeatures...") else: all_metafeatures = all_metafeatures.loc[:, keep] logger.info("Going to keep the following metafeatures:") logger.info(str(keep)) return self._split_metafeature_array(self.dataset_name, all_metafeatures) def _split_metafeature_array(self, dataset_name, metafeatures): """Split the metafeature array into dataset metafeatures and all other. This is inside an extra function for testing purpose. """ dataset_metafeatures = metafeatures.ix[dataset_name].copy() metafeatures = metafeatures[metafeatures.index != dataset_name] return dataset_metafeatures, metafeatures def read_task_list(self, fh): dataset_filenames = list() for line in fh: line = line.replace("\n", "") if line: dataset_filenames.append(line) else: raise ValueError("Blank lines in the task list are not " "supported.") return dataset_filenames def read_experiments_list(self, fh): experiments_list = list() for line in fh.readlines(): experiments_list.append(line.split()) return experiments_list
args['metalearning_directory']) with open(args["task_files_list"]) as fh: task_files_list = fh.readlines() with open(args["experiments_list"]) as fh: experiments_list = fh.readlines() if 'keep_configurations' in args: keep_configurations = args['keep_configurations'] keep_configurations = keep_configurations.split(',') keep_configurations = tuple( [tuple(kc.split('=')) for kc in keep_configurations]) else: keep_configurations = None meta_base = MetaBase(task_files_list, experiments_list, keep_configurations) metafeatures = meta_base.get_all_train_metafeatures_as_pandas() runs = meta_base.get_all_runs() # This can print the best hyperparameters of every dataset # for dataset in runs: # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0] rf = LearnedDistanceRF(**params) X, Y = rf._create_dataset(metafeatures, runs) import cPickle with open("test.pkl", "w") as fh: cPickle.dump((X, Y, metafeatures), fh, -1) print "Metafeatures", metafeatures.shape
with open(args.tasks) as fh: task_files_list = fh.readlines() # Load all the experiment run data only if needed if args.distance == 'runs': with open(args.runs) as fh: experiments_file_list = fh.readlines() else: experiments_file_list = StringIO.StringIO() for i in range(len(task_files_list)): experiments_file_list.write("\n") experiments_file_list.seek(0) pyMetaLearn.data_repositories.openml.apiconnector.set_local_directory( args.experiment_directory) meta_base = MetaBase(task_files_list, experiments_file_list) metafeatures = meta_base.get_all_metafeatures_as_pandas( metafeature_subset=args.subset) metafeature_times = meta_base.get_all_metafeatures_times_as_pandas( metafeature_subset=args.subset) #if args.subset: # metafeatures = metafeatures.loc[:,subsets[args.subset]] # metafeature_times = metafeature_times.loc[:,subsets[args.subset]] runs = meta_base.get_all_runs() general_plot_directory = os.path.join(args.experiment_directory, "plots") try: os.mkdir(general_plot_directory) except: