class MetaBaseTest(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\ .get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir) def tearDown(self): os.chdir(self.cwd) def test_get_all_runs(self): runs = self.base.get_all_runs() self.assertIsInstance(runs, pd.DataFrame) # TODO update this ASAP self.assertEqual((134, 24), runs.shape) def test_get_runs(self): runs = self.base.get_runs('38_acc') # TODO update this ASAP self.assertEqual(24, len(runs)) self.assertIsInstance(runs, pd.Series) def test_get_metafeatures_as_pandas(self): mf = self.base.get_metafeatures('38_acc') self.assertTrue(np.isfinite(mf).all()) self.assertEqual(type(mf), pd.Series) self.assertEqual(mf.name, u'38_acc') self.assertEqual(mf.loc['NumberOfInstances'], 2527.0) def test_get_all_metafeatures_as_pandas(self): mf = self.base.get_all_metafeatures() self.assertIsInstance(mf, pd.DataFrame) self.assertEqual((140, 46), mf.shape)
class MetaLearningOptimizer(object): def __init__(self, dataset_name, configuration_space, aslib_directory, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Path to an aslib directory distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features metric_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.aslib_dir = aslib_directory self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, self.aslib_dir) self.logger = logging.getLogger(__name__) def perform_sequential_optimization(self, target_algorithm=test_function, time_budget=None, evaluation_budget=None): raise NotImplementedError("Right now this is not implemented due to " "timing issues.") time_taken = 0 num_evaluations = 0 history = [] self.logger.info("Taking distance measure %s" % self.distance) while True: if time_budget is not None and time_taken >= time_budget: self.logger.info("Reached time budget. Exiting optimization.") break if evaluation_budget is not None and \ num_evaluations >= evaluation_budget: self.logger.info("Reached maximum number of evaluations. Exiting " "optimization.") break params = self.metalearning_suggest(history) fixed_params = OrderedDict() # Hack to remove all trailing - from the params which are # accidently in the experiment pickle of the current HPOlib version for key in params: if key[0] == "-": fixed_params[key[1:]] = params[key] else: fixed_params[key] = params[key] self.logger.info("%d/%d, parameters: %s" % (num_evaluations, evaluation_budget, str(fixed_params))) result = target_algorithm(fixed_params) history.append(Run(params, result)) num_evaluations += 1 return min([run.result for run in history]) def metalearning_suggest_all(self, exclude_double_configurations=True): """Return a list of the best hyperparameters of neighboring datasets""" # TODO check if _learn was called before! neighbors = self._learn(exclude_double_configurations) hp_list = [] for neighbor in neighbors: try: configuration = \ self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration)) except (KeyError): self.logger.warning("Configuration %s not found" % neighbor[2]) continue hp_list.append(configuration) return hp_list def metalearning_suggest(self, history): """Suggest the next most promising hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance history_with_indices = [] for run in history: history_with_indices.append(\ self.meta_base.get_algorithm_index_from_configuration(run)) for idx, neighbor in enumerate(neighbors): already_evaluated = False # Check if that dataset was already evaluated for run in history_with_indices: # If so, return to the outer loop if neighbor[2] == run: already_evaluated = True break if not already_evaluated: self.logger.info("Nearest dataset with hyperparameters of best value " "not evaluated yet is %s with a distance of %f" % (neighbor[0], neighbor[1])) return self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): dataset_metafeatures, all_other_metafeatures = self._get_metafeatures() # Remove metafeatures which could not be calculated for the target # dataset keep = [] for idx in dataset_metafeatures.index: if np.isfinite(dataset_metafeatures.loc[idx]): keep.append(idx) dataset_metafeatures = dataset_metafeatures.loc[keep] all_other_metafeatures = all_other_metafeatures.loc[:,keep] # Do mean imputation of all other metafeatures all_other_metafeatures = all_other_metafeatures.fillna( all_other_metafeatures.mean()) if self.kND is None: # In case that we learn our distance function, get_value the parameters for # the random forest if self.distance_kwargs: rf_params = ast.literal_eval(self.distance_kwargs) else: rf_params = None # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) kND = KNearestDatasets(metric=self.distance, random_state=random_state, metric_params=rf_params) runs = dict() # TODO move this code to the metabase for task_id in all_other_metafeatures.index: try: runs[task_id] = self.meta_base.get_runs(task_id) except KeyError: # TODO should I really except this? self.logger.warning("Could not find runs for instance %s" % task_id) runs[task_id] = pd.Series([], name=task_id) runs = pd.DataFrame(runs) kND.fit(all_other_metafeatures, runs) self.kND = kND return self.kND.kBestSuggestions(dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations) def _get_metafeatures(self): """This is inside an extra function for testing purpose""" # Load the task self.logger.info("Going to use the metafeature subset: %s", self.subset) all_metafeatures = self.meta_base.get_all_metafeatures() self.logger.info(" ".join(all_metafeatures.columns)) # TODO: buggy and hacky, replace with a list seperated by commas if self.use_features and \ (type(self.use_features) != str or self.use_features != ''): #ogger.warn("Going to keep the following features %s", # str(self.use_features)) if type(self.use_features) == str: use_features = self.use_features.split(",") elif type(self.use_features) in (list, np.ndarray): use_features = self.use_features else: raise NotImplementedError(type(self.use_features)) if len(use_features) == 0: self.logger.info("You just tried to remove all metafeatures...") else: keep = [col for col in all_metafeatures.columns if col in use_features] if len(use_features) == 0: self.logger.info("You just tried to remove all metafeatures...") else: all_metafeatures = all_metafeatures.loc[:,keep] self.logger.info("Going to keep the following metafeatures:") self.logger.info(str(keep)) return self._split_metafeature_array(self.dataset_name, all_metafeatures) def _split_metafeature_array(self, dataset_name, metafeatures): """Split the metafeature array into dataset metafeatures and all other. This is inside an extra function for testing purpose. """ dataset_metafeatures = metafeatures.loc[dataset_name].copy() metafeatures = metafeatures[metafeatures.index != dataset_name] return dataset_metafeatures, metafeatures def read_task_list(self, fh): dataset_filenames = list() for line in fh: line = line.replace("\n", "") if line: dataset_filenames.append(line) else: raise ValueError("Blank lines in the task list are not " "supported.") return dataset_filenames def read_experiments_list(self, fh): experiments_list = list() for line in fh.readlines(): experiments_list.append(line.split()) return experiments_list
class MetaLearningOptimizer(object): def __init__(self, dataset_name, configuration_space, aslib_directory, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Path to an aslib directory distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features metric_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.aslib_dir = aslib_directory self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, self.aslib_dir) self.logger = logging.getLogger(__name__) def perform_sequential_optimization(self, target_algorithm=test_function, time_budget=None, evaluation_budget=None): raise NotImplementedError("Right now this is not implemented due to " "timing issues.") time_taken = 0 num_evaluations = 0 history = [] self.logger.info("Taking distance measure %s" % self.distance) while True: if time_budget is not None and time_taken >= time_budget: self.logger.info("Reached time budget. Exiting optimization.") break if evaluation_budget is not None and \ num_evaluations >= evaluation_budget: self.logger.info( "Reached maximum number of evaluations. Exiting " "optimization.") break params = self.metalearning_suggest(history) fixed_params = OrderedDict() # Hack to remove all trailing - from the params which are # accidently in the experiment pickle of the current HPOlib version for key in params: if key[0] == "-": fixed_params[key[1:]] = params[key] else: fixed_params[key] = params[key] self.logger.info( "%d/%d, parameters: %s" % (num_evaluations, evaluation_budget, str(fixed_params))) result = target_algorithm(fixed_params) history.append(Run(params, result)) num_evaluations += 1 return min([run.result for run in history]) def metalearning_suggest_all(self, exclude_double_configurations=True): """Return a list of the best hyperparameters of neighboring datasets""" # TODO check if _learn was called before! neighbors = self._learn(exclude_double_configurations) hp_list = [] for neighbor in neighbors: try: configuration = \ self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration)) except (KeyError): self.logger.warning("Configuration %s not found" % neighbor[2]) continue hp_list.append(configuration) return hp_list def metalearning_suggest(self, history): """Suggest the next most promising hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance history_with_indices = [] for run in history: history_with_indices.append(\ self.meta_base.get_algorithm_index_from_configuration(run)) for idx, neighbor in enumerate(neighbors): already_evaluated = False # Check if that dataset was already evaluated for run in history_with_indices: # If so, return to the outer loop if neighbor[2] == run: already_evaluated = True break if not already_evaluated: self.logger.info( "Nearest dataset with hyperparameters of best value " "not evaluated yet is %s with a distance of %f" % (neighbor[0], neighbor[1])) return self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): dataset_metafeatures, all_other_metafeatures = self._get_metafeatures() # Remove metafeatures which could not be calculated for the target # dataset keep = [] for idx in dataset_metafeatures.index: if np.isfinite(dataset_metafeatures.loc[idx]): keep.append(idx) dataset_metafeatures = dataset_metafeatures.loc[keep] all_other_metafeatures = all_other_metafeatures.loc[:, keep] # Do mean imputation of all other metafeatures all_other_metafeatures = all_other_metafeatures.fillna( all_other_metafeatures.mean()) if self.kND is None: # In case that we learn our distance function, get_value the parameters for # the random forest if self.distance_kwargs: rf_params = ast.literal_eval(self.distance_kwargs) else: rf_params = None # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) kND = KNearestDatasets(metric=self.distance, random_state=random_state, metric_params=rf_params) runs = dict() # TODO move this code to the metabase for task_id in all_other_metafeatures.index: try: runs[task_id] = self.meta_base.get_runs(task_id) except KeyError: # TODO should I really except this? self.logger.warning("Could not find runs for instance %s" % task_id) runs[task_id] = pd.Series([], name=task_id) runs = pd.DataFrame(runs) kND.fit(all_other_metafeatures, runs) self.kND = kND return self.kND.kBestSuggestions( dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations) def _get_metafeatures(self): """This is inside an extra function for testing purpose""" # Load the task self.logger.info("Going to use the metafeature subset: %s", self.subset) all_metafeatures = self.meta_base.get_all_metafeatures() self.logger.info(" ".join(all_metafeatures.columns)) # TODO: buggy and hacky, replace with a list seperated by commas if self.use_features and \ (type(self.use_features) != str or self.use_features != ''): #ogger.warn("Going to keep the following features %s", # str(self.use_features)) if type(self.use_features) == str: use_features = self.use_features.split(",") elif type(self.use_features) in (list, np.ndarray): use_features = self.use_features else: raise NotImplementedError(type(self.use_features)) if len(use_features) == 0: self.logger.info( "You just tried to remove all metafeatures...") else: keep = [ col for col in all_metafeatures.columns if col in use_features ] if len(use_features) == 0: self.logger.info( "You just tried to remove all metafeatures...") else: all_metafeatures = all_metafeatures.loc[:, keep] self.logger.info( "Going to keep the following metafeatures:") self.logger.info(str(keep)) return self._split_metafeature_array(self.dataset_name, all_metafeatures) def _split_metafeature_array(self, dataset_name, metafeatures): """Split the metafeature array into dataset metafeatures and all other. This is inside an extra function for testing purpose. """ dataset_metafeatures = metafeatures.loc[dataset_name].copy() metafeatures = metafeatures[metafeatures.index != dataset_name] return dataset_metafeatures, metafeatures def read_task_list(self, fh): dataset_filenames = list() for line in fh: line = line.replace("\n", "") if line: dataset_filenames.append(line) else: raise ValueError("Blank lines in the task list are not " "supported.") return dataset_filenames def read_experiments_list(self, fh): experiments_list = list() for line in fh.readlines(): experiments_list.append(line.split()) return experiments_list