def test_read_and_write_pcs(self): configuration_space_path = os.path.abspath(HPOlibConfigSpace.__file__) configuration_space_path = os.path.dirname(configuration_space_path) configuration_space_path = os.path.join( configuration_space_path, "..", "test", "test_searchspaces", "mini_autosklearn_original.pcs") with open(configuration_space_path) as fh: cs = pcs_parser.read(fh) pcs = pcs_parser.write(cs) with open(configuration_space_path) as fh: lines = fh.readlines() num_asserts = 0 for line in lines: line = line.replace("\n", "") line = line.split("#")[0] # Remove comments line = line.strip() if line: num_asserts += 1 self.assertIn(line, pcs) self.assertEqual(21, num_asserts) # Sample a little bit rs = RandomSampler(cs, 1) print cs for i in range(1000): c = rs.sample_configuration()
def test_read_and_write_pcs(self): configuration_space_path = os.path.abspath(HPOlibConfigSpace.__file__) configuration_space_path = os.path.dirname(configuration_space_path) configuration_space_path = os.path.join(configuration_space_path, "..", "test", "test_searchspaces", "mini_autosklearn_original.pcs") with open(configuration_space_path) as fh: cs = pcs_parser.read(fh) pcs = pcs_parser.write(cs) with open(configuration_space_path) as fh: lines = fh.readlines() num_asserts = 0 for line in lines: line = line.replace("\n", "") line = line.split("#")[0] # Remove comments line = line.strip() if line: num_asserts += 1 self.assertIn(line, pcs) self.assertEqual(21, num_asserts) # Sample a little bit rs = RandomSampler(cs, 1) print cs for i in range(1000): c = rs.sample_configuration()
def test_write_log10(self): expected = "a [10.0, 1000.0] [100.0]l" cs = ConfigurationSpace() cs.add_hyperparameter( UniformFloatHyperparameter("a", 10, 1000, log=True)) value = pcs_parser.write(cs) self.assertEqual(expected, value)
def test_write_q_float(self): expected = "Q16_float_a [16.0, 1024.0] [520.0]" cs = ConfigurationSpace() cs.add_hyperparameter( UniformFloatHyperparameter("float_a", 16, 1024, q=16)) value = pcs_parser.write(cs) self.assertEqual(expected, value)
def test_write_q_int(self): expected = "Q16_int_a [16, 1024] [520]i" cs = ConfigurationSpace() cs.add_hyperparameter( UniformIntegerHyperparameter("int_a", 16, 1024, q=16)) value = pcs_parser.write(cs) self.assertEqual(expected, value)
def _create_search_space(tmp_dir, data_info, watcher, log_function): task_name = 'CreateConfigSpace' watcher.start_task(task_name) config_space_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = paramsklearn.get_configuration_space( data_info) sp_string = pcs_parser.write(configuration_space) _write_file_with_data(config_space_path, sp_string, 'Configuration space', log_function) watcher.stop_task(task_name) return configuration_space, config_space_path
def test_build_forbidden(self): expected = "a {a, b, c} [a]\nb {a, b, c} [c]\n\n" \ "{a=a, b=a}\n{a=a, b=b}\n{a=b, b=a}\n{a=b, b=b}" cs = ConfigurationSpace() a = CategoricalHyperparameter("a", ["a", "b", "c"], "a") b = CategoricalHyperparameter("b", ["a", "b", "c"], "c") cs.add_hyperparameter(a) cs.add_hyperparameter(b) fb = ForbiddenAndConjunction(ForbiddenInClause(a, ["a", "b"]), ForbiddenInClause(b, ["a", "b"])) cs.add_forbidden_clause(fb) value = pcs_parser.write(cs) self.assertIn(expected, value)
def _create_search_space( tmp_dir, data_info, backend, watcher, logger, include_estimators=None, include_preprocessors=None ): task_name = "CreateConfigSpace" watcher.start_task(task_name) configspace_path = os.path.join(tmp_dir, "space.pcs") configuration_space = paramsklearn.get_configuration_space( data_info, include_estimators=include_estimators, include_preprocessors=include_preprocessors ) sp_string = pcs_parser.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, "Configuration space") watcher.stop_task(task_name) return configuration_space, configspace_path
def _create_search_space(tmp_dir, data_info, backend, watcher, logger, include_estimators=None, include_preprocessors=None): task_name = 'CreateConfigSpace' watcher.start_task(task_name) configspace_path = os.path.join(tmp_dir, 'space.pcs') configuration_space = pipeline.get_configuration_space( data_info, include_estimators=include_estimators, include_preprocessors=include_preprocessors) sp_string = pcs_parser.write(configuration_space) backend.write_txt_file(configspace_path, sp_string, 'Configuration space') watcher.stop_task(task_name) return configuration_space, configspace_path
def _fit(self, D): # TODO: check that data and task definition fit together! self.metric_ = D.info['metric'] self.task_ = D.info['task'] self.target_num_ = D.info['target_num'] # Set environment variable: seed = os.environ.get("AUTOSKLEARN_SEED") if seed is not None and int(seed) != self.seed: raise ValueError("It seems you have already started an instance " "of AutoSklearn in this thread.") else: os.environ["AUTOSKLEARN_SEED"] = str(self.seed) # == Split dataset and store Data for the ensemble script X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data( D.data['X_train'], D.data['Y_train']) true_labels_ensemble_filename = os.path.join(self.tmp_dir, "true_labels_ensemble.npy") true_labels_ensemble_lock = true_labels_ensemble_filename + ".lock" with lockfile.LockFile(true_labels_ensemble_lock): if not os.path.exists(true_labels_ensemble_filename): np.save(true_labels_ensemble_filename, Y_ensemble) del X_train, X_ensemble, Y_train, Y_ensemble time_needed_to_load_data = self.stopwatch_.wall_elapsed(self.basename_) time_left_after_reading = max(0, self.time_left_for_this_task - time_needed_to_load_data) self.logger.info("Remaining time after reading %s %5.2f sec" % (self.basename_, time_left_after_reading)) self.stopwatch_.stop_task("LoadData") # == Calculate metafeatures self.stopwatch_.start_task("CalculateMetafeatures") categorical = [True if feat_type.lower() in ["categorical"] else False for feat_type in D.feat_type] if self.initial_configurations_via_metalearning <= 0: ml = None elif D.info["task"] in \ [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: ml = metalearning.MetaLearning() self.logger.debug("Start calculating metafeatures for %s" % self.basename_) ml.calculate_metafeatures_with_labels(D.data["X_train"], D.data["Y_train"], categorical=categorical, dataset_name=self.basename_) else: ml = None self.logger.critical("Metafeatures not calculated") self.stopwatch_.stop_task("CalculateMetafeatures") self.logger.debug("Calculating Metafeatures (categorical attributes) took %5.2f" % self.stopwatch_.wall_elapsed("CalculateMetafeatures")) self.stopwatch_.start_task("OneHot") D.perform1HotEncoding() self.ohe_ = D.encoder_ self.stopwatch_.stop_task("OneHot") # == Pickle the data manager self.stopwatch_.start_task("StoreDatamanager") data_manager_path = os.path.join(self.tmp_dir, self.basename_ + "_Manager.pkl") data_manager_lockfile = data_manager_path + ".lock" with lockfile.LockFile(data_manager_lockfile): if not os.path.exists(data_manager_path): pickle.dump(D, open(data_manager_path, 'w'), protocol=-1) self.logger.debug("Pickled Datamanager at %s" % data_manager_path) else: self.logger.debug("Data manager already presend at %s" % data_manager_path) self.stopwatch_.stop_task("StoreDatamanager") # = Create a searchspace self.stopwatch_.start_task("CreateConfigSpace") configspace_path = os.path.join(self.tmp_dir, "space.pcs") self.configuration_space = paramsklearn.get_configuration_space( D.info) self.configuration_space_created_hook() sp_string = pcs_parser.write(self.configuration_space) configuration_space_lockfile = configspace_path + ".lock" with lockfile.LockFile(configuration_space_lockfile): if not os.path.exists(configspace_path): with open(configspace_path, "w") as fh: fh.write(sp_string) self.logger.debug("Configuration space written to %s" % configspace_path) else: self.logger.debug("Configuration space already present at %s" % configspace_path) self.stopwatch_.stop_task("CreateConfigSpace") if ml is None: initial_configurations = [] elif D.info["task"]in \ [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]: self.stopwatch_.start_task("CalculateMetafeaturesEncoded") ml.calculate_metafeatures_encoded_labels(X_train=D.data["X_train"], Y_train=D.data["Y_train"], categorical=[False] * D.data["X_train"].shape[0], dataset_name=self.basename_) self.stopwatch_.stop_task("CalculateMetafeaturesEncoded") self.logger.debug( "Calculating Metafeatures (encoded attributes) took %5.2fsec" % self.stopwatch_.wall_elapsed("CalculateMetafeaturesEncoded")) self.logger.debug(ml._metafeatures_labels.__repr__(verbosity=2)) self.logger.debug(ml._metafeatures_encoded_labels.__repr__(verbosity=2)) self.stopwatch_.start_task("InitialConfigurations") try: initial_configurations = ml.create_metalearning_string_for_smac_call( self.configuration_space, self.basename_, self.metric_, self.task_, True if D.info['is_sparse'] == 1 else False, self.initial_configurations_via_metalearning, self.metadata_directory) except Exception as e: import traceback self.logger.error(str(e)) self.logger.error(traceback.format_exc()) initial_configurations = [] self.stopwatch_.stop_task("InitialConfigurations") self.logger.debug("Initial Configurations: (%d)", len(initial_configurations)) for initial_configuration in initial_configurations: self.logger.debug(initial_configuration) self.logger.debug("Looking for initial configurations took %5.2fsec" % self.stopwatch_.wall_elapsed("InitialConfigurations")) self.logger.info( "Time left for %s after finding initial configurations: %5.2fsec" % (self.basename_, self.time_left_for_this_task - self.stopwatch_.wall_elapsed(self.basename_))) else: initial_configurations = [] self.logger.critical("Metafeatures encoded not calculated") # == Set up a directory where all the trained models will be pickled to if self.keep_models: self.model_directory_ = os.path.join(self.tmp_dir, "models_%d" % self.seed) os.mkdir(self.model_directory_) self.ensemble_indices_directory_ = os.path.join(self.tmp_dir, "ensemble_indices_%d" % self.seed) os.mkdir(self.ensemble_indices_directory_) # == RUN SMAC self.stopwatch_.start_task("runSmac") # = Create an empty instance file instance_file = os.path.join(self.tmp_dir, "instances.txt") instance_file_lock = instance_file + ".lock" with lockfile.LockFile(instance_file_lock): if not os.path.exists(instance_file_lock): with open(instance_file, "w") as fh: fh.write("holdout") self.logger.debug("Created instance file %s" % instance_file) else: self.logger.debug("Instance file already present at %s" % instance_file) # = Start SMAC time_left_for_smac = max(0, self.time_left_for_this_task - ( self.stopwatch_.wall_elapsed(self.basename_))) self.logger.debug("Start SMAC with %5.2fsec time left" % time_left_for_smac) proc_smac, smac_call = \ submit_process.run_smac(dataset_name=self.basename_, dataset=data_manager_path, tmp_dir=self.tmp_dir, searchspace=configspace_path, instance_file=instance_file, limit=time_left_for_smac, cutoff_time=self.per_run_time_limit, initial_challengers=initial_configurations, memory_limit=self.ml_memory_limit, seed=self.seed) self.logger.debug(smac_call) self.stopwatch_.stop_task("runSmac") # == RUN ensemble builder self.stopwatch_.start_task("runEnsemble") time_left_for_ensembles = max(0, self.time_left_for_this_task - ( self.stopwatch_.wall_elapsed(self.basename_))) self.logger.debug("Start Ensemble with %5.2fsec time left" % time_left_for_ensembles) proc_ensembles = \ submit_process.run_ensemble_builder(tmp_dir=self.tmp_dir, dataset_name=self.basename_, task_type=self.task_, metric=self.metric_, limit=time_left_for_ensembles, output_dir=self.output_dir, ensemble_size=self.ensemble_size, ensemble_nbest=self.ensemble_nbest, seed=self.seed, ensemble_indices_output_dir=self.ensemble_indices_directory_) self.stopwatch_.stop_task("runEnsemble") del D if self.queue is not None: self.queue.put([time_needed_to_load_data, data_manager_path, proc_smac, proc_ensembles]) else: proc_smac.wait() proc_ensembles.wait() # Delete AutoSklearn environment variable del os.environ["AUTOSKLEARN_SEED"] return self
def start_automl_on_dataset(basename, input_dir, tmp_dataset_dir, output_dir, time_left_for_this_task, queue): start = time.time() verbose = True # == Creating a data object with data and information about it vprint(verbose, "======== Reading and converting data ==========") # Encoding the labels will be done after the metafeature calculation! loaded_data_manager = data_manager.DataManager(basename, input_dir, verbose=verbose, encode_labels=False) print loaded_data_manager # == Split dataset and store Data for the ensemble script X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data( loaded_data_manager.data['X_train'], loaded_data_manager.data['Y_train']) np.save(os.path.join(tmp_dataset_dir, "true_labels_ensemble.npy"), Y_ensemble) del X_train, X_ensemble, Y_train, Y_ensemble stop = time.time() time_needed_to_load_data = stop - start time_left_after_reading = max(0, time_left_for_this_task - time_needed_to_load_data) vprint(verbose, "Remaining time after reading data %5.2f sec" % time_left_after_reading) # = Create a searchspace searchspace_path = os.path.join(tmp_dataset_dir, "space.pcs") config_space = autosklearn.get_configuration_space(loaded_data_manager.info) sp_string = pcs_parser.write(config_space) fh = open(searchspace_path, 'w') fh.write(sp_string) fh.close() # == Calculate metafeatures categorical = [True if feat_type.lower() in ["categorical"] else False for feat_type in loaded_data_manager.feat_type] if loaded_data_manager.info["task"].lower() not in \ ["multilabel.classification", "regression"] and \ not loaded_data_manager.info["is_sparse"]: ml = metalearning.MetaLearning() metafeatures_start_time = time.time() vprint(verbose, "Start calculating metafeatures for %s" % loaded_data_manager.basename) ml.calculate_metafeatures_with_labels(loaded_data_manager.data["X_train"], loaded_data_manager.data["Y_train"], categorical=categorical, dataset_name=loaded_data_manager.basename) loaded_data_manager.perform1HotEncoding() if loaded_data_manager.info["task"].lower() not in \ ["multilabel.classification", "regression"] and \ not loaded_data_manager.info["is_sparse"]: ml.calculate_metafeatures_encoded_labels(loaded_data_manager.data["X_train"], loaded_data_manager.data["Y_train"], categorical=[False]*loaded_data_manager.data["X_train"].shape[0], dataset_name=loaded_data_manager.basename) metafeatures_end_time = time.time() metafeature_calculation_time = metafeatures_end_time - metafeatures_start_time vprint(verbose, "Done calculationg metafeatures for %s, took %5.2f " "seconds." % (loaded_data_manager.basename, metafeature_calculation_time)) time_left_after_metafeatures = max(0, time_left_for_this_task - (metafeatures_end_time - start)) vprint(verbose, "Remaining time after calculating the metafeatures for %s %5.2f " "sec" % (loaded_data_manager.basename, time_left_after_metafeatures)) vprint(verbose, ml._metafeatures_labels) vprint(verbose, ml._metafeatures_encoded_labels) # TODO check that Metafeatures only contain finite numbers! vprint(verbose, "Starting to look for initial configurations for %s." % loaded_data_manager.basename) initial_configurations_start_time = time.time() initial_configurations = ml.create_metalearning_string_for_smac_call( config_space, loaded_data_manager.basename, loaded_data_manager.info[ 'metric']) initial_configurations_end_time = time.time() vprint(verbose, "Calculating the initial configurations for %s took " "%5.2f seconds" % (loaded_data_manager.basename, initial_configurations_end_time - initial_configurations_start_time)) time_left_after_initial_configurations = max(0, time_left_for_this_task - (initial_configurations_end_time - start)) vprint(verbose, "Remaining time after finding the initial configurations for %s " "%5.2f sec" % (loaded_data_manager.basename, time_left_after_initial_configurations)) else: initial_configurations = [] # == Pickle the data manager data_manager_path = os.path.join(tmp_dataset_dir, basename + "_Manager.pkl") cPickle.dump(loaded_data_manager, open(data_manager_path, 'w'), protocol=-1) # == RUN SMAC # = Create an empty instance file instance_file = os.path.join(tmp_dataset_dir, "instances.txt") fh = open(instance_file, 'w') fh.write(os.path.join(input_dir, basename)) fh.close() # = Start SMAC stop = time.time() time_left_for_smac = max(0, time_left_for_this_task - (stop - start)) proc_smac = \ submit_process.run_smac(tmp_dir=tmp_dataset_dir, searchspace=searchspace_path, instance_file=instance_file, limit=time_left_for_smac, initial_challengers=initial_configurations) # == RUN ensemble builder stop = time.time() time_left_for_ensembles = max(0, time_left_for_this_task - (stop - start)) proc_ensembles = \ submit_process.run_ensemble_builder(tmp_dir=tmp_dataset_dir, dataset_name=basename, task_type=loaded_data_manager.info['task'], metric=loaded_data_manager.info['metric'], limit=time_left_for_ensembles, output_dir=output_dir) queue.put([time_needed_to_load_data, data_manager_path, proc_smac, proc_ensembles]) return
import sys from AutoML2015.models.autosklearn import get_configuration_space from AutoML2015.data.data_io import inventory_data from AutoML2015.data.data_manager import DataManager from HPOlibConfigSpace.converters import pcs_parser input_dir = sys.argv[1] output_dir = sys.argv[2] datanames = inventory_data(input_dir) try: os.mkdir(output_dir) except: pass for basename in datanames: D = DataManager(basename, input_dir, verbose=True) cs = get_configuration_space(D.info) dataset_dir = os.path.join(output_dir, basename) try: os.mkdir(dataset_dir) except: pass with open(os.path.join(dataset_dir, "params.pcs"), 'w') as fh: fh.write(pcs_parser.write(cs)) print
def test_write_log_int(self): expected = "int_log_a [1, 6] [2]il" cs = ConfigurationSpace() cs.add_hyperparameter(int_log_a) value = pcs_parser.write(cs) self.assertEqual(expected, value)