Ejemplo n.º 1
0
    def test_read_and_write_pcs(self):
        configuration_space_path = os.path.abspath(HPOlibConfigSpace.__file__)
        configuration_space_path = os.path.dirname(configuration_space_path)
        configuration_space_path = os.path.join(
            configuration_space_path, "..", "test", "test_searchspaces",
            "mini_autosklearn_original.pcs")

        with open(configuration_space_path) as fh:
            cs = pcs_parser.read(fh)

        pcs = pcs_parser.write(cs)

        with open(configuration_space_path) as fh:
            lines = fh.readlines()

        num_asserts = 0
        for line in lines:
            line = line.replace("\n", "")
            line = line.split("#")[0]  # Remove comments
            line = line.strip()

            if line:
                num_asserts += 1
                self.assertIn(line, pcs)

        self.assertEqual(21, num_asserts)

        # Sample a little bit
        rs = RandomSampler(cs, 1)
        print cs
        for i in range(1000):
            c = rs.sample_configuration()
    def test_read_and_write_pcs(self):
        configuration_space_path = os.path.abspath(HPOlibConfigSpace.__file__)
        configuration_space_path = os.path.dirname(configuration_space_path)
        configuration_space_path = os.path.join(configuration_space_path,
                                                "..", "test",
                                                "test_searchspaces",
                                                "mini_autosklearn_original.pcs")

        with open(configuration_space_path) as fh:
            cs = pcs_parser.read(fh)

        pcs = pcs_parser.write(cs)

        with open(configuration_space_path) as fh:
            lines = fh.readlines()

        num_asserts = 0
        for line in lines:
            line = line.replace("\n", "")
            line = line.split("#")[0]       # Remove comments
            line = line.strip()

            if line:
                num_asserts += 1
                self.assertIn(line, pcs)

        self.assertEqual(21, num_asserts)

        # Sample a little bit
        rs = RandomSampler(cs, 1)
        print cs
        for i in range(1000):
            c = rs.sample_configuration()
Ejemplo n.º 3
0
 def test_write_log10(self):
     expected = "a [10.0, 1000.0] [100.0]l"
     cs = ConfigurationSpace()
     cs.add_hyperparameter(
         UniformFloatHyperparameter("a", 10, 1000, log=True))
     value = pcs_parser.write(cs)
     self.assertEqual(expected, value)
Ejemplo n.º 4
0
 def test_write_q_float(self):
     expected = "Q16_float_a [16.0, 1024.0] [520.0]"
     cs = ConfigurationSpace()
     cs.add_hyperparameter(
         UniformFloatHyperparameter("float_a", 16, 1024, q=16))
     value = pcs_parser.write(cs)
     self.assertEqual(expected, value)
Ejemplo n.º 5
0
 def test_write_q_int(self):
     expected = "Q16_int_a [16, 1024] [520]i"
     cs = ConfigurationSpace()
     cs.add_hyperparameter(
         UniformIntegerHyperparameter("int_a", 16, 1024, q=16))
     value = pcs_parser.write(cs)
     self.assertEqual(expected, value)
Ejemplo n.º 6
0
def _create_search_space(tmp_dir, data_info, watcher, log_function):
    task_name = 'CreateConfigSpace'
    watcher.start_task(task_name)
    config_space_path = os.path.join(tmp_dir, 'space.pcs')
    configuration_space = paramsklearn.get_configuration_space(
        data_info)
    sp_string = pcs_parser.write(configuration_space)
    _write_file_with_data(config_space_path, sp_string,
                          'Configuration space', log_function)
    watcher.stop_task(task_name)

    return configuration_space, config_space_path
Ejemplo n.º 7
0
 def test_build_forbidden(self):
     expected = "a {a, b, c} [a]\nb {a, b, c} [c]\n\n" \
                "{a=a, b=a}\n{a=a, b=b}\n{a=b, b=a}\n{a=b, b=b}"
     cs = ConfigurationSpace()
     a = CategoricalHyperparameter("a", ["a", "b", "c"], "a")
     b = CategoricalHyperparameter("b", ["a", "b", "c"], "c")
     cs.add_hyperparameter(a)
     cs.add_hyperparameter(b)
     fb = ForbiddenAndConjunction(ForbiddenInClause(a, ["a", "b"]),
                                  ForbiddenInClause(b, ["a", "b"]))
     cs.add_forbidden_clause(fb)
     value = pcs_parser.write(cs)
     self.assertIn(expected, value)
Ejemplo n.º 8
0
def _create_search_space(
    tmp_dir, data_info, backend, watcher, logger, include_estimators=None, include_preprocessors=None
):
    task_name = "CreateConfigSpace"
    watcher.start_task(task_name)
    configspace_path = os.path.join(tmp_dir, "space.pcs")
    configuration_space = paramsklearn.get_configuration_space(
        data_info, include_estimators=include_estimators, include_preprocessors=include_preprocessors
    )
    sp_string = pcs_parser.write(configuration_space)
    backend.write_txt_file(configspace_path, sp_string, "Configuration space")
    watcher.stop_task(task_name)

    return configuration_space, configspace_path
Ejemplo n.º 9
0
def _create_search_space(tmp_dir, data_info, backend, watcher, logger,
                         include_estimators=None, include_preprocessors=None):
    task_name = 'CreateConfigSpace'
    watcher.start_task(task_name)
    configspace_path = os.path.join(tmp_dir, 'space.pcs')
    configuration_space = pipeline.get_configuration_space(
        data_info,
        include_estimators=include_estimators,
        include_preprocessors=include_preprocessors)
    sp_string = pcs_parser.write(configuration_space)
    backend.write_txt_file(configspace_path, sp_string,
                           'Configuration space')
    watcher.stop_task(task_name)

    return configuration_space, configspace_path
Ejemplo n.º 10
0
    def _fit(self, D):
        # TODO: check that data and task definition fit together!

        self.metric_ = D.info['metric']
        self.task_ = D.info['task']
        self.target_num_ = D.info['target_num']

        # Set environment variable:
        seed = os.environ.get("AUTOSKLEARN_SEED")
        if seed is not None and int(seed) != self.seed:
            raise ValueError("It seems you have already started an instance "
                             "of AutoSklearn in this thread.")
        else:
            os.environ["AUTOSKLEARN_SEED"] = str(self.seed)

        # == Split dataset and store Data for the ensemble script
        X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data(
            D.data['X_train'], D.data['Y_train'])

        true_labels_ensemble_filename = os.path.join(self.tmp_dir,
                                                     "true_labels_ensemble.npy")
        true_labels_ensemble_lock = true_labels_ensemble_filename + ".lock"
        with lockfile.LockFile(true_labels_ensemble_lock):
            if not os.path.exists(true_labels_ensemble_filename):
                np.save(true_labels_ensemble_filename, Y_ensemble)

        del X_train, X_ensemble, Y_train, Y_ensemble

        time_needed_to_load_data = self.stopwatch_.wall_elapsed(self.basename_)
        time_left_after_reading = max(0, self.time_left_for_this_task -
                                      time_needed_to_load_data)
        self.logger.info("Remaining time after reading %s %5.2f sec" %
                    (self.basename_, time_left_after_reading))

        self.stopwatch_.stop_task("LoadData")

        # == Calculate metafeatures
        self.stopwatch_.start_task("CalculateMetafeatures")
        categorical = [True if feat_type.lower() in ["categorical"] else False
                       for feat_type in D.feat_type]

        if self.initial_configurations_via_metalearning <= 0:
            ml = None
        elif D.info["task"] in \
                [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]:
            ml = metalearning.MetaLearning()
            self.logger.debug("Start calculating metafeatures for %s" %
                              self.basename_)
            ml.calculate_metafeatures_with_labels(D.data["X_train"],
                                                  D.data["Y_train"],
                                                  categorical=categorical,
                                                  dataset_name=self.basename_)
        else:
            ml = None
            self.logger.critical("Metafeatures not calculated")
        self.stopwatch_.stop_task("CalculateMetafeatures")
        self.logger.debug("Calculating Metafeatures (categorical attributes) took %5.2f" % self.stopwatch_.wall_elapsed("CalculateMetafeatures"))

        self.stopwatch_.start_task("OneHot")
        D.perform1HotEncoding()
        self.ohe_ = D.encoder_
        self.stopwatch_.stop_task("OneHot")

        # == Pickle the data manager
        self.stopwatch_.start_task("StoreDatamanager")
        data_manager_path = os.path.join(self.tmp_dir,
                                         self.basename_ + "_Manager.pkl")
        data_manager_lockfile = data_manager_path + ".lock"
        with lockfile.LockFile(data_manager_lockfile):
            if not os.path.exists(data_manager_path):
                pickle.dump(D,
                            open(data_manager_path, 'w'), protocol=-1)
                self.logger.debug("Pickled Datamanager at %s" %
                                  data_manager_path)
            else:
                self.logger.debug("Data manager already presend at %s" %
                                  data_manager_path)
        self.stopwatch_.stop_task("StoreDatamanager")

        # = Create a searchspace
        self.stopwatch_.start_task("CreateConfigSpace")
        configspace_path = os.path.join(self.tmp_dir, "space.pcs")
        self.configuration_space = paramsklearn.get_configuration_space(
            D.info)

        self.configuration_space_created_hook()

        sp_string = pcs_parser.write(self.configuration_space)
        configuration_space_lockfile = configspace_path + ".lock"
        with lockfile.LockFile(configuration_space_lockfile):
            if not os.path.exists(configspace_path):
                with open(configspace_path, "w") as fh:
                    fh.write(sp_string)
                self.logger.debug("Configuration space written to %s" %
                                  configspace_path)
            else:
                self.logger.debug("Configuration space already present at %s" %
                                  configspace_path)
        self.stopwatch_.stop_task("CreateConfigSpace")

        if ml is None:
            initial_configurations = []
        elif D.info["task"]in \
                [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION]:
            self.stopwatch_.start_task("CalculateMetafeaturesEncoded")
            ml.calculate_metafeatures_encoded_labels(X_train=D.data["X_train"],
                                                     Y_train=D.data["Y_train"],
                                                     categorical=[False] * D.data["X_train"].shape[0],
                                                     dataset_name=self.basename_)
            self.stopwatch_.stop_task("CalculateMetafeaturesEncoded")
            self.logger.debug(
                "Calculating Metafeatures (encoded attributes) took %5.2fsec" %
                self.stopwatch_.wall_elapsed("CalculateMetafeaturesEncoded"))

            self.logger.debug(ml._metafeatures_labels.__repr__(verbosity=2))
            self.logger.debug(ml._metafeatures_encoded_labels.__repr__(verbosity=2))

            self.stopwatch_.start_task("InitialConfigurations")
            try:
                initial_configurations = ml.create_metalearning_string_for_smac_call(
                    self.configuration_space, self.basename_, self.metric_,
                    self.task_, True if D.info['is_sparse'] == 1 else False,
                    self.initial_configurations_via_metalearning, self.metadata_directory)
            except Exception as e:
                import traceback

                self.logger.error(str(e))
                self.logger.error(traceback.format_exc())
                initial_configurations = []

            self.stopwatch_.stop_task("InitialConfigurations")

            self.logger.debug("Initial Configurations: (%d)", len(initial_configurations))
            for initial_configuration in initial_configurations:
                self.logger.debug(initial_configuration)
            self.logger.debug("Looking for initial configurations took %5.2fsec" %
                              self.stopwatch_.wall_elapsed("InitialConfigurations"))
            self.logger.info(
                "Time left for %s after finding initial configurations: %5.2fsec" %
                (self.basename_, self.time_left_for_this_task -
                 self.stopwatch_.wall_elapsed(self.basename_)))
        else:
            initial_configurations = []
            self.logger.critical("Metafeatures encoded not calculated")

        # == Set up a directory where all the trained models will be pickled to
        if self.keep_models:
            self.model_directory_ = os.path.join(self.tmp_dir,
                                                 "models_%d" % self.seed)
            os.mkdir(self.model_directory_)
        self.ensemble_indices_directory_ = os.path.join(self.tmp_dir,
                                                        "ensemble_indices_%d" % self.seed)
        os.mkdir(self.ensemble_indices_directory_)

        # == RUN SMAC
        self.stopwatch_.start_task("runSmac")
        # = Create an empty instance file
        instance_file = os.path.join(self.tmp_dir, "instances.txt")
        instance_file_lock = instance_file + ".lock"
        with lockfile.LockFile(instance_file_lock):
            if not os.path.exists(instance_file_lock):
                with open(instance_file, "w") as fh:
                    fh.write("holdout")
                self.logger.debug("Created instance file %s" % instance_file)
            else:
                self.logger.debug("Instance file already present at %s" % instance_file)

        # = Start SMAC
        time_left_for_smac = max(0, self.time_left_for_this_task - (
            self.stopwatch_.wall_elapsed(self.basename_)))
        self.logger.debug("Start SMAC with %5.2fsec time left" % time_left_for_smac)
        proc_smac, smac_call = \
            submit_process.run_smac(dataset_name=self.basename_,
                                    dataset=data_manager_path,
                                    tmp_dir=self.tmp_dir,
                                    searchspace=configspace_path,
                                    instance_file=instance_file,
                                    limit=time_left_for_smac,
                                    cutoff_time=self.per_run_time_limit,
                                    initial_challengers=initial_configurations,
                                    memory_limit=self.ml_memory_limit,
                                    seed=self.seed)
        self.logger.debug(smac_call)
        self.stopwatch_.stop_task("runSmac")

        # == RUN ensemble builder
        self.stopwatch_.start_task("runEnsemble")
        time_left_for_ensembles = max(0, self.time_left_for_this_task - (
            self.stopwatch_.wall_elapsed(self.basename_)))
        self.logger.debug("Start Ensemble with %5.2fsec time left" % time_left_for_ensembles)
        proc_ensembles = \
            submit_process.run_ensemble_builder(tmp_dir=self.tmp_dir,
                                                dataset_name=self.basename_,
                                                task_type=self.task_,
                                                metric=self.metric_,
                                                limit=time_left_for_ensembles,
                                                output_dir=self.output_dir,
                                                ensemble_size=self.ensemble_size,
                                                ensemble_nbest=self.ensemble_nbest,
                                                seed=self.seed,
                                                ensemble_indices_output_dir=self.ensemble_indices_directory_)
        self.stopwatch_.stop_task("runEnsemble")

        del D

        if self.queue is not None:
            self.queue.put([time_needed_to_load_data, data_manager_path,
                            proc_smac, proc_ensembles])
        else:
            proc_smac.wait()
            proc_ensembles.wait()

        # Delete AutoSklearn environment variable
        del os.environ["AUTOSKLEARN_SEED"]
        return self
Ejemplo n.º 11
0
def start_automl_on_dataset(basename, input_dir, tmp_dataset_dir, output_dir,
                            time_left_for_this_task, queue):
    start = time.time()
    verbose = True
    # == Creating a data object with data and information about it
    vprint(verbose,  "======== Reading and converting data ==========")
    # Encoding the labels will be done after the metafeature calculation!
    loaded_data_manager = data_manager.DataManager(basename, input_dir,
                                                   verbose=verbose,
                                                   encode_labels=False)
    print loaded_data_manager

    # == Split dataset and store Data for the ensemble script
    X_train, X_ensemble, Y_train, Y_ensemble = split_data.split_data(
        loaded_data_manager.data['X_train'], loaded_data_manager.data['Y_train'])
    np.save(os.path.join(tmp_dataset_dir, "true_labels_ensemble.npy"), Y_ensemble)
    del X_train, X_ensemble, Y_train, Y_ensemble

    stop = time.time()
    time_needed_to_load_data = stop - start
    time_left_after_reading = max(0, time_left_for_this_task -
                                  time_needed_to_load_data)
    vprint(verbose, "Remaining time after reading data %5.2f sec" % time_left_after_reading)

    # = Create a searchspace
    searchspace_path = os.path.join(tmp_dataset_dir, "space.pcs")
    config_space = autosklearn.get_configuration_space(loaded_data_manager.info)
    sp_string = pcs_parser.write(config_space)
    fh = open(searchspace_path, 'w')
    fh.write(sp_string)
    fh.close()

    # == Calculate metafeatures
    categorical = [True if feat_type.lower() in ["categorical"] else False
                   for feat_type in loaded_data_manager.feat_type]

    if loaded_data_manager.info["task"].lower() not in \
            ["multilabel.classification", "regression"] and \
            not loaded_data_manager.info["is_sparse"]:
        ml = metalearning.MetaLearning()
        metafeatures_start_time = time.time()
        vprint(verbose, "Start calculating metafeatures for %s" %
               loaded_data_manager.basename)
        ml.calculate_metafeatures_with_labels(loaded_data_manager.data["X_train"],
                                              loaded_data_manager.data["Y_train"],
                                              categorical=categorical,
                                              dataset_name=loaded_data_manager.basename)

    loaded_data_manager.perform1HotEncoding()

    if loaded_data_manager.info["task"].lower() not in \
            ["multilabel.classification", "regression"] and \
            not loaded_data_manager.info["is_sparse"]:
        ml.calculate_metafeatures_encoded_labels(loaded_data_manager.data["X_train"],
                                                 loaded_data_manager.data["Y_train"],
            categorical=[False]*loaded_data_manager.data["X_train"].shape[0],
            dataset_name=loaded_data_manager.basename)
        metafeatures_end_time = time.time()
        metafeature_calculation_time = metafeatures_end_time - metafeatures_start_time
        vprint(verbose, "Done calculationg metafeatures for %s, took %5.2f "
                        "seconds." % (loaded_data_manager.basename,
                                      metafeature_calculation_time))
        time_left_after_metafeatures = max(0, time_left_for_this_task -
                                           (metafeatures_end_time - start))
        vprint(verbose,
           "Remaining time after calculating the metafeatures for %s %5.2f "
           "sec" % (loaded_data_manager.basename, time_left_after_metafeatures))

        vprint(verbose, ml._metafeatures_labels)
        vprint(verbose, ml._metafeatures_encoded_labels)

        # TODO check that Metafeatures only contain finite numbers!

        vprint(verbose, "Starting to look for initial configurations for %s." % loaded_data_manager.basename)
        initial_configurations_start_time = time.time()
        initial_configurations = ml.create_metalearning_string_for_smac_call(
            config_space, loaded_data_manager.basename, loaded_data_manager.info[
                'metric'])
        initial_configurations_end_time = time.time()
        vprint(verbose, "Calculating the initial configurations for %s took "
                        "%5.2f seconds" % (loaded_data_manager.basename,
                                           initial_configurations_end_time -
                                           initial_configurations_start_time))

        time_left_after_initial_configurations = max(0,
            time_left_for_this_task - (initial_configurations_end_time - start))

        vprint(verbose,
               "Remaining time after finding the initial configurations for %s "
               "%5.2f sec" % (loaded_data_manager.basename,
                              time_left_after_initial_configurations))

    else:
        initial_configurations = []

    # == Pickle the data manager
    data_manager_path = os.path.join(tmp_dataset_dir, basename + "_Manager.pkl")
    cPickle.dump(loaded_data_manager, open(data_manager_path, 'w'), protocol=-1)

    # == RUN SMAC
    # = Create an empty instance file
    instance_file = os.path.join(tmp_dataset_dir, "instances.txt")
    fh = open(instance_file, 'w')
    fh.write(os.path.join(input_dir, basename))
    fh.close()

    # = Start SMAC
    stop = time.time()
    time_left_for_smac = max(0, time_left_for_this_task - (stop - start))
    proc_smac = \
        submit_process.run_smac(tmp_dir=tmp_dataset_dir,
                                searchspace=searchspace_path,
                                instance_file=instance_file,
                                limit=time_left_for_smac,
                                initial_challengers=initial_configurations)


    # == RUN ensemble builder
    stop = time.time()
    time_left_for_ensembles = max(0, time_left_for_this_task - (stop - start))
    proc_ensembles = \
        submit_process.run_ensemble_builder(tmp_dir=tmp_dataset_dir,
                                            dataset_name=basename,
                                            task_type=loaded_data_manager.info['task'],
                                            metric=loaded_data_manager.info['metric'],
                                            limit=time_left_for_ensembles,
                                            output_dir=output_dir)

    queue.put([time_needed_to_load_data, data_manager_path,
               proc_smac, proc_ensembles])
    return
Ejemplo n.º 12
0
import sys

from AutoML2015.models.autosklearn import get_configuration_space
from AutoML2015.data.data_io import inventory_data
from AutoML2015.data.data_manager import DataManager
from HPOlibConfigSpace.converters import pcs_parser

input_dir = sys.argv[1]
output_dir = sys.argv[2]
datanames = inventory_data(input_dir)

try:
    os.mkdir(output_dir)
except:
    pass

for basename in datanames:
    D = DataManager(basename, input_dir, verbose=True)

    cs = get_configuration_space(D.info)
    dataset_dir = os.path.join(output_dir, basename)

    try:
        os.mkdir(dataset_dir)
    except:
        pass

    with open(os.path.join(dataset_dir, "params.pcs"), 'w') as fh:
        fh.write(pcs_parser.write(cs))
    print
Ejemplo n.º 13
0
 def test_write_log_int(self):
     expected = "int_log_a [1, 6] [2]il"
     cs = ConfigurationSpace()
     cs.add_hyperparameter(int_log_a)
     value = pcs_parser.write(cs)
     self.assertEqual(expected, value)