Exemple #1
0
 def test_fix_types(self):
     # Test categorical and ordinal
     for hyperparameter_type in [CategoricalHyperparameter, OrdinalHyperparameter]:
         cs = ConfigurationSpace()
         cs.add_hyperparameters([
             hyperparameter_type('bools', [True, False]),
             hyperparameter_type('ints', [1, 2, 3, 4, 5]),
             hyperparameter_type('floats', [1.5, 2.5, 3.5, 4.5, 5.5]),
             hyperparameter_type('str', ['string', 'ding', 'dong']),
             hyperparameter_type('mixed', [2, True, 1.5, 'string', False, 'False']),
             ])
         c = cs.get_default_configuration().get_dictionary()
         # Check bools
         for b in [False, True]:
             c['bools'] = b
             c_str = {k: str(v) for k, v in c.items()}
             self.assertEqual(fix_types(c_str, cs), c)
         # Check legal mixed values
         for m in [2, True, 1.5, 'string']:
             c['mixed'] = m
             c_str = {k: str(v) for k, v in c.items()}
             self.assertEqual(fix_types(c_str, cs), c)
         # Check error on cornercase that cannot be caught
         for m in [False, 'False']:
             c['mixed'] = m
             c_str = {k: str(v) for k, v in c.items()}
             self.assertRaises(ValueError, fix_types, c_str, cs)
     # Test constant
     for m in [2, 1.5, 'string']:
         cs = ConfigurationSpace()
         cs.add_hyperparameter(Constant('constant', m))
         c = cs.get_default_configuration().get_dictionary()
         c_str = {k: str(v) for k, v in c.items()}
         self.assertEqual(fix_types(c_str, cs), c)
Exemple #2
0
def smac_to_fanova(state_run_directory, destination_dir):
    '''
    Takes the state-run files, merges them and prepares the configuration space for fANOVA.
    
    outputs: fANOVA object
    
    state_run_directory: str
                        path to the directory of the pysmac_output/out/scenario file
    destination_dir: str
                    path to the directory in which the merged states should be stored
    '''

    state_run_list =[]
    files = glob(state_run_directory + "/*")
    for file in files:
        if file.startswith(state_run_directory + "/state-run"):
            state_run_list.append(file)
    state_merge.state_merge(state_run_list, destination_dir)
    merged_files = glob(destination_dir + '/*')

    for file in merged_files:
        if file.startswith(destination_dir + '/runs_and_results'):
            response_file = file
        if file.startswith(destination_dir + '/paramstrings'):
            paramstrings = file
    param_dict = output_reader.read_paramstrings_file(paramstrings)
    
    num_line = str(param_dict[0]).replace("'", "")
    num_line = str(num_line).replace("}", "")
    # messy way to get the parameter names wrt order
    f_params = []
    for line in str(num_line).split(" "):
        line = str(line).replace(",", "")
        line = line.replace('{',  '')
        if ':' in line:
            parameter = line.replace(':', '')
            f_params.append(parameter)
    
    # get configspace
    with open(destination_dir + '/param.pcs') as fh:
        cs = pcs_new.read(fh.readlines(), debug=True)

    X = []
    hps = cs.get_hyperparameters()


    for p in param_dict:
        c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True)
        X.append([])
        for hp in hps:
            if hasattr(hp, 'choices'):
                value = hp.choices.index(c[hp.name])
            else:
                value = c[hp.name]
            X[-1].append(value)
    
    X = np.array(X)
    Y = data_extractor(response_file, X.shape[0])

    return fanova.fANOVA(X = X, Y = Y, config_space= cs)
Exemple #3
0
    def get_runhistory(self, cs):
        """
        Expects the following files:

        - `self.folder/smac-output/aclib/state-run1/runs_and_results(...).csv`
        - `self.folder/smac-output/aclib/state-run1/paramstrings(...).csv`

        Returns
        -------
        rh: RunHistory
            runhistory
        """
        rh_fn = self.get_glob_file(self.folder, 'runs_and_results*.csv')
        self.logger.debug("Runhistory loaded as csv from %s", rh_fn)
        configs_fn = self.get_glob_file(self.folder, 'paramstrings*.txt')
        self.logger.debug("Configurations loaded from %s", configs_fn)
        # Translate smac2 to csv
        csv_data = load_csv_to_pandaframe(rh_fn, self.logger)
        data = pd.DataFrame()
        data["config_id"] = csv_data["Run History Configuration ID"]
        data["instance_id"] = csv_data["Instance ID"].apply(
            lambda x: self.scen.train_insts[x - 1])
        data["seed"] = csv_data["Seed"]
        data["time"] = csv_data["Runtime"]
        if self.scen.run_obj == 'runtime':
            data["cost"] = csv_data["Runtime"]
        else:
            data["cost"] = csv_data["Run Quality"]
        data["status"] = csv_data["Run Result"]

        # Load configurations
        with open(configs_fn, 'r') as csv_file:
            csv_data = list(
                csv.reader(csv_file, delimiter=',', skipinitialspace=True))
        id_to_config = {}
        for row in csv_data:
            config_id = int(re.match(r'^(\d*):', row[0]).group(1))
            params = [re.match(r'^\d*: (.*)', row[0]).group(1)]
            params.extend(row[1:])
            #self.logger.debug(params)
            matches = [re.match(r'(.*)=\'(.*)\'', p) for p in params]
            values = {m.group(1): m.group(2) for m in matches}
            values = deactivate_inactive_hyperparameters(
                fix_types(values, cs), cs).get_dictionary()
            id_to_config[config_id] = Configuration(cs, values=values)
        self.id_to_config = id_to_config
        names, feats = self.scen.feature_names, self.scen.feature_dict
        rh = CSV2RH().read_csv_to_rh(data,
                                     cs=cs,
                                     id_to_config=id_to_config,
                                     train_inst=self.scen.train_insts,
                                     test_inst=self.scen.test_insts,
                                     instance_features=feats)

        return rh
Exemple #4
0
 def add_config(row):
     values = {
         name: row[name]
         for name in parameters if row[name] != ''
     }
     config = deactivate_inactive_hyperparameters(
         fix_types(values, cs), cs)
     if config not in config_to_id:
         config_to_id[config] = len(config_to_id)
     row['config_id'] = config_to_id[config]
     return row
Exemple #5
0
def read_config_file(fn:str, cs:ConfigurationSpace):
    
    config_pd = pd.read_csv(fn, header=0, index_col=0, dtype=object)    
    
    configs = []
    
    for param_name in list(config_pd):
        if param_name.startswith("dummy_non_parameter"):
            del config_pd[param_name]
            
    for config in config_pd.iterrows():
        config = fix_types(configuration=config[1:][0].to_dict(), configuration_space=cs)
        config = deactivate_inactive_hyperparameters(configuration=config, configuration_space=cs)
        configs.append(config)
        
    return configs
Exemple #6
0
def load_config_csv(path, cs, logger):
    """ Load configurations.csv in the following format:

    +-----------+-----------------+-----------------+-----+
    | CONFIG_ID | parameter_name1 | parameter_name2 | ... |
    +===========+=================+=================+=====+
    | 0         | value1          | value2          | ... |
    +-----------+-----------------+-----------------+-----+
    | ...       | ...             | ...             | ... |
    +-----------+-----------------+-----------------+-----+

    Parameters
    ----------
    path: str
        path to csv-file
    cs: ConfigurationSpace
        configspace with matching parameters
    logger: Logger
        logger for debugs

    Returns
    -------
    (parameters, id_to_config): (str, dict)
        parameter-names and dict mapping ids to Configurations
    """
    id_to_config = {}
    logger.debug("Trying to read configuration-csv-file: %s.", path)
    config_data = load_csv_to_pandaframe(path, logger, apply_numeric=False)
    config_data['CONFIG_ID'] = config_data['CONFIG_ID'].apply(pd.to_numeric)
    config_data.set_index('CONFIG_ID', inplace=True)
    logger.debug("Found parameters: %s", config_data.columns)
    logger.debug("Parameters in pcs: %s", cs.get_hyperparameter_names())
    diff = set(config_data.columns).symmetric_difference(
        set(cs.get_hyperparameter_names()))
    if diff:
        raise ValueError("Provided pcs does not match configuration-file "
                         "\'%s\' (check parameters %s)" % (path, diff))
    for index, row in config_data.iterrows():
        values = {name: row[name] for name in config_data.columns if row[name]}
        id_to_config[index] = deactivate_inactive_hyperparameters(
            fix_types(values, cs), cs)
    return config_data.columns, id_to_config
Exemple #7
0
def smac_to_fanova(state_run_directory, destination_dir):
    '''
    Takes the state-run files, merges them and prepares the configuration space for fANOVA.
    
    outputs: fANOVA object
    
    state_run_directory: str
                        path to the directory of the pysmac_output/out/scenario file
    destination_dir: str
                    path to the directory in which the merged states should be stored
    '''

    state_run_list = []
    files = glob(state_run_directory + "/*")
    for file in files:
        if file.startswith(state_run_directory + "/state-run"):
            state_run_list.append(file)
    state_merge.state_merge(state_run_list, destination_dir)
    merged_files = glob(destination_dir + '/*')

    for file in merged_files:
        if file.startswith(destination_dir + '/runs_and_results'):
            response_file = file
        if file.startswith(destination_dir + '/paramstrings'):
            paramstrings = file
    param_dict = output_reader.read_paramstrings_file(paramstrings)

    num_line = str(param_dict[0]).replace("'", "")
    num_line = str(num_line).replace("}", "")
    # messy way to get the parameter names wrt order
    f_params = []
    for line in str(num_line).split(" "):
        line = str(line).replace(",", "")
        line = line.replace('{', '')
        if ':' in line:
            parameter = line.replace(':', '')
            f_params.append(parameter)

    # get configspace
    with open(destination_dir + '/param.pcs') as fh:
        cs = pcs_new.read(fh.readlines(), debug=True)

    X = []
    hps = cs.get_hyperparameters()

    for p in param_dict:
        c = CS.Configuration(cs,
                             fix_types(p, cs),
                             allow_inactive_with_values=True)
        X.append([])
        for hp in hps:
            if hasattr(hp, 'choices'):
                value = hp.choices.index(c[hp.name])
            else:
                value = c[hp.name]
            X[-1].append(value)

    X = np.array(X)
    Y = data_extractor(response_file, X.shape[0])

    return fanova.fANOVA(X=X, Y=Y, config_space=cs)
Exemple #8
0
    def get_validated_runhistory(self, cs):
        """
        Expects the following files:

        - `self.folder/validate-time-train/validationCallStrings(...).csv`
        - `self.folder/validate-time-train/validationRunResultLineMatrix(...).csv`
        - `self.folder/validate-time-test/validationCallStrings(...).csv`
        - `self.folder/validate-time-test/validationRunResultLineMatrix(...).csv`

        Returns
        -------
        validated_rh: RunHistory
            validated runhistory
        """
        self.logger.debug("Loading validation-data")
        folder = os.path.join(self.folder, 'validate-time-train')
        configs_fn = re.search(r'validationCallStrings.*?\.csv',
                               str(os.listdir(folder)))
        if not configs_fn:
            self.logger.warning(
                "Specified validation_format is \'SMAC2\', but no "
                "\'validationCallStrings(...).csv\'-file could be found "
                "in %s" % folder)
            return
        configs_fn = os.path.join(folder, configs_fn.group())

        results_fn = re.search(r'validationRunResultLineMatrix.*?\.csv',
                               str(os.listdir(folder)))
        if not results_fn:
            self.logger.warning(
                "Specified validation_format is \'SMAC2\', but no "
                "\'validationRunResultLineMatrix(...).csv\'-file could be found "
                "in %s" % folder)
            return
        results_fn = os.path.join(folder, results_fn.group())

        self.logger.debug("Configurations loaded from %s", configs_fn)
        self.logger.debug("Runhistory loaded as csv from %s", results_fn)

        # Load configurations
        csv_data = load_csv_to_pandaframe(configs_fn, self.logger, False)
        id_to_config = {}
        for idx, row in csv_data.iterrows():
            config_id = int(row[0])
            configuration = row[1].split()
            params = [p.lstrip('-') for p in configuration[::2]]  # all odds
            values = [v.strip('\'') for v in configuration[1::2]]  # all evens
            param_values = dict(zip(params, values))
            param_values = deactivate_inactive_hyperparameters(
                fix_types(param_values, cs), cs).get_dictionary()
            id_to_config[config_id] = Configuration(cs, values=param_values)

        names, feats = self.scen.feature_names, self.scen.feature_dict

        # Translate smac2-validation (RunResultString-matrix) to csv
        csv_data = load_csv_to_pandaframe(results_fn,
                                          self.logger,
                                          delimiter='\",\"')
        data = pd.DataFrame()
        for idx, row in csv_data.iterrows():
            instance, seed = row[0], row[1]
            for column in csv_data.columns[2:]:
                config_id = int(
                    re.match(r'^Run result line of validation config #(\d*)$',
                             column).group(1))
                result = [e.strip() for e in row[column].split(',')]
                data = data.append(
                    {
                        "config_id":
                        config_id,
                        "instance_id":
                        instance,
                        "seed":
                        seed,
                        "time":
                        result[1],
                        "cost":
                        result[1]
                        if self.scen.run_obj == 'runtime' else result[3],
                        "status":
                        result[0]
                    },
                    ignore_index=True)

        rh = CSV2RH().read_csv_to_rh(data,
                                     cs=cs,
                                     id_to_config=id_to_config,
                                     train_inst=self.scen.train_insts,
                                     test_inst=self.scen.test_insts,
                                     instance_features=feats)

        self.logger.debug(
            "%d datapoints for %d configurations found in validated rh.",
            len(rh.data), len(rh.get_all_configs()))

        return rh
Exemple #9
0
    def fit(self,
            X,
            y,
            runcount_limit: int = 100,
            wc_limit: int = 60,
            config: Configuration = None,
            seed: int = 12345):

        X_all = None
        y_all = None
        for idx, (X_q, y_q) in enumerate(zip(X, y)):
            if idx == 0:
                X_all = X_q
                y_all = y_q
            else:
                X_all = np.vstack([X_all, X_q])
                y_all = np.hstack([y_all, y_q])

        def obj_func(config, instance=None, seed=None, pc=None):

            losses = []

            for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0],
                                                                [1, 2], [2,
                                                                         1]]):

                X_train = X[train_idx]
                y_train = y[train_idx]
                y_train = np.log10(y_train)

                X_valid, y_valid = X_all, y_all
                y_valid = np.log10(y_valid)

                rf = RandomForestRegressor(
                    n_estimators=config["n_estimators"],
                    criterion=config["criterion"],
                    min_samples_split=config["min_samples_split"],
                    min_samples_leaf=config["min_samples_leaf"],
                    min_weight_fraction_leaf=config[
                        "min_weight_fraction_leaf"],
                    max_features=config["max_features"],
                    bootstrap=config["bootstrap"],
                    random_state=12345)

                rf.fit(X_train, y_train)

                y_preds = []
                for tree in rf.estimators_:
                    y_pred = 10**tree.predict(X_valid)
                    y_preds.append(y_pred)
                y_preds = np.mean(y_preds, axis=0)
                y_preds = np.log10(y_preds)
                loss = np.sqrt(
                    mean_squared_error(y_true=y_valid, y_pred=y_preds))
                losses.append(loss)

            return np.mean(losses)

        taf = SimpleTAFunc(obj_func)
        cs = self.get_config_space()

        ac_scenario = Scenario({
            "run_obj": "quality",  # we optimize quality
            "runcount-limit": runcount_limit,
            "wallclock-limit": wc_limit,
            "cost_for_crash": 10,
            "cs": cs,
            "deterministic": "true",
            "abort_on_first_run_crash": False,
            "output-dir": ""
        })

        if isinstance(config, dict):
            config = fix_types(configuration=dict, configuration_space=cs)
            config = Configuration(configuration_space=cs, values=config)
        elif runcount_limit == 1:
            config = cs.get_default_configuration()
        else:
            smac = SMAC(scenario=ac_scenario,
                        tae_runner=taf,
                        rng=np.random.RandomState(seed))
            config = smac.optimize()

        print("Final Incumbent")
        print(config)

        y_all = np.log10(y_all)

        rf = RandomForestRegressor(
            n_estimators=100,
            criterion=config["criterion"],
            min_samples_split=config["min_samples_split"],
            min_samples_leaf=config["min_samples_leaf"],
            min_weight_fraction_leaf=config["min_weight_fraction_leaf"],
            max_features=config["max_features"],
            bootstrap=config["bootstrap"],
            random_state=12345)

        start_time = time.time()
        rf.fit(X_all, y_all)
        print("Training Time: %d" % (time.time() - start_time))

        self.model = rf
Exemple #10
0
    def fit(self,
            X,
            y,
            max_epochs: int,
            runcount_limit: int = 100,
            wc_limit: int = 60,
            config: Configuration = None,
            seed: int = 12345):

        X_all = None
        y_all = None
        for idx, (X_q, y_q) in enumerate(zip(X, y)):
            if idx == 0:
                X_all = X_q
                y_all = y_q
            else:
                X_all = np.vstack([X_all, X_q])
                y_all = np.hstack([y_all, y_q])

        def obj_func(config, instance=None, seed=None, pc=None):
            # continuing training if pc is given
            # otherwise, construct new DNN

            models = []
            losses = []

            for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0],
                                                                [1, 2], [2,
                                                                         1]]):

                X_train = X[train_idx]
                y_train = y[train_idx]

                X_train = self.scalerX.fit_transform(X_train)
                y_train = np.log10(y_train)
                y_train = self.scalerY.fit_transform(y_train.reshape(-1, 1))[:,
                                                                             0]

                X_valid, y_valid = X_all, y_all
                X_valid = self.scalerX.transform(X_valid)
                y_valid = np.log10(y_valid)
                y_valid = self.scalerY.transform(y_valid.reshape(-1, 1))[:, 0]

                if pc is None:

                    if model_idx == 0:
                        K.clear_session()
                    model = ParamFCNetRegression(
                        config=config,
                        n_feat=X_train.shape[1],
                        expected_num_epochs=max_epochs,
                        n_outputs=1,
                        verbose=1)
                else:
                    model = pc[model_idx]

                history = model.train(X_train=X_train,
                                      y_train=y_train,
                                      X_valid=X_valid,
                                      y_valid=y_valid,
                                      n_epochs=1)

                models.append(model)

                final_loss = history["val_loss"][-1]
                losses.append(final_loss)

            return np.mean(losses), {"model": models}

        taf = SimpleTAFunc(obj_func)
        cs = ParamFCNetRegression.get_config_space(
            num_layers_range=self.num_layers_range,
            use_l2_regularization=self.use_l2_regularization,
            use_dropout=self.use_dropout)

        print(cs)

        ac_scenario = Scenario({
            "run_obj": "quality",  # we optimize quality
            "runcount-limit": max_epochs * runcount_limit,
            "wallclock-limit": wc_limit,
            "cost_for_crash": 10,
            "cs": cs,
            "deterministic": "true",
            "abort_on_first_run_crash": False,
            "output-dir": ""
        })

        intensifier = Intensifier(tae_runner=taf,
                                  stats=None,
                                  traj_logger=None,
                                  rng=np.random.RandomState(42),
                                  run_limit=100,
                                  max_epochs=max_epochs)

        if isinstance(config, dict):
            config = fix_types(configuration=dict, configuration_space=cs)
            config = Configuration(configuration_space=cs, values=config)
        elif runcount_limit == 1:
            config = cs.get_default_configuration()
        else:
            smac = SMAC(scenario=ac_scenario,
                        tae_runner=taf,
                        rng=np.random.RandomState(seed),
                        intensifier=intensifier)

            smac.solver.runhistory.overwrite_existing_runs = True
            config = smac.optimize()

        print("Final Incumbent")
        print(config)

        X_all = self.scalerX.fit_transform(X_all)
        y_all = np.log10(y_all)
        y_all = self.scalerY.fit_transform(y_all.reshape(-1, 1))[:, 0]

        K.clear_session()

        start_time = time.time()

        model = ParamFCNetRegression(config=config,
                                     n_feat=X_all.shape[1],
                                     expected_num_epochs=max_epochs,
                                     n_outputs=1,
                                     verbose=1)

        history = model.train(X_train=X_all,
                              y_train=y_all,
                              X_valid=X_all,
                              y_valid=y_all,
                              n_epochs=max_epochs)

        print("Training Time: %f" % (time.time() - start_time))

        self.model = model