def test_fix_types(self): # Test categorical and ordinal for hyperparameter_type in [CategoricalHyperparameter, OrdinalHyperparameter]: cs = ConfigurationSpace() cs.add_hyperparameters([ hyperparameter_type('bools', [True, False]), hyperparameter_type('ints', [1, 2, 3, 4, 5]), hyperparameter_type('floats', [1.5, 2.5, 3.5, 4.5, 5.5]), hyperparameter_type('str', ['string', 'ding', 'dong']), hyperparameter_type('mixed', [2, True, 1.5, 'string', False, 'False']), ]) c = cs.get_default_configuration().get_dictionary() # Check bools for b in [False, True]: c['bools'] = b c_str = {k: str(v) for k, v in c.items()} self.assertEqual(fix_types(c_str, cs), c) # Check legal mixed values for m in [2, True, 1.5, 'string']: c['mixed'] = m c_str = {k: str(v) for k, v in c.items()} self.assertEqual(fix_types(c_str, cs), c) # Check error on cornercase that cannot be caught for m in [False, 'False']: c['mixed'] = m c_str = {k: str(v) for k, v in c.items()} self.assertRaises(ValueError, fix_types, c_str, cs) # Test constant for m in [2, 1.5, 'string']: cs = ConfigurationSpace() cs.add_hyperparameter(Constant('constant', m)) c = cs.get_default_configuration().get_dictionary() c_str = {k: str(v) for k, v in c.items()} self.assertEqual(fix_types(c_str, cs), c)
def smac_to_fanova(state_run_directory, destination_dir): ''' Takes the state-run files, merges them and prepares the configuration space for fANOVA. outputs: fANOVA object state_run_directory: str path to the directory of the pysmac_output/out/scenario file destination_dir: str path to the directory in which the merged states should be stored ''' state_run_list =[] files = glob(state_run_directory + "/*") for file in files: if file.startswith(state_run_directory + "/state-run"): state_run_list.append(file) state_merge.state_merge(state_run_list, destination_dir) merged_files = glob(destination_dir + '/*') for file in merged_files: if file.startswith(destination_dir + '/runs_and_results'): response_file = file if file.startswith(destination_dir + '/paramstrings'): paramstrings = file param_dict = output_reader.read_paramstrings_file(paramstrings) num_line = str(param_dict[0]).replace("'", "") num_line = str(num_line).replace("}", "") # messy way to get the parameter names wrt order f_params = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") line = line.replace('{', '') if ':' in line: parameter = line.replace(':', '') f_params.append(parameter) # get configspace with open(destination_dir + '/param.pcs') as fh: cs = pcs_new.read(fh.readlines(), debug=True) X = [] hps = cs.get_hyperparameters() for p in param_dict: c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True) X.append([]) for hp in hps: if hasattr(hp, 'choices'): value = hp.choices.index(c[hp.name]) else: value = c[hp.name] X[-1].append(value) X = np.array(X) Y = data_extractor(response_file, X.shape[0]) return fanova.fANOVA(X = X, Y = Y, config_space= cs)
def get_runhistory(self, cs): """ Expects the following files: - `self.folder/smac-output/aclib/state-run1/runs_and_results(...).csv` - `self.folder/smac-output/aclib/state-run1/paramstrings(...).csv` Returns ------- rh: RunHistory runhistory """ rh_fn = self.get_glob_file(self.folder, 'runs_and_results*.csv') self.logger.debug("Runhistory loaded as csv from %s", rh_fn) configs_fn = self.get_glob_file(self.folder, 'paramstrings*.txt') self.logger.debug("Configurations loaded from %s", configs_fn) # Translate smac2 to csv csv_data = load_csv_to_pandaframe(rh_fn, self.logger) data = pd.DataFrame() data["config_id"] = csv_data["Run History Configuration ID"] data["instance_id"] = csv_data["Instance ID"].apply( lambda x: self.scen.train_insts[x - 1]) data["seed"] = csv_data["Seed"] data["time"] = csv_data["Runtime"] if self.scen.run_obj == 'runtime': data["cost"] = csv_data["Runtime"] else: data["cost"] = csv_data["Run Quality"] data["status"] = csv_data["Run Result"] # Load configurations with open(configs_fn, 'r') as csv_file: csv_data = list( csv.reader(csv_file, delimiter=',', skipinitialspace=True)) id_to_config = {} for row in csv_data: config_id = int(re.match(r'^(\d*):', row[0]).group(1)) params = [re.match(r'^\d*: (.*)', row[0]).group(1)] params.extend(row[1:]) #self.logger.debug(params) matches = [re.match(r'(.*)=\'(.*)\'', p) for p in params] values = {m.group(1): m.group(2) for m in matches} values = deactivate_inactive_hyperparameters( fix_types(values, cs), cs).get_dictionary() id_to_config[config_id] = Configuration(cs, values=values) self.id_to_config = id_to_config names, feats = self.scen.feature_names, self.scen.feature_dict rh = CSV2RH().read_csv_to_rh(data, cs=cs, id_to_config=id_to_config, train_inst=self.scen.train_insts, test_inst=self.scen.test_insts, instance_features=feats) return rh
def add_config(row): values = { name: row[name] for name in parameters if row[name] != '' } config = deactivate_inactive_hyperparameters( fix_types(values, cs), cs) if config not in config_to_id: config_to_id[config] = len(config_to_id) row['config_id'] = config_to_id[config] return row
def read_config_file(fn:str, cs:ConfigurationSpace): config_pd = pd.read_csv(fn, header=0, index_col=0, dtype=object) configs = [] for param_name in list(config_pd): if param_name.startswith("dummy_non_parameter"): del config_pd[param_name] for config in config_pd.iterrows(): config = fix_types(configuration=config[1:][0].to_dict(), configuration_space=cs) config = deactivate_inactive_hyperparameters(configuration=config, configuration_space=cs) configs.append(config) return configs
def load_config_csv(path, cs, logger): """ Load configurations.csv in the following format: +-----------+-----------------+-----------------+-----+ | CONFIG_ID | parameter_name1 | parameter_name2 | ... | +===========+=================+=================+=====+ | 0 | value1 | value2 | ... | +-----------+-----------------+-----------------+-----+ | ... | ... | ... | ... | +-----------+-----------------+-----------------+-----+ Parameters ---------- path: str path to csv-file cs: ConfigurationSpace configspace with matching parameters logger: Logger logger for debugs Returns ------- (parameters, id_to_config): (str, dict) parameter-names and dict mapping ids to Configurations """ id_to_config = {} logger.debug("Trying to read configuration-csv-file: %s.", path) config_data = load_csv_to_pandaframe(path, logger, apply_numeric=False) config_data['CONFIG_ID'] = config_data['CONFIG_ID'].apply(pd.to_numeric) config_data.set_index('CONFIG_ID', inplace=True) logger.debug("Found parameters: %s", config_data.columns) logger.debug("Parameters in pcs: %s", cs.get_hyperparameter_names()) diff = set(config_data.columns).symmetric_difference( set(cs.get_hyperparameter_names())) if diff: raise ValueError("Provided pcs does not match configuration-file " "\'%s\' (check parameters %s)" % (path, diff)) for index, row in config_data.iterrows(): values = {name: row[name] for name in config_data.columns if row[name]} id_to_config[index] = deactivate_inactive_hyperparameters( fix_types(values, cs), cs) return config_data.columns, id_to_config
def smac_to_fanova(state_run_directory, destination_dir): ''' Takes the state-run files, merges them and prepares the configuration space for fANOVA. outputs: fANOVA object state_run_directory: str path to the directory of the pysmac_output/out/scenario file destination_dir: str path to the directory in which the merged states should be stored ''' state_run_list = [] files = glob(state_run_directory + "/*") for file in files: if file.startswith(state_run_directory + "/state-run"): state_run_list.append(file) state_merge.state_merge(state_run_list, destination_dir) merged_files = glob(destination_dir + '/*') for file in merged_files: if file.startswith(destination_dir + '/runs_and_results'): response_file = file if file.startswith(destination_dir + '/paramstrings'): paramstrings = file param_dict = output_reader.read_paramstrings_file(paramstrings) num_line = str(param_dict[0]).replace("'", "") num_line = str(num_line).replace("}", "") # messy way to get the parameter names wrt order f_params = [] for line in str(num_line).split(" "): line = str(line).replace(",", "") line = line.replace('{', '') if ':' in line: parameter = line.replace(':', '') f_params.append(parameter) # get configspace with open(destination_dir + '/param.pcs') as fh: cs = pcs_new.read(fh.readlines(), debug=True) X = [] hps = cs.get_hyperparameters() for p in param_dict: c = CS.Configuration(cs, fix_types(p, cs), allow_inactive_with_values=True) X.append([]) for hp in hps: if hasattr(hp, 'choices'): value = hp.choices.index(c[hp.name]) else: value = c[hp.name] X[-1].append(value) X = np.array(X) Y = data_extractor(response_file, X.shape[0]) return fanova.fANOVA(X=X, Y=Y, config_space=cs)
def get_validated_runhistory(self, cs): """ Expects the following files: - `self.folder/validate-time-train/validationCallStrings(...).csv` - `self.folder/validate-time-train/validationRunResultLineMatrix(...).csv` - `self.folder/validate-time-test/validationCallStrings(...).csv` - `self.folder/validate-time-test/validationRunResultLineMatrix(...).csv` Returns ------- validated_rh: RunHistory validated runhistory """ self.logger.debug("Loading validation-data") folder = os.path.join(self.folder, 'validate-time-train') configs_fn = re.search(r'validationCallStrings.*?\.csv', str(os.listdir(folder))) if not configs_fn: self.logger.warning( "Specified validation_format is \'SMAC2\', but no " "\'validationCallStrings(...).csv\'-file could be found " "in %s" % folder) return configs_fn = os.path.join(folder, configs_fn.group()) results_fn = re.search(r'validationRunResultLineMatrix.*?\.csv', str(os.listdir(folder))) if not results_fn: self.logger.warning( "Specified validation_format is \'SMAC2\', but no " "\'validationRunResultLineMatrix(...).csv\'-file could be found " "in %s" % folder) return results_fn = os.path.join(folder, results_fn.group()) self.logger.debug("Configurations loaded from %s", configs_fn) self.logger.debug("Runhistory loaded as csv from %s", results_fn) # Load configurations csv_data = load_csv_to_pandaframe(configs_fn, self.logger, False) id_to_config = {} for idx, row in csv_data.iterrows(): config_id = int(row[0]) configuration = row[1].split() params = [p.lstrip('-') for p in configuration[::2]] # all odds values = [v.strip('\'') for v in configuration[1::2]] # all evens param_values = dict(zip(params, values)) param_values = deactivate_inactive_hyperparameters( fix_types(param_values, cs), cs).get_dictionary() id_to_config[config_id] = Configuration(cs, values=param_values) names, feats = self.scen.feature_names, self.scen.feature_dict # Translate smac2-validation (RunResultString-matrix) to csv csv_data = load_csv_to_pandaframe(results_fn, self.logger, delimiter='\",\"') data = pd.DataFrame() for idx, row in csv_data.iterrows(): instance, seed = row[0], row[1] for column in csv_data.columns[2:]: config_id = int( re.match(r'^Run result line of validation config #(\d*)$', column).group(1)) result = [e.strip() for e in row[column].split(',')] data = data.append( { "config_id": config_id, "instance_id": instance, "seed": seed, "time": result[1], "cost": result[1] if self.scen.run_obj == 'runtime' else result[3], "status": result[0] }, ignore_index=True) rh = CSV2RH().read_csv_to_rh(data, cs=cs, id_to_config=id_to_config, train_inst=self.scen.train_insts, test_inst=self.scen.test_insts, instance_features=feats) self.logger.debug( "%d datapoints for %d configurations found in validated rh.", len(rh.data), len(rh.get_all_configs())) return rh
def fit(self, X, y, runcount_limit: int = 100, wc_limit: int = 60, config: Configuration = None, seed: int = 12345): X_all = None y_all = None for idx, (X_q, y_q) in enumerate(zip(X, y)): if idx == 0: X_all = X_q y_all = y_q else: X_all = np.vstack([X_all, X_q]) y_all = np.hstack([y_all, y_q]) def obj_func(config, instance=None, seed=None, pc=None): losses = [] for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0], [1, 2], [2, 1]]): X_train = X[train_idx] y_train = y[train_idx] y_train = np.log10(y_train) X_valid, y_valid = X_all, y_all y_valid = np.log10(y_valid) rf = RandomForestRegressor( n_estimators=config["n_estimators"], criterion=config["criterion"], min_samples_split=config["min_samples_split"], min_samples_leaf=config["min_samples_leaf"], min_weight_fraction_leaf=config[ "min_weight_fraction_leaf"], max_features=config["max_features"], bootstrap=config["bootstrap"], random_state=12345) rf.fit(X_train, y_train) y_preds = [] for tree in rf.estimators_: y_pred = 10**tree.predict(X_valid) y_preds.append(y_pred) y_preds = np.mean(y_preds, axis=0) y_preds = np.log10(y_preds) loss = np.sqrt( mean_squared_error(y_true=y_valid, y_pred=y_preds)) losses.append(loss) return np.mean(losses) taf = SimpleTAFunc(obj_func) cs = self.get_config_space() ac_scenario = Scenario({ "run_obj": "quality", # we optimize quality "runcount-limit": runcount_limit, "wallclock-limit": wc_limit, "cost_for_crash": 10, "cs": cs, "deterministic": "true", "abort_on_first_run_crash": False, "output-dir": "" }) if isinstance(config, dict): config = fix_types(configuration=dict, configuration_space=cs) config = Configuration(configuration_space=cs, values=config) elif runcount_limit == 1: config = cs.get_default_configuration() else: smac = SMAC(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(seed)) config = smac.optimize() print("Final Incumbent") print(config) y_all = np.log10(y_all) rf = RandomForestRegressor( n_estimators=100, criterion=config["criterion"], min_samples_split=config["min_samples_split"], min_samples_leaf=config["min_samples_leaf"], min_weight_fraction_leaf=config["min_weight_fraction_leaf"], max_features=config["max_features"], bootstrap=config["bootstrap"], random_state=12345) start_time = time.time() rf.fit(X_all, y_all) print("Training Time: %d" % (time.time() - start_time)) self.model = rf
def fit(self, X, y, max_epochs: int, runcount_limit: int = 100, wc_limit: int = 60, config: Configuration = None, seed: int = 12345): X_all = None y_all = None for idx, (X_q, y_q) in enumerate(zip(X, y)): if idx == 0: X_all = X_q y_all = y_q else: X_all = np.vstack([X_all, X_q]) y_all = np.hstack([y_all, y_q]) def obj_func(config, instance=None, seed=None, pc=None): # continuing training if pc is given # otherwise, construct new DNN models = [] losses = [] for model_idx, [train_idx, valid_idx] in enumerate([[0, 3], [3, 0], [1, 2], [2, 1]]): X_train = X[train_idx] y_train = y[train_idx] X_train = self.scalerX.fit_transform(X_train) y_train = np.log10(y_train) y_train = self.scalerY.fit_transform(y_train.reshape(-1, 1))[:, 0] X_valid, y_valid = X_all, y_all X_valid = self.scalerX.transform(X_valid) y_valid = np.log10(y_valid) y_valid = self.scalerY.transform(y_valid.reshape(-1, 1))[:, 0] if pc is None: if model_idx == 0: K.clear_session() model = ParamFCNetRegression( config=config, n_feat=X_train.shape[1], expected_num_epochs=max_epochs, n_outputs=1, verbose=1) else: model = pc[model_idx] history = model.train(X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, n_epochs=1) models.append(model) final_loss = history["val_loss"][-1] losses.append(final_loss) return np.mean(losses), {"model": models} taf = SimpleTAFunc(obj_func) cs = ParamFCNetRegression.get_config_space( num_layers_range=self.num_layers_range, use_l2_regularization=self.use_l2_regularization, use_dropout=self.use_dropout) print(cs) ac_scenario = Scenario({ "run_obj": "quality", # we optimize quality "runcount-limit": max_epochs * runcount_limit, "wallclock-limit": wc_limit, "cost_for_crash": 10, "cs": cs, "deterministic": "true", "abort_on_first_run_crash": False, "output-dir": "" }) intensifier = Intensifier(tae_runner=taf, stats=None, traj_logger=None, rng=np.random.RandomState(42), run_limit=100, max_epochs=max_epochs) if isinstance(config, dict): config = fix_types(configuration=dict, configuration_space=cs) config = Configuration(configuration_space=cs, values=config) elif runcount_limit == 1: config = cs.get_default_configuration() else: smac = SMAC(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(seed), intensifier=intensifier) smac.solver.runhistory.overwrite_existing_runs = True config = smac.optimize() print("Final Incumbent") print(config) X_all = self.scalerX.fit_transform(X_all) y_all = np.log10(y_all) y_all = self.scalerY.fit_transform(y_all.reshape(-1, 1))[:, 0] K.clear_session() start_time = time.time() model = ParamFCNetRegression(config=config, n_feat=X_all.shape[1], expected_num_epochs=max_epochs, n_outputs=1, verbose=1) history = model.train(X_train=X_all, y_train=y_all, X_valid=X_all, y_valid=y_all, n_epochs=max_epochs) print("Training Time: %f" % (time.time() - start_time)) self.model = model