def main(self, train_scenario_dn:str, test_scenario_dn:str=None): ''' main method Arguments --------- train_scenario_dn:str directory name with ASlib scenario training data test_scenarios_dn:str directory name with ASlib scenario test data (performance data is missing) ''' # Read scenario files scenario = ASlibScenario() scenario.read_scenario(dn=train_scenario_dn) # fit on training data self.fit(scenario=scenario) # Read test files # ASlibScenario is not designed to read partial scenarios # therefore, we have to cheat a bit scenario = ASlibScenario() scenario.read_description(fn=os.path.join(test_scenario_dn,"description.txt")) scenario.read_feature_values(fn=os.path.join(test_scenario_dn,"feature_values.arff")) scenario.read_feature_runstatus(fn=os.path.join(test_scenario_dn,"feature_runstatus.arff")) # predict on test data self.predict(scenario=scenario)
def write_to_database(scenario: ASlibScenario, approach, fold: int, on_training=False): metrics = list() metrics.append(Par10Metric()) metrics.append(NumberUnsolvedInstances(False)) metrics.append(NumberUnsolvedInstances(True)) scenario_name = scenario.scenario scenario = ASlibScenario() if scenario_name == 'GLUHACK-18': scenario_name = 'GLUHACK-2018' scenario.read_scenario('data/aslib_data-master/' + scenario_name) metric_results = _evaluate_train_test_split_mod(scenario, approach, metrics, fold, on_training) db_config = load_configuration() for i, result in enumerate(metric_results): if on_training: name = 'training_' + approach.get_name() publish_results_to_database(db_config, scenario.scenario, fold, name, metrics[i].get_name(), result) else: publish_results_to_database(db_config, scenario.scenario, fold, approach.get_name(), metrics[i].get_name(), result)
def _transform_aslib_scenario_to_kebi_format(self, scenario_folder_path): # read scenario scenario = ASlibScenario() scenario.logger.disabled = True scenario.read_scenario(dn=str(scenario_folder_path)) # prepare performance data and ranking data in XY_concationation DataFrame X = scenario.feature_data Y = self._performances_to_rankings(scenario) X, Y = self._adapt_column_names_according_to_the_output_format(X, Y) XY_concatination = pd.concat([X, Y], axis=1, join_axes=[X.index]) # Save in CSV file output_file_path = os.path.join(str(self.absolute_path_output_folder), scenario.scenario + ".csv") XY_concatination.to_csv( output_file_path, sep=self.separator, encoding='UTF-8', index=False, float_format='%g', na_rep=self.replacement_string_null_feature_values) # post step: add column types and empty line according to KEBI format to exported csv file self._add_value_type_column_name_line_in_kebi_formatted_csv( output_file_path, X.columns, Y.columns) return scenario
def evaluate_scenario(scenario_name: str, approach, metrics, amount_of_training_scenario_instances: int, fold: int, db_config, tune_hyperparameters: bool): scenario = ASlibScenario() scenario.read_scenario('data/aslib_data-master/' + scenario_name) print_stats_of_scenario(scenario) evaluate(scenario, approach, metrics, amount_of_training_scenario_instances, fold, db_config, tune_hyperparameters) return scenario_name
def read_scenario_ASlib(self, scenario_dn: str): ''' Read scenario from ASlib format Arguments --------- scenario_dn: str Scenario directory name ''' self.scenario = ASlibScenario() self.scenario.read_scenario(dn=scenario_dn)
def __init__(self, path): # read the parts of the aslib scenario which are present. This is adapted from # the example here: (in the predict method) # # https://github.com/mlindauer/OASC_starterkit/blob/master/oasc_starterkit/single_best.py scenario = ASlibScenario() scenario.read_description(fn=os.path.join(path,"description.txt")) scenario.read_feature_values(fn=os.path.join(path,"feature_values.arff")) scenario.read_feature_runstatus(fn=os.path.join(path,"feature_runstatus.arff")) scenario.instances = scenario.feature_data.index self.scenario = scenario
def evaluate_scenario(scenario_name: str, approach, metrics, amount_of_training_scenario_instances: int, fold: int, db_config, tune_hyperparameters: bool): scenario = ASlibScenario() scenario.read_scenario('data/aslib_data-master/' + scenario_name) if scenario_name in ['OPENML-WEKA-2017', 'TTP-2016']: metrics = list() metrics.append(PerformanceMetric()) evaluate(scenario, approach, metrics, amount_of_training_scenario_instances, fold, db_config, tune_hyperparameters) return scenario_name
def read_scenario_CSV(self, csv_data: namedtuple): ''' Read scenario from ASlib format Arguments --------- csv_data: namedtuple namedtuple with the following fields: "perf_csv", "feat_csv", "obj", "cutoff", "maximize", "cv_csv" "cv_csv" can be None ''' self.scenario = ASlibScenario() self.scenario.read_from_csv(perf_fn=csv_data.perf_csv, feat_fn=csv_data.feat_csv, objective=csv_data.obj, runtime_cutoff=csv_data.cutoff, maximize=csv_data.maximize, cv_fn=csv_data.cv_csv)
def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec))) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=args_.cv_csv) else: raise ValueError("Missing inputs to read scenario data.") self.cs = self.get_cs(scenario) if args_.tune: config = self.get_tuned_config(scenario) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=scenario.cv_data.max().max())
def __init__(self, perf_fn: str, feat_fn: str, objective: str = "solution_quality", runtime_cutoff: float = None, maximize: bool = True, cv_fn: str = None, seed: int = 12345): """ Constructor """ self.scenario = ASlibScenario() self.scenario.read_from_csv(perf_fn=perf_fn, feat_fn=feat_fn, objective=objective, runtime_cutoff=runtime_cutoff, maximize=maximize, cv_fn=cv_fn) self.seed = seed self.af = AutoFolio(random_seed=seed) self.logger = logging.getLogger("AF Facade")
def _outer_cv(solver_fold, args, config): solver, fold = solver_fold # there are problems serializing the aslib scenario, so just read it again scenario = ASlibScenario() scenario.read_scenario(args.scenario) msg = "Solver: {}, Fold: {}".format(solver, fold) logger.info(msg) msg = "Constructing template pipeline" logger.info(msg) pipeline = _get_pipeline(args, config, scenario) msg = "Extracting solver and fold performance data" logger.info(msg) testing, training = scenario.get_split(fold) X_train = training.feature_data y_train = training.performance_data[solver].values if 'log_performance_data' in config: y_train = np.log1p(y_train) msg = "Fitting the pipeline" logger.info(msg) pipeline = pipeline.fit(X_train, y_train) out = string.Template(args.out) out = out.substitute(solver=solver, fold=fold) msg = "Writing fit pipeline to disk: {}".format(out) logger.info(msg) joblib.dump(pipeline, out) return pipeline
def run_cli(self): ''' main method of AutoFolio based on command line interface ''' cmd_parser = CMDParser() args_, self.overwrite_args = cmd_parser.parse() self._root_logger.setLevel(args_.verbose) if args_.load: pred = self.read_model_and_predict( model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec.split(" ")))) print("Selected Schedule [(algorithm, budget)]: %s" % (pred)) else: scenario = ASlibScenario() if args_.scenario: scenario.read_scenario(args_.scenario) elif args_.performance_csv and args_.feature_csv: scenario.read_from_csv(perf_fn=args_.performance_csv, feat_fn=args_.feature_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=args_.cv_csv) else: raise ValueError("Missing inputs to read scenario data.") test_scenario = None if args_.performance_test_csv and args_.feature_test_csv: test_scenario = ASlibScenario() test_scenario.read_from_csv(perf_fn=args_.performance_test_csv, feat_fn=args_.feature_test_csv, objective=args_.objective, runtime_cutoff=args_.runtime_cutoff, maximize=args_.maximize, cv_fn=None) config = {} if args_.config is not None: self.logger.info("Reading yaml config file") config = yaml.load(open(args_.config)) if not config.get("wallclock_limit"): config["wallclock_limit"] = args_.wallclock_limit if not config.get("runcount_limit"): config["runcount_limit"] = args_.runcount_limit if not config.get("output-dir"): config["output-dir"] = args_.output_dir self.cs = self.get_cs(scenario, config) if args_.outer_cv: self._outer_cv(scenario, config, args_.outer_cv_fold, args_.out_template, smac_seed=args_.smac_seed) return 0 if args_.tune: config = self.get_tuned_config(scenario, wallclock_limit=args_.wallclock_limit, runcount_limit=args_.runcount_limit, autofolio_config=config, seed=args_.smac_seed) else: config = self.cs.get_default_configuration() self.logger.debug(config) if args_.save: feature_pre_pipeline, pre_solver, selector = self.fit( scenario=scenario, config=config) self._save_model( args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config) else: self.run_cv(config=config, scenario=scenario, folds=int(scenario.cv_data.max().max())) if test_scenario is not None: stats = self.run_fold(config=config, fold=0, return_fit=False, scenario=scenario, test_scenario=test_scenario)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Validate the algorithm selection performance of the " "predictions made using test-as-auto-sklearn using " "autofolio.validation.validate.Validator.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('predictions', help="The predictions file, from " "test-as-auto-sklearn") parser.add_argument('--config', help="A (yaml) config file which " "specifies options controlling the learner behavior") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) if args.config is not None: msg = "Loading yaml config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} config['allowed_feature_groups'] = [scenario.feature_group_dict.keys()] # either way, update the scenario with the features used during training scenario.used_feature_groups = config['allowed_feature_groups'] msg = "Reading predictions" logger.info(msg) predictions = pd.read_csv(args.predictions) msg = "Selecting the algorithm with smallest prediction for each instance" logger.info(msg) algorithm_selections = pandas_utils.get_group_extreme( predictions, "predicted", ex_type="min", group_fields="instance_id") msg = "Creating the schedules for the validator" logger.info(msg) schedules = parallel.apply_df_simple(algorithm_selections, _get_schedule, scenario.algorithm_cutoff_time) schedules = utils.merge_dicts(*schedules) val = Validator() performance_type = scenario.performance_type[0] if performance_type == "runtime": stats = val.validate_runtime(schedules=schedules, test_scenario=scenario) elif performance_type == "solution_quality": stats = val.validate_quality(schedules=schedules, test_scenario=scenario) else: msg = "Unknown performance type: {}".format(performance_type) raise ValueError(msg) msg = "=== RESULTS ===" logger.info(msg) stats.show()
# counts number of training instances # License: BSD import logging logging.basicConfig(level="INFO") import json import time as tm from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from aslib_scenario.aslib_scenario import ASlibScenario from validate import Validator if __name__ == "__main__": parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("--train", help="Directory with *all* train data in ASlib format") args_ = parser.parse_args() start_time_fold = tm.time() train_scenario = ASlibScenario() train_scenario.read_scenario(dn=args_.train) print('num of training insts ', len(train_scenario.instances))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script trains a model to predict the runtime for a " "solver from an ASlib scenario using autosklearn. It assumes an " "\"outer\" cross-validation strategy, and it only trains a model for " "the indicated folds and solvers. It then writes the learned model to " "disk. It *does not* collect any statistics, make predictions ,etc.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('out', help="A template string for the filenames for " "the learned models. They are written with joblib.dump, so they need " "to be read back in with joblib.load. ${solver} and ${fold} are the " "template part of the string. It is probably necessary to surround " "this argument with single quotes in order to prevent shell " "replacement of the template parts.") parser.add_argument('--config', help="A (yaml) config file which specifies " "options controlling the learner behavior") parser.add_argument('--solvers', help="The solvers for which models will " "be learned. By default, models for all solvers are learned", nargs='*', default=[]) parser.add_argument('--folds', help="The outer-cv folds for which a model " "will be learned. By default, models for all folds are learned", type=int, nargs='*', default=[]) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use " "for parallel solver/fold training", type=int, default=default_num_cpus) parser.add_argument('--num-blas-threads', help="The number of threads to " "use for parallelizing BLAS. The total number of CPUs will be " "\"num_cpus * num_blas_cpus\". Currently, this flag only affects " "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus) parser.add_argument('--do-not-update-env', help="By default, num-blas-threads " "requires that relevant environment variables are updated. Likewise, " "if num-cpus is greater than one, it is necessary to turn off python " "assertions due to an issue with multiprocessing. If this flag is " "present, then the script assumes those updates are already handled. " "Otherwise, the relevant environment variables are set, and a new " "processes is spawned with this flag and otherwise the same " "arguments. This flag is not inended for external users.", action='store_true') automl_utils.add_automl_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # see which folds to run folds = args.folds if len(folds) == 0: folds = range(1, 11) for f in folds: math_utils.check_range(f, 1, 10, variable_name="fold") # and which solvers msg = "Reading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) # ensure the selected solver is present solvers = args.solvers if len(solvers) == 0: solvers = scenario.algorithms for solver in solvers: if solver not in scenario.algorithms: solver_str = ','.join(scenario.algorithms) msg = ("[train-auto-sklear]: the solver is not present in the " "ASlib scenario. given: {}. choices: {}".format(solver, solver_str)) raise ValueError(msg) if args.config is not None: msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} # everything is present, so update the environment variables and spawn a # new process, if necessary if not args.do_not_update_env: ### # # There is a lot going on with settings these environment variables. # please see the following references: # # Turning off assertions so we can parallelize sklearn across # multiple CPUs for different solvers/folds # https://github.com/celery/celery/issues/1709 # # Controlling OpenBLAS threads # https://github.com/automl/auto-sklearn/issues/166 # # Other environment variables controlling thread usage # http://stackoverflow.com/questions/30791550 # ### # we only need to turn off the assertions if we parallelize across cpus if args.num_cpus > 1: os.environ['PYTHONOPTIMIZE'] = "1" # openblas os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads) # mkl blas os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads) # other stuff from the SO post os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads) os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads) cmd = ' '.join(shlex.quote(a) for a in sys.argv) cmd += " --do-not-update-env" shell_utils.check_call(cmd) return msg = "Learning regressors" logger.info(msg) it = itertools.product(solvers, folds) regressors = parallel.apply_parallel_iter( it, args.num_cpus, _outer_cv, args, config, progress_bar=True )
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Test models learned with train-as-auto-sklearn. It " "writes the predictions to disk as a \"long\" data frame. The output " "file is in gzipped csv format.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('model_template', help="A template string for the filenames for " "the learned models. ${solver} and ${fold} are the template part of " "the string. It is probably necessary to surround this argument with " "single quotes in order to prevent shell replacement of the template " "parts.") parser.add_argument('out', help="The output csv file") parser.add_argument('--config', help="A (yaml) config file which " "specifies options controlling the learner behavior") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) if args.config is not None: msg = "Loading yaml config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} msg = "Creating string templates" logger.info(msg) model_template = string.Template(args.model_template) msg = "Finding folds from ASlib scenario" logger.info(msg) folds = [int(i) for i in scenario.cv_data['fold'].unique()] folds = sorted(folds) msg = "Making predictions" logger.info(msg) all_predictions = [] it = itertools.product(scenario.algorithms, folds) for solver, fold in it: model_file = model_template.substitute(solver=solver, fold=fold) if not os.path.exists(model_file): msg = "Could not find model file. Skipping: {}".format(model_file) logger.warning(msg) continue try: model = joblib.load(model_file) except: msg = ("Problem loading the model file. Skipping: {}".format( model_file)) logger.warning(msg) continue msg = "Processing. solver: {}. fold: {}".format(solver, fold) logger.info(msg) testing, training = scenario.get_split(fold) y_pred = model.predict(testing.feature_data) if 'log_performance_data': # exp transform it back out y_pred = np.expm1(y_pred) pred_df = pd.DataFrame() pred_df['instance_id'] = testing.feature_data.index pred_df['solver'] = solver pred_df['fold'] = fold pred_df['actual'] = testing.performance_data[solver].values pred_df['predicted'] = y_pred all_predictions.append(pred_df) msg = "Joining all predictions in a long data frame" logger.info(msg) all_predictions = pd.concat(all_predictions) msg = "Writing predictions to disk" logger.info(msg) utils.write_df(all_predictions, args.out, index=False)
if __name__ == "__main__": parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument( "--result_fn", help="Result json file with predictions for each test instances") parser.add_argument("--test_as", help="Directory with *all* test data in ASlib format") parser.add_argument("--train_as", help="Directory with *all* train data in ASlib format") args_ = parser.parse_args() start_time_fold = tm.time() #read scenarios test_scenario = ASlibScenario() test_scenario.read_scenario(dn=args_.test_as) train_scenario = ASlibScenario() train_scenario.read_scenario(dn=args_.train_as) # read result file with open(args_.result_fn) as fp: schedules = json.load(fp) validator = Validator() if test_scenario.performance_type[0] == "runtime": validator.validate_runtime(schedules=schedules, test_scenario=test_scenario, train_scenario=train_scenario) else: