コード例 #1
0
 def main(self,
          train_scenario_dn:str,
          test_scenario_dn:str=None):
     '''
         main method
         
         Arguments
         ---------
         train_scenario_dn:str
             directory name with ASlib scenario training data
         test_scenarios_dn:str
             directory name with ASlib scenario test data 
             (performance data is missing)
     '''
     
     # Read scenario files
     scenario = ASlibScenario()
     scenario.read_scenario(dn=train_scenario_dn)
     
     # fit on training data
     self.fit(scenario=scenario)
     
     # Read test files
     # ASlibScenario is not designed to read partial scenarios
     # therefore, we have to cheat a bit
     scenario = ASlibScenario()
     scenario.read_description(fn=os.path.join(test_scenario_dn,"description.txt"))
     scenario.read_feature_values(fn=os.path.join(test_scenario_dn,"feature_values.arff"))
     scenario.read_feature_runstatus(fn=os.path.join(test_scenario_dn,"feature_runstatus.arff"))
     
     # predict on test data
     self.predict(scenario=scenario)
コード例 #2
0
def write_to_database(scenario: ASlibScenario,
                      approach,
                      fold: int,
                      on_training=False):
    metrics = list()
    metrics.append(Par10Metric())
    metrics.append(NumberUnsolvedInstances(False))
    metrics.append(NumberUnsolvedInstances(True))
    scenario_name = scenario.scenario
    scenario = ASlibScenario()
    if scenario_name == 'GLUHACK-18':
        scenario_name = 'GLUHACK-2018'
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)
    metric_results = _evaluate_train_test_split_mod(scenario, approach,
                                                    metrics, fold, on_training)

    db_config = load_configuration()
    for i, result in enumerate(metric_results):
        if on_training:
            name = 'training_' + approach.get_name()
            publish_results_to_database(db_config, scenario.scenario, fold,
                                        name, metrics[i].get_name(), result)
        else:
            publish_results_to_database(db_config, scenario.scenario, fold,
                                        approach.get_name(),
                                        metrics[i].get_name(), result)
コード例 #3
0
    def _transform_aslib_scenario_to_kebi_format(self, scenario_folder_path):

        # read scenario
        scenario = ASlibScenario()
        scenario.logger.disabled = True
        scenario.read_scenario(dn=str(scenario_folder_path))

        # prepare performance data and ranking data in XY_concationation DataFrame
        X = scenario.feature_data
        Y = self._performances_to_rankings(scenario)
        X, Y = self._adapt_column_names_according_to_the_output_format(X, Y)
        XY_concatination = pd.concat([X, Y], axis=1, join_axes=[X.index])

        # Save in CSV file
        output_file_path = os.path.join(str(self.absolute_path_output_folder),
                                        scenario.scenario + ".csv")
        XY_concatination.to_csv(
            output_file_path,
            sep=self.separator,
            encoding='UTF-8',
            index=False,
            float_format='%g',
            na_rep=self.replacement_string_null_feature_values)

        # post step: add column types and empty line according to KEBI format to exported csv file
        self._add_value_type_column_name_line_in_kebi_formatted_csv(
            output_file_path, X.columns, Y.columns)
        return scenario
コード例 #4
0
def evaluate_scenario(scenario_name: str, approach, metrics,
                      amount_of_training_scenario_instances: int, fold: int,
                      db_config, tune_hyperparameters: bool):
    scenario = ASlibScenario()
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)
    print_stats_of_scenario(scenario)
    evaluate(scenario, approach, metrics,
             amount_of_training_scenario_instances, fold, db_config,
             tune_hyperparameters)
    return scenario_name
コード例 #5
0
ファイル: asapy.py プロジェクト: rpplayground/asapy
    def read_scenario_ASlib(self, scenario_dn: str):
        '''
        Read scenario from ASlib format

        Arguments
        ---------
        scenario_dn: str
            Scenario directory name 
        '''

        self.scenario = ASlibScenario()
        self.scenario.read_scenario(dn=scenario_dn)
コード例 #6
0
ファイル: oasc_test_scenario.py プロジェクト: bmmalone/as-asl
    def __init__(self, path):
        # read the parts of the aslib scenario which are present. This is adapted from
        # the example here: (in the predict method)
        #
        # https://github.com/mlindauer/OASC_starterkit/blob/master/oasc_starterkit/single_best.py
        
        scenario = ASlibScenario()
        scenario.read_description(fn=os.path.join(path,"description.txt"))
        scenario.read_feature_values(fn=os.path.join(path,"feature_values.arff"))
        scenario.read_feature_runstatus(fn=os.path.join(path,"feature_runstatus.arff"))

        scenario.instances = scenario.feature_data.index
        
        self.scenario = scenario
コード例 #7
0
def evaluate_scenario(scenario_name: str, approach, metrics,
                      amount_of_training_scenario_instances: int, fold: int,
                      db_config, tune_hyperparameters: bool):
    scenario = ASlibScenario()
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)

    if scenario_name in ['OPENML-WEKA-2017', 'TTP-2016']:
        metrics = list()
        metrics.append(PerformanceMetric())

    evaluate(scenario, approach, metrics,
             amount_of_training_scenario_instances, fold, db_config,
             tune_hyperparameters)
    return scenario_name
コード例 #8
0
ファイル: asapy.py プロジェクト: rpplayground/asapy
    def read_scenario_CSV(self, csv_data: namedtuple):
        '''
        Read scenario from ASlib format

        Arguments
        ---------
        csv_data: namedtuple
            namedtuple with the following fields: "perf_csv", "feat_csv", "obj", "cutoff", "maximize", "cv_csv" 
            "cv_csv" can be None
        '''
        self.scenario = ASlibScenario()
        self.scenario.read_from_csv(perf_fn=csv_data.perf_csv,
                                    feat_fn=csv_data.feat_csv,
                                    objective=csv_data.obj,
                                    runtime_cutoff=csv_data.cutoff,
                                    maximize=csv_data.maximize,
                                    cv_fn=csv_data.cv_csv)
コード例 #9
0
ファイル: autofolio.py プロジェクト: bmmalone/AutoFolio
    def run_cli(self):
        '''
            main method of AutoFolio based on command line interface
        '''

        cmd_parser = CMDParser()
        args_, self.overwrite_args = cmd_parser.parse()

        self._root_logger.setLevel(args_.verbose)

        if args_.load:
            self.read_model_and_predict(
                model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec)))
        else:

            scenario = ASlibScenario()
            if args_.scenario:
                scenario.read_scenario(args_.scenario)
            elif args_.performance_csv and args_.feature_csv:
                scenario.read_from_csv(perf_fn=args_.performance_csv,
                                       feat_fn=args_.feature_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=args_.cv_csv)
            else:
                raise ValueError("Missing inputs to read scenario data.")

            self.cs = self.get_cs(scenario)

            if args_.tune:
                config = self.get_tuned_config(scenario)
            else:
                config = self.cs.get_default_configuration()
            self.logger.debug(config)

            if args_.save:
                feature_pre_pipeline, pre_solver, selector = self.fit(
                    scenario=scenario, config=config)
                self._save_model(
                    args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config)
            else:
                self.run_cv(config=config, scenario=scenario, folds=scenario.cv_data.max().max())
コード例 #10
0
    def __init__(self,
                 perf_fn: str,
                 feat_fn: str,
                 objective: str = "solution_quality",
                 runtime_cutoff: float = None,
                 maximize: bool = True,
                 cv_fn: str = None,
                 seed: int = 12345):
        """ Constructor """
        self.scenario = ASlibScenario()
        self.scenario.read_from_csv(perf_fn=perf_fn,
                                    feat_fn=feat_fn,
                                    objective=objective,
                                    runtime_cutoff=runtime_cutoff,
                                    maximize=maximize,
                                    cv_fn=cv_fn)
        self.seed = seed

        self.af = AutoFolio(random_seed=seed)
        self.logger = logging.getLogger("AF Facade")
コード例 #11
0
def _outer_cv(solver_fold, args, config):

    solver, fold = solver_fold

    # there are problems serializing the aslib scenario, so just read it again
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)
     
    msg = "Solver: {}, Fold: {}".format(solver, fold)
    logger.info(msg)

    msg = "Constructing template pipeline"
    logger.info(msg)
    pipeline = _get_pipeline(args, config, scenario)

    msg = "Extracting solver and fold performance data"
    logger.info(msg)
    
    testing, training = scenario.get_split(fold)
    X_train = training.feature_data
    y_train = training.performance_data[solver].values

    if 'log_performance_data' in config:
        y_train = np.log1p(y_train)
    
    msg = "Fitting the pipeline"
    logger.info(msg)
    pipeline = pipeline.fit(X_train, y_train)

    out = string.Template(args.out)
    out = out.substitute(solver=solver, fold=fold)

    msg = "Writing fit pipeline to disk: {}".format(out)
    logger.info(msg)
    joblib.dump(pipeline, out)

    return pipeline
コード例 #12
0
    def run_cli(self):
        '''
            main method of AutoFolio based on command line interface
        '''

        cmd_parser = CMDParser()
        args_, self.overwrite_args = cmd_parser.parse()

        self._root_logger.setLevel(args_.verbose)

        if args_.load:
            pred = self.read_model_and_predict(
                model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec.split(" "))))
            print("Selected Schedule [(algorithm, budget)]: %s" % (pred))

        else:

            scenario = ASlibScenario()
            if args_.scenario:
                scenario.read_scenario(args_.scenario)
            elif args_.performance_csv and args_.feature_csv:
                scenario.read_from_csv(perf_fn=args_.performance_csv,
                                       feat_fn=args_.feature_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=args_.cv_csv)
            else:
                raise ValueError("Missing inputs to read scenario data.")

            test_scenario = None
            if args_.performance_test_csv and args_.feature_test_csv:
                test_scenario = ASlibScenario()
                test_scenario.read_from_csv(perf_fn=args_.performance_test_csv,
                                       feat_fn=args_.feature_test_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=None)

            config = {}
            if args_.config is not None:
                self.logger.info("Reading yaml config file")
                config = yaml.load(open(args_.config))
            if not config.get("wallclock_limit"):
                config["wallclock_limit"] = args_.wallclock_limit
            if not config.get("runcount_limit"):
                config["runcount_limit"] = args_.runcount_limit
            if not config.get("output-dir"):
                config["output-dir"] = args_.output_dir

            self.cs = self.get_cs(scenario, config)

            if args_.outer_cv:
                self._outer_cv(scenario, config, args_.outer_cv_fold, 
                    args_.out_template, smac_seed=args_.smac_seed)
                return 0
            
            if args_.tune:
                config = self.get_tuned_config(scenario,
                                               wallclock_limit=args_.wallclock_limit,
                                               runcount_limit=args_.runcount_limit,
                                               autofolio_config=config,
                                               seed=args_.smac_seed)
            else:
                config = self.cs.get_default_configuration()
            self.logger.debug(config)

            if args_.save:
                feature_pre_pipeline, pre_solver, selector = self.fit(
                    scenario=scenario, config=config)
                self._save_model(
                    args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config)
            else:
                self.run_cv(config=config, scenario=scenario, folds=int(scenario.cv_data.max().max()))

            if test_scenario is not None:
                stats = self.run_fold(config=config,
                                      fold=0,
                                      return_fit=False,
                                      scenario=scenario,
                                      test_scenario=test_scenario)
コード例 #13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Validate the algorithm selection performance of the "
        "predictions made using test-as-auto-sklearn using "
        "autofolio.validation.validate.Validator.")

    parser.add_argument('scenario', help="The ASlib scenario")
    parser.add_argument('predictions',
                        help="The predictions file, from "
                        "test-as-auto-sklearn")

    parser.add_argument('--config',
                        help="A (yaml) config file which "
                        "specifies options controlling the learner behavior")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}
        config['allowed_feature_groups'] = [scenario.feature_group_dict.keys()]

    # either way, update the scenario with the features used during training
    scenario.used_feature_groups = config['allowed_feature_groups']

    msg = "Reading predictions"
    logger.info(msg)
    predictions = pd.read_csv(args.predictions)

    msg = "Selecting the algorithm with smallest prediction for each instance"
    logger.info(msg)

    algorithm_selections = pandas_utils.get_group_extreme(
        predictions, "predicted", ex_type="min", group_fields="instance_id")

    msg = "Creating the schedules for the validator"
    logger.info(msg)

    schedules = parallel.apply_df_simple(algorithm_selections, _get_schedule,
                                         scenario.algorithm_cutoff_time)

    schedules = utils.merge_dicts(*schedules)

    val = Validator()
    performance_type = scenario.performance_type[0]

    if performance_type == "runtime":
        stats = val.validate_runtime(schedules=schedules,
                                     test_scenario=scenario)

    elif performance_type == "solution_quality":
        stats = val.validate_quality(schedules=schedules,
                                     test_scenario=scenario)

    else:
        msg = "Unknown performance type: {}".format(performance_type)
        raise ValueError(msg)

    msg = "=== RESULTS ==="
    logger.info(msg)
    stats.show()
コード例 #14
0
ファイル: count.py プロジェクト: lteu/OASC_starterkit
# counts number of training instances
# License: BSD

import logging
logging.basicConfig(level="INFO")
import json
import time as tm
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter

from aslib_scenario.aslib_scenario import ASlibScenario
from validate import Validator

if __name__ == "__main__":

    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("--train",
                        help="Directory with *all* train data in ASlib format")

    args_ = parser.parse_args()

    start_time_fold = tm.time()
    train_scenario = ASlibScenario()
    train_scenario.read_scenario(dn=args_.train)
    print('num of training insts ', len(train_scenario.instances))
コード例 #15
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script trains a model to predict the runtime for a "
        "solver from an ASlib scenario using autosklearn. It assumes an "
        "\"outer\" cross-validation strategy, and it only trains a model for "
        "the indicated folds and solvers. It then writes the learned model to "
        "disk. It *does not* collect any statistics, make predictions ,etc.")

    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('out', help="A template string for the filenames for "
        "the learned models. They are written with joblib.dump, so they need "
        "to be read back in with joblib.load. ${solver} and ${fold} are the "
        "template part of the string. It is probably necessary to surround "
        "this argument with single quotes in order to prevent shell "
        "replacement of the template parts.")

    parser.add_argument('--config', help="A (yaml) config file which specifies "
        "options controlling the learner behavior")

    parser.add_argument('--solvers', help="The solvers for which models will "
        "be learned. By default, models for all solvers are learned", 
        nargs='*', default=[])

    parser.add_argument('--folds', help="The outer-cv folds for which a model "
        "will be learned. By default, models for all folds are learned", 
        type=int, nargs='*', default=[])

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use "
        "for parallel solver/fold training", type=int, 
        default=default_num_cpus)
    
    parser.add_argument('--num-blas-threads', help="The number of threads to "
        "use for parallelizing BLAS. The total number of CPUs will be "
        "\"num_cpus * num_blas_cpus\". Currently, this flag only affects "
        "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus)

    parser.add_argument('--do-not-update-env', help="By default, num-blas-threads "
        "requires that relevant environment variables are updated. Likewise, "
        "if num-cpus is greater than one, it is necessary to turn off python "
        "assertions due to an issue with multiprocessing. If this flag is "
        "present, then the script assumes those updates are already handled. "
        "Otherwise, the relevant environment variables are set, and a new "
        "processes is spawned with this flag and otherwise the same "
        "arguments. This flag is not inended for external users.",
        action='store_true')

    automl_utils.add_automl_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    folds = args.folds
    if len(folds) == 0:
        folds = range(1, 11)

    for f in folds:
        math_utils.check_range(f, 1, 10, variable_name="fold")

    # and which solvers
    msg = "Reading ASlib scenario"
    logger.info(msg)
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    # ensure the selected solver is present
    solvers = args.solvers
    if len(solvers) == 0:
        solvers = scenario.algorithms

    for solver in solvers:
        if solver not in scenario.algorithms:
            solver_str = ','.join(scenario.algorithms)
            msg = ("[train-auto-sklear]: the solver is not present in the "
                "ASlib scenario. given: {}. choices: {}".format(solver, 
                solver_str))
            raise ValueError(msg)

    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    # everything is present, so update the environment variables and spawn a
    # new process, if necessary
    if not args.do_not_update_env:
        ###
        #
        # There is a lot going on with settings these environment variables.
        # please see the following references:
        #
        #   Turning off assertions so we can parallelize sklearn across
        #   multiple CPUs for different solvers/folds
        #       https://github.com/celery/celery/issues/1709
        #
        #   Controlling OpenBLAS threads
        #       https://github.com/automl/auto-sklearn/issues/166
        #
        #   Other environment variables controlling thread usage
        #       http://stackoverflow.com/questions/30791550
        #
        ###
        
        # we only need to turn off the assertions if we parallelize across cpus
        if args.num_cpus > 1:
            os.environ['PYTHONOPTIMIZE'] = "1"

        # openblas
        os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads)
        
        # mkl blas
        os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads)

        # other stuff from the SO post
        os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads)
        os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads)

        cmd = ' '.join(shlex.quote(a) for a in sys.argv)
        cmd += " --do-not-update-env"
        shell_utils.check_call(cmd)
        return

    msg = "Learning regressors"
    logger.info(msg)

    it = itertools.product(solvers, folds)
    regressors = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        _outer_cv,
        args,
        config,
        progress_bar=True
    )
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Test models learned with train-as-auto-sklearn. It "
        "writes the predictions to disk as a \"long\" data frame. The output "
        "file is in gzipped csv format.")
    
    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('model_template', help="A template string for the filenames for "
        "the learned models. ${solver} and ${fold} are the template part of "
        "the string. It is probably necessary to surround this argument with "
        "single quotes in order to prevent shell replacement of the template "
        "parts.")

    parser.add_argument('out', help="The output csv file")

    parser.add_argument('--config', help="A (yaml) config file which "
        "specifies options controlling the learner behavior")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    msg = "Creating string templates"
    logger.info(msg)
    model_template = string.Template(args.model_template)

    msg = "Finding folds from ASlib scenario"
    logger.info(msg)
        
    folds = [int(i) for i in scenario.cv_data['fold'].unique()]
    folds = sorted(folds)

    msg = "Making predictions"
    logger.info(msg)

    all_predictions = []
    it = itertools.product(scenario.algorithms, folds)
    for solver, fold in it:
        
        model_file = model_template.substitute(solver=solver, fold=fold)
        
        if not os.path.exists(model_file):
            msg = "Could not find model file. Skipping: {}".format(model_file)
            logger.warning(msg)
            continue
        
        try:
            model = joblib.load(model_file)
        except:
            msg = ("Problem loading the model file. Skipping: {}".format(
                model_file))
            logger.warning(msg)
            continue
            
        msg = "Processing. solver: {}. fold: {}".format(solver, fold)
        logger.info(msg)
        
        testing, training = scenario.get_split(fold)
        y_pred = model.predict(testing.feature_data)

        if 'log_performance_data':
            # exp transform it back out
            y_pred = np.expm1(y_pred)
            
        pred_df = pd.DataFrame()
        pred_df['instance_id'] = testing.feature_data.index
        pred_df['solver'] = solver
        pred_df['fold'] = fold
        pred_df['actual'] = testing.performance_data[solver].values
        pred_df['predicted'] = y_pred
        
        all_predictions.append(pred_df)
        
    msg = "Joining all predictions in a long data frame"
    logger.info(msg)
    all_predictions = pd.concat(all_predictions)

    msg = "Writing predictions to disk"
    logger.info(msg)

    utils.write_df(all_predictions, args.out, index=False)
コード例 #17
0
ファイル: validate_cli.py プロジェクト: Lucab95/Sunny
if __name__ == "__main__":

    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--result_fn",
        help="Result json file with predictions for each test instances")
    parser.add_argument("--test_as",
                        help="Directory with *all* test data in ASlib format")
    parser.add_argument("--train_as",
                        help="Directory with *all* train data in ASlib format")

    args_ = parser.parse_args()

    start_time_fold = tm.time()
    #read scenarios
    test_scenario = ASlibScenario()
    test_scenario.read_scenario(dn=args_.test_as)
    train_scenario = ASlibScenario()
    train_scenario.read_scenario(dn=args_.train_as)

    # read result file
    with open(args_.result_fn) as fp:
        schedules = json.load(fp)

    validator = Validator()

    if test_scenario.performance_type[0] == "runtime":
        validator.validate_runtime(schedules=schedules,
                                   test_scenario=test_scenario,
                                   train_scenario=train_scenario)
    else: