def write_to_database(scenario: ASlibScenario,
                      approach,
                      fold: int,
                      on_training=False):
    metrics = list()
    metrics.append(Par10Metric())
    metrics.append(NumberUnsolvedInstances(False))
    metrics.append(NumberUnsolvedInstances(True))
    scenario_name = scenario.scenario
    scenario = ASlibScenario()
    if scenario_name == 'GLUHACK-18':
        scenario_name = 'GLUHACK-2018'
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)
    metric_results = _evaluate_train_test_split_mod(scenario, approach,
                                                    metrics, fold, on_training)

    db_config = load_configuration()
    for i, result in enumerate(metric_results):
        if on_training:
            name = 'training_' + approach.get_name()
            publish_results_to_database(db_config, scenario.scenario, fold,
                                        name, metrics[i].get_name(), result)
        else:
            publish_results_to_database(db_config, scenario.scenario, fold,
                                        approach.get_name(),
                                        metrics[i].get_name(), result)
    def _transform_aslib_scenario_to_kebi_format(self, scenario_folder_path):

        # read scenario
        scenario = ASlibScenario()
        scenario.logger.disabled = True
        scenario.read_scenario(dn=str(scenario_folder_path))

        # prepare performance data and ranking data in XY_concationation DataFrame
        X = scenario.feature_data
        Y = self._performances_to_rankings(scenario)
        X, Y = self._adapt_column_names_according_to_the_output_format(X, Y)
        XY_concatination = pd.concat([X, Y], axis=1, join_axes=[X.index])

        # Save in CSV file
        output_file_path = os.path.join(str(self.absolute_path_output_folder),
                                        scenario.scenario + ".csv")
        XY_concatination.to_csv(
            output_file_path,
            sep=self.separator,
            encoding='UTF-8',
            index=False,
            float_format='%g',
            na_rep=self.replacement_string_null_feature_values)

        # post step: add column types and empty line according to KEBI format to exported csv file
        self._add_value_type_column_name_line_in_kebi_formatted_csv(
            output_file_path, X.columns, Y.columns)
        return scenario
Esempio n. 3
0
def evaluate_scenario(scenario_name: str, approach, metrics,
                      amount_of_training_scenario_instances: int, fold: int,
                      db_config, tune_hyperparameters: bool):
    scenario = ASlibScenario()
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)
    print_stats_of_scenario(scenario)
    evaluate(scenario, approach, metrics,
             amount_of_training_scenario_instances, fold, db_config,
             tune_hyperparameters)
    return scenario_name
Esempio n. 4
0
    def read_scenario_ASlib(self, scenario_dn: str):
        '''
        Read scenario from ASlib format

        Arguments
        ---------
        scenario_dn: str
            Scenario directory name 
        '''

        self.scenario = ASlibScenario()
        self.scenario.read_scenario(dn=scenario_dn)
Esempio n. 5
0
def evaluate_scenario(scenario_name: str, approach, metrics,
                      amount_of_training_scenario_instances: int, fold: int,
                      db_config, tune_hyperparameters: bool):
    scenario = ASlibScenario()
    scenario.read_scenario('data/aslib_data-master/' + scenario_name)

    if scenario_name in ['OPENML-WEKA-2017', 'TTP-2016']:
        metrics = list()
        metrics.append(PerformanceMetric())

    evaluate(scenario, approach, metrics,
             amount_of_training_scenario_instances, fold, db_config,
             tune_hyperparameters)
    return scenario_name
Esempio n. 6
0
    def _save_model(self, out_fn: str, scenario: ASlibScenario, feature_pre_pipeline: list, pre_solver: Aspeed, selector, config: Configuration):
        '''
            save all pipeline objects for predictions

            Arguments
            ---------
            out_fn: str
                filename of output file
            scenario: AslibScenario
                ASlib scenario with all the data
            feature_pre_pipeline: list
                list of preprocessing objects
            pre_solver: Aspeed
                aspeed object with pre-solving schedule
            selector: autofolio.selector.*
                fitted selector object
            config: Configuration
                parameter setting configuration
        '''
        scenario.logger = None
        for fpp in feature_pre_pipeline:
            fpp.logger = None
        if pre_solver:
            pre_solver.logger = None
        selector.logger = None
        model = [scenario, feature_pre_pipeline, pre_solver, selector, config]
        with open(out_fn, "bw") as fp:
            pickle.dump(model, fp)
Esempio n. 7
0
    def transform(self, scenario: ASlibScenario):
        '''
            transform ASLib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas

            Returns
            -------
            data.aslib_scenario.ASlibScenario
        '''
        if self.scaler:
            self.logger.debug("Applying StandardScaler")

            values = self.scaler.transform(
                np.array(scenario.feature_data.values))

            scenario.feature_data = pd.DataFrame(
                data=values,
                index=scenario.feature_data.index,
                columns=scenario.feature_data.columns)

        return scenario
Esempio n. 8
0
    def read_scenario_CSV(self, csv_data: namedtuple):
        '''
        Read scenario from ASlib format

        Arguments
        ---------
        csv_data: namedtuple
            namedtuple with the following fields: "perf_csv", "feat_csv", "obj", "cutoff", "maximize", "cv_csv" 
            "cv_csv" can be None
        '''
        self.scenario = ASlibScenario()
        self.scenario.read_from_csv(perf_fn=csv_data.perf_csv,
                                    feat_fn=csv_data.feat_csv,
                                    objective=csv_data.obj,
                                    runtime_cutoff=csv_data.cutoff,
                                    maximize=csv_data.maximize,
                                    cv_fn=csv_data.cv_csv)
Esempio n. 9
0
    def __init__(self, path):
        # read the parts of the aslib scenario which are present. This is adapted from
        # the example here: (in the predict method)
        #
        # https://github.com/mlindauer/OASC_starterkit/blob/master/oasc_starterkit/single_best.py
        
        scenario = ASlibScenario()
        scenario.read_description(fn=os.path.join(path,"description.txt"))
        scenario.read_feature_values(fn=os.path.join(path,"feature_values.arff"))
        scenario.read_feature_runstatus(fn=os.path.join(path,"feature_runstatus.arff"))

        scenario.instances = scenario.feature_data.index
        
        self.scenario = scenario
Esempio n. 10
0
    def __init__(self,
                 perf_fn: str,
                 feat_fn: str,
                 objective: str = "solution_quality",
                 runtime_cutoff: float = None,
                 maximize: bool = True,
                 cv_fn: str = None,
                 seed: int = 12345):
        """ Constructor """
        self.scenario = ASlibScenario()
        self.scenario.read_from_csv(perf_fn=perf_fn,
                                    feat_fn=feat_fn,
                                    objective=objective,
                                    runtime_cutoff=runtime_cutoff,
                                    maximize=maximize,
                                    cv_fn=cv_fn)
        self.seed = seed

        self.af = AutoFolio(random_seed=seed)
        self.logger = logging.getLogger("AF Facade")
Esempio n. 11
0
    def fit(self, scenario: ASlibScenario, fold: int,
            amount_of_training_instances: int):
        print("Run fit on " + self.get_name() + " for fold " + str(fold))
        self.num_algorithms = len(scenario.algorithms)

        # create all bootstrap samples
        bootstrap_samples, out_of_sample_samples = self.generate_bootstrap_sample(
            scenario, fold, self.num_base_learner)

        weights_denorm = list()

        # train each base learner on a different sample
        for index in range(self.num_base_learner):
            self.current_iteration = index + 1
            self.base_learners.append(copy.deepcopy(self.base_learner))
            original_scenario = copy.deepcopy(scenario)
            scenario.feature_data, scenario.performance_data, scenario.runstatus_data, scenario.feature_runstatus_data, scenario.feature_cost_data = bootstrap_samples[
                index]
            self.base_learners[index].fit(scenario, fold,
                                          amount_of_training_instances)
            if self.weighting:
                if self.weight_type == 'oos':
                    scenario.feature_data, scenario.performance_data, scenario.runstatus_data, scenario.feature_runstatus_data, scenario.feature_cost_data = out_of_sample_samples[
                        index]
                elif self.weight_type == 'original_set':
                    scenario = original_scenario
                weights_denorm.append(
                    base_learner_performance(scenario,
                                             len(scenario.feature_data),
                                             self.base_learners[index]))

            #if self.current_iteration != self.num_base_learner:
            #    write_to_database(scenario, self, fold)

        # Turn around values (lowest (best) gets highest weight) and normalize
        weights_denorm = [
            max(weights_denorm) / float(i + 1) for i in weights_denorm
        ]
        self.weights = [float(i) / max(weights_denorm) for i in weights_denorm]
Esempio n. 12
0
def _outer_cv(solver_fold, args, config):

    solver, fold = solver_fold

    # there are problems serializing the aslib scenario, so just read it again
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)
     
    msg = "Solver: {}, Fold: {}".format(solver, fold)
    logger.info(msg)

    msg = "Constructing template pipeline"
    logger.info(msg)
    pipeline = _get_pipeline(args, config, scenario)

    msg = "Extracting solver and fold performance data"
    logger.info(msg)
    
    testing, training = scenario.get_split(fold)
    X_train = training.feature_data
    y_train = training.performance_data[solver].values

    if 'log_performance_data' in config:
        y_train = np.log1p(y_train)
    
    msg = "Fitting the pipeline"
    logger.info(msg)
    pipeline = pipeline.fit(X_train, y_train)

    out = string.Template(args.out)
    out = out.substitute(solver=solver, fold=fold)

    msg = "Writing fit pipeline to disk: {}".format(out)
    logger.info(msg)
    joblib.dump(pipeline, out)

    return pipeline
def split_scenario(scenario: ASlibScenario, sub_fold: int, num_instances: int):
    fold_len = int(num_instances / 10)
    instances = scenario.instances
    if sub_fold < 10:
        test_insts = instances[(sub_fold - 1) * fold_len:sub_fold * fold_len]
        training_insts = instances[:(sub_fold - 1) * fold_len]
        training_insts = np.append(training_insts, instances[sub_fold * fold_len:])
    else:
        test_insts = instances[(sub_fold - 1) * fold_len:]
        training_insts = instances[:(sub_fold - 1) * fold_len]

    test = copy.copy(scenario)
    training = copy.copy(scenario)

    # feature_data
    test.feature_data = test.feature_data.drop(training_insts).sort_index()
    training.feature_data = training.feature_data.drop(test_insts).sort_index()

    # performance_data
    test.performance_data = test.performance_data.drop(training_insts).sort_index()
    training.performance_data = training.performance_data.drop(test_insts).sort_index()

    # runstatus_data
    test.runstatus_data = test.runstatus_data.drop(training_insts).sort_index()
    training.runstatus_data = training.runstatus_data.drop(test_insts).sort_index()

    # feature_runstatus_data
    test.feature_runstatus_data = test.feature_runstatus_data.drop(training_insts).sort_index()
    training.feature_runstatus_data = training.feature_runstatus_data.drop(test_insts).sort_index()

    # feature_cost_data
    if scenario.feature_cost_data is not None:
        test.feature_cost_data = test.feature_cost_data.drop(training_insts).sort_index()
        training.feature_cost_data = training.feature_cost_data.drop(test_insts).sort_index()

    # ground_truth_data
    if scenario.ground_truth_data is not None:
        test.ground_truth_data = test.ground_truth_data.drop(training_insts).sort_index()
        training.ground_truth_data = training.ground_truth_data.drop(test_insts).sort_index()

    test.cv_data = None
    training.cv_data = None

    test.instances = test_insts
    training.instances = training_insts

    scenario.used_feature_groups = None

    return test, training
def _evaluate_train_test_split_mod(scenario: ASlibScenario, approach, metrics,
                                   fold: int, on_training):
    test_scenario, train_scenario = scenario.get_split(indx=fold)

    if on_training:
        test_scenario = train_scenario

    approach_metric_values = np.zeros(len(metrics))

    num_counted_test_values = 0

    feature_data = test_scenario.feature_data.to_numpy()
    performance_data = test_scenario.performance_data.to_numpy()
    feature_cost_data = test_scenario.feature_cost_data.to_numpy(
    ) if test_scenario.feature_cost_data is not None else None

    for instance_id in range(0, len(test_scenario.instances)):
        X_test = feature_data[instance_id]
        y_test = performance_data[instance_id]

        accumulated_feature_time = 0
        if test_scenario.feature_cost_data is not None and approach.get_name(
        ) != 'sbs' and approach.get_name() != 'oracle':
            feature_time = feature_cost_data[instance_id]
            accumulated_feature_time = np.sum(feature_time)

        contains_non_censored_value = False
        for y_element in y_test:
            if y_element < test_scenario.algorithm_cutoff_time:
                contains_non_censored_value = True
        if contains_non_censored_value:
            num_counted_test_values += 1
            predicted_scores = approach.predict(X_test, instance_id)
            for i, metric in enumerate(metrics):
                runtime = metric.evaluate(y_test, predicted_scores,
                                          accumulated_feature_time,
                                          scenario.algorithm_cutoff_time)
                approach_metric_values[i] = (approach_metric_values[i] +
                                             runtime)

    approach_metric_values = np.true_divide(approach_metric_values,
                                            num_counted_test_values)

    print('PAR10: {0:.10f}'.format(approach_metric_values[0]))

    return approach_metric_values
Esempio n. 15
0
    def get_par10(self, scenario: ASlibScenario, fold: int):
        metrics = list()
        metrics.append(Par10Metric())

        test_scenario, train_scenario = scenario.get_split(indx=fold)

        approach_metric_values = np.zeros(len(metrics))

        num_counted_test_values = 0

        feature_data = train_scenario.feature_data.to_numpy()
        performance_data = train_scenario.performance_data.to_numpy()
        feature_cost_data = train_scenario.feature_cost_data.to_numpy(
        ) if train_scenario.feature_cost_data is not None else None

        for instance_id in range(0, len(train_scenario.instances)):
            X_test = feature_data[instance_id]
            y_test = performance_data[instance_id]

            accumulated_feature_time = 0
            if train_scenario.feature_cost_data is not None and self.get_name(
            ) != 'sbs' and self.get_name() != 'oracle':
                feature_time = feature_cost_data[instance_id]
                accumulated_feature_time = np.sum(feature_time)

            contains_non_censored_value = False
            for y_element in y_test:
                if y_element < train_scenario.algorithm_cutoff_time:
                    contains_non_censored_value = True
            if contains_non_censored_value:
                num_counted_test_values += 1
                predicted_scores = self.predict(X_test, instance_id, opt=True)
                for i, metric in enumerate(metrics):
                    runtime = metric.evaluate(y_test, predicted_scores,
                                              accumulated_feature_time,
                                              scenario.algorithm_cutoff_time)
                    approach_metric_values[i] = (approach_metric_values[i] +
                                                 runtime)

        approach_metric_values = np.true_divide(approach_metric_values,
                                                num_counted_test_values)

        return approach_metric_values
Esempio n. 16
0
    def run_fold(self, config: Configuration, scenario:ASlibScenario, fold:int):
        '''
            run a given fold of cross validation
            
            Arguments
            ---------
            scenario: aslib_scenario.aslib_scenario.ASlibScenario
                aslib scenario at hand
            config: Configuration
                parameter configuration to use for preprocessing
            fold: int
                fold id
                
            Returns
            -------
            Stats()
                
        '''
        self.logger.info("CV-Iteration: %d" % (fold))
        
        test_scenario, training_scenario = scenario.get_split(indx=fold)

        feature_pre_pipeline, pre_solver, selector = self.fit(
            scenario=training_scenario, config=config)

        schedules = self.predict(
            test_scenario, config, feature_pre_pipeline, pre_solver, selector)

        val = Validator()
        if scenario.performance_type[0] == "runtime":
            stats = val.validate_runtime(
                schedules=schedules, test_scenario=test_scenario)
        elif scenario.performance_type[0] == "solution_quality":
            stats = val.validate_quality(
                schedules=schedules, test_scenario=test_scenario)
        else:
            raise ValueError("Unknown: %s" %(performance_type[0]))
        
        return stats
Esempio n. 17
0
    def transform(self, scenario: ASlibScenario):
        '''
            transform ASLib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas

            Returns
            -------
            data.aslib_scenario.ASlibScenario
        '''
        self.logger.debug("Impute Missing Feature Values")

        values = self.imputer.transform(np.array(scenario.feature_data.values))
        scenario.feature_data = pd.DataFrame(
            data=values,
            index=scenario.feature_data.index,
            columns=scenario.feature_data.columns)

        return scenario
Esempio n. 18
0
    def transform(self, scenario: ASlibScenario):
        '''
            transform ASLib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas

            Returns
            -------
            data.aslib_scenario.ASlibScenario
        '''
        if self.pca:
            self.logger.debug("Applying PCA")
            values = self.pca.transform(np.array(scenario.feature_data.values))

            scenario.feature_data = pd.DataFrame(
                data=values,
                index=scenario.feature_data.index,
                columns=["f%d" % (i) for i in range(values.shape[1])])

        return scenario
Esempio n. 19
0
    def run_cli(self):
        '''
            main method of AutoFolio based on command line interface
        '''

        cmd_parser = CMDParser()
        args_, self.overwrite_args = cmd_parser.parse()

        self._root_logger.setLevel(args_.verbose)

        if args_.load:
            self.read_model_and_predict(
                model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec)))
        else:

            scenario = ASlibScenario()
            if args_.scenario:
                scenario.read_scenario(args_.scenario)
            elif args_.performance_csv and args_.feature_csv:
                scenario.read_from_csv(perf_fn=args_.performance_csv,
                                       feat_fn=args_.feature_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=args_.cv_csv)
            else:
                raise ValueError("Missing inputs to read scenario data.")

            self.cs = self.get_cs(scenario)

            if args_.tune:
                config = self.get_tuned_config(scenario)
            else:
                config = self.cs.get_default_configuration()
            self.logger.debug(config)

            if args_.save:
                feature_pre_pipeline, pre_solver, selector = self.fit(
                    scenario=scenario, config=config)
                self._save_model(
                    args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config)
            else:
                self.run_cv(config=config, scenario=scenario, folds=scenario.cv_data.max().max())
Esempio n. 20
0
    def run_cli(self):
        '''
            main method of AutoFolio based on command line interface
        '''

        cmd_parser = CMDParser()
        args_, self.overwrite_args = cmd_parser.parse()

        self._root_logger.setLevel(args_.verbose)

        if args_.load:
            pred = self.read_model_and_predict(
                model_fn=args_.load, feature_vec=list(map(float, args_.feature_vec.split(" "))))
            print("Selected Schedule [(algorithm, budget)]: %s" % (pred))

        else:

            scenario = ASlibScenario()
            if args_.scenario:
                scenario.read_scenario(args_.scenario)
            elif args_.performance_csv and args_.feature_csv:
                scenario.read_from_csv(perf_fn=args_.performance_csv,
                                       feat_fn=args_.feature_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=args_.cv_csv)
            else:
                raise ValueError("Missing inputs to read scenario data.")

            test_scenario = None
            if args_.performance_test_csv and args_.feature_test_csv:
                test_scenario = ASlibScenario()
                test_scenario.read_from_csv(perf_fn=args_.performance_test_csv,
                                       feat_fn=args_.feature_test_csv,
                                       objective=args_.objective,
                                       runtime_cutoff=args_.runtime_cutoff,
                                       maximize=args_.maximize,
                                       cv_fn=None)

            config = {}
            if args_.config is not None:
                self.logger.info("Reading yaml config file")
                config = yaml.load(open(args_.config))
            if not config.get("wallclock_limit"):
                config["wallclock_limit"] = args_.wallclock_limit
            if not config.get("runcount_limit"):
                config["runcount_limit"] = args_.runcount_limit
            if not config.get("output-dir"):
                config["output-dir"] = args_.output_dir

            self.cs = self.get_cs(scenario, config)

            if args_.outer_cv:
                self._outer_cv(scenario, config, args_.outer_cv_fold, 
                    args_.out_template, smac_seed=args_.smac_seed)
                return 0
            
            if args_.tune:
                config = self.get_tuned_config(scenario,
                                               wallclock_limit=args_.wallclock_limit,
                                               runcount_limit=args_.runcount_limit,
                                               autofolio_config=config,
                                               seed=args_.smac_seed)
            else:
                config = self.cs.get_default_configuration()
            self.logger.debug(config)

            if args_.save:
                feature_pre_pipeline, pre_solver, selector = self.fit(
                    scenario=scenario, config=config)
                self._save_model(
                    args_.save, scenario, feature_pre_pipeline, pre_solver, selector, config)
            else:
                self.run_cv(config=config, scenario=scenario, folds=int(scenario.cv_data.max().max()))

            if test_scenario is not None:
                stats = self.run_fold(config=config,
                                      fold=0,
                                      return_fit=False,
                                      scenario=scenario,
                                      test_scenario=test_scenario)
Esempio n. 21
0
    def run_fold(self, config: Configuration, scenario:ASlibScenario, fold:int, test_scenario=None, return_fit:bool=False):
        '''
            run a given fold of cross validation
            
            Arguments
            ---------
            scenario: aslib_scenario.aslib_scenario.ASlibScenario
                aslib scenario at hand
            config: Configuration
                parameter configuration to use for preprocessing
            fold: int
                fold id
            test_scenario:aslib_scenario.aslib_scenario.ASlibScenario
                aslib scenario with test data for validation
                generated from <scenario> if None

            return_fit: bool
                optionally, the learned preprocessing options, presolver and
                selector can be returned
                
            Returns
            -------
            Stats()

            (pre_pipeline, pre_solver, selector):
                only present if return_fit is True
                the pipeline components fit with the configuration options

            schedule: dict of string -> list of (solver, cutoff) pairs
                only present if return_fit is True
                the solver choices for each instance
                
                
        '''

        if test_scenario is None:
            self.logger.info("CV-Iteration: %d" % (fold))
            test_scenario, training_scenario = scenario.get_split(indx=fold)
        else:
            self.logger.info("Validation on test data")
            training_scenario = scenario

        feature_pre_pipeline, pre_solver, selector = self.fit(
            scenario=training_scenario, config=config)

        schedules = self.predict(
            test_scenario, config, feature_pre_pipeline, pre_solver, selector)

        val = Validator()
        if scenario.performance_type[0] == "runtime":
            stats = val.validate_runtime(
                schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario)
        elif scenario.performance_type[0] == "solution_quality":
            stats = val.validate_quality(
                schedules=schedules, test_scenario=test_scenario, train_scenario=training_scenario)
        else:
            raise ValueError("Unknown: %s" %(scenario.performance_type[0]))
        
        if return_fit:
            return stats, (feature_pre_pipeline, pre_solver, selector), schedules
        else:
            return stats
Esempio n. 22
0
if __name__ == "__main__":

    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--result_fn",
        help="Result json file with predictions for each test instances")
    parser.add_argument("--test_as",
                        help="Directory with *all* test data in ASlib format")
    parser.add_argument("--train_as",
                        help="Directory with *all* train data in ASlib format")

    args_ = parser.parse_args()

    start_time_fold = tm.time()
    #read scenarios
    test_scenario = ASlibScenario()
    test_scenario.read_scenario(dn=args_.test_as)
    train_scenario = ASlibScenario()
    train_scenario.read_scenario(dn=args_.train_as)

    # read result file
    with open(args_.result_fn) as fp:
        schedules = json.load(fp)

    validator = Validator()

    if test_scenario.performance_type[0] == "runtime":
        validator.validate_runtime(schedules=schedules,
                                   test_scenario=test_scenario,
                                   train_scenario=train_scenario)
    else:
Esempio n. 23
0
    def _outer_cv(self, scenario: ASlibScenario, autofolio_config:dict=None, 
            outer_cv_fold:int=None, out_template:str=None,
            smac_seed:int=42):
        '''
            Evaluate on a scenario using an "outer" cross-fold validation
            scheme. In particular, this ensures that SMAC does not use the test
            set during hyperparameter optimization.

            Arguments
            ---------
            scenario: ASlibScenario
                ASlib Scenario at hand
            
            autofolio_config: dict, or None
                An optional dictionary of configuration options

            outer_cv_fold: int, or None
                If given, then only the single outer-cv fold is processed

            out_template: str, or None
                If given, the learned configurations are written to the 
                specified locations. The string is considered a template, and
                "%fold%" will be replaced with the fold.

            smac_seed:int 
                random seed for SMAC

            Returns
            -------
            stats: validate.Stats
                Performance over all outer-cv folds

        '''
        import string

        outer_stats = None

        # For each outer split
        outer_cv_folds = range(1, 11)
        if outer_cv_fold is not None:
            outer_cv_folds = range(outer_cv_fold, outer_cv_fold+1)

        for cv_fold in outer_cv_folds:
            
            # Use ‘ASlibScenario.get_split()’ to get the outer split
            outer_testing, outer_training = scenario.get_split(cv_fold)
            
            msg = ">>>>> Outer CV fold: {} <<<<<".format(cv_fold)
            self.logger.info(msg)

            # Use ASlibScenario.create_cv_splits() to get an inner-cv
            outer_training.create_cv_splits(n_folds=10)
            
            # Use ‘AutoFolio.get_tuned_config()’ to tune on inner-cv
            config = self.get_tuned_config(
                outer_training, 
                autofolio_config=autofolio_config,
                seed=smac_seed
            )
            
            # Use `AutoFolio.run_fold()’ to get the performance on the outer split
            stats, fit, schedule = self.run_fold(
                config, 
                scenario, 
                cv_fold, 
                return_fit=True
            )

            feature_pre_pipeline, pre_solver, selector = fit

            if outer_stats is None:
                outer_stats = stats
            else:
                outer_stats.merge(stats)

            # save the model, if given an output location
            if out_template is not None:
                out_template_ = string.Template(out_template)
                model_fn = out_template_.substitute(fold=cv_fold, type="pkl")
                
                msg = "Writing model to: {}".format(model_fn)
                self.logger.info(msg)

                self._save_model(
                    model_fn, 
                    scenario, 
                    feature_pre_pipeline, 
                    pre_solver, 
                    selector, 
                    config
                )

                # convert the schedule to a data frame
                schedule_df = pd.Series(schedule, name="solver")
                schedule_df.index.name = "instance"
                schedule_df = schedule_df.reset_index()

                # just keep the solver name; we don't care about the time

                # x[0] gets the first pair in the schedule list
                # and x[0][0] gets the name of the solver from that pair
                schedule_df['solver'] = schedule_df['solver'].apply(lambda x: x[0][0])

                selections_fn = out_template_.substitute(fold=cv_fold, type="csv")

                msg = "Writing solver choices to: {}".format(selections_fn)
                self.logger.info(msg)

                schedule_df.to_csv(selections_fn, index=False)

        self.logger.info(">>>>> Final Stats <<<<<")
        outer_stats.show()
Esempio n. 24
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Validate the algorithm selection performance of the "
        "predictions made using test-as-auto-sklearn using "
        "autofolio.validation.validate.Validator.")

    parser.add_argument('scenario', help="The ASlib scenario")
    parser.add_argument('predictions',
                        help="The predictions file, from "
                        "test-as-auto-sklearn")

    parser.add_argument('--config',
                        help="A (yaml) config file which "
                        "specifies options controlling the learner behavior")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}
        config['allowed_feature_groups'] = [scenario.feature_group_dict.keys()]

    # either way, update the scenario with the features used during training
    scenario.used_feature_groups = config['allowed_feature_groups']

    msg = "Reading predictions"
    logger.info(msg)
    predictions = pd.read_csv(args.predictions)

    msg = "Selecting the algorithm with smallest prediction for each instance"
    logger.info(msg)

    algorithm_selections = pandas_utils.get_group_extreme(
        predictions, "predicted", ex_type="min", group_fields="instance_id")

    msg = "Creating the schedules for the validator"
    logger.info(msg)

    schedules = parallel.apply_df_simple(algorithm_selections, _get_schedule,
                                         scenario.algorithm_cutoff_time)

    schedules = utils.merge_dicts(*schedules)

    val = Validator()
    performance_type = scenario.performance_type[0]

    if performance_type == "runtime":
        stats = val.validate_runtime(schedules=schedules,
                                     test_scenario=scenario)

    elif performance_type == "solution_quality":
        stats = val.validate_quality(schedules=schedules,
                                     test_scenario=scenario)

    else:
        msg = "Unknown performance type: {}".format(performance_type)
        raise ValueError(msg)

    msg = "=== RESULTS ==="
    logger.info(msg)
    stats.show()
Esempio n. 25
0
class AFCsvFacade(object):

    def __init__(self,
                 perf_fn:str,
                 feat_fn:str,
                 objective:str = "solution_quality",
                 runtime_cutoff:float = None,
                 maximize:bool = True,
                 cv_fn:str = None,
                 seed: int = 12345
                 ):
        """ Constructor """
        self.scenario = ASlibScenario()
        self.scenario.read_from_csv(perf_fn=perf_fn,
                                               feat_fn=feat_fn,
                                               objective=objective,
                                               runtime_cutoff=runtime_cutoff,
                                               maximize=maximize,
                                               cv_fn=cv_fn)
        self.seed = seed

        self.af = AutoFolio(random_seed=seed)
        self.logger = logging.getLogger("AF Facade")

    def fit(self,
            config:Configuration=None,
            save_fn:str = None):
        """ Train AutoFolio on data from init"""
        self.logger.info("Fit")
        if config is None:
            cs = self.af.get_cs(self.scenario, {})
            config = cs.get_default_configuration()
        feature_pre_pipeline, pre_solver, selector = self.af.fit(scenario=self.scenario, config=config)

        if save_fn:
            self.af._save_model(save_fn, self.scenario, feature_pre_pipeline, pre_solver, selector, config)
        self.logger.info("AutoFolio model saved to %s" %(save_fn))

    def tune(self,
             wallclock_limit:int = 1200,
             runcount_limit:int = np.inf,
             ):

        config = self.af.get_tuned_config(self.scenario,
                                       wallclock_limit=wallclock_limit,
                                       runcount_limit=runcount_limit,
                                       autofolio_config={},
                                       seed=self.seed)
        self.logger.info("Optimized Configuration: %s" %(config))
        return config

    def cross_validation(self, config:Configuration):
        """ run a cross validation on given AutoFolio configuration"""
        score = -1 * self.af.run_cv(config=config, scenario=self.scenario, folds=int(self.scenario.cv_data.max().max()))
        self.logger.info("AF's final performance %f" %(score))

        return score

    @staticmethod
    def load_and_predict(vec: np.ndarray,
                         load_fn:str):
        """ get predicted algorithm for given meta-feature vector"""
        af = AutoFolio(random_seed=42) # random seed doesn't matter here
        pred = af.read_model_and_predict(model_fn=load_fn, feature_vec=vec)
        print("Selected Schedule [(algorithm, budget)]: %s" % (pred))
        return pred[0][0]
def evaluate_train_test_split(scenario: ASlibScenario, approach, metrics,
                              fold: int, amount_of_training_instances: int,
                              train_status: str):
    test_scenario, train_scenario = scenario.get_split(indx=fold)

    if train_status != 'all':
        train_scenario = copy.deepcopy(train_scenario)
        threshold = train_scenario.algorithm_cutoff_time
        if train_status == 'clip_censored':
            train_scenario.performance_data = train_scenario.performance_data.clip(
                upper=threshold)

        elif train_status == 'ignore_censored':
            train_scenario.performance_data = train_scenario.performance_data.replace(
                10 * threshold, np.nan)

    if approach.get_name() == 'oracle' or approach.get_name(
    ) == 'virtual_sbs_with_feature_costs':
        approach.fit(test_scenario, fold, amount_of_training_instances)
    else:
        approach.fit(train_scenario, fold, amount_of_training_instances)

    approach_metric_values = np.zeros(len(metrics))

    num_counted_test_values = 0

    feature_data = test_scenario.feature_data.to_numpy()
    performance_data = test_scenario.performance_data.to_numpy()
    feature_cost_data = test_scenario.feature_cost_data.to_numpy(
    ) if test_scenario.feature_cost_data is not None else None

    instancewise_result_strings = list()
    simple_runtime_metric = RuntimeMetric()

    for instance_id in range(0, len(test_scenario.instances)):

        X_test = feature_data[instance_id]
        y_test = performance_data[instance_id]

        # compute feature time
        accumulated_feature_time = 0
        if test_scenario.feature_cost_data is not None and approach.get_name(
        ) != 'sbs' and approach.get_name() != 'oracle':
            feature_time = feature_cost_data[instance_id]
            accumulated_feature_time = np.sum(feature_time)

        #compute the values of the different metrics
        predicted_scores = approach.predict(X_test, instance_id)
        num_counted_test_values += 1
        for i, metric in enumerate(metrics):
            runtime = metric.evaluate(y_test, predicted_scores,
                                      accumulated_feature_time,
                                      scenario.algorithm_cutoff_time)
            approach_metric_values[i] = (approach_metric_values[i] + runtime)

        # store runtimes on a per instance basis in ASLib format
        runtime = simple_runtime_metric.evaluate(
            y_test, predicted_scores, accumulated_feature_time,
            scenario.algorithm_cutoff_time)
        run_status_to_print = "ok" if runtime < scenario.algorithm_cutoff_time else "timeout"
        line_to_store = test_scenario.instances[
            instance_id] + ",1," + approach.get_name() + "," + str(
                runtime) + "," + run_status_to_print
        instancewise_result_strings.append(line_to_store)

    write_instance_wise_results_to_file(instancewise_result_strings,
                                        scenario.scenario)

    approach_metric_values = np.true_divide(approach_metric_values,
                                            num_counted_test_values)

    for i, metric in enumerate(metrics):
        print(metrics[i].get_name() +
              ': {0:.10f}'.format(approach_metric_values[i]))

    return approach_metric_values
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Test models learned with train-as-auto-sklearn. It "
        "writes the predictions to disk as a \"long\" data frame. The output "
        "file is in gzipped csv format.")
    
    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('model_template', help="A template string for the filenames for "
        "the learned models. ${solver} and ${fold} are the template part of "
        "the string. It is probably necessary to surround this argument with "
        "single quotes in order to prevent shell replacement of the template "
        "parts.")

    parser.add_argument('out', help="The output csv file")

    parser.add_argument('--config', help="A (yaml) config file which "
        "specifies options controlling the learner behavior")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Loading ASlib scenario"
    logger.info(msg)

    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    if args.config is not None:
        msg = "Loading yaml config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    msg = "Creating string templates"
    logger.info(msg)
    model_template = string.Template(args.model_template)

    msg = "Finding folds from ASlib scenario"
    logger.info(msg)
        
    folds = [int(i) for i in scenario.cv_data['fold'].unique()]
    folds = sorted(folds)

    msg = "Making predictions"
    logger.info(msg)

    all_predictions = []
    it = itertools.product(scenario.algorithms, folds)
    for solver, fold in it:
        
        model_file = model_template.substitute(solver=solver, fold=fold)
        
        if not os.path.exists(model_file):
            msg = "Could not find model file. Skipping: {}".format(model_file)
            logger.warning(msg)
            continue
        
        try:
            model = joblib.load(model_file)
        except:
            msg = ("Problem loading the model file. Skipping: {}".format(
                model_file))
            logger.warning(msg)
            continue
            
        msg = "Processing. solver: {}. fold: {}".format(solver, fold)
        logger.info(msg)
        
        testing, training = scenario.get_split(fold)
        y_pred = model.predict(testing.feature_data)

        if 'log_performance_data':
            # exp transform it back out
            y_pred = np.expm1(y_pred)
            
        pred_df = pd.DataFrame()
        pred_df['instance_id'] = testing.feature_data.index
        pred_df['solver'] = solver
        pred_df['fold'] = fold
        pred_df['actual'] = testing.performance_data[solver].values
        pred_df['predicted'] = y_pred
        
        all_predictions.append(pred_df)
        
    msg = "Joining all predictions in a long data frame"
    logger.info(msg)
    all_predictions = pd.concat(all_predictions)

    msg = "Writing predictions to disk"
    logger.info(msg)

    utils.write_df(all_predictions, args.out, index=False)
Esempio n. 28
0
 def main(self,
          train_scenario_dn:str,
          test_scenario_dn:str=None):
     '''
         main method
         
         Arguments
         ---------
         train_scenario_dn:str
             directory name with ASlib scenario training data
         test_scenarios_dn:str
             directory name with ASlib scenario test data 
             (performance data is missing)
     '''
     
     # Read scenario files
     scenario = ASlibScenario()
     scenario.read_scenario(dn=train_scenario_dn)
     
     # fit on training data
     self.fit(scenario=scenario)
     
     # Read test files
     # ASlibScenario is not designed to read partial scenarios
     # therefore, we have to cheat a bit
     scenario = ASlibScenario()
     scenario.read_description(fn=os.path.join(test_scenario_dn,"description.txt"))
     scenario.read_feature_values(fn=os.path.join(test_scenario_dn,"feature_values.arff"))
     scenario.read_feature_runstatus(fn=os.path.join(test_scenario_dn,"feature_runstatus.arff"))
     
     # predict on test data
     self.predict(scenario=scenario)
Esempio n. 29
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script trains a model to predict the runtime for a "
        "solver from an ASlib scenario using autosklearn. It assumes an "
        "\"outer\" cross-validation strategy, and it only trains a model for "
        "the indicated folds and solvers. It then writes the learned model to "
        "disk. It *does not* collect any statistics, make predictions ,etc.")

    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('out', help="A template string for the filenames for "
        "the learned models. They are written with joblib.dump, so they need "
        "to be read back in with joblib.load. ${solver} and ${fold} are the "
        "template part of the string. It is probably necessary to surround "
        "this argument with single quotes in order to prevent shell "
        "replacement of the template parts.")

    parser.add_argument('--config', help="A (yaml) config file which specifies "
        "options controlling the learner behavior")

    parser.add_argument('--solvers', help="The solvers for which models will "
        "be learned. By default, models for all solvers are learned", 
        nargs='*', default=[])

    parser.add_argument('--folds', help="The outer-cv folds for which a model "
        "will be learned. By default, models for all folds are learned", 
        type=int, nargs='*', default=[])

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use "
        "for parallel solver/fold training", type=int, 
        default=default_num_cpus)
    
    parser.add_argument('--num-blas-threads', help="The number of threads to "
        "use for parallelizing BLAS. The total number of CPUs will be "
        "\"num_cpus * num_blas_cpus\". Currently, this flag only affects "
        "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus)

    parser.add_argument('--do-not-update-env', help="By default, num-blas-threads "
        "requires that relevant environment variables are updated. Likewise, "
        "if num-cpus is greater than one, it is necessary to turn off python "
        "assertions due to an issue with multiprocessing. If this flag is "
        "present, then the script assumes those updates are already handled. "
        "Otherwise, the relevant environment variables are set, and a new "
        "processes is spawned with this flag and otherwise the same "
        "arguments. This flag is not inended for external users.",
        action='store_true')

    automl_utils.add_automl_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    folds = args.folds
    if len(folds) == 0:
        folds = range(1, 11)

    for f in folds:
        math_utils.check_range(f, 1, 10, variable_name="fold")

    # and which solvers
    msg = "Reading ASlib scenario"
    logger.info(msg)
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    # ensure the selected solver is present
    solvers = args.solvers
    if len(solvers) == 0:
        solvers = scenario.algorithms

    for solver in solvers:
        if solver not in scenario.algorithms:
            solver_str = ','.join(scenario.algorithms)
            msg = ("[train-auto-sklear]: the solver is not present in the "
                "ASlib scenario. given: {}. choices: {}".format(solver, 
                solver_str))
            raise ValueError(msg)

    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    # everything is present, so update the environment variables and spawn a
    # new process, if necessary
    if not args.do_not_update_env:
        ###
        #
        # There is a lot going on with settings these environment variables.
        # please see the following references:
        #
        #   Turning off assertions so we can parallelize sklearn across
        #   multiple CPUs for different solvers/folds
        #       https://github.com/celery/celery/issues/1709
        #
        #   Controlling OpenBLAS threads
        #       https://github.com/automl/auto-sklearn/issues/166
        #
        #   Other environment variables controlling thread usage
        #       http://stackoverflow.com/questions/30791550
        #
        ###
        
        # we only need to turn off the assertions if we parallelize across cpus
        if args.num_cpus > 1:
            os.environ['PYTHONOPTIMIZE'] = "1"

        # openblas
        os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads)
        
        # mkl blas
        os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads)

        # other stuff from the SO post
        os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads)
        os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads)

        cmd = ' '.join(shlex.quote(a) for a in sys.argv)
        cmd += " --do-not-update-env"
        shell_utils.check_call(cmd)
        return

    msg = "Learning regressors"
    logger.info(msg)

    it = itertools.product(solvers, folds)
    regressors = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        _outer_cv,
        args,
        config,
        progress_bar=True
    )
    def fit(self, scenario: ASlibScenario, fold: int,
            amount_of_training_instances: int):

        # setup the ensemble
        self.create_base_learner()
        self.scenario_name = scenario.scenario
        self.fold = fold
        self.num_algorithms = len(scenario.algorithms)

        num_instances = len(scenario.instances)
        feature_data = scenario.feature_data.to_numpy()
        performance_data = scenario.performance_data.to_numpy()

        # new features in matrix [instances x predictions]
        if self.new_feature_type == 'full':
            new_feature_data = np.zeros(
                (num_instances, self.num_algorithms * len(self.base_learners)))

        elif self.new_feature_type == 'small':
            new_feature_data = np.zeros(
                (num_instances, len(self.base_learners)))

        # if predictions are precomputed
        if self.pre_computed:
            for base_learner in self.base_learners:
                self.predictions.append(
                    load_pickle(filename='predictions/' +
                                base_learner.get_name() + '_' +
                                scenario.scenario + '_' + str(fold)))

        # create new features for every base learner on each instance
        for learner_index, base_learner in enumerate(self.base_learners):

            # load pre computed predictions
            if self.pre_computed:
                if self.cross_validation:
                    predictions = load_pickle(
                        filename='predictions/cross_validation_' +
                        base_learner.get_name() + '_' + scenario.scenario +
                        '_' + str(fold))
                else:
                    predictions = load_pickle(
                        filename='predictions/full_trainingdata_' +
                        base_learner.get_name() + '_' + scenario.scenario +
                        '_' + str(fold))

            # create predictions, if they are not pre computed
            else:

                # if cross validation is used (h2o)
                if self.cross_validation:
                    instance_counter = 0

                    for sub_fold in range(1, 11):
                        test_scenario, training_scenario = split_scenario(
                            scenario, sub_fold, num_instances)

                        # train base learner
                        base_learner.fit(training_scenario, fold,
                                         amount_of_training_instances)

                        # create new feature data
                        for instance_number in range(
                                instance_counter, instance_counter +
                                len(test_scenario.instances)):
                            prediction = base_learner.predict(
                                feature_data[instance_number], instance_number)
                            predictions[instance_number] = prediction.flatten()

                        instance_counter = instance_counter + len(
                            test_scenario.instances)

                    # fit base learner on the original training data
                    self.create_base_learner()
                    for base_learner in self.base_learners:
                        base_learner.fit(scenario, fold,
                                         amount_of_training_instances)

                # if no cross validation is used
                else:
                    base_learner.fit(scenario, fold,
                                     amount_of_training_instances)

                    predictions = np.zeros(
                        (len(scenario.instances), self.num_algorithms))

                    for instance_id, instance_feature in enumerate(
                            feature_data):
                        predictions[instance_id] = base_learner.predict(
                            instance_feature, instance_id)

            # insert predictions to new feature data matrix
            for i in range(num_instances):
                if self.new_feature_type == 'full':
                    for alo_num in range(self.num_algorithms):
                        new_feature_data[i][
                            alo_num + self.num_algorithms *
                            learner_index] = predictions[i][alo_num]

                elif self.new_feature_type == 'small':
                    new_feature_data[i][learner_index] = np.argmin(
                        predictions[i])

        # add predictions to the features of the instances
        if self.new_feature_type == 'full':
            new_columns = np.arange(self.num_algorithms *
                                    len(self.base_learners))

        elif self.new_feature_type == 'small':
            new_columns = np.arange(len(self.base_learners))

        new_feature_data = pd.DataFrame(new_feature_data,
                                        index=scenario.feature_data.index,
                                        columns=new_columns)

        if self.meta_learner_input == 'full':
            new_feature_data = pd.concat(
                [scenario.feature_data, new_feature_data], axis=1, sort=False)

        elif self.meta_learner_input == 'predictions_only':
            pass

        else:
            sys.exit('Wrong meta learner input type option')

        scenario.feature_data = new_feature_data

        # meta learner selection
        if self.meta_learner_type == 'per_algorithm_regressor':
            self.meta_learner = PerAlgorithmRegressor(
                feature_importances=self.feature_importance)
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'SUNNY':
            self.meta_learner = SUNNY()
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'ISAC':
            self.meta_learner = ISAC()
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'SATzilla-11':
            self.meta_learner = SATzilla11()
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'multiclass':
            self.meta_learner = MultiClassAlgorithmSelector(
                feature_importance=self.feature_importance)
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'Expectation':
            self.meta_learner = SurrogateSurvivalForest(
                criterion='Expectation')
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'PAR10':
            self.meta_learner = SurrogateSurvivalForest(criterion='PAR10')
            self.algorithm_selection_algorithm = True
        elif self.meta_learner_type == 'RandomForest':
            self.meta_learner = RandomForestClassifier(random_state=fold)
        elif self.meta_learner_type == 'SVM':
            self.meta_learner = LinearSVC(random_state=fold, max_iter=10000)

        # feature selection
        if self.feature_selection == 'variance_threshold':
            self.feature_selector = VarianceThreshold(threshold=.8 * (1 - .8))
            self.feature_selector.fit(scenario.feature_data)
            scenario.feature_data = pd.DataFrame(
                data=self.feature_selector.transform(scenario.feature_data))
        elif self.feature_selection == 'select_k_best':
            self.feature_selector = SelectKBest(f_classif,
                                                k=self.num_algorithms)
            label_performance_data = [np.argmin(x) for x in performance_data]
            self.imputer = SimpleImputer()
            scenario.feature_data = self.imputer.fit_transform(
                scenario.feature_data)
            self.feature_selector.fit(scenario.feature_data,
                                      label_performance_data)
            scenario.feature_data = pd.DataFrame(
                data=self.feature_selector.transform(scenario.feature_data))

        # fit meta learner
        if self.algorithm_selection_algorithm:
            self.meta_learner.fit(scenario, fold, amount_of_training_instances)
        else:
            label_performance_data = [np.argmin(x) for x in performance_data]

            self.pipe = Pipeline([('imputer', SimpleImputer()),
                                  ('standard_scaler', StandardScaler())])
            x_train = self.pipe.fit_transform(scenario.feature_data.to_numpy(),
                                              label_performance_data)

            self.meta_learner.fit(x_train, label_performance_data)