Example #1
0
    def execute(save_folder,
                runhistory_location,
                configspace_location,
                modus='ablation',
                seed=1):
        with open(runhistory_location, 'r') as runhistory_filep:
            runhistory = json.load(runhistory_filep)

        # create scenario file
        scenario_dict = {
            'run_obj': 'quality',
            'deterministic': 1,
            'paramfile': configspace_location
        }

        trajectory_lines = openmlpimp.utils.runhistory_to_trajectory(
            runhistory, maximize=True)
        if len(trajectory_lines) != 1:
            raise ValueError('trajectory file should containexactly one line.')

        traj_file = tempfile.NamedTemporaryFile('w', delete=False)
        for line in trajectory_lines:
            json.dump(line, traj_file)
            traj_file.write("\n")
        traj_file.close()

        num_params = len(trajectory_lines[0]['incumbent'])
        importance = Importance(scenario_dict,
                                runhistory_file=runhistory_location,
                                parameters_to_evaluate=num_params,
                                traj_file=traj_file.name,
                                seed=seed,
                                save_folder=save_folder)

        try:
            os.makedirs(save_folder)
        except FileExistsError:
            pass

        for i in range(5):
            try:
                result = importance.evaluate_scenario(modus)
                filename = 'pimp_values_%s.json' % modus
                with open(os.path.join(save_folder, filename),
                          'w') as out_file:
                    json.dump(result,
                              out_file,
                              sort_keys=True,
                              indent=4,
                              separators=(',', ': '))
                importance.plot_results(name=os.path.join(save_folder, modus),
                                        show=False)
                return save_folder + "/" + filename
            except ZeroDivisionError as e:
                pass
        raise e
Example #2
0
class PIMP:
    def __init__(self,
                 scenario: Scenario,
                 smac: Union[SMAC, None] = None,
                 mode: str = 'all',
                 X: Union[None, List[list], np.ndarray] = None,
                 y: Union[None, List[list], np.ndarray] = None,
                 numParams: int = -1,
                 impute: bool = False,
                 seed: int = 12345,
                 run: bool = False,
                 max_sample_size: int = -1,
                 fanova_cut_at_default: bool = False,
                 fANOVA_pairwise: bool = True,
                 forwardsel_feat_imp: bool = False,
                 incn_quant_var: bool = True,
                 marginalize_away_instances: bool = False,
                 save_folder: str = 'PIMP'):
        """
        Interface to be used with SMAC or with X and y matrices.
        :param scenario: The scenario object, that knows the configuration space.
        :param smac: The smac object that keeps all the run-data
        :param mode: The mode with which to run PIMP [ablation, fanova, all, forward-selection]
        :param X: Numpy Array that contains parameter arrays
        :param y: Numpy array that contains the corresponding performance values
        :param numParams: The number of parameters to evaluate
        :param impute: Flag to decide if censored data gets imputed or not
        :param seed: The random seed
        :param run: Flag to immediately compute the importance values after this setup or not.
        """
        self.scenario = scenario
        self.imp = None
        self.mode = mode
        self.save_folder = save_folder
        if not os.path.exists(self.save_folder): os.mkdir(self.save_folder)
        if smac is not None:
            self.imp = Importance(scenario=scenario,
                                  runhistory=smac.runhistory,
                                  incumbent=smac.solver.incumbent,
                                  seed=seed,
                                  parameters_to_evaluate=numParams,
                                  save_folder='PIMP',
                                  impute_censored=impute,
                                  max_sample_size=max_sample_size,
                                  fANOVA_cut_at_default=fanova_cut_at_default,
                                  fANOVA_pairwise=fANOVA_pairwise,
                                  forwardsel_feat_imp=forwardsel_feat_imp,
                                  incn_quant_var=incn_quant_var,
                                  preprocess=marginalize_away_instances)
        elif X is not None and y is not None:
            X = np.array(X)
            y = np.array(y)
            runHist = RunHistory(average_cost)
            if X.shape[0] != y.shape[0]:
                raise Exception('Number of samples in X and y dont match!')
            n_params = len(scenario.cs.get_hyperparameters())
            feats = None
            if X.shape[1] > n_params:
                feats = X[:, n_params:]
                assert feats.shape[1] == scenario.feature_array.shape[1]
                X = X[:, :n_params]

            for p in range(X.shape[1]):  # Normalize the data to fit into [0, 1]
                _min, _max = np.min(X[:, p]), np.max(X[:, p])
                if _min < 0. or 1 < _max:  # if it is not already normalized
                    for id, v in enumerate(X[:, p]):
                        X[id, p] = (v - _min) / (_max - _min)

            # Add everything to a runhistory such that PIMP can work with it
            for x, feat, y_val in zip(X, feats if feats is not None else X, y):
                id = None
                for inst in scenario.feature_dict:  # determine on which instance a configuration was run
                    if np.all(scenario.feature_dict[inst] == feat):
                        id = inst
                        break
                runHist.add(Configuration(scenario.cs, vector=x), y_val, 0, StatusType.SUCCESS, id)
            self.X = X
            self.y = y

            best_ = None  # Determine incumbent according to the best mean cost in the runhistory
            for config in runHist.config_ids:
                inst_seed_pairs = runHist.get_runs_for_config(config)
                all_ = []
                for inst, seed in inst_seed_pairs:
                    rk = RunKey(runHist.config_ids[config], inst, seed)
                    all_.append(runHist.data[rk].cost)
                mean = np.mean(all_)
                if best_ is None or best_[0] > mean:
                    best_ = (mean, config)
            incumbent = best_[1]
            self.imp = Importance(scenario=scenario,
                                  runhistory=runHist,
                                  seed=seed,
                                  parameters_to_evaluate=numParams,
                                  save_folder=self.save_folder,
                                  impute_censored=impute,
                                  incumbent=incumbent,
                                  fANOVA_cut_at_default=fanova_cut_at_default,
                                  fANOVA_pairwise=fANOVA_pairwise,
                                  forwardsel_feat_imp=forwardsel_feat_imp,
                                  incn_quant_var=incn_quant_var,
                                  preprocess=marginalize_away_instances
                                  )
        else:
            raise Exception('Neither X and y matrices nor a SMAC object were specified to compute the importance '
                            'values from!')

        if run:
            self.compute_importances()

    def compute_importances(self):
        if self.mode == 'all':
            self.mode = ['ablation',
                         'forward-selection',
                         'fanova',
                         'incneighbor']
        elif not isinstance(self.mode, list):
            self.mode = [self.mode]
        result = self.imp.evaluate_scenario(self.mode, save_folder=self.save_folder)
        return result

    def plot_results(self, result: Union[List[Dict[str, float]], Dict[str, float]], save_table: bool = True,
                     show=False):
        save_folder = self.save_folder
        if self.mode == 'all':
            with open(os.path.join(save_folder, 'pimp_values_%s.json' % self.mode), 'w') as out_file:
                json.dump(result[0], out_file, sort_keys=True, indent=4, separators=(',', ': '))
            self.imp.plot_results(list(map(lambda x: os.path.join(save_folder, x.name.lower()), result[1])),
                                  result[1], show=show)
            if save_table:
                self.imp.table_for_comparison(evaluators=result[1], name=os.path.join(
                    save_folder, 'pimp_table_%s.tex' % self.mode), style='latex')
            else:
                self.imp.table_for_comparison(evaluators=result[1], style='cmd')
        else:
            with open(os.path.join(save_folder, 'pimp_values_%s.json' % self.mode), 'w') as out_file:
                json.dump(result[0], out_file, sort_keys=True, indent=4, separators=(',', ': '))
            if isinstance(self.mode, list):
                self.imp.plot_results(name=os.path.join(save_folder, 'all'), show=show)
            else:
                self.imp.plot_results(name=os.path.join(save_folder, self.mode), show=show)
Example #3
0
def cmd_line_call():
    """
    Main Parameter importance script.
    """
    cmd_reader = CMDs()
    args, misc_ = cmd_reader.read_cmd()  # read cmd args
    cwd = os.path.abspath(os.getcwd())
    if args.out_folder and not os.path.isabs(args.out_folder):
        args.out_folder = os.path.abspath(args.out_folder)
    if args.trajectory and not os.path.isabs(args.trajectory):
        args.trajectory = os.path.abspath(args.trajectory)
    if not os.path.isabs(args.scenario_file):
        args.scenario_file = os.path.abspath(args.scenario_file)
    if not os.path.isabs(args.history):
        args.history = os.path.abspath(args.history)
    os.chdir(args.wdir)
    logging.basicConfig(level=args.verbose_level)
    ts = time.time()
    ts = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H:%M:%S')
    fanova_ready = True

    try:
        import fanova
    except ImportError:
        warnings.simplefilter('always', ImportWarning)
        warnings.warn('fANOVA is not installed in your environment. To install it please run '
                      '"git+http://github.com/automl/fanova.git@master"')
        fanova_ready = False

    if 'influence-model' in args.modus:
        logging.warning('influence-model not fully supported yet!')
    if 'incneighbor' in args.modus:
        warnings.simplefilter('always', DeprecationWarning)
        warnings.warn('incneighbor will be deprecated in version 1.0.0 as it was the development name of'
                      ' lpi. Use lpi instead.', DeprecationWarning, stacklevel=2)
    if 'lpi' in args.modus:  # LPI will replace incneighbor in the future
        args.modus[args.modus.index('lpi')] = 'incneighbor'
    if 'fanova' in args.modus and not fanova_ready:
        raise ImportError('fANOVA is not installed! To install it please run '
                          '"git+http://github.com/automl/fanova.git@master"')
    if 'all' in args.modus:
        choices = ['ablation',
                   'forward-selection',
                   'fanova',
                   'incneighbor']
        if not fanova_ready:
            raise ImportError('fANOVA is not installed! To install it please run '
                              '"git+http://github.com/automl/fanova.git@master"')
        del args.modus[args.modus.index('all')]
        if len(args.modus) == len(choices):
            pass
        else:
            args.modus = choices
    if not args.out_folder:
        if len(args.modus) > 1:
            tmp = ['all']
        else:
            tmp = args.modus
            if 'incneighbor' in args.modus:
                tmp = ['lpi']
        save_folder = os.path.join(cwd, 'PIMP_%s' % '_'.join(tmp))
        if os.path.exists(os.path.abspath(save_folder)):
            save_folder = os.path.join(cwd, 'PIMP_%s_%s' % ('_'.join(tmp), ts))
    else:
        if len(args.modus) > 1:
            tmp = ['all']
        else:
            tmp = args.modus
            if 'incneighbor' in args.modus:
                tmp = ['lpi']
        if os.path.exists(os.path.abspath(args.out_folder)) or os.path.exists(os.path.abspath(
                        args.out_folder + '_%s' % '_'.join(tmp))):
            save_folder = os.path.join(cwd, args.out_folder + '_%s_%s' % ('_'.join(tmp), ts))
        else:
            save_folder = os.path.join(cwd, args.out_folder + '_%s' % '_'.join(tmp))

    importance = Importance(scenario_file=args.scenario_file,
                            runhistory_file=args.history,
                            parameters_to_evaluate=args.num_params,
                            traj_file=args.trajectory, seed=args.seed,
                            save_folder=save_folder,
                            impute_censored=args.impute,
                            max_sample_size=args.max_sample_size,
                            fANOVA_cut_at_default=args.fanova_cut_at_default,
                            fANOVA_pairwise=args.fanova_pairwise,
                            forwardsel_feat_imp=args.forwardsel_feat_imp,
                            incn_quant_var=args.incn_quant_var,
                            preprocess=args.marg_inst,
                            forwardsel_cv=args.forwardsel_cv)  # create importance object
    with open(os.path.join(save_folder, 'pimp_args.json'), 'w') as out_file:
        json.dump(args.__dict__, out_file, sort_keys=True, indent=4, separators=(',', ': '))
    result = importance.evaluate_scenario(args.modus, save_folder=save_folder)
    if args.table:
        importance.table_for_comparison(evaluators=result[1], name=os.path.join(
            save_folder, 'pimp_table_%s.tex' % args.modus), style='latex')
    else:
        importance.table_for_comparison(evaluators=result[1], style='cmd')
    os.chdir(cwd)
Example #4
0
        scenario_file=args.scenario_file,
        runhistory_file=args.history,
        parameters_to_evaluate=args.num_params,
        traj_file=args.trajectory,
        seed=args.seed,
        save_folder=save_folder,
        impute_censored=args.impute)  # create importance object
    save_folder += '_run1'
    os.makedirs(save_folder, exist_ok=True)
    with open(os.path.join(save_folder, 'pimp_args.json'), 'w') as out_file:
        json.dump(args.__dict__,
                  out_file,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))
    result = importance.evaluate_scenario(args.modus, save_folder=save_folder)

    if args.modus == 'all':
        with open(
                os.path.join(save_folder, 'pimp_values_%s.json' % args.modus),
                'w') as out_file:
            json.dump(result[0],
                      out_file,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
        importance.plot_results(list(
            map(lambda x: os.path.join(save_folder, x.name.lower()),
                result[1])),
                                result[1],
                                show=False)
    importance = Importance(
        args.scenario_file,
        args.history,
        parameters_to_evaluate=args.num_params,
        traj_file=args.trajectory,
        seed=args.seed,
        save_folder=save_folder,
        impute_censored=args.impute)  # create importance object
    save_folder += '_run1'
    with open(os.path.join(save_folder, 'pimp_args.json'), 'w') as out_file:
        json.dump(args.__dict__,
                  out_file,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))
    result = importance.evaluate_scenario(args.modus, sort_by=args.order)

    if args.modus == 'all':
        with open(
                os.path.join(save_folder, 'pimp_values_%s.json' % args.modus),
                'w') as out_file:
            json.dump(result[0],
                      out_file,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
        importance.plot_results(
            list(
                map(lambda x: os.path.join(save_folder, x.name.lower()),
                    result[1])), result[1])
        if args.table:
Example #6
0
class Analyzer(object):
    """
    This class serves as an interface to all the individual analyzing and
    plotting components. The plotter object is responsible for the actual
    plotting of things, but should not be invoked via the facade (which is
    constructed for cmdline-usage).
    """
    def __init__(self,
                 original_rh,
                 validated_rh,
                 default,
                 incumbent,
                 train_test,
                 scenario,
                 validator,
                 output,
                 max_pimp_samples,
                 fanova_pairwise=True):
        """
        Parameters
        ----------
        original_rh: RunHistory
            runhistory containing all runs that have actually been run
        validated_rh: RunHistory
            runhistory containing all runs from original_rh + estimates for
            default and all incumbents for all instances
        default, incumbent: Configuration
            default and overall incumbent
        train_test: bool
            whether is distinction is made (in cdf and scatter)
        scenario: Scenario
            the scenario object
        validator: Validator
            validator object (to estimate using EPM)
        output: string
            output-directory
        """
        self.logger = logging.getLogger("cave.analyzer")

        # Important objects for analysis
        self.original_rh = original_rh
        self.validated_rh = validated_rh
        self.default = default
        self.incumbent = incumbent
        self.train_test = train_test
        self.scenario = scenario
        self.validator = validator
        self.pimp = None  # PIMP object for reuse
        self.feat_analysis = None  # feat_analysis object for reuse
        self.evaluators = []
        self.output = output

        self.importance = None  # Used to store dictionary containing parameter
        # importances, so it can be used by analysis
        self.feat_importance = None  # Used to store dictionary w feat_imp

        conf1_runs = get_cost_dict_for_config(self.validated_rh, self.default)
        conf2_runs = get_cost_dict_for_config(self.validated_rh,
                                              self.incumbent)
        self.plotter = Plotter(self.scenario,
                               self.train_test,
                               conf1_runs,
                               conf2_runs,
                               output=self.output)
        self.max_pimp_samples = max_pimp_samples
        self.fanova_pairwise = fanova_pairwise

    def get_timeouts(self, config):
        """ Get number of timeouts in config per runs in total (not per
        instance)

        Parameters
        ----------
        config: Configuration
            configuration from which to calculate the timeouts

        Returns
        -------
        timeouts: tuple(int, int)
            tuple (timeouts, total runs)
        """
        cutoff = self.scenario.cutoff
        timeouts = get_timeout(self.validated_rh, config, cutoff)
        if self.train_test:
            if not cutoff:
                return (("N", "A"), ("N", "A"))
            train_timeout = len([
                i for i in timeouts
                if (timeouts[i] == False and i in self.scenario.train_insts)
            ])
            test_timeout = len([
                i for i in timeouts
                if (timeouts[i] == False and i in self.scenario.test_insts)
            ])
            return ((train_timeout, len(self.scenario.train_insts)),
                    (test_timeout, len(self.scenario.test_insts)))
        else:
            if not cutoff:
                return ("N", "A")
            timeout = len([i for i in timeouts if timeouts[i] == False])
            no_timeout = len([i for i in timeouts if timeouts[i] == True])
            return (timeout, no_timeout)

    def get_parX(self, config, par=10):
        """Calculate parX-values of default and incumbent configs.
        First determine PAR-timeouts for each run on each instances,
        Second average over train/test if available, else just average.

        Parameters
        ----------
        config: Configuration
            config to be calculated
        par: int
            par-factor to use

        Returns
        -------
        (train, test) OR average -- tuple<float, float> OR float
            PAR10 values for train- and test-instances, if available as tuple
            else the general average
        """
        runs = get_cost_dict_for_config(self.validated_rh, config)
        # Penalize
        if self.scenario.cutoff:
            runs = [(k, runs[k]) if runs[k] < self.scenario.cutoff else
                    (k, self.scenario.cutoff * par) for k in runs]
        else:
            runs = [(k, runs[k]) for k in runs]
            self.logger.info("Calculating penalized average runtime without "
                             "cutoff...")

        # Average
        if self.train_test:
            train = np.mean(
                [c for i, c in runs if i in self.scenario.train_insts])
            test = np.mean(
                [c for i, c in runs if i in self.scenario.test_insts])
            return (train, test)
        else:
            return np.mean([c for i, c in runs])

####################################### TABLES #######################################

    def create_overview_table(self, best_folder):
        """ Create overview-table.

        Parameters
        ----------
        best_folder: str
            path to folder/run with best incumbent

        Returns
        -------
        table: str
            overview table in HTML
        """
        overview = OrderedDict([
            ('Run with best incumbent', best_folder),
            ('# Train instances', len(self.scenario.train_insts)),
            ('# Test instances', len(self.scenario.test_insts)),
            ('# Parameters', len(self.scenario.cs.get_hyperparameters())),
            ('Cutoff', self.scenario.cutoff),
            ('Walltime budget', self.scenario.wallclock_limit),
            ('Runcount budget', self.scenario.ta_run_limit),
            ('CPU budget', self.scenario.algo_runs_timelimit),
            ('Deterministic', self.scenario.deterministic),
        ])
        # Split into two columns
        overview_split = self._split_table(overview)
        # Convert to HTML
        df = DataFrame(data=overview_split)
        table = df.to_html(escape=False,
                           header=False,
                           index=False,
                           justify='left')
        return table

    def create_performance_table(self, default, incumbent):
        """Create table, compare default against incumbent on train-,
        test- and combined instances. Listing PAR10, PAR1 and timeouts.
        Distinguishes between train and test, if available."""
        self.logger.info("... create performance table")
        def_timeout, inc_timeout = self.get_timeouts(
            default), self.get_timeouts(incumbent)
        def_par10, inc_par10 = self.get_parX(default,
                                             10), self.get_parX(incumbent, 10)
        def_par1, inc_par1 = self.get_parX(default,
                                           1), self.get_parX(incumbent, 1)
        dec_place = 3
        if self.train_test:
            # Distinction between train and test
            # Create table
            array = np.array([[
                round(def_par10[0], dec_place),
                round(def_par10[1], dec_place),
                round(inc_par10[0], dec_place),
                round(inc_par10[1], dec_place)
            ],
                              [
                                  round(def_par1[0], dec_place),
                                  round(def_par1[1], dec_place),
                                  round(inc_par1[0], dec_place),
                                  round(inc_par1[1], dec_place)
                              ],
                              [
                                  "{}/{}".format(def_timeout[0][0],
                                                 def_timeout[0][1]),
                                  "{}/{}".format(def_timeout[1][0],
                                                 def_timeout[1][1]),
                                  "{}/{}".format(inc_timeout[0][0],
                                                 inc_timeout[0][1]),
                                  "{}/{}".format(inc_timeout[1][0],
                                                 inc_timeout[1][1])
                              ]])
            df = DataFrame(data=array,
                           index=['PAR10', 'PAR1', 'Timeouts'],
                           columns=['Train', 'Test', 'Train', 'Test'])
            table = df.to_html()
            # Insert two-column-header
            table = table.split(sep='</thead>', maxsplit=1)[1]
            new_table = "<table border=\"3\" class=\"dataframe\">\n"\
                        "  <col>\n"\
                        "  <colgroup span=\"2\"></colgroup>\n"\
                        "  <colgroup span=\"2\"></colgroup>\n"\
                        "  <thead>\n"\
                        "    <tr>\n"\
                        "      <td rowspan=\"2\"></td>\n"\
                        "      <th colspan=\"2\" scope=\"colgroup\">Default</th>\n"\
                        "      <th colspan=\"2\" scope=\"colgroup\">Incumbent</th>\n"\
                        "    </tr>\n"\
                        "    <tr>\n"\
                        "      <th scope=\"col\">Train</th>\n"\
                        "      <th scope=\"col\">Test</th>\n"\
                        "      <th scope=\"col\">Train</th>\n"\
                        "      <th scope=\"col\">Test</th>\n"\
                        "    </tr>\n"\
                        "</thead>\n"
            table = new_table + table
        else:
            # No distinction between train and test
            array = np.array(
                [[round(def_par10, dec_place),
                  round(inc_par10, dec_place)],
                 [round(def_par1, dec_place),
                  round(inc_par1, dec_place)],
                 [
                     "{}/{}".format(def_timeout[0], def_timeout[1]),
                     "{}/{}".format(inc_timeout[0], inc_timeout[1])
                 ]])
            df = DataFrame(data=array,
                           index=['PAR10', 'PAR1', 'Timeouts'],
                           columns=['Default', 'Incumbent'])
            table = df.to_html()
        self.performance_table = table
        return table

    def config_to_html(self, default: Configuration, incumbent: Configuration):
        """Create HTML-table to compare Configurations.
        Removes unused parameters.

        Parameters
        ----------
        default, incumbent: Configurations
            configurations to be converted

        Returns
        -------
        table: str
            HTML-table comparing default and incumbent
        """
        # Remove unused parameters
        keys = [k for k in default.keys() if default[k] or incumbent[k]]
        default = [
            default[k] if default[k] != None else "inactive" for k in keys
        ]
        incumbent = [
            incumbent[k] if incumbent[k] != None else "inactive" for k in keys
        ]
        table = list(zip(keys, default, incumbent))
        # Show first parameters that changed
        same = [x for x in table if x[1] == x[2]]
        diff = [x for x in table if x[1] != x[2]]
        table = []
        if len(diff) > 0:
            table.extend([("-------------- Changed parameters: "\
                           "--------------", "-----", "-----")])
            table.extend(diff)
        if len(same) > 0:
            table.extend([("-------------- Unchanged parameters: "\
                           "--------------", "-----", "-----")])
            table.extend(same)
        keys, table = [k[0] for k in table], [k[1:] for k in table]
        df = DataFrame(data=table,
                       columns=["Default", "Incumbent"],
                       index=keys)
        table = df.to_html()
        return table

    def _split_table(self, table: OrderedDict):
        """Splits an OrderedDict into a list of tuples that can be turned into a
        HTML-table with pandas DataFrame

        Parameters
        ----------
        table: OrderedDict
            table that is to be split into two columns

        Returns
        -------
        table_split: List[tuple(key, value, key, value)]
            list with two key-value pairs per entry that can be used by pandas
            df.to_html()
        """
        table_split = []
        keys = list(table.keys())
        half_size = len(keys) // 2
        for i in range(half_size):
            j = i + half_size
            table_split.append(("<b>" + keys[i] + "</b>", table[keys[i]],
                                "<b>" + keys[j] + "</b>", table[keys[j]]))
        if len(keys) % 2 == 1:
            table_split.append(
                ("<b>" + keys[-1] + "</b>", table[keys[-1]], '', ''))
        return table_split

####################################### PARAMETER IMPORTANCE #######################################

    def fanova(self,
               incumbent,
               num_params=10,
               num_pairs=0,
               marginal_threshold=0.05):
        """Wrapper for parameter_importance to save the importance-object/
        extract the results. We want to show the top X most important
        parameter-fanova-plots.

        Parameters
        ----------
        incumbent: Configuration
            incumbent configuration
        num_params: int
            how many of the top important parameters should be shown
        num_pairs: int  (NOT WORKING)
            for how many parameters pairwise marginals are plotted
            n parameters -> n^2 plots
        marginal_threshold: float
            parameter/s must be at least this important to be mentioned

        Returns
        -------
        fanova_table: str
            html table with importances for all parameters
        plots: Dict[str: st]
            dictionary mapping single parameters to their plots
        """
        self.parameter_importance("fanova",
                                  incumbent,
                                  self.output,
                                  num_params,
                                  num_pairs=num_pairs)
        parameter_imp = self.pimp.evaluator.evaluated_parameter_importance
        # Split single and pairwise (pairwise are string: "['p1','p2']")
        pairwise_imp = {
            k: v
            for k, v in parameter_imp.items() if k.startswith("[")
        }
        for k in pairwise_imp.keys():
            parameter_imp.pop(k)

        # Set internal parameter importance for further analysis (such as
        #   parallel coordinates)
        self.logger.debug("Fanova importance: %s", str(parameter_imp))
        self.importance = parameter_imp

        # Dicts to lists of tuples, sorted descending after importance and only
        #   including marginals > 0.05
        parameter_imp = [(k, v) for k, v in sorted(
            parameter_imp.items(), key=operator.itemgetter(1), reverse=True)
                         if v > 0.05]
        pairwise_imp = [(k, v) for k, v in sorted(
            pairwise_imp.items(), key=operator.itemgetter(1), reverse=True)
                        if v > 0.05]
        # Create table
        table = []
        if len(parameter_imp) > 0:
            table.extend([(20 * "-" + " Single importance: " + 20 * "-",
                           20 * "-")])
            table.extend(parameter_imp)
        if len(pairwise_imp) > 0:
            table.extend([(20 * "-" + " Pairwise importance: " + 20 * "-",
                           20 * "-")])
            # TODO assuming (current) form of "['param1','param2']", but not
            #       expecting it stays this way (on PIMPs side)
            table.extend([(' & '.join(
                [tmp.strip('\' ') for tmp in k.strip('[]').split(',')]), v)
                          for k, v in pairwise_imp])

        keys, fanova_table = [k[0] for k in table], [k[1:] for k in table]
        df = DataFrame(data=fanova_table, index=keys)
        fanova_table = df.to_html(escape=False,
                                  header=False,
                                  index=True,
                                  justify='left')

        single_plots = {}
        for p, v in parameter_imp:
            single_plots[p] = os.path.join(self.output, "fanova", p + '.png')
        # Check for pairwise plots
        # Right now no way to access paths of the plots -> file issue
        pairwise_plots = {}
        for p, v in pairwise_imp:
            p_new = p.replace('\'', '')
            potential_path = os.path.join(self.output, 'fanova',
                                          p_new + '.png')
            self.logger.debug("Check for %s", potential_path)
            if os.path.exists(potential_path):
                pairwise_plots[p] = potential_path
        return fanova_table, single_plots, pairwise_plots

    def local_epm_plots(self):
        plots = OrderedDict([])
        if self.importance:
            self.parameter_importance("incneighbor",
                                      self.incumbent,
                                      self.output,
                                      num_params=3)
            for p, i in [(k, v) for k, v in sorted(self.importance.items(),
                                                   key=operator.itemgetter(1),
                                                   reverse=True) if v > 0.05]:
                plots[p] = os.path.join(self.output, 'incneighbor', p + '.png')

        else:
            self.logger.warning("Need to run fANOVA before incneighbor!")
            raise ValueError()
        return plots

    def parameter_importance(self,
                             modus,
                             incumbent,
                             output,
                             num_params=4,
                             num_pairs=0):
        """Calculate parameter-importance using the PIMP-package.
        Currently ablation, forward-selection and fanova are used.

        Parameters
        ----------
        modus: str
            modus for parameter importance, from [forward-selection, ablation,
            fanova]

        Returns
        -------
        importance: pimp.Importance
            importance object with evaluated data
        """
        self.logger.info("... parameter importance {}".format(modus))
        # Evaluate parameter importance
        save_folder = output
        if not self.pimp:
            self.pimp = Importance(scenario=copy.deepcopy(self.scenario),
                                   runhistory=self.original_rh,
                                   incumbent=incumbent,
                                   parameters_to_evaluate=num_params,
                                   save_folder=save_folder,
                                   seed=12345,
                                   max_sample_size=self.max_pimp_samples,
                                   fANOVA_pairwise=self.fanova_pairwise,
                                   preprocess=False)
        result = self.pimp.evaluate_scenario([modus], save_folder)
        self.evaluators.append(self.pimp.evaluator)
        return self.pimp

####################################### FEATURE IMPORTANCE #######################################

    def feature_importance(self):
        self.logger.info("... plotting feature importance")
        forward_selector = FeatureForwardSelector(self.scenario,
                                                  self.original_rh)
        imp = forward_selector.run()
        self.logger.debug("FEAT IMP %s", imp)
        self.feat_importance = imp
        plots = forward_selector.plot_result(
            os.path.join(self.output, 'feature_plots/importance'))
        return (imp, plots)

####################################### PLOTS #######################################

    def plot_parallel_coordinates(self, n_param=10, n_configs=500):
        """ Creates a parallel coordinates plot visualizing the explored
        parameter configuration space. """
        self.logger.info("... plotting parallel coordinates")
        # If a parameter importance has been performed in this analyzer-object,
        # only plot the n_param most important parameters.
        if self.importance:
            n_param = min(
                n_param,
                max(3, len([x for x in self.importance.values() if x > 0.05])))
            params = list(self.importance.keys())[:n_param]
        else:
            # TODO what if no parameter importance has been performed?
            # plot all? random subset? -> atm: random
            self.logger.info(
                "No parameter importance performed. Plotting random "
                "parameters in parallel coordinates plot.")
            params = list(self.default.keys())[:n_param]

        self.logger.info(
            "    plotting %s parameters for (max) %s configurations",
            len(params), n_configs)
        rh = self.original_rh if self.plotter.vizrh is None else self.plotter.vizrh
        path = self.plotter.plot_parallel_coordinates(rh, self.output, params,
                                                      n_configs,
                                                      self.validator)

        return path

    def plot_cdf(self):
        self.logger.info("... plotting eCDF")
        cdf_path = os.path.join(self.output, 'cdf')
        return self.plotter.plot_cdf_compare(output_fn_base=cdf_path)

    def plot_scatter(self):
        self.logger.info("... plotting scatter")
        scatter_path = os.path.join(self.output, 'scatter')
        return self.plotter.plot_scatter(output_fn_base=scatter_path)

    @timing
    def plot_confviz(self, incumbents, runhistories, max_confs=1000):
        """ Plot the visualization of configurations, highlightning the
        incumbents. Using original rh, so the explored configspace can be
        estimated.

        Parameters
        ----------
        incumbents: List[Configuration]
            list with incumbents, so they can be marked in plot
        runhistories: List[RunHistory]
            list of runhistories, so they can be marked in plot
        max_confs: int
            maximum number of data-points to plot

        Returns
        -------
        confviz: str
            script to generate the interactive html
        """
        self.logger.info("... visualizing explored configspace")
        confviz = self.plotter.visualize_configs(self.scenario,
                                                 runhistories=runhistories,
                                                 incumbents=incumbents,
                                                 max_confs_plot=max_confs)

        return confviz

    @timing
    def plot_cost_over_time(self, traj, validator):
        path = os.path.join(self.output, 'cost_over_time.png')
        self.logger.info("... cost over time:")
        self.logger.info("    plotting!")
        self.plotter.plot_cost_over_time(self.validated_rh,
                                         traj,
                                         output=path,
                                         validator=validator)
        return path

    @timing
    def plot_algorithm_footprint(self,
                                 algorithms=None,
                                 density=200,
                                 purity=0.95):
        if not algorithms:
            algorithms = {self.default: "default", self.incumbent: "incumbent"}
        self.logger.info("... algorithm footprints:")
        self.logger.info("    for: {}".format(algorithms.values()))
        footprint = AlgorithmFootprint(self.validated_rh,
                                       self.scenario.feature_dict, algorithms,
                                       self.scenario.cutoff, self.output)
        # Calculate footprints
        #for i in range(100):
        #    for a in algorithms:
        #        footprint.footprint(a, 20, 0.95)

        # Plot footprints
        plots = footprint.plot_points_per_cluster()
        return plots

####################################### FEATURE ANALYSIS #######################################

    def feature_analysis(
        self,
        mode,
        feat_names,
    ):
        """Use asapys feature analysis.

        Parameters
        ----------
        mode: str
            from [box_violin, correlation, clustering]

        Returns
        -------
        Corresponding plot paths
        """
        self.logger.info("... feature analysis: %s", mode)
        self.feat_analysis = FeatureAnalysis(
            output_dn=self.output,
            scenario=self.scenario,
            feat_names=feat_names,
            feat_importance=self.feat_importance)

        if mode == 'box_violin':
            return self.feat_analysis.get_box_violin_plots()

        if mode == 'correlation':
            self.feat_analysis.correlation_plot()
            return self.feat_analysis.correlation_plot(imp=False)

        if mode == 'clustering':
            return self.feat_analysis.cluster_instances()
Example #7
0
def cmd_line_call():
    """
    Main Parameter importance script.
    """
    cmd_reader = CMDs()
    args, misc_ = cmd_reader.read_cmd()  # read cmd args
    logging.basicConfig(level=args.verbose_level)
    ts = time.time()
    ts = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H:%M:%S')
    if not args.out_folder:
        save_folder = 'PIMP_%s_%s' % (args.modus, ts)
    else:
        if os.path.exists(os.path.abspath(args.out_folder)) or os.path.exists(
                os.path.abspath(args.out_folder + '_%s' % args.modus)):
            save_folder = args.out_folder + '_%s_%s' % (args.modus, ts)
        else:
            save_folder = args.out_folder + '_%s' % args.modus

    importance = Importance(
        scenario_file=args.scenario_file,
        runhistory_file=args.history,
        parameters_to_evaluate=args.num_params,
        traj_file=args.trajectory,
        seed=args.seed,
        save_folder=save_folder,
        impute_censored=args.impute,
        max_sample_size=args.max_sample_size)  # create importance object
    with open(os.path.join(save_folder, 'pimp_args.json'), 'w') as out_file:
        json.dump(args.__dict__,
                  out_file,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))
    result = importance.evaluate_scenario(args.modus, sort_by=args.order)

    if args.modus == 'all':
        with open(
                os.path.join(save_folder, 'pimp_values_%s.json' % args.modus),
                'w') as out_file:
            json.dump(result[0],
                      out_file,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
        importance.plot_results(list(
            map(lambda x: os.path.join(save_folder, x.name.lower()),
                result[1])),
                                result[1],
                                show=False)
        if args.table:
            importance.table_for_comparison(
                evaluators=result[1],
                name=os.path.join(save_folder,
                                  'pimp_table_%s.tex' % args.modus),
                style='latex')
        else:
            importance.table_for_comparison(evaluators=result[1], style='cmd')
    else:
        with open(
                os.path.join(save_folder, 'pimp_values_%s.json' % args.modus),
                'w') as out_file:
            json.dump(result,
                      out_file,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))

        importance.plot_results(name=os.path.join(save_folder, args.modus),
                                show=False)
from pimp.importance.importance import Importance
from pimp.utils.io.cmd_reader import CMDs

__author__ = "Andre Biedenkapp"
__copyright__ = "Copyright 2016, ML4AAD"
__license__ = "3-clause BSD"
__maintainer__ = "Andre Biedenkapp"
__email__ = "*****@*****.**"

if __name__ == '__main__':
    """
    Main Parameter importance script.
    """
    cmd_reader = CMDs()
    args, misc_ = cmd_reader.read_cmd()  # read cmd args
    logging.basicConfig(level=args.verbose_level)
    importance = Importance(args.scenario_file,
                            args.history,
                            parameters_to_evaluate=args.num_params,
                            traj_file=args.trajectory,
                            seed=args.seed)  # create importance object
    importance_value_dict = importance.evaluate_scenario(args.modus)

    ts = time.time()
    ts = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H:%M:%S')
    with open('pimp_values_%s_%s.json' % (args.modus, ts), 'w') as out_file:
        json.dump(importance_value_dict, out_file)

    importance.plot_results(name=args.modus)