예제 #1
0
    def __init__(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        params: Union[int, List[str]],
        n_configs: int,
        pc_sort_by: str,
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
        max_runs_epm: int = 3000000,
    ):
        """This function prepares the data from a SMAC-related
        format (using runhistories and parameters) to a more general format
        (using a dataframe). The resulting dataframe is passed to the
        parallel_coordinates-routine

        Parameters
        ----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        params: Union[int, List[str]]
            either directly the parameters to displayed or the number of parameters (will try to define the most
            important ones
        n_configs: int
            number of configs to be plotted
        pc_sort_by: str
            defines the pimp-method by which to choose the plotted parameters
        max_runs_epm: int
            maximum number of runs to train the epm with. this should prevent MemoryErrors
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """

        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
        self.error = None

        self.default = default
        self.param_imp = param_imp
        self.cs = cs

        # Sorting by importance, if possible (choose first executed parameter-importance)
        self.method, self.importance = "", {}
        if pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            self.method = 'average'
            for m, i in self.param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in self.importance:
                            self.importance[p].append(imp)
                        else:
                            self.importance[p] = [imp]
            self.importance = {
                k: sum(v) / len(v)
                for k, v in self.importance.items()
            }
        elif pc_sort_by in self.param_imp:
            self.method, self.importance = pc_sort_by, self.param_imp[
                pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              pc_sort_by, str(list(self.param_imp.keys())))
            for m, i in self.param_imp.items():
                if i:
                    self.method, self.importance = m, i
                    break

        self.hp_names = sorted(
            [hp for hp in self.cs.get_hyperparameter_names()],
            key=lambda x: self.importance.get(x, 0),
            reverse=True)
        self.logger.debug("Sorted hp's by method \'%s\': %s", self.method,
                          str(self.hp_names))

        # To be set
        self.plots = []

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        max_runs_epm = 300000  # Maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not default in all_configs:
                all_configs = [default] + all_configs
            if not incumbent in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory(average_cost)
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        self.config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs}

        self.params = self.get_params(params)
        self.n_configs = n_configs

        self.pcp = ParallelCoordinatesPlotter(self.config_to_cost, output_dir,
                                              cs, runtime)
예제 #2
0
    def _preprocess_budget(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Preprocess data and save in self.data to enable fast replots

        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([p for p in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hyperparameters by method \'%s\': %s",
                          method, str(hp_names))

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            self.max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and "
                "choosing the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, self.max_runs_epm)
            all_configs = sorted(all_configs,
                                 key=lambda c: len(
                                     original_rh.get_runs_for_config(
                                         c, only_max_observed_budget=False)))
            all_configs = all_configs[:max_configs]
            if default not in all_configs:
                all_configs = [default] + all_configs
            if incumbent not in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory()
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = OrderedDict(
            {c: epm_rh.get_cost(c)
             for c in all_configs})

        data = OrderedDict()
        data['cost'] = list(config_to_cost.values())
        for hp in self.runscontainer.scenario.cs.get_hyperparameter_names():
            data[hp] = np.array([
                c[hp]  # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan
                for c in config_to_cost.keys()
            ])
        df = pd.DataFrame(data=data)
        return df
예제 #3
0
    def _plot_parallel_coordinates(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([hp for hp in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hp's by method \'%s\': %s", method,
                          str(hp_names))

        # To be set
        self.plots = []

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        max_runs_epm = self.max_runs_epm  # Maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not default in all_configs:
                all_configs = [default] + all_configs
            if not incumbent in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory(average_cost)
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs}

        pcp = ParallelCoordinatesPlotter(config_to_cost, output_dir, cs,
                                         runtime)

        try:
            plots = [
                pcp.plot_n_configs(
                    self.n_configs,
                    self.get_params(self.params, importance, hp_names))
            ]
            self.logger.debug("Paths to plot(s): %s", str(plots))
            return {'figure': plots}
        except ValueError as err:
            self.logger.debug("Error: %s", str(err))
            return {'else': str(err)}
예제 #4
0
    def plot_parallel_coordinates(self,
                                  original_rh,
                                  validated_rh,
                                  validator,
                                  n_param=10,
                                  n_configs=500,
                                  max_runs_epm=300000):
        """ Plot parallel coordinates (visualize higher dimensions), here used
        to visualize pcs. This function prepares the data from a SMAC-related
        format (using runhistories and parameters) to a more general format
        (using a dataframe). The resulting dataframe is passed to the
        parallel_coordinates-routine.

        NOTE: the given runhistory should contain only optimization and no
        validation to analyze the explored parameter-space.

        Parameters
        ----------
        original_rh: RunHistory
            rundata to take configs from (no validation data - we want to
            visualize optimization process)
        validate_rh: RunHistory
            rundata to estimate costs of configs from (can contain validation
            data but no empirical estimations, since it's used to train an epm)
        validator: Validator
            to calculate alpha values
        n_param: int
            parameters to be plotted
        n_configs: int
            max # configs to be plotted
        max_runs_epm: int
            maximum number of total runs that should be predicted using epm. the higher this value is, the better the
            predictions (probably), however high numbers are likely to lead to MemoryErrors

        Returns
        -------
        output: str
            path to plot
        """
        self.logger.info("... plotting parallel coordinates")
        # If a parameter importance has been performed in this analyzer-object,
        # only plot the n_param most important parameters.
        if self.param_imp:
            # Use the first applied parameter importance analysis to choose
            method, importance = list(self.param_imp.items())[0]
            self.logger.debug(
                "Choosing visualized parameters in parallel coordinates "
                "according to parameter importance method %s" % method)
            n_param = min(
                n_param,
                max(3, len([x for x in importance.values() if x > 0.05])))
            # Some importance methods add "--source--" or similar to the parameter names -> filter them in next line
            params = [
                p for p in importance.keys()
                if p in self.scenario.cs.get_hyperparameter_names()
            ][:n_param]
        else:
            self.logger.info(
                "No parameter importance performed. Plotting random parameters in parallel coordinates."
            )
            params = list(self.default.keys())[:n_param]

        self.logger.info(
            "    plotting %s parameters for (max) %s configurations",
            len(params), n_configs)

        # Reduce to feasible number of configurations
        all_configs = original_rh.get_all_configs()
        max_configs = int(
            max_runs_epm /
            (len(self.scenario.train_insts) + len(self.scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs", len(all_configs), max_configs,
                max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not self.default in all_configs:
                all_configs = [self.default] + all_configs
            if not self.incumbent in all_configs:
                all_configs.append(self.incumbent)

        if self.scenario.feature_dict:
            epm_rh = timing(validator.validate_epm)(all_configs,
                                                    'train+test',
                                                    1,
                                                    runhistory=validated_rh)
            epm_rh.update(validated_rh)
        else:
            epm_rh = validated_rh
        pcp = ParallelCoordinatesPlotter(
            original_rh,
            epm_rh,
            self.output_dir,
            self.scenario.cs,
            runtime=(self.scenario.run_obj == 'runtime'))
        output = pcp.plot_n_configs(n_configs, params)
        return output