예제 #1
0
    def _plot_parallel_coordinates(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([hp for hp in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hp's by method \'%s\': %s", method,
                          str(hp_names))

        # To be set
        self.plots = []

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        max_runs_epm = self.max_runs_epm  # Maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing "
                "the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, max_runs_epm)
            all_configs = sorted(
                all_configs,
                key=lambda c: len(original_rh.get_runs_for_config(c)
                                  ))[:max_configs]
            if not default in all_configs:
                all_configs = [default] + all_configs
            if not incumbent in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory(average_cost)
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs}

        pcp = ParallelCoordinatesPlotter(config_to_cost, output_dir, cs,
                                         runtime)

        try:
            plots = [
                pcp.plot_n_configs(
                    self.n_configs,
                    self.get_params(self.params, importance, hp_names))
            ]
            self.logger.debug("Paths to plot(s): %s", str(plots))
            return {'figure': plots}
        except ValueError as err:
            self.logger.debug("Error: %s", str(err))
            return {'else': str(err)}
예제 #2
0
    def _preprocess_budget(
        self,
        original_rh: RunHistory,
        validated_rh: RunHistory,
        validator: Validator,
        scenario: Scenario,
        default: Configuration,
        incumbent: Configuration,
        param_imp: Union[None, Dict[str, float]],
        output_dir: str,
        cs: ConfigurationSpace,
        runtime: bool = False,
    ):
        """
        Preprocess data and save in self.data to enable fast replots

        Parameters:
        -----------
        original_rh: RunHistory
            runhistory that should contain only runs that were executed during search
        validated_rh: RunHistory
            runhistory that may contain as many runs as possible, also external runs.
            this runhistory will be used to build the EPM
        validator: Validator
            validator to be used to estimate costs for configurations
        scenario: Scenario
            scenario object to take instances from
        default, incumbent: Configuration
            default and incumbent, they will surely be displayed
        param_imp: Union[None, Dict[str->float]
            if given, maps parameter-names to importance
        output_dir: str
            output directory for plots
        cs: ConfigurationSpace
            parameter configuration space to be visualized
        runtime: boolean
            runtime will be on logscale
        """
        # Sorting parameters by importance, if possible (choose first executed parameter-importance)
        method, importance = "", {}
        if self.pc_sort_by == 'all':
            self.logger.debug("Sorting by average importance")
            method = 'average'
            for m, i in param_imp.items():
                if i:
                    for p, imp in i.items():
                        if p in importance:
                            importance[p].append(imp)
                        else:
                            importance[p] = [imp]
            importance = {k: sum(v) / len(v) for k, v in importance.items()}
        elif self.pc_sort_by in param_imp:
            method, importance = self.pc_sort_by, param_imp[self.pc_sort_by]
        else:
            self.logger.debug("%s not evaluated.. choosing at random from: %s",
                              self.pc_sort_by, str(list(param_imp.keys())))
            for m, i in param_imp.items():
                if i:
                    method, importance = m, i
                    self.logger.debug("Chose %s", method)
                    break

        hp_names = sorted([p for p in cs.get_hyperparameter_names()],
                          key=lambda x: importance.get(x, 0),
                          reverse=True)
        self.logger.debug("Sorted hyperparameters by method \'%s\': %s",
                          method, str(hp_names))

        # Define set of configurations (limiting to max and choosing most interesting ones)
        all_configs = original_rh.get_all_configs()
        # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs
        max_configs = int(
            self.max_runs_epm /
            (len(scenario.train_insts) + len(scenario.test_insts)))
        if len(all_configs) > max_configs:
            self.logger.debug(
                "Limiting number of configs to train epm from %d to %d (based on max runs %d) and "
                "choosing the ones with the most runs (for parallel coordinates)",
                len(all_configs), max_configs, self.max_runs_epm)
            all_configs = sorted(all_configs,
                                 key=lambda c: len(
                                     original_rh.get_runs_for_config(
                                         c, only_max_observed_budget=False)))
            all_configs = all_configs[:max_configs]
            if default not in all_configs:
                all_configs = [default] + all_configs
            if incumbent not in all_configs:
                all_configs.append(incumbent)

        # Get costs for those configurations
        epm_rh = RunHistory()
        epm_rh.update(validated_rh)
        if scenario.feature_dict:  # if instances are available
            epm_rh.update(
                timing(validator.validate_epm)(all_configs,
                                               'train+test',
                                               1,
                                               runhistory=validated_rh))
        config_to_cost = OrderedDict(
            {c: epm_rh.get_cost(c)
             for c in all_configs})

        data = OrderedDict()
        data['cost'] = list(config_to_cost.values())
        for hp in self.runscontainer.scenario.cs.get_hyperparameter_names():
            data[hp] = np.array([
                c[hp]  # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan
                for c in config_to_cost.keys()
            ])
        df = pd.DataFrame(data=data)
        return df