Example #1
0
def write(run_history: RunHistory, output_directory: str,
          logger: logging.Logger) -> None:
    """Write the runhistory to the output directory.

    Overwrites previously outputted runhistories.

    Parameters
    ----------
    run_history : ~smac.runhistory.runhistory.RunHistory
        RunHistory object to be saved.

    output_directory : str

    logger : logging.Logger
    """

    output_filename = os.path.join(output_directory, RUNHISTORY_FILEPATTERN)

    logger.debug("Saving runhistory to %s" % output_filename)

    with tempfile.NamedTemporaryFile('wb', dir=output_directory,
                                     delete=False) as fh:
        temporary_filename = fh.name

    run_history.save_json(temporary_filename, save_external=False)
    os.rename(temporary_filename, output_filename)
Example #2
0
    def _save_results(self, rh: RunHistory, output_fn, backup_fn=None):
        """ Helper to save results to file

        Parameters
        ----------
        rh: RunHistory
            runhistory to save
        output_fn: str
            if ends on '.json': filename to save history to
            else: directory to save runhistory to (filename is backup_fn)
        backup_fn: str
            if output_fn does not end on '.json', treat output_fn as dir and
            append backup_fn as filename (if output_fn ends on '.json', this
            argument is ignored)
        """
        if output_fn == "":
            self.logger.info(
                "No output specified, validated runhistory not saved.")
            return
        # Check if a folder or a file is specified as output
        if not output_fn.endswith('.json'):
            output_dir = output_fn
            output_fn = os.path.join(output_dir, backup_fn)
            self.logger.debug("Output is \"%s\", changing to \"%s\"!",
                              output_dir, output_fn)
        base = os.path.split(output_fn)[0]
        if not base == "" and not os.path.exists(base):
            self.logger.debug("Folder (\"%s\") doesn't exist, creating.", base)
            os.makedirs(base)
        rh.save_json(output_fn)
        self.logger.info("Saving validation-results in %s", output_fn)
Example #3
0
def write(run_history:RunHistory, output_directory:str, num_run:int):
    """Write the runhistory to the output directory.

    Overwrites previously outputted runhistories.

    Parameters
    ----------
    run_history : smac.runhistory.RunHistory
        RunHistory object to be saved.

    output_directory : str

    run_run : int
        ID of the current SMAC run.

    """

    output_filename = os.path.join(output_directory, RUNHISTORY_FILEPATTERN)

    with tempfile.NamedTemporaryFile('wb', dir=output_directory,
                                     delete=False) as fh:
        temporary_filename = fh.name

    run_history.save_json(temporary_filename)
    os.rename(temporary_filename, output_filename)
Example #4
0
def dummy_traditional_classification(self, time_left: int, func_eval_time_limit_secs: int) -> None:
    run_history = RunHistory()
    run_history.load_json('./.tmp_api/traditional_run_history.json',
                          TraditionalTabularClassificationPipeline(dataset_properties={
                              'numerical_columns': [10]
                          }).get_hyperparameter_search_space())
    self.run_history.update(run_history, DataOrigin.EXTERNAL_SAME_INSTANCES)
    run_history.save_json(os.path.join(self._backend.internals_directory, 'traditional_run_history.json'),
                          save_external=True)
    return
Example #5
0
    def test_json_origin(self):

        for origin in ['test_origin', None]:
            rh = RunHistory()
            cs = get_config_space()
            config1 = Configuration(cs,
                                    values={'a': 1, 'b': 2},
                                    origin=origin)

            rh.add(config=config1, cost=10, time=20,
                   status=StatusType.SUCCESS, instance_id=1,
                   seed=1)

            path = 'test/test_files/test_json_origin.json'
            rh.save_json(path)
            _ = rh.load_json(path, cs)

            self.assertEqual(rh.get_all_configs()[0].origin, origin)

            os.remove(path)
Example #6
0
def write(run_history: RunHistory, output_directory: str,
          logger: logging.Logger):
    """Write the runhistory to the output directory.

    Overwrites previously outputted runhistories.

    Parameters
    ----------
    run_history : ~smac.runhistory.runhistory.RunHistory
        RunHistory object to be saved.

    output_directory : str

    logger : logging.Logger
    """
    file_system = run_history.file_system
    output_filename = file_system.join(output_directory,
                                       RUNHISTORY_FILEPATTERN)

    logging.debug("Saving runhistory to %s" % output_filename)

    run_history.save_json(output_filename, save_external=False)
class Hydra(object):
    """
    Facade to use Hydra default mode

    Attributes
    ----------
    logger
    stats : Stats
        loggs information about used resources
    solver : SMBO
        handles the actual algorithm calls
    rh : RunHistory
        List with information about previous runs
    portfolio : list
        List of all incumbents

    """
    def __init__(self,
                 scenario: Scenario,
                 n_iterations: int,
                 val_set: str = 'train',
                 incs_per_round: int = 1,
                 n_optimizers: int = 1,
                 rng: typing.Optional[typing.Union[np.random.RandomState,
                                                   int]] = None,
                 run_id: int = 1,
                 tae: typing.Type[ExecuteTARun] = ExecuteTARunOld,
                 tae_kwargs: typing.Union[dict, None] = None,
                 **kwargs):
        """
        Constructor

        Parameters
        ----------
        scenario : ~smac.scenario.scenario.Scenario
            Scenario object
        n_iterations: int,
            number of Hydra iterations
        val_set: str
            Set to validate incumbent(s) on. [train, valX].
            train => whole training set,
            valX => train_set * 100/X where X in (0, 100)
        incs_per_round: int
            Number of incumbents to keep per round
        n_optimizers: int
            Number of optimizers to run in parallel per round
        rng: int/np.random.RandomState
            The randomState/seed to pass to each smac run
        run_id: int
            run_id for this hydra run
        tae: ExecuteTARun
            Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc)
        tae_kwargs: Optional[dict]
            arguments passed to constructor of '~tae'

        """
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        self.n_iterations = n_iterations
        self.scenario = scenario
        self.run_id, self.rng = get_rng(rng, run_id, self.logger)
        self.kwargs = kwargs
        self.output_dir = None
        self.top_dir = None
        self.solver = None
        self.portfolio = None
        self.rh = RunHistory(average_cost, file_system=scenario.file_system)
        self._tae = tae
        self._tae_kwargs = tae_kwargs
        if incs_per_round <= 0:
            self.logger.warning('Invalid value in %s: %d. Setting to 1',
                                'incs_per_round', incs_per_round)
        self.incs_per_round = max(incs_per_round, 1)
        if n_optimizers <= 0:
            self.logger.warning('Invalid value in %s: %d. Setting to 1',
                                'n_optimizers', n_optimizers)
        self.n_optimizers = max(n_optimizers, 1)
        self.val_set = self._get_validation_set(val_set)
        self.cost_per_inst = {}
        self.optimizer = None
        self.portfolio_cost = None

    def _get_validation_set(self,
                            val_set: str,
                            delete: bool = True) -> typing.List[str]:
        """
        Create small validation set for hydra to determine incumbent performance

        Parameters
        ----------
        val_set: str
            Set to validate incumbent(s) on. [train, valX].
            train => whole training set,
            valX => train_set * 100/X where X in (0, 100)
        delete: bool
            Flag to delete all validation instances from the training set

        Returns
        -------
        val: typing.List[str]
            List of instance-ids to validate on

        """
        if val_set == 'none':
            return None
        if val_set == 'train':
            return self.scenario.train_insts
        elif val_set[:3] != 'val':
            self.logger.warning(
                'Can not determine validation set size. Using full training-set!'
            )
            return self.scenario.train_insts
        else:
            size = int(val_set[3:]) / 100
            if size <= 0 or size >= 1:
                raise ValueError(
                    'X invalid in valX, should be between 0 and 1')
            insts = np.array(self.scenario.train_insts)
            # just to make sure this also works with the small example we have to round up to 3
            size = max(np.floor(insts.shape[0] * size).astype(int), 3)
            ids = np.random.choice(insts.shape[0], size, replace=False)
            val = insts[ids].tolist()
            if delete:
                self.scenario.train_insts = np.delete(insts, ids).tolist()
            return val

    def optimize(self) -> typing.List[Configuration]:
        """
        Optimizes the algorithm provided in scenario (given in constructor)

        Returns
        -------
        portfolio : typing.List[Configuration]
            Portfolio of found configurations

        """
        # Setup output directory
        self.portfolio = []
        portfolio_cost = np.inf
        if self.output_dir is None:
            self.top_dir = "hydra-output_%s" % (
                datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))
            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)

        scen = copy.deepcopy(self.scenario)
        scen.output_dir_for_this_run = None
        scen.output_dir = None
        # parent process SMAC only used for validation purposes
        self.solver = SMAC4AC(scenario=scen,
                              tae_runner=self._tae,
                              rng=self.rng,
                              run_id=self.run_id,
                              **self.kwargs)
        for i in range(self.n_iterations):
            self.logger.info("=" * 120)
            self.logger.info("Hydra Iteration: %d", (i + 1))

            if i == 0:
                tae = self._tae
                tae_kwargs = self._tae_kwargs
            else:
                tae = ExecuteTARunHydra
                if self._tae_kwargs:
                    tae_kwargs = self._tae_kwargs
                else:
                    tae_kwargs = {}
                tae_kwargs['cost_oracle'] = self.cost_per_inst
            self.optimizer = PSMAC(
                scenario=self.scenario,
                run_id=self.run_id,
                rng=self.rng,
                tae=tae,
                tae_kwargs=tae_kwargs,
                shared_model=False,
                validate=True if self.val_set else False,
                n_optimizers=self.n_optimizers,
                val_set=self.val_set,
                n_incs=self.
                n_optimizers,  # return all configurations (unvalidated)
                **self.kwargs)
            self.optimizer.output_dir = self.output_dir
            incs = self.optimizer.optimize()
            cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids(
                incs)
            if self.val_set:
                to_keep_ids = val_ids[:self.incs_per_round]
            else:
                to_keep_ids = est_ids[:self.incs_per_round]
            config_cost_per_inst = {}
            incs = incs[to_keep_ids]
            self.logger.info('Kept incumbents')
            for inc in incs:
                self.logger.info(inc)
                config_cost_per_inst[inc] = cost_per_conf_v[
                    inc] if self.val_set else cost_per_conf_e[inc]

            cur_portfolio_cost = self._update_portfolio(
                incs, config_cost_per_inst)
            if portfolio_cost <= cur_portfolio_cost:
                self.logger.info(
                    "No further progress (%f) --- terminate hydra",
                    portfolio_cost)
                break
            else:
                portfolio_cost = cur_portfolio_cost
                self.logger.info("Current pertfolio cost: %f", portfolio_cost)

            self.scenario.output_dir = os.path.join(
                self.top_dir,
                "psmac3-output_%s" % (datetime.datetime.fromtimestamp(
                    time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')))
            self.output_dir = create_output_directory(self.scenario,
                                                      run_id=self.run_id,
                                                      logger=self.logger)
        read(self.rh,
             os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)),
             self.scenario.cs, self.logger)
        self.rh.save_json(fn=os.path.join(
            self.top_dir, 'all_validated_runs_runhistory.json'),
                          save_external=True)
        with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh:
            pickle.dump(self.portfolio, fh)
        self.logger.info("~" * 120)
        self.logger.info('Resulting Portfolio:')
        for configuration in self.portfolio:
            self.logger.info(str(configuration))
        self.logger.info("~" * 120)

        return self.portfolio

    def _update_portfolio(
            self, incs: np.ndarray, config_cost_per_inst: typing.Dict
    ) -> typing.Union[np.float, float]:
        """
        Validates all configurations (in incs) and determines which ones to add to the portfolio

        Parameters
        ----------
        incs: np.ndarray
            List of Configurations

        Returns
        -------
        cur_cost: typing.Union[np.float, float]
            The current cost of the portfolio

        """
        if self.val_set:  # we have validated data
            for kept in incs:
                if kept not in self.portfolio:
                    self.portfolio.append(kept)
                    cost_per_inst = config_cost_per_inst[kept]
                    if self.cost_per_inst:
                        if len(self.cost_per_inst) != len(cost_per_inst):
                            raise ValueError(
                                'Num validated Instances mismatch!')
                        else:
                            for key in cost_per_inst:
                                self.cost_per_inst[key] = min(
                                    self.cost_per_inst[key],
                                    cost_per_inst[key])
                    else:
                        self.cost_per_inst = cost_per_inst
            cur_cost = np.mean(list(
                self.cost_per_inst.values()))  # type: np.float
        else:  # No validated data. Set the mean to the approximated mean
            means = [
            ]  # can contain nans as not every instance was evaluated thus we should use nanmean to approximate
            for kept in incs:
                means.append(
                    np.nanmean(
                        list(
                            self.optimizer.rh.get_instance_costs_for_config(
                                kept).values())))
                self.portfolio.append(kept)
            if self.portfolio_cost:
                new_mean = self.portfolio_cost * (
                    len(self.portfolio) - len(incs)) / len(self.portfolio)
                new_mean += np.nansum(means)
            else:
                new_mean = np.mean(means)
            self.cost_per_inst = defaultdict(lambda: new_mean)
            cur_cost = new_mean

        self.portfolio_cost = cur_cost
        return cur_cost
Example #8
0
    def hpbandster2smac(self, folder, result, cs_options, output_dir: str):
        """Reading hpbandster-result-object and creating RunHistory and trajectory...

        Parameters
        ----------
        folder: str (path)
            original folder
        result: hpbandster.core.result.Result
            bohb's result-object
        cs_options: list[ConfigurationSpace]
            the configuration spaces. in the best case it's a single element, but for pcs-format we need to guess
            through a list of possible configspaces
        output_dir_base: str
            the output-dir to save the smac-runs to
        
        Returns
        -------
        converted: dict{
                'new_path' : path_to_converted_input,
                'hp_bandster_result' : result_in_hpbandster_format,
                'config_space' : config_space,
                'runhistory' : runhistory,
                'validated_runhistory' : validated_runhistory,
                'scenario' : scenario,
                'trajectory' : trajectory,
                }

        """
        self.logger.debug("Budgets for '%s': %s" %
                          (folder, str(result.HB_config['budgets'])))
        ##########################
        # 1. Create runhistory   #
        ##########################
        id2config_mapping = result.get_id2config_mapping()
        skipped = {'None': 0, 'NaN': 0}
        rh = RunHistory()
        for run in result.get_all_runs():
            # Load config...
            config = None
            while config is None:
                if len(cs_options) == 0:
                    self.logger.debug("None of the alternatives worked...")
                    raise ValueError(
                        "Your configspace seems to be corrupt. If you use floats (or mix up ints, bools "
                        "and strings) as categoricals, please consider using the .json-format, as the "
                        ".pcs-format cannot recover the type of categoricals. Otherwise please report "
                        "this to https://github.com/automl/CAVE/issues (and attach the debug.log)"
                    )
                try:
                    config = self._get_config(run.config_id, id2config_mapping,
                                              cs_options[0])
                except ValueError as err:
                    self.logger.debug(
                        "Loading config failed. Trying %d alternatives" %
                        len(cs_options) - 1,
                        exc_info=1)
                    cs_options = cs_options[
                        1:]  # remove the failing cs-version

            # Filter corrupted loss-values (ignore them)
            if run.loss is None:
                skipped['None'] += 1
                continue
            if np.isnan(run.loss):
                skipped['NaN'] += 1
                continue

            rh.add(config=config,
                   cost=run.loss,
                   time=run.time_stamps['finished'] -
                   run.time_stamps['started'],
                   status=StatusType.SUCCESS,
                   budget=run.budget,
                   seed=0,
                   additional_info={
                       'info': run.info,
                       'timestamps': run.time_stamps
                   })

        self.logger.debug(
            "Skipped %d None- and %d NaN-loss-values in BOHB-result",
            skipped['None'], skipped['NaN'])

        ##########################
        # 2. Create all else     #
        ##########################
        scenario = Scenario({
            'run_obj': 'quality',
            'cs': cs_options[0],
            'output_dir': output_dir,
            'deterministic':
            True,  # At the time of writing, BOHB is always treating ta's as deterministic
        })
        scenario.output_dir_for_this_run = output_dir
        scenario.write()

        with open(os.path.join(output_dir, 'configspace.json'), 'w') as fh:
            fh.write(pcs_json.write(cs_options[0]))

        rh.save_json(fn=os.path.join(output_dir, 'runhistory.json'))

        trajectory = self.get_trajectory(result, output_dir, scenario, rh)

        return {
            'new_path': output_dir,
            'hpbandster_result': result,
            'config_space': cs_options[0],
            'runhistory': rh,
            'validated_runhistory': None,
            'scenario': scenario,
            'trajectory': trajectory,
        }
Example #9
0
class Validator(object):
    """
    Validator for already run SMAC-scenarios, evaluates specified configurations
    on specified instances.
    """
    def __init__(self, scenario, trajectory, output, rng=None):
        """
        Construct Validator for given scenario and trajectory.

        Parameters
        ----------
        scenario: Scenario
            scenario object for cutoff, instances and specifics
        trajectory: Trajectory
            trajectory to take incumbent(s) from
        output: string
            path to runhistory to be saved
        rng: np.random.RandomState
            Random number generator
        """
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)

        self.scen = scenario
        self.traj = trajectory
        if output:
            self.output = output
        else:
            self.output = "validation_rh.json"
        if isinstance(rng, np.random.RandomState):
            self.rng = rng
        elif isinstance(rng, int):
            self.rng = np.random.RandomState(seed=rng)
        else:
            num_run = np.random.randint(MAXINT)
            self.rng = np.random.RandomState(seed=num_run)

        self.rh = RunHistory(
            average_cost)  # update this rh with validation-runs

    def validate(self,
                 config_mode: str = 'def',
                 instance_mode: str = 'test',
                 repetitions: int = 1,
                 n_jobs: int = 1,
                 backend: str = 'threading',
                 runhistory: RunHistory = None,
                 tae: ExecuteTARun = None):
        """
        Validate configs on instances and save result in runhistory.

        Parameters
        ----------
        config_mode: string
            what configurations to validate
            from [def, inc, def+inc, time, all], time means evaluation at
            timesteps 2^-4, 2^-3, 2^-2, 2^-1, 2^0, 2^1, ...
        instance_mode: string
            what instances to use for validation, from [train, test, train+test]
        repetitions: int
            number of repetitions in nondeterministic algorithms
        n_jobs: int
            number of parallel processes used by joblib
        runhistory: RunHistory or string or None
            runhistory to take data from
        tae: ExecuteTARun
            tae to be used. if none, will initialize ExecuteTARunOld

        Returns
        -------
        runhistory: RunHistory
            runhistory with validated runs
        """
        self.logger.debug(
            "Validating configs '%s' on instances '%s', repeating %d times"
            " with %d parallel runs on backend '%s'.", config_mode,
            instance_mode, repetitions, n_jobs, backend)
        # Reset runhistory
        self.rh = RunHistory(average_cost)

        # Get relevant configurations and instances
        configs = self._get_configs(config_mode)
        instances = self._get_instances(instance_mode)

        # If runhistory is given as string, load into memory
        if isinstance(runhistory, str):
            fn = runhistory
            runhistory = RunHistory(average_cost)
            runhistory.load_json(fn, self.scen.cs)

        # Get all runs needed as list
        runs = self.get_runs(configs,
                             instances,
                             repetitions=repetitions,
                             runhistory=runhistory)

        # Create new Stats without limits
        inf_scen = Scenario({
            'run_obj': self.scen.run_obj,
            'cutoff_time': self.scen.cutoff,
            'output_dir': None
        })
        inf_stats = Stats(inf_scen)
        inf_stats.start_timing()

        # Create TAE
        if not tae:
            tae = ExecuteTARunOld(ta=self.scen.ta,
                                  stats=inf_stats,
                                  run_obj=self.scen.run_obj,
                                  par_factor=self.scen.par_factor,
                                  cost_for_crash=self.scen.cost_for_crash)
        else:
            # Inject endless-stats
            tae.stats = inf_stats

        # Validate!
        run_results = self._validate_parallel(tae, runs, n_jobs, backend)

        # tae returns (status, cost, runtime, additional_info)
        # Add runs to RunHistory
        idx = 0
        for result in run_results:
            self.rh.add(config=runs[idx]['config'],
                        cost=result[1],
                        time=result[2],
                        status=result[0],
                        instance_id=runs[idx]['inst'],
                        seed=runs[idx]['seed'],
                        additional_info=result[3])
            idx += 1

        # Save runhistory
        if not self.output.endswith('.json'):
            old = self.output
            self.output = os.path.join(self.output,
                                       'validated_runhistory.json')
            self.logger.debug("Output is \"%s\", changing to \"%s\"!", old,
                              self.output)
        base = os.path.split(self.output)[0]
        if not os.path.exists(base):
            self.logger.debug("Folder (\"%s\") doesn't exist, creating.", base)
            os.makedirs(base)
        self.logger.info("Saving validation-results in %s", self.output)
        self.rh.save_json(self.output)
        return self.rh

    def _validate_parallel(self, tae, runs, n_jobs, backend):
        """
        Validate runs with joblibs Parallel-interface

        Parameters
        ----------
        tae: ExecuteTARun
            tae to be used for validation
        runs: list<dict<string,string,string,string>>
            list with dicts
            [{"config":CONFIG,"inst":INSTANCE,"seed":SEED,"inst_specs":INST_SPECIFICS}]
        n_jobs: int
            number of cpus to use for validation
        backend: string
            what backend to use for parallelization

        Returns
        -------
        run_results: list<tuple(tae-returns)>
            results as returned by tae
        """
        # Runs with parallel
        run_results = Parallel(n_jobs=n_jobs, backend=backend)(
            delayed(_unbound_tae_starter)(tae,
                                          run['config'],
                                          run['inst'],
                                          self.scen.cutoff,
                                          run['seed'],
                                          run['inst_specs'],
                                          capped=False) for run in runs)
        return run_results

    def get_runs(self, configs, insts, repetitions=1, runhistory=None):
        """
        Generate list of SMAC-TAE runs to be executed. This means
        combinations of configs with all instances on a certain number of seeds.

        Parameters
        ----------
        configs: list<Configuration>
            configurations to be evaluated
        insts: list<strings>
            instances to be validated
        repetitions: int
            number of seeds per instance/config to be evaluated
        runhistory: RunHistory or None
            if given, try to reuse these results and save some runs

        Returns
        -------
        runs: list<dict<string,string,string,string>>
            list with dicts
            [{"config":CONFIG1,"inst":INSTANCE1,"seed":SEED1,"inst_specs":INST_SPECIFICS1},
             {"config":CONFIG2,"inst":INSTANCE2,"seed":SEED2,"inst_specs":INST_SPECIFICS2}]
        """
        # If no instances are given, fix the instances to one "None" instance
        if len(insts) == 0:
            insts = [None]
        # If algorithm is deterministic, fix repetitions to 1
        if self.scen.deterministic:
            self.logger.debug("Fixing repetitions to one, because algorithm is"
                              " deterministic.")
            repetitions = 1

        # Extract relevant information from given runhistory
        inst_seed_config = self._process_runhistory(configs, insts, runhistory)

        # Now create the actual run-list
        runs = []
        # Counter for runs without the need of recalculation
        runs_from_rh = 0

        for i in sorted(insts):
            for rep in range(repetitions):
                configs_evaluated = []
                if runhistory and i in inst_seed_config:
                    # Choose seed based on most often evaluated inst-seed-pair
                    seed, configs_evaluated = inst_seed_config[i].pop(0)
                    # Delete i from dict if list is empty
                    if len(inst_seed_config[i]) == 0:
                        inst_seed_config.pop(i)
                    # Add runs to runhistory
                    for c in configs_evaluated:
                        runkey = RunKey(runhistory.config_ids[c], i, seed)
                        cost, time, status, additional_info = runhistory.data[
                            runkey]
                        self.rh.add(c,
                                    cost,
                                    time,
                                    status,
                                    instance_id=i,
                                    seed=seed,
                                    additional_info=additional_info)
                        runs_from_rh += 1
                else:
                    # If no runhistory or no entries for instance, get new seed
                    seed = self.rng.randint(MAXINT)
                    if self.scen.deterministic:
                        seed = 0
                # configs in inner loop -> same inst-seed-pairs for all configs
                for config in [
                        c for c in configs if not c in configs_evaluated
                ]:
                    specs = self.scen.instance_specific[
                        i] if i and i in self.scen.instance_specific else "0"
                    runs.append({
                        'config': config,
                        'inst': i,
                        'seed': seed,
                        'inst_specs': specs
                    })

        self.logger.info(
            "Collected %d runs from %d configurations on %d instances "
            "with %d repetitions.", len(runs), len(configs), len(insts),
            repetitions)
        self.logger.info("Using %d runs from given runhistory.", runs_from_rh)

        return runs

    def _process_runhistory(self, configs, insts, runhistory):
        """
        Processes runhistory from self.get_runs by extracting already evaluated
        (relevant) config-inst-seed tuples.

        Parameters
        ----------
        configs: list(Configuration)
            list of configs of interest
        insts: list(str)
            list of instances of interest
        runhistory: RunHistory
            runhistory to extract runs from

        Returns
        -------
        inst_seed_config: dict<str : list(tuple(int, tuple(configs)))>
            dictionary mapping instances to a list of tuples of already used
            seeds and the configs that this inst-seed-pair has been evaluated
            on, sorted by the number of configs
        """
        # We want to reuse seeds that have been used on most configurations
        # To this end, we create a dictionary as {instances:{seed:[configs]}}
        # Like this we can easily retrieve the most used instance-seed pairs to
        # minimize the number of runs to be evaluated
        inst_seed_config = {}
        if runhistory:
            relevant = dict()
            for key in runhistory.data:
                if (runhistory.ids_config[key.config_id] in configs
                        and key.instance_id in insts):
                    relevant[key] = runhistory.data[key]

            # Change data-structure to {instances:[(seed1, (configs)), (seed2, (configs), ... ]}
            # to make most used seed easily accessible, we sort after length of configs
            for key in relevant:
                inst, seed = key.instance_id, key.seed
                config = runhistory.ids_config[key.config_id]
                if inst in inst_seed_config:
                    if seed in inst_seed_config[inst]:
                        inst_seed_config[inst][seed].append(config)
                    else:
                        inst_seed_config[inst][seed] = [config]
                else:
                    inst_seed_config[inst] = {seed: [config]}

            inst_seed_config = {
                i: sorted([(seed, tuple(inst_seed_config[i][seed]))
                           for seed in inst_seed_config[i]],
                          key=lambda x: len(x[1]))
                for i in inst_seed_config
            }
        return inst_seed_config

    def _get_configs(self, mode):
        """
        Return desired configs

        Parameters
        ----------
        mode : string
            from [def, inc, def+inc, time, all], time means evaluation at
            timesteps 2^-4, 2^-3, 2^-2, 2^-1, 2^0, 2^1, ...

        Returns
        -------
        configs: list<Configuration>
            list with desired configurations
        """
        # Add desired configs
        configs = []
        mode = mode.lower()
        if mode not in ['def', 'inc', 'def+inc', 'time', 'all']:
            raise ValueError(
                "%s not a valid option for config_mode in validation." % mode)
        if mode == "def" or mode == "def+inc":
            configs.append(self.scen.cs.get_default_configuration())
        if mode == "inc" or mode == "def+inc":
            configs.append(self.traj[-1]["incumbent"])
        if mode == "time":
            # add configs at evaluations 2^1, 2^2, 2^3, ...
            configs.append(self.traj[0]["incumbent"])  # add first
            counter = 2 ^ (-4)
            for entry in self.traj[:-1]:
                if (entry["wallclock_time"] >= counter
                        and entry["incumbent"] not in configs):
                    configs.append(entry["incumbent"])
                    counter *= 2
            configs.append(self.traj[-1]["incumbent"])  # add last
        if mode == "all":
            for entry in self.traj:
                if not entry["incumbent"] in configs:
                    configs.append(entry["incumbent"])
        self.logger.debug("Gathered %d configurations for mode %s.",
                          len(configs), mode)
        return configs

    def _get_instances(self, mode):
        """
        Get desired instances

        Parameters
        ----------
        mode: string
            what instances to use for validation, from [train, test, train+test]

        Returns
        -------
        instances: list<strings>
            instances to be used
        """
        instance_mode = mode.lower()
        if mode not in ['train', 'test', 'train+test']:
            raise ValueError(
                "%s not a valid option for instance_mode in validation." %
                mode)

        # Make sure if instances matter, than instances should be passed
        if ((instance_mode == 'train' and self.scen.train_insts == [None]) or
            (instance_mode == 'test' and self.scen.test_insts == [None])):
            self.logger.warning(
                "Instance mode is set to %s, but there are no "
                "%s-instances specified in the scenario. Setting instance mode to"
                "\"train+test\"!", instance_mode, instance_mode)
            instance_mode = 'train+test'

        instances = []
        if ((instance_mode == 'train' or instance_mode == 'train+test')
                and not self.scen.train_insts == [None]):
            instances.extend(self.scen.train_insts)
        if ((instance_mode == 'test' or instance_mode == 'train+test')
                and not self.scen.test_insts == [None]):
            instances.extend(self.scen.test_insts)
        return instances
Example #10
0
class CAVE(object):
    """
    """
    def __init__(self,
                 folders: typing.List[str],
                 output: str,
                 ta_exec_dir: Union[str, None] = None,
                 missing_data_method: str = 'epm',
                 max_pimp_samples: int = -1,
                 fanova_pairwise=True):
        """
        Initialize CAVE facade to handle analyzing, plotting and building the
        report-page easily. During initialization, the analysis-infrastructure
        is built and the data is validated, meaning the overall best
        incumbent is found and default+incumbent are evaluated for all
        instances for all runs, by default using an EPM.
        The class holds two runhistories:
            self.original_rh -> only contains runs from the actual data
            self.validated_rh -> contains original runs and epm-predictions for
                                 all incumbents
        The analyze()-method performs an analysis and outputs a report.html.

        Arguments
        ---------
        folders: list<strings>
            paths to relevant SMAC runs
        output: string
            output for cave to write results (figures + report)
        ta_exec_dir: string
            execution directory for target algorithm (to find instance.txt, ..)
        missing_data_method: string
            from [validation, epm], how to estimate missing runs
        """
        self.logger = logging.getLogger("cave.cavefacade")
        self.logger.debug("Folders: %s", str(folders))
        self.ta_exec_dir = ta_exec_dir

        # Create output if necessary
        self.output = output
        self.logger.info("Saving results to %s", self.output)
        if not os.path.exists(output):
            self.logger.debug("Output-dir %s does not exist, creating",
                              self.output)
            os.makedirs(output)
        if not os.path.exists(os.path.join(self.output, "debug")):
            os.makedirs(os.path.join(self.output, "debug"))
        # Log to file
        logger = logging.getLogger()
        handler = logging.FileHandler(
            os.path.join(self.output, "debug/debug.log"), "w")
        handler.setLevel(logging.DEBUG)
        logger.addHandler(handler)

        # Global runhistory combines all actual runs of individual SMAC-runs
        # We save the combined (unvalidated) runhistory to disk, so we can use it later on.
        # We keep the validated runhistory (with as many runs as possible) in
        # memory. The distinction is made to avoid using runs that are
        # only estimated using an EPM for further EPMs or to handle runs
        # validated on different hardware (depending on validation-method).
        self.original_rh = RunHistory(average_cost)
        self.validated_rh = RunHistory(average_cost)

        # Save all relevant SMAC-runs in a list
        self.runs = []
        for folder in folders:
            try:
                self.logger.debug("Collecting data from %s.", folder)
                self.runs.append(SMACrun(folder, ta_exec_dir))
            except Exception as err:
                self.logger.warning(
                    "Folder %s could not be loaded, failed "
                    "with error message: %s", folder, err)
                continue
        if not len(self.runs):
            raise ValueError(
                "None of the specified SMAC-folders could be loaded.")

        # Use scenario of first run for general purposes (expecting they are all the same anyway!)
        self.scenario = self.runs[0].solver.scenario

        # Update global runhistory with all available runhistories
        self.logger.debug("Update original rh with all available rhs!")
        runhistory_fns = [
            os.path.join(run.folder, "runhistory.json") for run in self.runs
        ]
        for rh_file in runhistory_fns:
            self.original_rh.update_from_json(rh_file, self.scenario.cs)
        self.logger.debug(
            'Combined number of Runhistory data points: %d. '
            '# Configurations: %d. # Runhistories: %d',
            len(self.original_rh.data),
            len(self.original_rh.get_all_configs()), len(runhistory_fns))
        self.original_rh.save_json(
            os.path.join(self.output, "combined_rh.json"))

        # Validator for a) validating with epm, b) plot over time
        # Initialize without trajectory
        self.validator = Validator(self.scenario, None, None)

        # Estimate missing costs for [def, inc1, inc2, ...]
        self.complete_data(method=missing_data_method)
        self.best_run = min(
            self.runs,
            key=lambda run: self.validated_rh.get_cost(run.solver.incumbent))

        self.default = self.scenario.cs.get_default_configuration()
        self.incumbent = self.best_run.solver.incumbent

        self.logger.debug("Overall best run: %s, with incumbent: %s",
                          self.best_run.folder, self.incumbent)

        # Following variable determines whether a distinction is made
        # between train and test-instances (e.g. in plotting)
        self.train_test = bool(self.scenario.train_insts != [None]
                               and self.scenario.test_insts != [None])

        self.analyzer = Analyzer(self.original_rh, self.validated_rh,
                                 self.default, self.incumbent, self.train_test,
                                 self.scenario, self.validator, self.output,
                                 max_pimp_samples, fanova_pairwise)

        self.builder = HTMLBuilder(self.output, "CAVE")
        # Builder for html-website
        self.website = OrderedDict([])

    def complete_data(self, method="epm"):
        """Complete missing data of runs to be analyzed. Either using validation
        or EPM.
        """
        with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'):
            self.logger.info("Completing data using %s.", method)

            path_for_validated_rhs = os.path.join(self.output, "validated_rhs")
            for run in self.runs:
                self.validator.traj = run.traj
                if method == "validation":
                    # TODO determine # repetitions
                    new_rh = self.validator.validate(
                        'def+inc',
                        'train+test',
                        1,
                        -1,
                        runhistory=self.original_rh)
                elif method == "epm":
                    new_rh = self.validator.validate_epm(
                        'def+inc',
                        'train+test',
                        1,
                        runhistory=self.original_rh)
                else:
                    raise ValueError("Missing data method illegal (%s)",
                                     method)
                self.validator.traj = None  # Avoid usage-mistakes
                self.validated_rh.update(new_rh)

    def analyze(self,
                performance=True,
                cdf=True,
                scatter=True,
                confviz=True,
                param_importance=['forward_selection', 'ablation', 'fanova'],
                feature_analysis=[
                    "box_violin", "correlation", "feat_importance",
                    "clustering", "feature_cdf"
                ],
                parallel_coordinates=True,
                cost_over_time=True,
                algo_footprint=True):
        """Analyze the available data and build HTML-webpage as dict.
        Save webpage in 'self.output/CAVE/report.html'.
        Analyzing is performed with the analyzer-instance that is initialized in
        the __init__

        Parameters
        ----------
        performance: bool
            whether to calculate par10-values
        cdf: bool
            whether to plot cdf
        scatter: bool
            whether to plot scatter
        confviz: bool
            whether to perform configuration visualization
        param_importance: List[str]
            containing methods for parameter importance
        feature_analysis: List[str]
            containing methods for feature analysis
        parallel_coordinates: bool
            whether to plot parallel coordinates
        cost_over_time: bool
            whether to plot cost over time
        algo_footprint: bool
            whether to plot algorithm footprints
        """

        # Check arguments
        for p in param_importance:
            if p not in [
                    'forward_selection', 'ablation', 'fanova', 'incneighbor'
            ]:
                raise ValueError(
                    "%s not a valid option for parameter "
                    "importance!", p)
        for f in feature_analysis:
            if f not in [
                    "box_violin", "correlation", "importance", "clustering",
                    "feature_cdf"
            ]:
                raise ValueError("%s not a valid option for feature analysis!",
                                 f)

        # Start analysis
        overview = self.analyzer.create_overview_table(self.best_run.folder)
        self.website["Meta Data"] = {"table": overview}

        compare_config = self.analyzer.config_to_html(self.default,
                                                      self.incumbent)
        self.website["Best configuration"] = {"table": compare_config}

        ########## PERFORMANCE ANALYSIS
        self.website["Performance Analysis"] = OrderedDict()

        if performance:
            performance_table = self.analyzer.create_performance_table(
                self.default, self.incumbent)
            self.website["Performance Analysis"]["Performance Table"] = {
                "table": performance_table
            }

        if cdf:
            cdf_path = self.analyzer.plot_cdf()
            self.website["Performance Analysis"][
                "empirical Cumulative Distribution Function (eCDF)"] = {
                    "figure": cdf_path
                }

        if scatter and (self.scenario.train_insts != [[None]]):
            scatter_path = self.analyzer.plot_scatter()
            self.website["Performance Analysis"]["Scatterplot"] = {
                "figure": scatter_path
            }
        elif scatter:
            self.logger.info(
                "Scatter plot desired, but no instances available.")

        # Build report before time-consuming analysis
        self.build_website()

        if algo_footprint and self.scenario.feature_dict:
            algorithms = {self.default: "default", self.incumbent: "incumbent"}
            # Add all available incumbents to test portfolio strategy
            #for r in self.runs:
            #    if not r.get_incumbent() in algorithms:
            #        algorithms[r.get_incumbent()] = str(self.runs.index(r))

            algo_footprint_plots = self.analyzer.plot_algorithm_footprint(
                algorithms)
            self.website["Performance Analysis"][
                "Algorithm Footprints"] = OrderedDict()
            for p in algo_footprint_plots:
                header = os.path.splitext(os.path.split(p)[1])[0]  # algo name
                self.website["Performance Analysis"]["Algorithm Footprints"][
                    header] = {
                        "figure": p,
                        "tooltip":
                        get_tooltip("Algorithm Footprints") + ": " + header
                    }

        self.build_website()

        ########### Configurator's behavior
        self.website["Configurator's behavior"] = OrderedDict()

        if confviz:
            if self.scenario.feature_array is None:
                self.scenario.feature_array = np.array([[]])
            # Sort runhistories and incs wrt cost
            incumbents = [r.solver.incumbent for r in self.runs]
            trajectories = [r.traj for r in self.runs]
            runhistories = [r.runhistory for r in self.runs]
            costs = [self.validated_rh.get_cost(i) for i in incumbents]
            costs, incumbents, runhistories, trajectories = (
                list(t) for t in zip(
                    *sorted(zip(costs, incumbents, runhistories, trajectories),
                            key=lambda x: x[0])))
            incumbents = list(map(lambda x: x['incumbent'], trajectories[0]))

            confviz_script = self.analyzer.plot_confviz(
                incumbents, runhistories)
            self.website["Configurator's behavior"][
                "Configurator Footprint"] = {
                    "table": confviz_script
                }
        elif confviz:
            self.logger.info("Configuration visualization desired, but no "
                             "instance-features available.")

        self.build_website()

        if cost_over_time:
            cost_over_time_path = self.analyzer.plot_cost_over_time(
                self.best_run.traj, self.validator)
            self.website["Configurator's behavior"]["Cost over time"] = {
                "figure": cost_over_time_path
            }

        self.build_website()

        self.parameter_importance(ablation='ablation' in param_importance,
                                  fanova='fanova' in param_importance,
                                  forward_selection='forward_selection'
                                  in param_importance,
                                  incneighbor='incneighbor'
                                  in param_importance)

        self.build_website()

        if parallel_coordinates:
            # Should be after parameter importance, if performed.
            n_params = 6
            parallel_path = self.analyzer.plot_parallel_coordinates(n_params)
            self.website["Configurator's behavior"]["Parallel Coordinates"] = {
                "figure": parallel_path
            }

        self.build_website()

        if self.scenario.feature_dict:
            self.feature_analysis(box_violin='box_violin' in feature_analysis,
                                  correlation='correlation'
                                  in feature_analysis,
                                  clustering='clustering' in feature_analysis,
                                  importance='importance' in feature_analysis)
        else:
            self.logger.info('No feature analysis possible')

        self.logger.info("CAVE finished. Report is located in %s",
                         os.path.join(self.output, 'report.html'))

        self.build_website()

    def parameter_importance(self,
                             ablation=False,
                             fanova=False,
                             forward_selection=False,
                             incneighbor=False):
        """Perform the specified parameter importance procedures. """
        # PARAMETER IMPORTANCE
        if (ablation or forward_selection or fanova or incneighbor):
            self.website["Parameter Importance"] = OrderedDict()
        sum_ = 0
        if fanova:
            sum_ += 1
            table, plots, pair_plots = self.analyzer.fanova(self.incumbent)

            self.website["Parameter Importance"]["fANOVA"] = OrderedDict()

            self.website["Parameter Importance"]["fANOVA"]["Importance"] = {
                "table": table
            }
            # Insert plots (the received plots is a dict, mapping param -> path)
            self.website["Parameter Importance"]["fANOVA"][
                "Marginals"] = OrderedDict([])
            for param, plot in plots.items():
                self.website["Parameter Importance"]["fANOVA"]["Marginals"][
                    param] = {
                        "figure": plot
                    }
            if pair_plots:
                self.website["Parameter Importance"]["fANOVA"][
                    "PairwiseMarginals"] = OrderedDict([])
                for param, plot in pair_plots.items():
                    self.website["Parameter Importance"]["fANOVA"][
                        "PairwiseMarginals"][param] = {
                            "figure": plot
                        }

        if ablation:
            sum_ += 1
            self.logger.info("Ablation...")
            self.analyzer.parameter_importance("ablation", self.incumbent,
                                               self.output)
            ablationpercentage_path = os.path.join(self.output,
                                                   "ablationpercentage.png")
            ablationperformance_path = os.path.join(self.output,
                                                    "ablationperformance.png")
            self.website["Parameter Importance"]["Ablation"] = {
                "figure": [ablationpercentage_path, ablationperformance_path]
            }

        if forward_selection:
            sum_ += 1
            self.logger.info("Forward Selection...")
            self.analyzer.parameter_importance("forward-selection",
                                               self.incumbent, self.output)
            f_s_barplot_path = os.path.join(self.output,
                                            "forward selection-barplot.png")
            f_s_chng_path = os.path.join(self.output,
                                         "forward selection-chng.png")
            self.website["Parameter Importance"]["Forward Selection"] = {
                "figure": [f_s_barplot_path, f_s_chng_path]
            }

        if incneighbor:
            sum_ += 1
            self.logger.info("Local EPM-predictions around incumbent...")
            plots = self.analyzer.local_epm_plots()
            self.website["Parameter Importance"][
                "Local Parameter Importance (LPI)"] = OrderedDict([])
            for param, plot in plots.items():
                self.website["Parameter Importance"][
                    "Local Parameter Importance (LPI)"][param] = {
                        "figure": plot
                    }

        if sum_:
            of = os.path.join(self.output, 'pimp.tex')
            self.logger.info('Creating pimp latex table at %s' % of)
            self.analyzer.pimp.table_for_comparison(self.analyzer.evaluators,
                                                    of,
                                                    style='latex')

    def feature_analysis(self,
                         box_violin=False,
                         correlation=False,
                         clustering=False,
                         importance=False):
        if not (box_violin or correlation or clustering or importance):
            self.logger.debug("No feature analysis.")
            return

        # FEATURE ANALYSIS (ASAPY)
        # TODO make the following line prettier
        # TODO feat-names from scenario?
        in_reader = InputReader()
        feat_fn = self.scenario.feature_fn

        if not self.scenario.feature_names:
            with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'):
                if not feat_fn or not os.path.exists(feat_fn):
                    self.logger.warning(
                        "Feature Analysis needs valid feature "
                        "file! Either {} is not a valid "
                        "filename or features are not saved in "
                        "the scenario.")
                    self.logger.error("Skipping Feature Analysis.")
                    return
                else:
                    feat_names = in_reader.read_instance_features_file(
                        self.scenario.feature_fn)[0]
        else:
            feat_names = copy.deepcopy(self.scenario.feature_names)

        self.website["Feature Analysis"] = OrderedDict([])

        # feature importance using forward selection
        if importance:
            self.website["Feature Analysis"][
                "Feature Importance"] = OrderedDict()
            imp, plots = self.analyzer.feature_importance()
            imp = DataFrame(data=list(imp.values()),
                            index=list(imp.keys()),
                            columns=["Error"])
            imp = imp.to_html()  # this is a table with the values in html
            self.website["Feature Analysis"]["Feature Importance"]["Table"] = {
                "table": imp
            }
            for p in plots:
                name = os.path.splitext(os.path.basename(p))[0]
                self.website["Feature Analysis"]["Feature Importance"][
                    name] = {
                        "figure": p
                    }

        # box and violin plots
        if box_violin:
            name_plots = self.analyzer.feature_analysis(
                'box_violin', feat_names)
            self.website["Feature Analysis"][
                "Violin and Box Plots"] = OrderedDict()
            for plot_tuple in name_plots:
                key = "%s" % (plot_tuple[0])
                self.website["Feature Analysis"]["Violin and Box Plots"][
                    key] = {
                        "figure": plot_tuple[1]
                    }

        # correlation plot
        if correlation:
            correlation_plot = self.analyzer.feature_analysis(
                'correlation', feat_names)
            if correlation_plot:
                self.website["Feature Analysis"]["Correlation"] = {
                    "figure": correlation_plot
                }

        # cluster instances in feature space
        if clustering:
            cluster_plot = self.analyzer.feature_analysis(
                'clustering', feat_names)
            self.website["Feature Analysis"]["Clustering"] = {
                "figure": cluster_plot
            }

        self.build_website()

    def build_website(self):
        self.builder.generate_html(self.website)