Beispiel #1
0
    def __init__(self, dimensions_file: str, min_num_results_to_fit: int=8, lease_timout='2 days'):
        self.__all_experiments = pd.DataFrame()
        self.__all_experiments['status'] = [self.WAITING] * len(self.__all_experiments)
        self.__all_experiments['last_update'] = pd.Series(pd.Timestamp(float('NaN')))
        self.__all_experiments['client'] = [""] * len(self.__all_experiments)

        self.__lease_duration = pd.to_timedelta(lease_timout)
        self.__leased_experiments = []

        dims = self.__load_dimensions(dimensions_file)
        self.__dimension_names = list(dims.keys())
        self.__dimensions = list(dims.values())
        self.__min_num_results_to_fit = min_num_results_to_fit

        # Initialize

        dim_types = [check_dimension(d) for d in self.__dimensions]
        is_cat = all([isinstance(check_dimension(d), Categorical) for d in dim_types])
        if is_cat:
            transformed_dims = [check_dimension(d, transform="identity") for d in self.__dimensions]
        else:
            transformed_dims = []
            for dim_type, dim in zip(dim_types, self.__dimensions):
                if isinstance(dim_type, Categorical):
                    transformed_dims.append(check_dimension(dim, transform="onehot"))
                # To make sure that GP operates in the [0, 1] space
                else:
                    transformed_dims.append(check_dimension(dim, transform="normalize"))

        space = Space(transformed_dims)
        # Default GP
        cov_amplitude = ConstantKernel(1.0, (0.01, 1000.0))

        if is_cat:
            other_kernel = HammingKernel(length_scale=np.ones(space.transformed_n_dims))
            acq_optimizer = "lbfgs"
        else:
            other_kernel = Matern(
                length_scale=np.ones(space.transformed_n_dims),
                length_scale_bounds=[(0.01, 100)] * space.transformed_n_dims,
                nu=2.5)

        base_estimator = GaussianProcessRegressor(
            kernel=cov_amplitude * other_kernel,
            normalize_y=True, random_state=None, alpha=0.0, noise='gaussian',
            n_restarts_optimizer=2)

        self.__opt = Optimizer(self.__dimensions, base_estimator, acq_optimizer="lbfgs",
                               n_random_starts=100, acq_optimizer_kwargs=dict(n_points=10000))
Beispiel #2
0
    def _check_search_space(self, search_space):
        """Checks whether the search space argument is correct"""

        if len(search_space) == 0:
            raise ValueError(
                "The search_spaces parameter should contain at least one"
                "non-empty search space, got %s" % search_space)

        # check if space is a single dict, convert to list if so
        if isinstance(search_space, dict):
            search_space = [search_space]

        # check if the structure of the space is proper
        if isinstance(search_space, list):
            # convert to just a list of dicts
            dicts_only = []

            # 1. check the case when a tuple of space, n_iter is provided
            for elem in search_space:
                if isinstance(elem, tuple):
                    if len(elem) != 2:
                        raise ValueError(
                            "All tuples in list of search spaces should have"
                            "length 2, and contain (dict, int), got %s" % elem)
                    subspace, n_iter = elem

                    if (not isinstance(n_iter, int)) or n_iter < 0:
                        raise ValueError(
                            "Number of iterations in search space should be"
                            "positive integer, got %s in tuple %s " %
                            (n_iter, elem))

                    # save subspaces here for further checking
                    dicts_only.append(subspace)
                elif isinstance(elem, dict):
                    dicts_only.append(elem)
                else:
                    raise TypeError(
                        "A search space should be provided as a dict or"
                        "tuple (dict, int), got %s" % elem)

            # 2. check all the dicts for correctness of contents
            for subspace in dicts_only:
                for k, v in subspace.items():
                    check_dimension(v)
        else:
            raise TypeError(
                "Search space should be provided as a dict or list of dict,"
                "got %s" % search_space)
Beispiel #3
0
def create_opt(lines, ranker_name):
    gp_seed, opt_seed = get_seed(lines)
    _ranker_class = object_rankers[ranker_name]
    _ranker_class._use_early_stopping = True
    param_ranges = _ranker_class.set_tunable_parameter_ranges({})
    transformed = []
    for param in param_ranges:
        transformed.append(check_dimension(param))
    space = normalize_dimensions(transformed)
    base_estimator = cook_estimator("GP",
                                    space=space,
                                    random_state=gp_seed,
                                    noise="gaussian")
    optimizer = Optimizer(dimensions=param_ranges,
                          random_state=opt_seed,
                          base_estimator=base_estimator)
    return optimizer
Beispiel #4
0
    def set_optimizer(self, n_iter, opt_seed, acq_func, gp_seed, **kwargs):
        self.logger.info('Retrieving model stored at: {}'.format(self.optimizer_path))
        try:
            optimizer = load(self.optimizer_path)
            self.logger.info('Loading model stored at: {}'.format(self.optimizer_path))
            finished_iter = np.array(optimizer.yi).shape[0]
            if finished_iter == 0:
                optimizer = None
                self.logger.info('Optimizer did not finish any iterations so setting optimizer to null')
        except KeyError:
            self.logger.error('Cannot open the file {}'.format(self.optimizer_path))
            optimizer = None

        except ValueError:
            self.logger.error('Cannot open the file {}'.format(self.optimizer_path))
            optimizer = None
        except FileNotFoundError:
            self.logger.error('No such file or directory: {}'.format(self.optimizer_path))
            optimizer = None

        if optimizer is not None:
            n_iter = n_iter - finished_iter
            if n_iter < 0:
                n_iter = 0
            self.logger.info('Iterations already done: {} and running iterations {}'.format(finished_iter, n_iter))
            self.opt = optimizer
            self.logger.debug('Setting the provided optimizer')
            self.log_best_params()
        else:
            transformed = []
            for param in self.parameter_ranges:
                transformed.append(check_dimension(param))
            self.logger.info("Parameter Space: {}".format(transformed))
            norm_space = normalize_dimensions(transformed)
            self.logger.info("Parameter Space after transformation: {}".format(norm_space))
            categorical_space = np.array([isinstance(s, Categorical) for s in norm_space])
            self.logger.info("categorical_space: {}".format(categorical_space))
            if np.all(categorical_space):
                base_estimator = cook_estimator("RF", space=norm_space, random_state=gp_seed)
            else:
                base_estimator = cook_estimator("GP", space=norm_space, random_state=gp_seed, noise="gaussian")

            self.opt = Optimizer(dimensions=self.parameter_ranges, random_state=opt_seed, base_estimator=base_estimator,
                                 acq_func=acq_func, **kwargs)

        return n_iter
Beispiel #5
0
    def fit(self,
            X,
            Y,
            total_duration=6e7,
            n_iter=100,
            cv_iter=None,
            optimizer=None,
            acq_func='gp_hedge',
            **kwargs):
        start = datetime.now()

        def splitter(itr):
            for train_idx, test_idx in itr:
                yield X[train_idx], Y[train_idx], X[test_idx], Y[test_idx]

        def splitter_dict(itr_dict):

            n_splits = len(list(itr_dict.values())[0])
            for i in range(n_splits):
                X_train = dict()
                Y_train = dict()
                X_test = dict()
                Y_test = dict()
                for n_obj, itr in itr_dict.items():
                    train_idx = itr[i][0]
                    test_idx = itr[i][1]
                    X_train[n_obj] = np.copy(X[n_obj][train_idx])
                    X_test[n_obj] = np.copy(X[n_obj][test_idx])
                    Y_train[n_obj] = np.copy(Y[n_obj][train_idx])
                    Y_test[n_obj] = np.copy(Y[n_obj][test_idx])
                yield X_train, Y_train, X_test, Y_test

        if cv_iter is None:
            cv_iter = ShuffleSplit(n_splits=3,
                                   test_size=0.1,
                                   random_state=self.random_state)
        if isinstance(X, dict):
            splits = dict()
            for n_obj, arr in X.items():
                if arr.shape[0] == 1:
                    splits[n_obj] = [([0], [0])
                                     for i in range(cv_iter.n_splits)]
                else:
                    splits[n_obj] = list(cv_iter.split(arr))
        else:
            splits = list(cv_iter.split(X))
        # Pre-compute splits for reuse
        # Here we fix a random seed for all simulations to correlate the random
        # streams:

        seed = self.random_state.randint(2**32, dtype='uint32')
        self.logger.debug(
            'Random seed for the ranking algorithm: {}'.format(seed))
        opt_seed = self.random_state.randint(2**32, dtype='uint32')
        self.logger.debug('Random seed for the optimizer: {}'.format(opt_seed))
        gp_seed = self.random_state.randint(2**32, dtype='uint32')
        self.logger.debug(
            'Random seed for the GP surrogate: {}'.format(gp_seed))

        if optimizer is not None:
            opt = optimizer
            self.logger.debug('Setting the provided optimizer')
            self.log_best_params(opt)
        else:
            transformed = []
            for param in self.parameter_ranges:
                transformed.append(check_dimension(param))
            self.logger.info("Parameter Space: {}".format(transformed))
            space = normalize_dimensions(transformed)
            self.logger.info(
                "Parameter Space after transformation: {}".format(space))

            # Todo: Make this passable
            base_estimator = cook_estimator("GP",
                                            space=space,
                                            random_state=gp_seed,
                                            noise="gaussian")
            opt = Optimizer(dimensions=self.parameter_ranges,
                            random_state=opt_seed,
                            base_estimator=base_estimator,
                            acq_func=acq_func,
                            **kwargs)
        self._callbacks_set_optimizer(opt)
        self._callbacks_on_optimization_begin()
        time_taken = duration_tillnow(start)
        total_duration -= time_taken
        max_fit_duration = -10000
        self.logger.info('Time left for {} iterations is {}'.format(
            n_iter, microsec_to_time(total_duration)))

        try:
            for t in range(n_iter):
                start = datetime.now()
                self._callbacks_on_iteration_begin(t)
                self.logger.info(
                    'Starting optimization iteration: {}'.format(t))
                if t > 0:
                    self.log_best_params(opt)

                next_point = opt.ask()
                self.logger.info('Next parameters:\n{}'.format(next_point))
                results = []
                running_times = []
                if isinstance(X, dict):
                    for X_train, Y_train, X_test, Y_test in splitter_dict(
                            splits):
                        result, time_taken = self._fit_ranker(
                            X_train, Y_train, X_test, Y_test, next_point)
                        running_times.append(time_taken)
                        results.append(result)
                else:
                    for X_train, Y_train, X_test, Y_test in splitter(splits):
                        result, time_taken = self._fit_ranker(
                            X_train, Y_train, X_test, Y_test, next_point)
                        running_times.append(time_taken)
                        results.append(result)

                results = np.array(results)
                running_times = np.array(running_times)
                mean_result = np.mean(results)
                mean_fitting_duration = np.mean(running_times)

                # Storing the maximum time to run the splitting model and adding the time for out of sample evaluation
                if max_fit_duration < np.sum(running_times):
                    max_fit_duration = np.sum(running_times)

                self.logger.info(
                    'Validation error for the parameters is {:.4f}'.format(
                        mean_result))
                self.logger.info('Time taken for the parameters is {}'.format(
                    microsec_to_time(np.sum(running_times))))
                if "ps" in opt.acq_func:
                    opt.tell(next_point, [mean_result, mean_fitting_duration])
                else:
                    opt.tell(next_point, mean_result)
                self._callbacks_on_iteration_end(t)

                self.logger.info(
                    "Main optimizer iterations done {} and saving the model".
                    format(np.array(opt.yi).shape[0]))
                dump(opt, self.optimizer_path)

                time_taken = duration_tillnow(start)
                total_duration -= time_taken
                self.logger.info('Time left for simulations is {} '.format(
                    microsec_to_time(total_duration)))

                if (total_duration - max_fit_duration) < 0:
                    self.logger.info(
                        'At iteration {} maximum time required by model to validate a parameter values'
                        .format(microsec_to_time(max_fit_duration)))
                    self.logger.info(
                        'At iteration {} simulation stops, due to time deficiency'
                        .format(t))
                    break

        except KeyboardInterrupt:
            self.logger.debug(
                'Optimizer interrupted saving the model at {}'.format(
                    self.optimizer_path))
            self.log_best_params(opt)
        else:
            self.logger.debug(
                'Finally, fit a model on the complete training set and storing the model at {}'
                .format(self.optimizer_path))
            self._fit_params["epochs"] = self._fit_params.get("epochs", 1000)
            if "ps" in opt.acq_func:
                best_point = opt.Xi[np.argmin(np.array(opt.yi)[:, 0])]
            else:
                best_point = opt.Xi[np.argmin(opt.yi)]
            self._set_new_parameters(best_point)
            self.model = copy.copy(self.ranker)
            self.model.fit(X, Y, **self._fit_params)

        finally:
            self._callbacks_on_optimization_end()
            self.optimizer = opt
            if np.array(opt.yi).shape[0] != 0:
                dump(opt, self.optimizer_path)
Beispiel #6
0
def SkoptCMAoptimizer(
        func,
        dimensions,
        n_calls,
        verbose=False,
        callback=(),
        x0=None,
        sigma0=.5,
        normalize=True,
):
    '''
    Optmizer based on CMA-ES algorithm.
    This is essentially a wrapper fuction for the cma library function
    to align the interface with skopt library.

    Args:
        func (callable): function to optimize
        dimensions: list of tuples.  search dimensions
        n_calls: the number of samples.
        verbose: if this func should be verbose
        callback: the list of callback functions.
        x0: inital values
            if None, random point will be sampled
        sigma0: initial standard deviation
        normalize: whether optimization domain should be normalized

    Returns:
        `res` skopt.OptimizeResult object
        The optimization result returned as a dict object.
        Important attributes are:
        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimization space.
    '''
    specs = {
        'args': copy.copy(inspect.currentframe().f_locals),
        'function': inspect.currentframe().f_code.co_name,
    }

    if normalize:
        dimensions = list(
            map(lambda x: check_dimension(x, 'normalize'), dimensions))
    space = Space(dimensions)
    if x0 is None: x0 = space.transform(space.rvs())[0]

    tempdir = tempfile.mkdtemp()
    xi, yi = [], []
    options = {
        'bounds': np.array(space.transformed_bounds).transpose().tolist(),
        'verb_filenameprefix': tempdir,
    }

    def delete_tempdir(self, *args, **kargs):
        os.removedirs(tempdir)
        return

    model = cma.CMAEvolutionStrategy(x0, sigma0, options)
    model.logger.__del__ = delete_tempdir
    for i in range(n_calls):
        if model.stop(): break
        new_xi = model.ask()
        new_xi_denorm = space.inverse_transform(np.array(new_xi))
        new_yi = [func(x) for x in new_xi_denorm]

        model.tell(new_xi, new_yi)
        model.logger.add()
        if verbose: model.disp()

        xi += new_xi_denorm
        yi += new_yi
        results = create_result(xi, yi)
        for f in callback:
            f(results)

    results = create_result(xi, yi, space)
    model.logger.load()
    results.cma_logger = model.logger
    results.specs = specs
    return results