def _tree_minimize(func, dimensions, base_estimator, n_calls,
                   n_points, n_random_starts, random_state=None):
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Initialize with random points
    if n_random_starts <= 0:
        raise ValueError(
            "Expected n_random_starts > 0, got %d" % n_random_starts)

    if n_calls <= 0:
        raise ValueError(
            "Expected n_calls > 0, got %d" % n_random_starts)

    if n_calls < n_random_starts:
        raise ValueError(
            "Expected n_calls >= %d, got %d" % (n_random_starts, n_calls))

    Xi = space.rvs(n_samples=n_random_starts, random_state=rng)
    yi = [func(x) for x in Xi]
    if np.ndim(yi) != 1:
        raise ValueError(
            "The function to be optimized should return a scalar")

    # Tree-based optimization loop
    models = []

    n_model_iter = n_calls - n_random_starts
    for i in range(n_model_iter):
        rgr = clone(base_estimator)
        rgr.fit(space.transform(Xi), yi)
        models.append(rgr)

        # `rgr` predicts constants for each leaf which means that the EI
        # has zero gradient over large distances. As a result we can not
        # use gradient based optimizers like BFGS, so using random sampling
        # for the moment.
        X = space.transform(space.rvs(n_samples=n_points,
                                      random_state=rng))
        values = -gaussian_ei(X, rgr, np.min(yi))
        next_x = X[np.argmin(values)]

        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi = np.vstack((Xi, next_x))
        yi.append(next_y)

    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space

    return res
Exemple #2
0
def create_result(Xi,
                  yi,
                  space=None,
                  rng=None,
                  specs=None,
                  models=None,
                  model_mu=None,
                  model_std=None,
                  gurobi_mipgap=None):
    """
    Initialize an `OptimizeResult` object.

    Parameters
    ----------
    Xi : list of lists, shape (n_iters, n_features)
        Location of the minimum at every iteration.

    yi : array-like, shape (n_iters,)
        Minimum value obtained at every iteration.

    space : Space instance, optional
        Search space.

    rng : RandomState instance, optional
        State of the random state.

    specs : dict, optional
        Call specifications.

    models : list, optional
        List of fit surrogate models.

    Returns
    -------
    res : `OptimizeResult`, scipy object
        OptimizeResult instance with the required information.
    """
    res = OptimizeResult()
    yi = np.asarray(yi)
    if np.ndim(yi) == 2:
        res.log_time = np.ravel(yi[:, 1])
        yi = np.ravel(yi[:, 0])
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = yi
    res.x_iters = Xi
    res.models = models
    res.model_mu = model_mu
    res.model_std = model_std
    res.gurobi_mipgap = gurobi_mipgap
    res.space = space
    res.random_state = rng
    res.specs = specs
    return res
Exemple #3
0
def create_result(xi, yi, space=None, rs=None, specs=None, models=None):

    res = OptimizeResult()
    yi = np.asarray(yi)
    if np.ndim(yi) == 2:
        res.log_time = np.ravel(yi[:, 1])
        yi = np.ravel(yi[:, 0])
    best = np.argmin(yi)
    res.x = xi[best]
    res.fun = yi[best]
    res.func_vals = yi
    res.x_iters = xi
    res.models = models
    res.space = space
    res.random_state = rs
    res.specs = specs
    return res
Exemple #4
0
def create_result(Xi, yi, space=None, rng=None, specs=None, models=None):
    """
    Initialize an `OptimizeResult` object.

    Parameters
    ----------
    * `Xi` [list of lists, shape=(n_iters, n_features)]:
        Location of the minimum at every iteration.

    * `yi` [array-like, shape=(n_iters,)]:
        Minimum value obtained at every iteration.

    * `space` [Space instance, optional]:
        Search space.

    * `rng` [RandomState instance, optional]:
        State of the random state.

    * `specs` [dict, optional]:
        Call specifications.

    * `models` [list, optional]:
        List of fit surrogate models.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        OptimizeResult instance with the required information.
    """
    res = OptimizeResult()
    yi = np.asarray(yi)

    if np.ndim(yi) == 2:
        res.log_time = np.ravel(yi[:, 1])
        yi = np.ravel(yi[:, 0])
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = yi
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng
    res.specs = specs

    return res
Exemple #5
0
def create_result(Xi, yi, space=None, rng=None, specs=None, models=None):
    """
    Initialize an `OptimizeResult` object.

    Parameters
    ----------
    * `Xi` [list of lists, shape=(n_iters, n_features)]:
        Location of the minimum at every iteration.

    * `yi` [array-like, shape=(n_iters,)]:
        Minimum value obtained at every iteration.

    * `space` [Space instance, optional]:
        Search space.

    * `rng` [RandomState instance, optional]:
        State of the random state.

    * `specs` [dict, optional]:
        Call specifications.

    * `models` [list, optional]:
        List of fit surrogate models.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        OptimizeResult instance with the required information.
    """
    res = OptimizeResult()
    yi = np.asarray(yi)
    if np.ndim(yi) == 2:
        res.log_time = np.ravel(yi[:, 1])
        yi = np.ravel(yi[:, 0])
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = yi
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng
    res.specs = specs
    return res
def dummy_minimize(func,
                   dimensions,
                   n_calls=100,
                   x0=None,
                   y0=None,
                   random_state=None):
    """Random search by uniform sampling within the given bounds.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `n_calls` [int, default=100]:
        Number of calls to `func` to find the minimum.

    * `x0` [list, list of lists or `None`]:
        Initial input points.

        - If it is a list of lists, use it as a list of input points.
        - If it is a list, use it as a single initial input point.
        - If it is `None`, no initial input points are used.

    * `y0` [list, scalar or `None`]
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.
        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.
        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimisation space.
        - `specs` [dict]: the call specifications.
        - `rng` [RandomState instance]: State of the random state
           at the end of minimization.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    # Save call args
    specs = {
        "args": copy.copy(inspect.currentframe().f_locals),
        "function": inspect.currentframe().f_code.co_name
    }

    # Check params
    rng = check_random_state(random_state)
    space = Space(dimensions)

    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, got %s" % type(x0))

    if len(x0) > 0 and y0 is not None:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError("`y0` should be an iterable or a scalar, got %s" %
                             type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")

        if not all(map(np.isscalar, y0)):
            raise ValueError("`y0` elements should be scalars")

    elif len(x0) > 0 and y0 is None:
        y0 = []
        n_calls -= len(x0)

    elif len(x0) == 0 and y0 is not None:
        raise ValueError("`x0`cannot be `None` when `y0` is provided")

    else:  # len(x0) == 0 and y0 is None
        y0 = []

    X = x0
    y = y0

    # Random search
    X = X + space.rvs(n_samples=n_calls, random_state=rng)
    first = True

    for i in range(len(y0), len(X)):
        y_i = func(X[i])

        if first:
            first = False
            if not np.isscalar(y_i):
                raise ValueError("`func` should return a scalar")

        y.append(y_i)

    y = np.array(y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(y)
    res.x = X[best]
    res.fun = y[best]
    res.func_vals = y
    res.x_iters = X
    res.models = []  # Create attribute even though it is empty
    res.space = space
    res.random_state = rng
    res.specs = specs

    return res
Exemple #7
0
def gbrt_minimize(func,
                  bounds,
                  base_estimator=None,
                  maxiter=100,
                  n_points=20,
                  n_start=10,
                  random_state=None):
    """Sequential optimisation using gradient boosted trees.

    Gradient boosted regression trees are used to model the (very)
    expensive to evaluate function `func`. The model is improved
    by sequentially evaluating the expensive function at the next
    best point. Thereby finding the minimum of `func` with as
    few evaluations as possible.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `bounds` [array-like, shape=(n_parameters, 2)]:
        - ``bounds[i][0]`` should give the lower bound of each parameter and
        - ``bounds[i][1]`` should give the upper bound of each parameter.

    * `base_estimator` [`GradientBoostingQuantileRegressor`]:
        The regressor to use as surrogate model

    * `maxiter` [int, default=100]:
        Number of iterations used to find the minimum. This corresponds
        to the total number of evaluations of `func`. If `n_start` > 0
        only `maxiter - n_start` iterations are used.

    * `n_start` [int, default=10]:
        Number of random points to draw before fitting `base_estimator`
        for the first time. If `n_start > maxiter` this degrades to
        a random search for the minimum.

    * `n_points` [int, default=20]:
        Number of points to sample when minimizing the acquisition function.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [float]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [array]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = check_random_state(random_state)

    # Bounds
    num_params = len(bounds)
    lower_bounds, upper_bounds = extract_bounds(bounds)

    # Default estimator
    if base_estimator is None:
        base_estimator = GradientBoostingQuantileRegressor(random_state=rng)

    # Record the points and function values evaluated as part of
    # the minimization
    Xi = np.zeros((maxiter, num_params))
    yi = np.zeros(maxiter)

    # Initialize with random points
    if n_start == 0:
        raise ValueError("Need at least one starting point.")

    if maxiter == 0:
        raise ValueError("Need to perform at least one iteration.")

    n_start = min(n_start, maxiter)

    Xi[:n_start] = _random_points(lower_bounds,
                                  upper_bounds,
                                  n_points=n_start,
                                  random_state=rng)
    best_x = Xi[:n_start].ravel()
    yi[:n_start] = [func(xi) for xi in Xi[:n_start]]
    best_y = np.min(yi[:n_start])

    models = []

    for i in range(n_start, maxiter):
        rgr = clone(base_estimator)
        # only the first i points are meaningful
        rgr.fit(Xi[:i, :], yi[:i])
        models.append(rgr)

        # `rgr` predicts constants for each leaf which means that the EI
        # has zero gradient over large distances. As a result we can not
        # use gradient based optimisers like BFGS, use random sampling
        # for the moment.
        x0 = _random_points(lower_bounds,
                            upper_bounds,
                            n_points=n_points,
                            random_state=rng)
        aq = _expected_improvement(x0, rgr, best_y)
        best = np.argmin(aq)

        Xi[i] = x0[best].ravel()
        yi[i] = func(x0[best])

        if yi[i] < best_y:
            best_y = yi[i]
            best_x = Xi[i]

    res = OptimizeResult()
    res.x = best_x
    res.fun = best_y
    res.func_vals = yi
    res.x_iters = Xi
    res.models = models

    return res
Exemple #8
0
def create_result(Xi,
                  yi,
                  n_evaluations=None,
                  space=None,
                  rng=None,
                  specs=None,
                  models=None,
                  maximize=False):
    """
    Initialize an `OptimizeResult` object.

    Parameters
    ----------
    * `Xi` [list of lists, shape=(n_iters, n_features)]:
        Location of the minimum at every iteration.

    * `yi` [array-like, shape=(n_iters,)]:
        Minimum value obtained at every iteration.

    * `space` [Space instance, optional]:
        Search space.

    * `rng` [RandomState instance, optional]:
        State of the random state.

    * `specs` [dict, optional]:
        Call specifications.

    * `models` [list, optional]:
        List of fit surrogate models.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        OptimizeResult instance with the required information.
    """
    res = OptimizeResult()

    try:
        # Hyperband returns evaluations as lists of lists.
        # We want to store the results as a single array.
        yi = list(itertools.chain.from_iterable(yi))
        Xi = list(itertools.chain.from_iterable(Xi))
    except TypeError:
        # All algorithms other than Hyperband already return a single list.
        pass

    yi = np.asarray(yi)
    if np.ndim(yi) == 2:
        res.log_time = np.ravel(yi[:, 1])
        yi = np.ravel(yi[:, 0])

    if maximize:
        best = np.argmax(yi)
    else:
        best = np.argmin(yi)

    res.x = Xi[best]
    res.fun = yi[best]

    if n_evaluations:
        unique, sort_indices = np.unique(yi, return_index=True)

        if len(unique) < n_evaluations:
            func_sort_idx = np.argsort(yi)
            func_vals = sorted(yi)
            res.func_vals = np.asarray(func_vals[:n_evaluations])

            x_iter_sort = []
            for idx in func_sort_idx:
                x_iter_sort.append(Xi[idx])

            res.x_iters = np.asarray(x_iter_sort[:n_evaluations])
            res.all_func_vals = np.asarray(yi)
            res.all_x_iters = np.asarray(Xi)
        else:
            func_vals = sorted(unique)
            res.func_vals = np.asarray(func_vals[:n_evaluations])

            x_iter_sort = []
            for idx in sort_indices:
                x_iter_sort.append(Xi[idx])

            res.x_iters = np.asarray(x_iter_sort[:n_evaluations])
            res.all_func_vals = np.asarray(yi)
            res.all_x_iters = np.asarray(Xi)
    else:
        res.func_vals = np.asarray(yi)
        res.x_iters = np.asarray(Xi)

    res.models = models
    res.space = space
    res.random_state = rng
    res.specs = specs
    return res
Exemple #9
0
def gp_minimize(func,
                bounds=None,
                search="sampling",
                random_state=None,
                maxiter=1000,
                acq="UCB",
                num_points=500):
    """
    Black-box optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standared optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquistion function
    over the Gaussian posterior which is much quicker to evaluate.

    Parameters
    ----------
    func: callable
        Function to minimize. Should take a array of parameters and
        return the function value.

    bounds: array-like, shape (n_parameters, 2)
        ``bounds[i][0]`` should give the lower bound of each parameter and
        ``bounds[i][1]`` should give the upper bound of each parameter.

    search: string, "sampling" or "lbfgs"
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to "sampling", ``num_points`` are sampled randomly
        and the Gaussian Process prior is updated with that point that gives
        the best acquision value over the Gaussian posterior.

        If search is set to "lbfgs", then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquistion function
        over the Gaussian posterior.

    random_state: int, RandomState instance, or None (default)
        Set random state to something other than None for reproducible
        results.

    maxiter: int, default 1000
        Number of iterations to find the minimum. In other words, the
        number of function evaluations.

    acq: string, default "UCB"
        Function to minimize over the gaussian posterior. Can be either
        the "UCB" which refers to the UpperConfidenceBound or "EI" which
        is the Expected Improvement.

    num_points: int, default 500
        Number of points to sample to determine the next "best" point.
        Useless if search is set to "lbfgs".

    Returns
    -------
    res: OptimizeResult, scipy object
        The optimization result returned as a OptimizeResult object.
        Important attributes are
        ``x`` - float, the optimization solution,
        ``fun`` - float, the value of the function at the optimum,
        ``models``- gp_models[i]. the prior on the function fit at
                       iteration[i].
        ``func_vals`` - the function value at the ith iteration.
        ``x_iters`` - the value of ``x`` corresponding to the function value
                      at the ith iteration.
        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = np.random.RandomState(random_state)

    num_params = len(bounds)
    lower_bounds, upper_bounds = zip(*bounds)
    upper_bounds = np.asarray(upper_bounds)
    lower_bounds = np.asarray(lower_bounds)
    x0 = rng.rand(num_params)
    func_val = [func(lower_bounds + (upper_bounds - lower_bounds) * x0)]

    length_scale = np.ones(num_params)
    gp_params = {
        'kernel': Matern(length_scale=length_scale, nu=2.5),
        'normalize_y': True,
        'random_state': random_state
    }
    lbfgs_bounds = np.tile((0, 1), (num_params, 1))

    gp_models = []
    x = np.reshape(x0, (1, -1))

    for i in range(maxiter):
        gpr = GaussianProcessRegressor(**gp_params)
        gpr.fit(x, func_val)

        if search == "sampling":
            sampling = rng.rand(num_points, num_params)
            acquis = acquisition_func(sampling, gpr, np.min(func_val), acq)
            best_arg = np.argmin(acquis)
            best_x = sampling[best_arg]
        elif search == "lbfgs":
            init = rng.rand(num_params)
            best_x, _, _ = fmin_l_bfgs_b(acquisition_func,
                                         np.asfortranarray(init),
                                         args=(gpr, np.min(func_val), acq),
                                         bounds=lbfgs_bounds,
                                         approx_grad=True,
                                         maxiter=10)

        gp_models.append(gpr)

        best_f = func(lower_bounds + (upper_bounds - lower_bounds) * best_x)
        x_list = x.tolist()
        x_list.append(best_x)
        x = np.asarray(x_list)
        func_val.append(best_f)

    x = lower_bounds + (upper_bounds - lower_bounds) * x
    func_ind = np.argmin(func_val)
    x_val = x[func_ind]
    best_func_val = func_val[func_ind]
    res = OptimizeResult()
    res.models = gp_models

    res.x = x_val
    res.fun = best_func_val
    res.func_vals = func_val
    res.x_iters = x

    return res
Exemple #10
0
def gp_minimize(func, dimensions, base_estimator=None, alpha=10e-10,
                acq="EI", xi=0.01, kappa=1.96, search="auto", n_calls=100,
                n_points=500, n_random_starts=10, n_restarts_optimizer=5,
                x0=None, y0=None, random_state=None):
    """Bayesian optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standard optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquisition function
    over the Gaussian prior which is much quicker to evaluate.

    The total number of evaluations, `n_calls`, are performed like the
    following. If `x0` is provided but not `y0`, then the elements of `x0`
    are first evaluated, followed by `n_random_starts` evaluations.
    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
    made guided by the surrogate model. If `x0` and `y0` are both
    provided then `n_random_starts` evaluations are first made then
    `n_calls - n_random_starts` subsequent evaluations are made
    guided by the surrogate model.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `base_estimator` [a Gaussian process estimator]:
        The Gaussian process estimator to use for optimization.

    * `alpha` [float, default=1e-10]:
        Value added to the diagonal of the kernel matrix during fitting.
        Larger values correspond to an increased noise level in the
        observations and reduce potential numerical issues during fitting.

    * `acq` [string, default=`"EI"`]:
        Function to minimize over the gaussian prior. Can be either

        - `"LCB"` for lower confidence bound,
        - `"EI"` for expected improvement,
        - `"PI"` for probability of improvement.

    * `xi` [float, default=0.01]:
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    * `kappa` [float, default=1.96]:
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    * `search` [string, `"auto"`, `"sampling"` or `"lbfgs"`, default=`"auto"`]:
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to `"auto"`, then it is set to `"lbfgs"`` if
        all the search dimensions are Real(continuous). It defaults to
        `"sampling"` for all other cases.

        If search is set to `"sampling"`, `n_points` are sampled randomly
        and the Gaussian Process prior is updated with the point that gives
        the best acquisition value over the Gaussian prior.

        If search is set to `"lbfgs"`, then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquisition function
        over the Gaussian prior.

    * `n_calls` [int, default=100]:
        Number of calls to `func`.

    * `n_points` [int, default=500]:
        Number of points to sample to determine the next "best" point.
        Useless if search is set to `"lbfgs"`.

    * `n_random_starts` [int, default=10]:
        Number of evaluations of `func` with random initialization points
        before approximating the `func` with `base_estimator`.

    * `n_restarts_optimizer` [int, default=10]:
        The number of restarts of the optimizer when `search` is `"lbfgs"`.

    * `x0` [list, list of lists or `None`]:
        Initial input points.

        - If it is a list of lists, use it as a list of input points.
        - If it is a list, use it as a single initial input point.
        - If it is `None`, no initial input points are used.

    * `y0` [list, scalar or `None`]
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.
        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.
        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimization space.
        - `specs` [dict]`: the call specifications.
        - `rng` [RandomState instance]: State of the random state
           at the end of minimization.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    # Save call args
    specs = {"args": copy.copy(inspect.currentframe().f_locals),
             "function": inspect.currentframe().f_code.co_name}

    # Check params
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Default GP
    if base_estimator is None:
        base_estimator = GaussianProcessRegressor(
            kernel=(ConstantKernel(1.0, (0.01, 1000.0)) *
                    Matern(length_scale=np.ones(space.transformed_n_dims),
                           length_scale_bounds=[(0.01, 100)] * space.transformed_n_dims,
                           nu=2.5)),
            normalize_y=True, alpha=alpha, random_state=random_state)

    # Initialize with provided points (x0 and y0) and/or random points
    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, but got %s" % type(x0))

    n_init_func_calls = len(x0) if y0 is not None else 0
    n_total_init_calls = n_random_starts + n_init_func_calls

    if n_total_init_calls <= 0:
        # if x0 is not provided and n_random_starts is 0 then
        # it will ask for n_random_starts to be > 0.
        raise ValueError(
            "Expected `n_random_starts` > 0, got %d" % n_random_starts)

    if n_calls < n_total_init_calls:
        raise ValueError(
            "Expected `n_calls` >= %d, got %d" % (n_total_init_calls, n_calls))

    if y0 is None and x0:
        y0 = [func(x) for x in x0]
    elif x0:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError(
                "`y0` should be an iterable or a scalar, got %s" % type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")
        if not all(map(np.isscalar, y0)):
            raise ValueError(
                "`y0` elements should be scalars")
    else:
        y0 = []

    Xi = x0 + space.rvs(n_samples=n_random_starts, random_state=rng)
    yi = y0 + [func(x) for x in Xi[len(x0):]]
    if np.ndim(yi) != 1:
        raise ValueError("`func` should return a scalar")

    if search == "auto":
        if space.is_real:
            search = "lbfgs"
        else:
            search = "sampling"
    elif search not in ["lbfgs", "sampling"]:
        raise ValueError(
            "Expected search to be 'lbfgs', 'sampling' or 'auto', "
            "got %s" % search)

    # Bayesian optimization loop
    models = []
    n_model_iter = n_calls - n_total_init_calls
    for i in range(n_model_iter):
        gp = clone(base_estimator)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gp.fit(space.transform(Xi), yi)

        models.append(gp)

        if search == "sampling":
            X = space.transform(space.rvs(n_samples=n_points,
                                          random_state=rng))
            values = _gaussian_acquisition(
                X=X, model=gp,  y_opt=np.min(yi), method=acq,
                xi=xi, kappa=kappa)
            next_x = X[np.argmin(values)]

        elif search == "lbfgs":
            best = np.inf

            for j in range(n_restarts_optimizer):
                x0 = space.transform(space.rvs(n_samples=1,
                                               random_state=rng))[0]

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    x, a, _ = fmin_l_bfgs_b(
                        _acquisition, x0,
                        args=(gp, np.min(yi), acq, xi, kappa),
                        bounds=space.transformed_bounds,
                        approx_grad=True, maxiter=20)

                if a < best:
                    next_x, best = x, a

        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi.append(next_x)
        yi.append(next_y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng
    res.specs = specs

    return res
Exemple #11
0
def gp_minimize(func,
                bounds,
                base_estimator=None,
                acq="LCB",
                xi=0.01,
                kappa=1.96,
                search="sampling",
                maxiter=1000,
                n_points=500,
                n_start=10,
                n_restarts_optimizer=5,
                random_state=None):
    """Bayesian optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standared optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquisition function
    over the Gaussian prior which is much quicker to evaluate.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `bounds` [array-like, shape=(n_parameters, 2)]:
        - ``bounds[i][0]`` should give the lower bound of each parameter and
        - ``bounds[i][1]`` should give the upper bound of each parameter.

    * `base_estimator` [a Gaussian process estimator]:
        The Gaussian process estimator to use for optimization.

    * `acq` [string, default=`"LCB"`]:
        Function to minimize over the gaussian prior. Can be either

        - `"LCB"` for lower confidence bound,
        - `"EI"` for expected improvement,
        - `"PI"` for probability of improvement.

    * `xi` [float, default=0.01]:
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    * `kappa` [float, default=1.96]:
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    * `search` [string, `"sampling"` or `"lbfgs"`]:
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to `"sampling"`, `n_points` are sampled randomly
        and the Gaussian Process prior is updated with the point that gives
        the best acquisition value over the Gaussian prior.

        If search is set to `"lbfgs"`, then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquisition function
        over the Gaussian prior.

    * `maxiter` [int, default=1000]:
        Number of iterations to find the minimum. Note that `n_start`
        iterations are effectively discounted, such that total number of
        function evaluations is at most `maxiter`.

    * `n_points` [int, default=500]:
        Number of points to sample to determine the next "best" point.
        Useless if search is set to `"lbfgs"`.

    * `n_start` [int, default=10]:
        Number of random initialization points.

    * `n_restarts_optimizer` [int, default=10]:
        The number of restarts of the optimizer when `search` is `"lbfgs"`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [float]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [array]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = check_random_state(random_state)

    # Bounds
    n_params = len(bounds)
    lb, ub = extract_bounds(bounds)

    # Default GP
    if base_estimator is None:
        base_estimator = GaussianProcessRegressor(
            kernel=(ConstantKernel(1.0, (0.01, 1000.0)) *
                    Matern(length_scale=np.ones(n_params),
                           length_scale_bounds=[(0.01, 100)] * n_params,
                           nu=2.5)),
            normalize_y=True,
            alpha=10e-6,
            random_state=random_state)

    # First points
    Xi = lb + (ub - lb) * rng.rand(n_start, n_params)
    yi = [func(x) for x in Xi]
    if np.ndim(yi) != 1:
        raise ValueError("The function to be optimized should return a scalar")

    # Bayesian optimization loop
    models = []

    for i in range(maxiter - n_start):
        gp = clone(base_estimator)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gp.fit(Xi, yi)

        models.append(gp)

        if search == "sampling":
            X = lb + (ub - lb) * rng.rand(n_points, n_params)
            values = acquisition(X=X,
                                 model=gp,
                                 y_opt=np.min(yi),
                                 method=acq,
                                 xi=xi,
                                 kappa=kappa)
            next_x = X[np.argmin(values)]

        elif search == "lbfgs":
            best = np.inf

            for j in range(n_restarts_optimizer):
                x0 = lb + (ub - lb) * rng.rand(n_params)

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    x, a, _ = fmin_l_bfgs_b(_acquisition,
                                            x0,
                                            args=(gp, np.min(yi), acq, xi,
                                                  kappa),
                                            bounds=bounds,
                                            approx_grad=True,
                                            maxiter=10)

                if a < best:
                    next_x, best = x, a

        next_y = func(next_x)
        Xi = np.vstack((Xi, next_x))
        yi.append(next_y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models

    return res
def gp_minimize(func, dimensions, base_estimator=None, acq="LCB", xi=0.01,
                kappa=1.96, search="sampling", maxiter=1000, n_points=500,
                n_start=10, n_restarts_optimizer=5, random_state=None):
    """Bayesian optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standared optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquisition function
    over the Gaussian prior which is much quicker to evaluate.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `base_estimator` [a Gaussian process estimator]:
        The Gaussian process estimator to use for optimization.

    * `acq` [string, default=`"LCB"`]:
        Function to minimize over the gaussian prior. Can be either

        - `"LCB"` for lower confidence bound,
        - `"EI"` for expected improvement,
        - `"PI"` for probability of improvement.

    * `xi` [float, default=0.01]:
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    * `kappa` [float, default=1.96]:
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    * `search` [string, `"sampling"` or `"lbfgs"`]:
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to `"sampling"`, `n_points` are sampled randomly
        and the Gaussian Process prior is updated with the point that gives
        the best acquisition value over the Gaussian prior.

        If search is set to `"lbfgs"`, then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquisition function
        over the Gaussian prior.

    * `maxiter` [int, default=1000]:
        Number of iterations to find the minimum. Note that `n_start`
        iterations are effectively discounted, such that total number of
        function evaluations is at most `maxiter`.

    * `n_points` [int, default=500]:
        Number of points to sample to determine the next "best" point.
        Useless if search is set to `"lbfgs"`.

    * `n_start` [int, default=10]:
        Number of random initialization points.

    * `n_restarts_optimizer` [int, default=10]:
        The number of restarts of the optimizer when `search` is `"lbfgs"`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [float]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [array]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimisation space.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Default GP
    if base_estimator is None:
        base_estimator = GaussianProcessRegressor(
            kernel=(ConstantKernel(1.0, (0.01, 1000.0)) *
                    Matern(length_scale=np.ones(space.transformed_n_dims),
                           length_scale_bounds=[(0.01, 100)] * space.transformed_n_dims,
                           nu=2.5)),
            normalize_y=True, alpha=10e-6, random_state=random_state)

    # First points
    Xi = space.rvs(n_samples=n_start, random_state=rng)
    yi = [func(x) for x in Xi]
    if np.ndim(yi) != 1:
        raise ValueError(
            "The function to be optimized should return a scalar")

    # Bayesian optimization loop
    models = []

    for i in range(maxiter - n_start):
        gp = clone(base_estimator)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gp.fit(space.transform(Xi), yi)

        models.append(gp)

        if search == "sampling":
            X = space.transform(space.rvs(n_samples=n_points,
                                          random_state=rng))
            values = _gaussian_acquisition(
                X=X, model=gp,  y_opt=np.min(yi), method=acq,
                xi=xi, kappa=kappa)
            next_x = X[np.argmin(values)]

        elif search == "lbfgs":
            best = np.inf

            for j in range(n_restarts_optimizer):
                x0 = space.transform(space.rvs(n_samples=1,
                                               random_state=rng))[0]

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    x, a, _ = fmin_l_bfgs_b(
                        _acquisition, x0,
                        args=(gp, np.min(yi), acq, xi, kappa),
                        bounds=space.transformed_bounds,
                        approx_grad=True, maxiter=10)

                if a < best:
                    next_x, best = x, a

        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi = np.vstack((Xi, next_x))
        yi.append(next_y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space

    return res
Exemple #13
0
def _tree_minimize(func,
                   dimensions,
                   base_estimator,
                   n_calls,
                   n_points,
                   n_random_starts,
                   x0=None,
                   y0=None,
                   random_state=None,
                   acq="EI",
                   xi=0.01,
                   kappa=1.96):
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Initialize with provided points (x0 and y0) and/or random points
    if n_calls <= 0:
        raise ValueError("Expected `n_calls` > 0, got %d" % n_random_starts)

    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, but got %s" % type(x0))

    n_init_func_calls = len(x0) if y0 is not None else 0
    n_total_init_calls = n_random_starts + n_init_func_calls

    if n_total_init_calls <= 0:
        # if x0 is not provided and n_random_starts is 0 then
        # it will ask for n_random_starts to be > 0.
        raise ValueError("Expected `n_random_starts` > 0, got %d" %
                         n_random_starts)

    if n_calls < n_total_init_calls:
        raise ValueError("Expected `n_calls` >= %d, got %d" %
                         (n_total_init_calls, n_calls))

    if y0 is None and x0:
        y0 = [func(x) for x in x0]
    elif x0:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError("`y0` should be an iterable or a scalar, got %s" %
                             type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")
        if not all(map(np.isscalar, y0)):
            raise ValueError("`y0` elements should be scalars")
    else:
        y0 = []

    Xi = x0 + space.rvs(n_samples=n_random_starts, random_state=rng)
    yi = y0 + [func(x) for x in Xi[len(x0):]]
    if np.ndim(yi) != 1:
        raise ValueError("`func` should return a scalar")

    # Tree-based optimization loop
    models = []
    n_model_iter = n_calls - n_total_init_calls
    for i in range(n_model_iter):
        rgr = clone(base_estimator)
        rgr.fit(space.transform(Xi), yi)
        models.append(rgr)

        # `rgr` predicts constants for each leaf which means that the EI
        # has zero gradient over large distances. As a result we can not
        # use gradient based optimizers like BFGS, so using random sampling
        # for the moment.
        X = space.transform(space.rvs(n_samples=n_points, random_state=rng))
        values = _gaussian_acquisition(X=X,
                                       model=rgr,
                                       y_opt=np.min(yi),
                                       method=acq,
                                       xi=xi,
                                       kappa=kappa)
        next_x = X[np.argmin(values)]
        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi.append(next_x)
        yi.append(next_y)

    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng

    return res
def dummy_minimize(func, dimensions, n_calls=100,
                   x0=None, y0=None, random_state=None):
    """Random search by uniform sampling within the given bounds.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `n_calls` [int, default=100]:
        Number of calls to `func` to find the minimum.

    * `x0` [list, list of lists or `None`]:
        Initial input points.

        - If it is a list of lists, use it as a list of input points.
        - If it is a list, use it as a single initial input point.
        - If it is `None`, no initial input points are used.

    * `y0` [list, scalar or `None`]
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.
        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.
        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimisation space.
        - `specs` [dict]: the call specifications.
        - `rng` [RandomState instance]: State of the random state
           at the end of minimization.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    # Save call args
    specs = {"args": copy.copy(inspect.currentframe().f_locals),
             "function": inspect.currentframe().f_code.co_name}

    # Check params
    rng = check_random_state(random_state)
    space = Space(dimensions)

    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, got %s" % type(x0))

    if len(x0) > 0 and y0 is not None:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError("`y0` should be an iterable or a scalar, got %s"
                             % type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")

        if not all(map(np.isscalar, y0)):
            raise ValueError("`y0` elements should be scalars")

    elif len(x0) > 0 and y0 is None:
        y0 = []
        n_calls -= len(x0)

    elif len(x0) == 0 and y0 is not None:
        raise ValueError("`x0`cannot be `None` when `y0` is provided")

    else:  # len(x0) == 0 and y0 is None
        y0 = []

    X = x0
    y = y0

    # Random search
    X = X + space.rvs(n_samples=n_calls, random_state=rng)
    first = True

    for i in range(len(y0), len(X)):
        y_i = func(X[i])

        if first:
            first = False
            if not np.isscalar(y_i):
                raise ValueError("`func` should return a scalar")

        y.append(y_i)

    y = np.array(y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(y)
    res.x = X[best]
    res.fun = y[best]
    res.func_vals = y
    res.x_iters = X
    res.models = []  # Create attribute even though it is empty
    res.space = space
    res.random_state = rng
    res.specs = specs

    return res
Exemple #15
0
def gp_minimize(func,
                dimensions,
                base_estimator=None,
                alpha=10e-10,
                acq="EI",
                xi=0.01,
                kappa=1.96,
                search="auto",
                n_calls=100,
                n_points=500,
                n_random_starts=10,
                n_restarts_optimizer=5,
                x0=None,
                y0=None,
                random_state=None):
    """Bayesian optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standard optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquisition function
    over the Gaussian prior which is much quicker to evaluate.

    The total number of evaluations, `n_calls`, are performed like the
    following. If `x0` is provided but not `y0`, then the elements of `x0`
    are first evaluated, followed by `n_random_starts` evaluations.
    Finally, `n_calls - len(x0) - n_random_starts` evaluations are
    made guided by the surrogate model. If `x0` and `y0` are both
    provided then `n_random_starts` evaluations are first made then
    `n_calls - n_random_starts` subsequent evaluations are made
    guided by the surrogate model.

    Parameters
    ----------
    * `func` [callable]:
        Function to minimize. Should take a array of parameters and
        return the function values.

    * `dimensions` [list, shape=(n_dims,)]:
        List of search space dimensions.
        Each search dimension can be defined either as

        - a `(upper_bound, lower_bound)` tuple (for `Real` or `Integer`
          dimensions),
        - a `(upper_bound, lower_bound, "prior")` tuple (for `Real`
          dimensions),
        - as a list of categories (for `Categorical` dimensions), or
        - an instance of a `Dimension` object (`Real`, `Integer` or
          `Categorical`).

    * `base_estimator` [a Gaussian process estimator]:
        The Gaussian process estimator to use for optimization.

    * `alpha` [float, default=1e-10]:
        Value added to the diagonal of the kernel matrix during fitting.
        Larger values correspond to an increased noise level in the
        observations and reduce potential numerical issues during fitting.

    * `acq` [string, default=`"EI"`]:
        Function to minimize over the gaussian prior. Can be either

        - `"LCB"` for lower confidence bound,
        - `"EI"` for expected improvement,
        - `"PI"` for probability of improvement.

    * `xi` [float, default=0.01]:
        Controls how much improvement one wants over the previous best
        values. Used when the acquisition is either `"EI"` or `"PI"`.

    * `kappa` [float, default=1.96]:
        Controls how much of the variance in the predicted values should be
        taken into account. If set to be very high, then we are favouring
        exploration over exploitation and vice versa.
        Used when the acquisition is `"LCB"`.

    * `search` [string, `"auto"`, `"sampling"` or `"lbfgs"`, default=`"auto"`]:
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to `"auto"`, then it is set to `"lbfgs"`` if
        all the search dimensions are Real(continuous). It defaults to
        `"sampling"` for all other cases.

        If search is set to `"sampling"`, `n_points` are sampled randomly
        and the Gaussian Process prior is updated with the point that gives
        the best acquisition value over the Gaussian prior.

        If search is set to `"lbfgs"`, then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquisition function
        over the Gaussian prior.

    * `n_calls` [int, default=100]:
        Number of calls to `func`.

    * `n_points` [int, default=500]:
        Number of points to sample to determine the next "best" point.
        Useless if search is set to `"lbfgs"`.

    * `n_random_starts` [int, default=10]:
        Number of evaluations of `func` with random initialization points
        before approximating the `func` with `base_estimator`.

    * `n_restarts_optimizer` [int, default=10]:
        The number of restarts of the optimizer when `search` is `"lbfgs"`.

    * `x0` [list, list of lists or `None`]:
        Initial input points.

        - If it is a list of lists, use it as a list of input points.
        - If it is a list, use it as a single initial input point.
        - If it is `None`, no initial input points are used.

    * `y0` [list, scalar or `None`]
        Evaluation of initial input points.

        - If it is a list, then it corresponds to evaluations of the function
          at each element of `x0` : the i-th element of `y0` corresponds
          to the function evaluated at the i-th element of `x0`.
        - If it is a scalar, then it corresponds to the evaluation of the
          function at `x0`.
        - If it is None and `x0` is provided, then the function is evaluated
          at each element of `x0`.

    * `random_state` [int, RandomState instance, or None (default)]:
        Set random state to something other than None for reproducible
        results.

    Returns
    -------
    * `res` [`OptimizeResult`, scipy object]:
        The optimization result returned as a OptimizeResult object.
        Important attributes are:

        - `x` [list]: location of the minimum.
        - `fun` [float]: function value at the minimum.
        - `models`: surrogate models used for each iteration.
        - `x_iters` [list of lists]: location of function evaluation for each
           iteration.
        - `func_vals` [array]: function value for each iteration.
        - `space` [Space]: the optimization space.
        - `specs` [dict]`: the call specifications.
        - `rng` [RandomState instance]: State of the random state
           at the end of minimization.

        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    # Save call args
    specs = {
        "args": copy.copy(inspect.currentframe().f_locals),
        "function": inspect.currentframe().f_code.co_name
    }

    # Check params
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Default GP
    if base_estimator is None:
        base_estimator = GaussianProcessRegressor(
            kernel=(ConstantKernel(1.0, (0.01, 1000.0)) * Matern(
                length_scale=np.ones(space.transformed_n_dims),
                length_scale_bounds=[(0.01, 100)] * space.transformed_n_dims,
                nu=2.5)),
            normalize_y=True,
            alpha=alpha,
            random_state=random_state)

    # Initialize with provided points (x0 and y0) and/or random points
    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, but got %s" % type(x0))

    n_init_func_calls = len(x0) if y0 is not None else 0
    n_total_init_calls = n_random_starts + n_init_func_calls

    if n_total_init_calls <= 0:
        # if x0 is not provided and n_random_starts is 0 then
        # it will ask for n_random_starts to be > 0.
        raise ValueError("Expected `n_random_starts` > 0, got %d" %
                         n_random_starts)

    if n_calls < n_total_init_calls:
        raise ValueError("Expected `n_calls` >= %d, got %d" %
                         (n_total_init_calls, n_calls))

    if y0 is None and x0:
        y0 = [func(x) for x in x0]
    elif x0:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError("`y0` should be an iterable or a scalar, got %s" %
                             type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")
        if not all(map(np.isscalar, y0)):
            raise ValueError("`y0` elements should be scalars")
    else:
        y0 = []

    Xi = x0 + space.rvs(n_samples=n_random_starts, random_state=rng)
    yi = y0 + [func(x) for x in Xi[len(x0):]]
    if np.ndim(yi) != 1:
        raise ValueError("`func` should return a scalar")

    if search == "auto":
        if space.is_real:
            search = "lbfgs"
        else:
            search = "sampling"
    elif search not in ["lbfgs", "sampling"]:
        raise ValueError(
            "Expected search to be 'lbfgs', 'sampling' or 'auto', "
            "got %s" % search)

    # Bayesian optimization loop
    models = []
    n_model_iter = n_calls - n_total_init_calls
    for i in range(n_model_iter):
        gp = clone(base_estimator)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            gp.fit(space.transform(Xi), yi)

        models.append(gp)

        if search == "sampling":
            X = space.transform(space.rvs(n_samples=n_points,
                                          random_state=rng))
            values = _gaussian_acquisition(X=X,
                                           model=gp,
                                           y_opt=np.min(yi),
                                           method=acq,
                                           xi=xi,
                                           kappa=kappa)
            next_x = X[np.argmin(values)]

        elif search == "lbfgs":
            best = np.inf

            for j in range(n_restarts_optimizer):
                x0 = space.transform(space.rvs(n_samples=1,
                                               random_state=rng))[0]

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    x, a, _ = fmin_l_bfgs_b(_acquisition,
                                            x0,
                                            args=(gp, np.min(yi), acq, xi,
                                                  kappa),
                                            bounds=space.transformed_bounds,
                                            approx_grad=True,
                                            maxiter=20)

                if a < best:
                    next_x, best = x, a

        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi.append(next_x)
        yi.append(next_y)

    # Pack results
    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng
    res.specs = specs

    return res
def gp_minimize(func, bounds=None, search="sampling", random_state=None,
                maxiter=1000, acq="UCB", num_points=500):
    """
    Black-box optimization using Gaussian Processes.

    If every function evaluation is expensive, for instance
    when the parameters are the hyperparameters of a neural network
    and the function evaluation is the mean cross-validation score across
    ten folds, optimizing the hyperparameters by standared optimization
    routines would take for ever!

    The idea is to approximate the function using a Gaussian process.
    In other words the function values are assumed to follow a multivariate
    gaussian. The covariance of the function values are given by a
    GP kernel between the parameters. Then a smart choice to choose the
    next parameter to evaluate can be made by the acquistion function
    over the Gaussian posterior which is much quicker to evaluate.

    Parameters
    ----------
    func: callable
        Function to minimize. Should take a array of parameters and
        return the function value.

    bounds: array-like, shape (n_parameters, 2)
        ``bounds[i][0]`` should give the lower bound of each parameter and
        ``bounds[i][1]`` should give the upper bound of each parameter.

    search: string, "sampling" or "lbfgs"
        Searching for the next possible candidate to update the Gaussian prior
        with.

        If search is set to "sampling", ``num_points`` are sampled randomly
        and the Gaussian Process prior is updated with that point that gives
        the best acquision value over the Gaussian posterior.

        If search is set to "lbfgs", then a point is sampled randomly, and
        lbfgs is run for 10 iterations optimizing the acquistion function
        over the Gaussian posterior.

    random_state: int, RandomState instance, or None (default)
        Set random state to something other than None for reproducible
        results.

    maxiter: int, default 1000
        Number of iterations to find the minimum. In other words, the
        number of function evaluations.

    acq: string, default "UCB"
        Function to minimize over the gaussian posterior. Can be either
        the "UCB" which refers to the UpperConfidenceBound or "EI" which
        is the Expected Improvement.

    num_points: int, default 500
        Number of points to sample to determine the next "best" point.
        Useless if search is set to "lbfgs".

    Returns
    -------
    res: OptimizeResult, scipy object
        The optimization result returned as a OptimizeResult object.
        Important attributes are
        ``x`` - float, the optimization solution,
        ``fun`` - float, the value of the function at the optimum,
        ``models``- gp_models[i]. the prior on the function fit at
                       iteration[i].
        ``func_vals`` - the function value at the ith iteration.
        ``x_iters`` - the value of ``x`` corresponding to the function value
                      at the ith iteration.
        For more details related to the OptimizeResult object, refer
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html
    """
    rng = np.random.RandomState(random_state)

    num_params = len(bounds)
    lower_bounds, upper_bounds = zip(*bounds)
    upper_bounds = np.asarray(upper_bounds)
    lower_bounds = np.asarray(lower_bounds)
    x0 = rng.rand(num_params)
    func_val = [func(lower_bounds + (upper_bounds - lower_bounds) * x0)]

    length_scale = np.ones(num_params)
    gp_params = {
        'kernel': Matern(length_scale=length_scale, nu=2.5),
        'normalize_y': True,
        'random_state': random_state
    }
    lbfgs_bounds = np.tile((0, 1), (num_params, 1))

    gp_models = []
    x = np.reshape(x0, (1, -1))

    for i in range(maxiter):
        gpr = GaussianProcessRegressor(**gp_params)
        gpr.fit(x, func_val)

        if search == "sampling":
            sampling = rng.rand(num_points, num_params)
            acquis = acquisition_func(sampling, gpr, np.min(func_val), acq)
            best_arg = np.argmin(acquis)
            best_x = sampling[best_arg]
        elif search == "lbfgs":
            init = rng.rand(num_params)
            best_x, _, _ = fmin_l_bfgs_b(
                acquisition_func,
                np.asfortranarray(init),
                args=(gpr, np.min(func_val), acq),
                bounds=lbfgs_bounds, approx_grad=True, maxiter=10)

        gp_models.append(gpr)

        best_f = func(lower_bounds + (upper_bounds - lower_bounds) * best_x)
        x_list = x.tolist()
        x_list.append(best_x)
        x = np.asarray(x_list)
        func_val.append(best_f)

    x = lower_bounds + (upper_bounds - lower_bounds) * x
    func_ind = np.argmin(func_val)
    x_val = x[func_ind]
    best_func_val = func_val[func_ind]
    res = OptimizeResult()
    res.models = gp_models

    res.x = x_val
    res.fun = best_func_val
    res.func_vals = func_val
    res.x_iters = x

    return res
def _tree_minimize(func, dimensions, base_estimator, n_calls,
                   n_points, n_random_starts, x0=None, y0=None,
                   random_state=None, acq="EI", xi=0.01, kappa=1.96):
    rng = check_random_state(random_state)
    space = Space(dimensions)

    # Initialize with provided points (x0 and y0) and/or random points
    if n_calls <= 0:
        raise ValueError(
            "Expected `n_calls` > 0, got %d" % n_random_starts)

    if x0 is None:
        x0 = []
    elif not isinstance(x0[0], list):
        x0 = [x0]

    if not isinstance(x0, list):
        raise ValueError("`x0` should be a list, but got %s" % type(x0))

    n_init_func_calls = len(x0) if y0 is not None else 0
    n_total_init_calls = n_random_starts + n_init_func_calls

    if n_total_init_calls <= 0:
        # if x0 is not provided and n_random_starts is 0 then
        # it will ask for n_random_starts to be > 0.
        raise ValueError(
            "Expected `n_random_starts` > 0, got %d" % n_random_starts)

    if n_calls < n_total_init_calls:
        raise ValueError(
            "Expected `n_calls` >= %d, got %d" % (n_total_init_calls, n_calls))

    if y0 is None and x0:
        y0 = [func(x) for x in x0]
    elif x0:
        if isinstance(y0, Iterable):
            y0 = list(y0)
        elif isinstance(y0, numbers.Number):
            y0 = [y0]
        else:
            raise ValueError(
                "`y0` should be an iterable or a scalar, got %s" % type(y0))
        if len(x0) != len(y0):
            raise ValueError("`x0` and `y0` should have the same length")
        if not all(map(np.isscalar, y0)):
            raise ValueError("`y0` elements should be scalars")
    else:
        y0 = []

    Xi = x0 + space.rvs(n_samples=n_random_starts, random_state=rng)
    yi = y0 + [func(x) for x in Xi[len(x0):]]
    if np.ndim(yi) != 1:
        raise ValueError("`func` should return a scalar")

    # Tree-based optimization loop
    models = []
    n_model_iter = n_calls - n_total_init_calls
    for i in range(n_model_iter):
        rgr = clone(base_estimator)
        rgr.fit(space.transform(Xi), yi)
        models.append(rgr)

        # `rgr` predicts constants for each leaf which means that the EI
        # has zero gradient over large distances. As a result we can not
        # use gradient based optimizers like BFGS, so using random sampling
        # for the moment.
        X = space.transform(space.rvs(n_samples=n_points,
                                      random_state=rng))
        values = _gaussian_acquisition(
            X=X, model=rgr, y_opt=np.min(yi), method=acq,
            xi=xi, kappa=kappa)
        next_x = X[np.argmin(values)]
        next_x = space.inverse_transform(next_x.reshape((1, -1)))[0]
        next_y = func(next_x)
        Xi.append(next_x)
        yi.append(next_y)

    res = OptimizeResult()
    best = np.argmin(yi)
    res.x = Xi[best]
    res.fun = yi[best]
    res.func_vals = np.array(yi)
    res.x_iters = Xi
    res.models = models
    res.space = space
    res.random_state = rng

    return res