def add_sampled_points(self, sampled_points): r"""Add sampled point(s) (point, value, noise) to the GP's prior data. Also forces recomputation of all derived quantities for GP to remain consistent. :param sampled_points: :class:`moe.optimal_learning.python.SamplePoint` objects to load into the GP (containing point, function value, and noise variance) :type sampled_points: list of :class:`~moe.optimal_learning.python.SamplePoint` objects (or SamplePoint-like iterables) """ # TODO(GH-159): When C++ can pass back numpy arrays, we can stop keeping a duplicate in self._historical_data. num_sampled_prev = self.num_sampled num_to_add = len(sampled_points) self._historical_data.append_sample_points(sampled_points) # new_historical_data = HistoricalData(self.dim, sampled_points) self._gaussian_process.add_sampled_points( cpp_utils.cppify( self._historical_data.points_sampled[num_sampled_prev:, ...]), cpp_utils.cppify( self._historical_data.points_sampled_value[num_sampled_prev:]), cpp_utils.cppify(self._historical_data. points_sampled_noise_variance[num_sampled_prev:]), num_to_add, )
def compute_log_likelihood(self, hyps): r"""Compute the objective_type measure at the specified hyperparameters. :return: value of log_likelihood evaluated at hyperparameters (``LL(y | X, \theta)``) :rtype: float64 """ # Bound the hyperparameter space to keep things sane. Note all # hyperparameters live on a log scale if numpy.any((-20 > hyps) + (hyps > 20)): return -numpy.inf if not self.noisy: hyps[(self.dim+1):] = numpy.log((1+self._num_derivatives)*[1.e-8]) posterior = 1 if self.prior is not None: posterior = self.prior.lnprob(hyps) hyps = numpy.exp(hyps) cov_hyps = hyps[:(self.dim+1)] noise = hyps[(self.dim+1):] if posterior == -numpy.inf: return -numpy.inf else: return posterior + C_GP.compute_log_likelihood( cpp_utils.cppify(self._points_sampled), cpp_utils.cppify(self._points_sampled_value), self.dim, self._num_sampled, self.objective_type, cpp_utils.cppify_hyperparameters(cov_hyps), cpp_utils.cppify(self._derivatives), self._num_derivatives, cpp_utils.cppify(noise), )
def restarted_hyperparameter_optimization( log_likelihood_optimizer, status=None, ): # status must be an initialized dict for the call to C++. if status is None: status = {} # C++ expects the domain in log10 space and in list form domain_bounds_log10 = numpy.log10( log_likelihood_optimizer.domain._domain_bounds) hyperparameters_opt = C_GP.restarted_hyperparameter_optimization( log_likelihood_optimizer.optimizer_parameters, cpp_utils.cppify(domain_bounds_log10), cpp_utils.cppify( log_likelihood_optimizer.objective_function._points_sampled), cpp_utils.cppify( log_likelihood_optimizer.objective_function._points_sampled_value), log_likelihood_optimizer.objective_function.dim, log_likelihood_optimizer.objective_function._num_sampled, cpp_utils.cppify_hyperparameters( log_likelihood_optimizer.objective_function.cov_hyperparameters), cpp_utils.cppify( log_likelihood_optimizer.objective_function.noise_variance), cpp_utils.cppify( log_likelihood_optimizer.objective_function.derivatives), log_likelihood_optimizer.objective_function.num_derivatives, status, ) return numpy.array(hyperparameters_opt)
def evaluate_at_point_list( self, points_to_evaluate, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): """Evaluate Expected Improvement (1,p-EI) over a specified list of ``points_to_evaluate``. .. Note:: We use ``points_to_evaluate`` instead of ``self._points_to_sample`` and compute the EI at those points only. ``self._points_to_sample`` is unchanged. Generally gradient descent is preferred but when they fail to converge this may be the only "robust" option. This function is also useful for plotting or debugging purposes (just to get a bunch of EI values). :param points_to_evaluate: points at which to compute EI :type points_to_evaluate: array of float64 with shape (num_to_evaluate, self.dim) :param randomness: RNGs used by C++ to generate initial guesses and as the source of normal random numbers when monte-carlo is used :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: EI evaluated at each of points_to_evaluate :rtype: array of float64 with shape (points_to_evaluate.shape[0]) """ # Create enough randomness sources if none are specified. if randomness is None: if max_num_threads == 1: randomness = self._randomness else: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seeds based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} # num_to_sample need not match ei_evaluator.num_to_sample since points_to_evaluate # overrides any data inside ei_evaluator num_to_evaluate, num_to_sample, _ = points_to_evaluate.shape ei_values = C_GP.evaluate_EI_at_point_list( self._gaussian_process._gaussian_process, cpp_utils.cppify(points_to_evaluate), cpp_utils.cppify(self._points_being_sampled), num_to_evaluate, num_to_sample, self.num_being_sampled, self._best_so_far, self._num_mc_iterations, max_num_threads, randomness, status, ) return numpy.array(ei_values)
def compute_knowledge_gradient_mcmc(self, force_monte_carlo=False): r"""Compute the knowledge gradient at ``points_to_sample``, with ``points_being_sampled`` concurrent points being sampled. .. Note:: These comments were copied from :meth:`moe.optimal_learning.python.interfaces.expected_improvement_interface.ExpectedImprovementInterface.compute_expected_improvement` ``points_to_sample`` is the "q" and ``points_being_sampled`` is the "p" in q,p-EI. Computes the knowledge gradient ``EI(Xs) = E_n[[f^*_n(X) - min(f(Xs_1),...,f(Xs_m))]^+]``, where ``Xs`` are potential points to sample (union of ``points_to_sample`` and ``points_being_sampled``) and ``X`` are already sampled points. The ``^+`` indicates that the expression in the expectation evaluates to 0 if it is negative. ``f^*(X)`` is the MINIMUM over all known function evaluations (``points_sampled_value``), whereas ``f(Xs)`` are *GP-predicted* function evaluations. In words, we are computing the knowledge gradient (over the current ``best_so_far``, best known objective function value) that would result from sampling (aka running new experiments) at ``points_to_sample`` with ``points_being_sampled`` concurrent/ongoing experiments. In general, the EI expression is complex and difficult to evaluate; hence we use Monte-Carlo simulation to approximate it. When faster (e.g., analytic) techniques are available, we will prefer them. The idea of the MC approach is to repeatedly sample at the union of ``points_to_sample`` and ``points_being_sampled``. This is analogous to gaussian_process_interface.sample_point_from_gp, but we sample ``num_union`` points at once: ``y = \mu + Lw`` where ``\mu`` is the GP-mean, ``L`` is the ``chol_factor(GP-variance)`` and ``w`` is a vector of ``num_union`` draws from N(0, 1). Then: ``improvement_per_step = max(max(best_so_far - y), 0.0)`` Observe that the inner ``max`` means only the smallest component of ``y`` contributes in each iteration. We compute the improvement over many random draws and average. :param force_monte_carlo: whether to force monte carlo evaluation (vs using fast/accurate analytic eval when possible) :type force_monte_carlo: boolean :return: the knowledge gradient from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments :rtype: float64 """ knowledge_gradient_mcmc = C_GP.compute_knowledge_gradient_mcmc( self._gaussian_process_mcmc._gaussian_process_mcmc, self._num_fidelity, self._inner_optimizer.optimizer_parameters, cpp_utils.cppify(self._inner_optimizer.domain.domain_bounds), cpp_utils.cppify(self._discrete_pts_list), cpp_utils.cppify(self._points_to_sample), cpp_utils.cppify(self._points_being_sampled), self.discrete, self.num_to_sample, self.num_being_sampled, self._num_mc_iterations, cpp_utils.cppify(self._best_so_far_list), self._randomness, ) return knowledge_gradient_mcmc
def compute_expected_improvement(self, force_monte_carlo=False): r"""Compute the expected improvement at ``points_to_sample``, with ``points_being_sampled`` concurrent points being sampled. .. Note:: These comments were copied from :meth:`moe.optimal_learning.python.interfaces.expected_improvement_interface.ExpectedImprovementInterface.compute_expected_improvement` ``points_to_sample`` is the "q" and ``points_being_sampled`` is the "p" in q,p-EI. Computes the expected improvement ``EI(Xs) = E_n[[f^*_n(X) - min(f(Xs_1),...,f(Xs_m))]^+]``, where ``Xs`` are potential points to sample (union of ``points_to_sample`` and ``points_being_sampled``) and ``X`` are already sampled points. The ``^+`` indicates that the expression in the expectation evaluates to 0 if it is negative. ``f^*(X)`` is the MINIMUM over all known function evaluations (``points_sampled_value``), whereas ``f(Xs)`` are *GP-predicted* function evaluations. In words, we are computing the expected improvement (over the current ``best_so_far``, best known objective function value) that would result from sampling (aka running new experiments) at ``points_to_sample`` with ``points_being_sampled`` concurrent/ongoing experiments. In general, the EI expression is complex and difficult to evaluate; hence we use Monte-Carlo simulation to approximate it. When faster (e.g., analytic) techniques are available, we will prefer them. The idea of the MC approach is to repeatedly sample at the union of ``points_to_sample`` and ``points_being_sampled``. This is analogous to gaussian_process_interface.sample_point_from_gp, but we sample ``num_union`` points at once: ``y = \mu + Lw`` where ``\mu`` is the GP-mean, ``L`` is the ``chol_factor(GP-variance)`` and ``w`` is a vector of ``num_union`` draws from N(0, 1). Then: ``improvement_per_step = max(max(best_so_far - y), 0.0)`` Observe that the inner ``max`` means only the smallest component of ``y`` contributes in each iteration. We compute the improvement over many random draws and average. :param force_monte_carlo: whether to force monte carlo evaluation (vs using fast/accurate analytic eval when possible) :type force_monte_carlo: boolean :return: the expected improvement from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments :rtype: float64 """ return C_GP.compute_expected_improvement( self._gaussian_process._gaussian_process, cpp_utils.cppify(self._points_to_sample), cpp_utils.cppify(self._points_being_sampled), self.num_to_sample, self.num_being_sampled, self._num_mc_iterations, self._best_so_far, force_monte_carlo, self._randomness, )
def evaluate_log_likelihood_at_hyperparameter_list( log_likelihood_evaluator, hyperparameters_to_evaluate, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): """Compute the specified log likelihood measure at each input set of hyperparameters. Generally Newton or gradient descent is preferred but when they fail to converge this may be the only "robust" option. This function is also useful for plotting or debugging purposes (just to get a bunch of log likelihood values). Calls into evaluate_log_likelihood_at_hyperparameter_list() in cpp/GPP_python_model_selection.cpp. :param log_likelihood_evaluator: object specifying which log likelihood measure to evaluate :type log_likelihood_evaluator: cpp_wrappers.log_likelihood.LogLikelihood :param hyperparameters_to_evaluate: the hyperparameters at which to compute the specified log likelihood :type hyperparameters_to_evaluate: array of float64 with shape (num_to_eval, log_likelihood_evaluator.num_hyperparameters) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: log likelihood value at each specified set of hyperparameters :rtype: array of float64 with shape (hyperparameters_to_evaluate.shape[0]) """ # status must be an initialized dict for the call to C++. if status is None: status = {} # We could just call log_likelihood_evaluator.compute_log_likelihood() in a loop, but instead we do # the looping in C++ where it can be multithreaded. log_likelihood_list = C_GP.evaluate_log_likelihood_at_hyperparameter_list( cpp_utils.cppify(hyperparameters_to_evaluate), cpp_utils.cppify(log_likelihood_evaluator._points_sampled), cpp_utils.cppify(log_likelihood_evaluator._points_sampled_value), log_likelihood_evaluator.dim, log_likelihood_evaluator._num_sampled, log_likelihood_evaluator.objective_type, cpp_utils.cppify_hyperparameters( log_likelihood_evaluator.cov_hyperparameters), cpp_utils.cppify(log_likelihood_evaluator.noise_variance), cpp_utils.cppify(log_likelihood_evaluator.derivatives), log_likelihood_evaluator.num_derivatives, hyperparameters_to_evaluate.shape[0], max_num_threads, status, ) return numpy.array(log_likelihood_list)
def sample_point_from_gp(self, point_to_sample, noise_variance=0.0): r"""Sample a function value from a Gaussian Process prior, provided a point at which to sample. Uses the formula ``function_value = gpp_mean + sqrt(gpp_variance) * w1 + sqrt(noise_variance) * w2``, where ``w1, w2`` are draws from N(0,1). Normal RNG source is held within the C++ GaussianProcess object. .. NOTE:: Set noise_variance to 0 if you want "accurate" draws from the GP. BUT if the drawn (point, value) pair is meant to be added back into the GP (e.g., for testing), then this point MUST be drawn with noise_variance equal to the noise associated with "point" as a member of "points_sampled" .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.sample_point_from_gp` :param point_to_sample: point (in dim dimensions) at which to sample from this GP :type points_to_sample: array of float64 with shape (dim) :param noise_variance: amount of noise to associate with the sample :type noise_variance: float64 >= 0.0 :return: sample_value: function value drawn from this GP :rtype: float64 """ return self._gaussian_process.sample_point_from_gp( cpp_utils.cppify(point_to_sample), noise_variance, )
def compute_grad_variance_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. This function is similar to compute_grad_cholesky_variance_of_points() (below), except this does not include gradient terms from the cholesky factorization. Description will not be duplicated here. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_var: gradient of the variance matrix of this GP :rtype: array of float64 with shape (num_derivatives, num_to_sample, num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives( points_to_sample.shape[0], num_derivatives) num_to_sample = points_to_sample.shape[0] grad_variance = self._gaussian_process.compute_grad_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, num_derivatives, ) return cpp_utils.uncppify( grad_variance, (num_derivatives, num_to_sample, num_to_sample, self.dim))
def compute_grad_mean_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the mean of this GP at each of point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. Note that ``grad_mu`` is nominally sized: ``grad_mu[num_to_sample][num_to_sample][dim]``. This is the the d-th component of the derivative evaluated at the i-th input wrt the j-th input. However, for ``0 <= i,j < num_to_sample``, ``i != j``, ``grad_mu[j][i][d] = 0``. (See references or implementation for further details.) Thus, ``grad_mu`` is stored in a reduced form which only tracks the nonzero entries. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_mean_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_mu: gradient of the mean of the GP. ``grad_mu[i][d]`` is actually the gradient of ``\mu_i`` wrt ``x_{i,d}``, the d-th dim of the i-th entry of ``points_to_sample``. :rtype: array of float64 with shape (num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives( points_to_sample.shape[0], num_derivatives) grad_mu = self._gaussian_process.compute_grad_mean_of_points( cpp_utils.cppify(points_to_sample[:num_derivatives, ...]), num_derivatives, ) return cpp_utils.uncppify(grad_mu, (num_derivatives, self.dim))
def __init__( self, covariance_function, historical_data, noise_variance, derivatives, log_likelihood_type=C_GP.LogLikelihoodTypes.log_marginal_likelihood ): """Construct a LogLikelihood object that knows how to call C++ for evaluation of member functions. :param covariance_function: covariance object encoding assumptions about the GP's behavior on our data :type covariance_function: :class:`moe.optimal_learning.python.interfaces.covariance_interface.CovarianceInterface` subclass (e.g., from :mod:`moe.optimal_learning.python.cpp_wrappers.covariance`). :param historical_data: object specifying the already-sampled points, the objective value at those points, and the noise variance associated with each observation :type historical_data: :class:`moe.optimal_learning.python.data_containers.HistoricalData` object :param log_likelihood_type: enum specifying which log likelihood measure to compute :type log_likelihood_type: GPP.LogLikelihoodTypes """ self._covariance = copy.deepcopy(covariance_function) self._historical_data = copy.deepcopy(historical_data) self._noise_variance = noise_variance self._derivatives = copy.deepcopy(derivatives) self._num_derivatives = len(cpp_utils.cppify(self._derivatives)) self.objective_type = log_likelihood_type
def __init__(self, historical_data, derivatives, prior, chain_length, burnin_steps, n_hypers, log_likelihood_type=C_GP.LogLikelihoodTypes.log_marginal_likelihood, noisy = True, rng = None): """Construct a LogLikelihood object that knows how to call C++ for evaluation of member functions. :param covariance_function: covariance object encoding assumptions about the GP's behavior on our data :type covariance_function: :class:`moe.optimal_learning.python.interfaces.covariance_interface.CovarianceInterface` subclass (e.g., from :mod:`moe.optimal_learning.python.cpp_wrappers.covariance`). :param historical_data: object specifying the already-sampled points, the objective value at those points, and the noise variance associated with each observation :type historical_data: :class:`moe.optimal_learning.python.data_containers.HistoricalData` object :param log_likelihood_type: enum specifying which log likelihood measure to compute :type log_likelihood_type: GPP.LogLikelihoodTypes """ self._historical_data = copy.deepcopy(historical_data) self._derivatives = copy.deepcopy(derivatives) self._num_derivatives = len(cpp_utils.cppify(self._derivatives)) self.objective_type = log_likelihood_type self.prior = prior self.chain_length = chain_length self.burned = False self.burnin_steps = burnin_steps self._models = [] self.noisy = noisy if rng is None: self.rng = numpy.random.RandomState(numpy.random.randint(0, 10000)) else: self.rng = rng self.n_hypers = n_hypers self.n_chains = max(n_hypers, 2*(self._historical_data.dim+1+1+self._num_derivatives))
def compute_log_likelihood(self): r"""Compute the objective_type measure at the specified hyperparameters. :return: value of log_likelihood evaluated at hyperparameters (``LL(y | X, \theta)``) :rtype: float64 """ return C_GP.compute_log_likelihood( cpp_utils.cppify(self._points_sampled), cpp_utils.cppify(self._points_sampled_value), self.dim, self._num_sampled, self.objective_type, cpp_utils.cppify_hyperparameters(self.hyperparameters), cpp_utils.cppify(self._points_sampled_noise_variance), )
def compute_grad_cholesky_variance_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the cholesky factorization of the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. This function accounts for the effect on the gradient resulting from cholesky-factoring the variance matrix. See Smith 1995 for algorithm details. Note that ``grad_chol`` is nominally sized: ``grad_chol[num_to_sample][num_to_sample][num_to_sample][dim]``. Let this be indexed ``grad_chol[k][j][i][d]``, which is read the derivative of ``var[j][i]`` with respect to ``x_{k,d}`` (x = ``points_to_sample``) .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_cholesky_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_chol: gradient of the cholesky factorization of the variance matrix of this GP. ``grad_chol[k][j][i][d]`` is actually the gradients of ``var_{j,i}`` with respect to ``x_{k,d}``, the d-th dimension of the k-th entry of ``points_to_sample`` :rtype: array of float64 with shape (num_derivatives, num_to_sample, num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives(points_to_sample.shape[0], num_derivatives) num_to_sample = points_to_sample.shape[0] grad_chol_decomp = self._gaussian_process.compute_grad_cholesky_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, num_derivatives, ) return cpp_utils.uncppify(grad_chol_decomp, (num_derivatives, num_to_sample, num_to_sample, self.dim))
def compute_grad_mean_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the mean of this GP at each of point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. Note that ``grad_mu`` is nominally sized: ``grad_mu[num_to_sample][num_to_sample][dim]``. This is the the d-th component of the derivative evaluated at the i-th input wrt the j-th input. However, for ``0 <= i,j < num_to_sample``, ``i != j``, ``grad_mu[j][i][d] = 0``. (See references or implementation for further details.) Thus, ``grad_mu`` is stored in a reduced form which only tracks the nonzero entries. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_mean_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_mu: gradient of the mean of the GP. ``grad_mu[i][d]`` is actually the gradient of ``\mu_i`` wrt ``x_{i,d}``, the d-th dim of the i-th entry of ``points_to_sample``. :rtype: array of float64 with shape (num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives(points_to_sample.shape[0], num_derivatives) grad_mu = self._gaussian_process.compute_grad_mean_of_points( cpp_utils.cppify(points_to_sample[:num_derivatives, ...]), num_derivatives, ) return cpp_utils.uncppify(grad_mu, (num_derivatives, self.dim))
def compute_grad_posterior_mean(self, force_monte_carlo=False): r"""Compute the gradient of knowledge gradient at ``points_to_sample`` wrt ``points_to_sample``, with ``points_being_sampled`` concurrent samples. .. Note:: These comments were copied from :meth:`moe.optimal_learning.python.interfaces.expected_improvement_interface.ExpectedImprovementInterface.compute_grad_expected_improvement` ``points_to_sample`` is the "q" and ``points_being_sampled`` is the "p" in q,p-EI. In general, the expressions for gradients of EI are complex and difficult to evaluate; hence we use Monte-Carlo simulation to approximate it. When faster (e.g., analytic) techniques are available, we will prefer them. The MC computation of grad EI is similar to the computation of EI (decsribed in compute_expected_improvement). We differentiate ``y = \mu + Lw`` wrt ``points_to_sample``; only terms from the gradient of ``\mu`` and ``L`` contribute. In EI, we computed: ``improvement_per_step = max(max(best_so_far - y), 0.0)`` and noted that only the smallest component of ``y`` may contribute (if it is > 0.0). Call this index ``winner``. Thus in computing grad EI, we only add gradient terms that are attributable to the ``winner``-th component of ``y``. :param force_monte_carlo: whether to force monte carlo evaluation (vs using fast/accurate analytic eval when possible) :type force_monte_carlo: boolean :return: gradient of EI, ``\pderiv{EI(Xq \cup Xp)}{Xq_{i,d}}`` where ``Xq`` is ``points_to_sample`` and ``Xp`` is ``points_being_sampled`` (grad EI from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments wrt each dimension of the points in ``points_to_sample``) :rtype: array of float64 with shape (num_to_sample, dim) """ grad_kg = C_GP.compute_grad_posterior_mean( self._gaussian_process._gaussian_process, self._num_fidelity, cpp_utils.cppify(self._points_to_sample), ) return cpp_utils.uncppify(grad_kg, (1, self.dim - self._num_fidelity))
def compute_grad_variance_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. This function is similar to compute_grad_cholesky_variance_of_points() (below), except this does not include gradient terms from the cholesky factorization. Description will not be duplicated here. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_var: gradient of the variance matrix of this GP :rtype: array of float64 with shape (num_derivatives, num_to_sample, num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives(points_to_sample.shape[0], num_derivatives) num_to_sample = points_to_sample.shape[0] grad_variance = self._gaussian_process.compute_grad_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, num_derivatives, ) return cpp_utils.uncppify(grad_variance, (num_derivatives, num_to_sample, num_to_sample, self.dim))
def compute_grad_log_likelihood(self): r"""Compute the gradient (wrt hyperparameters) of the objective_type measure at the specified hyperparameters. :return: grad_log_likelihood: i-th entry is ``\pderiv{LL(y | X, \theta)}{\theta_i}`` :rtype: array of float64 with shape (num_hyperparameters) """ grad_log_marginal = C_GP.compute_hyperparameter_grad_log_likelihood( cpp_utils.cppify(self._points_sampled), cpp_utils.cppify(self._points_sampled_value), self.dim, self._num_sampled, self.objective_type, cpp_utils.cppify_hyperparameters(self.hyperparameters), cpp_utils.cppify(self._points_sampled_noise_variance), ) return numpy.array(grad_log_marginal)
def evaluate_log_likelihood_at_hyperparameter_list( log_likelihood_evaluator, hyperparameters_to_evaluate, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): """Compute the specified log likelihood measure at each input set of hyperparameters. Generally Newton or gradient descent is preferred but when they fail to converge this may be the only "robust" option. This function is also useful for plotting or debugging purposes (just to get a bunch of log likelihood values). Calls into evaluate_log_likelihood_at_hyperparameter_list() in cpp/GPP_python_model_selection.cpp. :param log_likelihood_evaluator: object specifying which log likelihood measure to evaluate :type log_likelihood_evaluator: cpp_wrappers.log_likelihood.LogLikelihood :param hyperparameters_to_evaluate: the hyperparameters at which to compute the specified log likelihood :type hyperparameters_to_evaluate: array of float64 with shape (num_to_eval, log_likelihood_evaluator.num_hyperparameters) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: log likelihood value at each specified set of hyperparameters :rtype: array of float64 with shape (hyperparameters_to_evaluate.shape[0]) """ # status must be an initialized dict for the call to C++. if status is None: status = {} # We could just call log_likelihood_evaluator.compute_log_likelihood() in a loop, but instead we do # the looping in C++ where it can be multithreaded. log_likelihood_list = C_GP.evaluate_log_likelihood_at_hyperparameter_list( cpp_utils.cppify(hyperparameters_to_evaluate), cpp_utils.cppify(log_likelihood_evaluator._points_sampled), cpp_utils.cppify(log_likelihood_evaluator._points_sampled_value), log_likelihood_evaluator.dim, log_likelihood_evaluator._num_sampled, log_likelihood_evaluator.objective_type, cpp_utils.cppify_hyperparameters(log_likelihood_evaluator.hyperparameters), cpp_utils.cppify(log_likelihood_evaluator._points_sampled_noise_variance), hyperparameters_to_evaluate.shape[0], max_num_threads, status, ) return numpy.array(log_likelihood_list)
def __init__(self, hyperparameters_list, noise_variance_list, historical_data, derivatives): """Construct a GaussianProcess object that knows how to call C++ for evaluation of member functions. :param covariance_function: covariance object encoding assumptions about the GP's behavior on our data :type covariance_function: :class:`moe.optimal_learning.python.interfaces.covariance_interface.CovarianceInterface` subclass (e.g., from :mod:`moe.optimal_learning.python.cpp_wrappers.covariance`). :param historical_data: object specifying the already-sampled points, the objective value at those points, and the noise variance associated with each observation :type historical_data: :class:`moe.optimal_learning.python.data_containers.HistoricalData` object """ self._hyperparameters_list = copy.deepcopy(hyperparameters_list) self._num_mcmc = hyperparameters_list.shape[0] self._historical_data = copy.deepcopy(historical_data) self._noise_variance_list = copy.deepcopy(noise_variance_list) self._derivatives = copy.deepcopy(derivatives) self._num_derivatives = len(cpp_utils.cppify(self._derivatives)) # C++ will maintain its own copy of the contents of hyperparameters and historical_data self._gaussian_process_mcmc = C_GP.GaussianProcessMCMC( cpp_utils.cppify(self._hyperparameters_list), cpp_utils.cppify(self._noise_variance_list), cpp_utils.cppify(self._historical_data.points_sampled), cpp_utils.cppify(self._historical_data.points_sampled_value), cpp_utils.cppify(self._derivatives), self._num_mcmc, self._num_derivatives, self._historical_data.dim, self._historical_data.num_sampled, )
def add_sampled_points(self, sampled_points): r"""Add sampled point(s) (point, value, noise) to the GP's prior data. Also forces recomputation of all derived quantities for GP to remain consistent. :param sampled_points: :class:`moe.optimal_learning.python.SamplePoint` objects to load into the GP (containing point, function value, and noise variance) :type sampled_points: list of :class:`~moe.optimal_learning.python.SamplePoint` objects (or SamplePoint-like iterables) """ # TODO(GH-159): When C++ can pass back numpy arrays, we can stop keeping a duplicate in self._historical_data. num_sampled_prev = self.num_sampled num_to_add = len(sampled_points) self._historical_data.append_sample_points(sampled_points) # new_historical_data = HistoricalData(self.dim, sampled_points) self._gaussian_process.add_sampled_points( cpp_utils.cppify(self._historical_data.points_sampled[num_sampled_prev:, ...]), cpp_utils.cppify(self._historical_data.points_sampled_value[num_sampled_prev:]), cpp_utils.cppify(self._historical_data.points_sampled_noise_variance[num_sampled_prev:]), num_to_add, )
def compute_grad_expected_improvement(self, force_monte_carlo=False): r"""Compute the gradient of expected improvement at ``points_to_sample`` wrt ``points_to_sample``, with ``points_being_sampled`` concurrent samples. .. Note:: These comments were copied from :meth:`moe.optimal_learning.python.interfaces.expected_improvement_interface.ExpectedImprovementInterface.compute_grad_expected_improvement` ``points_to_sample`` is the "q" and ``points_being_sampled`` is the "p" in q,p-EI. In general, the expressions for gradients of EI are complex and difficult to evaluate; hence we use Monte-Carlo simulation to approximate it. When faster (e.g., analytic) techniques are available, we will prefer them. The MC computation of grad EI is similar to the computation of EI (decsribed in compute_expected_improvement). We differentiate ``y = \mu + Lw`` wrt ``points_to_sample``; only terms from the gradient of ``\mu`` and ``L`` contribute. In EI, we computed: ``improvement_per_step = max(max(best_so_far - y), 0.0)`` and noted that only the smallest component of ``y`` may contribute (if it is > 0.0). Call this index ``winner``. Thus in computing grad EI, we only add gradient terms that are attributable to the ``winner``-th component of ``y``. :param force_monte_carlo: whether to force monte carlo evaluation (vs using fast/accurate analytic eval when possible) :type force_monte_carlo: boolean :return: gradient of EI, ``\pderiv{EI(Xq \cup Xp)}{Xq_{i,d}}`` where ``Xq`` is ``points_to_sample`` and ``Xp`` is ``points_being_sampled`` (grad EI from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments wrt each dimension of the points in ``points_to_sample``) :rtype: array of float64 with shape (num_to_sample, dim) """ grad_ei = C_GP.compute_grad_expected_improvement( self._gaussian_process._gaussian_process, cpp_utils.cppify(self._points_to_sample), cpp_utils.cppify(self._points_being_sampled), self.num_to_sample, self.num_being_sampled, self._num_mc_iterations, self._best_so_far, force_monte_carlo, self._randomness, ) return cpp_utils.uncppify(grad_ei, (self.num_to_sample, self.dim))
def __init__(self, covariance_function, historical_data): """Construct a GaussianProcess object that knows how to call C++ for evaluation of member functions. :param covariance_function: covariance object encoding assumptions about the GP's behavior on our data :type covariance_function: :class:`moe.optimal_learning.python.interfaces.covariance_interface.CovarianceInterface` subclass (e.g., from :mod:`moe.optimal_learning.python.cpp_wrappers.covariance`). :param historical_data: object specifying the already-sampled points, the objective value at those points, and the noise variance associated with each observation :type historical_data: :class:`moe.optimal_learning.python.data_containers.HistoricalData` object """ self._covariance = copy.deepcopy(covariance_function) self._historical_data = copy.deepcopy(historical_data) # C++ will maintain its own copy of the contents of hyperparameters and historical_data self._gaussian_process = C_GP.GaussianProcess( cpp_utils.cppify_hyperparameters(self._covariance.hyperparameters), cpp_utils.cppify(historical_data.points_sampled), cpp_utils.cppify(historical_data.points_sampled_value), cpp_utils.cppify(historical_data.points_sampled_noise_variance), self._historical_data.dim, self._historical_data.num_sampled, )
def compute_cholesky_variance_of_points(self, points_to_sample): r"""Compute the cholesky factorization of the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``). ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :return: cholesky factorization of the variance matrix of this GP, lower triangular :rtype: array of float64 with shape (num_to_sample, num_to_sample), only lower triangle filled in """ num_to_sample = points_to_sample.shape[0] cholesky_variance = self._gaussian_process.compute_cholesky_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, ) return cpp_utils.uncppify(cholesky_variance, (num_to_sample, num_to_sample))
def compute_mean_of_points(self, points_to_sample): r"""Compute the mean of this GP at each of point of ``Xs`` (``points_to_sample``). ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_mean_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :return: mean: where mean[i] is the mean at points_to_sample[i] :rtype: array of float64 with shape (num_to_sample) """ mu = self._gaussian_process.compute_mean_of_points( cpp_utils.cppify(points_to_sample), points_to_sample.shape[0], ) return numpy.array(mu)
def compute_grad_cholesky_variance_of_points(self, points_to_sample, num_derivatives=-1): r"""Compute the gradient of the cholesky factorization of the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``) wrt ``Xs``. ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. This function accounts for the effect on the gradient resulting from cholesky-factoring the variance matrix. See Smith 1995 for algorithm details. Note that ``grad_chol`` is nominally sized: ``grad_chol[num_to_sample][num_to_sample][num_to_sample][dim]``. Let this be indexed ``grad_chol[k][j][i][d]``, which is read the derivative of ``var[j][i]`` with respect to ``x_{k,d}`` (x = ``points_to_sample``) .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_grad_cholesky_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :param num_derivatives: return derivatives wrt points_to_sample[0:num_derivatives]; large or negative values are clamped :type num_derivatives: int :return: grad_chol: gradient of the cholesky factorization of the variance matrix of this GP. ``grad_chol[k][j][i][d]`` is actually the gradients of ``var_{j,i}`` with respect to ``x_{k,d}``, the d-th dimension of the k-th entry of ``points_to_sample`` :rtype: array of float64 with shape (num_derivatives, num_to_sample, num_to_sample, dim) """ num_derivatives = self._clamp_num_derivatives( points_to_sample.shape[0], num_derivatives) num_to_sample = points_to_sample.shape[0] grad_chol_decomp = self._gaussian_process.compute_grad_cholesky_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, num_derivatives, ) return cpp_utils.uncppify( grad_chol_decomp, (num_derivatives, num_to_sample, num_to_sample, self.dim))
def compute_variance_of_points(self, points_to_sample): r"""Compute the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``). ``points_to_sample`` may not contain duplicate points. Violating this results in singular covariance matrices. The variance matrix is symmetric although we currently return the full representation. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :return: var_star: variance matrix of this GP :rtype: array of float64 with shape (num_to_sample, num_to_sample) """ num_to_sample = points_to_sample.shape[0] variance = self._gaussian_process.compute_variance_of_points( cpp_utils.cppify(points_to_sample), num_to_sample, ) return cpp_utils.uncppify(variance, (num_to_sample, num_to_sample))
def multistart_expected_improvement_optimization( ei_optimizer, num_multistarts, num_to_sample, use_gpu=False, which_gpu=0, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): """Solve the q,p-EI problem, returning the optimal set of q points to sample CONCURRENTLY in future experiments. When ``points_being_sampled.size == 0 && num_to_sample == 1``, this function will use (fast) analytic EI computations. .. NOTE:: The following comments are copied from gpp_math.hpp, ComputeOptimalPointsToSample(). These comments are copied into :func:`moe.optimal_learning.python.python_version.expected_improvement.multistart_expected_improvement_optimization` This is the primary entry-point for EI optimization in the optimal_learning library. It offers our best shot at improving robustness by combining higher accuracy methods like gradient descent with fail-safes like random/grid search. Returns the optimal set of q points to sample CONCURRENTLY by solving the q,p-EI problem. That is, we may want to run 4 experiments at the same time and maximize the EI across all 4 experiments at once while knowing of 2 ongoing experiments (4,2-EI). This function handles this use case. Evaluation of q,p-EI (and its gradient) for q > 1 or p > 1 is expensive (requires monte-carlo iteration), so this method is usually very expensive. Compared to ComputeHeuristicPointsToSample() (``gpp_heuristic_expected_improvement_optimization.hpp``), this function makes no external assumptions about the underlying objective function. Instead, it utilizes a feature of the GaussianProcess that allows the GP to account for ongoing/incomplete experiments. If ``num_to_sample = 1``, this is the same as ComputeOptimalPointsToSampleWithRandomStarts(). The option of using GPU to compute general q,p-EI via MC simulation is also available. To enable it, make sure you have installed GPU components of MOE, otherwise, it will throw Runtime excpetion. :param ei_optimizer: object that optimizes (e.g., gradient descent, newton) EI over a domain :type ei_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``ei_optimizer`` (UNUSED, data is in ei_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param num_to_sample: how many simultaneous experiments you would like to run (i.e., the q in q,p-EI) :type num_to_sample: int >= 1 :param use_gpu: set to True if user wants to use GPU for MC simulation :type use_gpu: bool :param which_gpu: GPU device ID :type which_gpu: int >= 0 :param randomness: RNGs used by C++ to generate initial guesses and as the source of normal random numbers when monte-carlo is used :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: point(s) that maximize the expected improvement (solving the q,p-EI problem) :rtype: array of float64 with shape (num_to_sample, ei_optimizer.objective_function.dim) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seeds based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} best_points_to_sample = C_GP.multistart_expected_improvement_optimization( ei_optimizer.optimizer_parameters, ei_optimizer.objective_function._gaussian_process._gaussian_process, cpp_utils.cppify(ei_optimizer.domain.domain_bounds), cpp_utils.cppify(ei_optimizer.objective_function._points_being_sampled), num_to_sample, ei_optimizer.objective_function.num_being_sampled, ei_optimizer.objective_function._best_so_far, ei_optimizer.objective_function._num_mc_iterations, max_num_threads, use_gpu, which_gpu, randomness, status, ) # reform output to be a list of dim-dimensional points, dim = len(self.domain) return cpp_utils.uncppify(best_points_to_sample, (num_to_sample, ei_optimizer.objective_function.dim))
def multistart_hyperparameter_optimization( log_likelihood_optimizer, num_multistarts, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): r"""Select the hyperparameters that maximize the specified log likelihood measure of model fit (over the historical data) within the specified domain. .. Note:: The following comments are copied to :mod:`moe.optimal_learning.python.python_version.log_likelihood.multistart_hyperparameter_optimization`. See :class:`moe.optimal_learning.python.cpp_wrappers.log_likelihood.GaussianProcessLogMarginalLikelihood` and :class:`moe.optimal_learning.python.cpp_wrappers.log_likelihood.GaussianProcessLeaveOneOutLogLikelihood` for an overview of some example log likelihood-like measures. Optimizers are: null ('dumb' search), gradient descent, newton Newton is the suggested optimizer. 'dumb' search means this will just evaluate the objective log likelihood measure at num_multistarts 'points' (hyperparameters) in the domain, uniformly sampled using latin hypercube sampling. The hyperparameter_optimizer_parameters input specifies the desired optimization technique as well as parameters controlling its behavior (see :mod:`moe.optimal_learning.python.cpp_wrappers.optimization`). See gpp_python_common.cpp for C++ enum declarations laying out the options for objective and optimizer types. Currently, during optimization, we recommend that the coordinates of the initial guesses not differ from the coordinates of the optima by more than about 1 order of magnitude. This is a very (VERY!) rough guideline for sizing the domain and gd_parameters.num_multistarts; i.e., be wary of sets of initial guesses that cover the space too sparsely. Note that the domain here must be specified in LOG-10 SPACE! Solution is guaranteed to lie within the region specified by "domain"; note that this may not be a true optima (i.e., the gradient may be substantially nonzero). .. WARNING:: this function fails if NO improvement can be found! In that case, the output will always be the first randomly chosen point. status will report failure. :param ei_optimizer: object that optimizes (e.g., gradient descent, newton) log likelihood over a domain :type ei_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``ei_optimizer`` (UNUSED, data is in log_likelihood_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param randomness: RNGs used by C++ to generate initial guesses :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: hyperparameters that maximize the specified log likelihood measure within the specified domain :rtype: array of float64 with shape (log_likelihood_optimizer.objective_function.num_hyperparameters) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seed based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} # C++ expects the domain in log10 space and in list form domain_bounds_log10 = numpy.log10(log_likelihood_optimizer.domain._domain_bounds) hyperparameters_opt = C_GP.multistart_hyperparameter_optimization( log_likelihood_optimizer.optimizer_parameters, cpp_utils.cppify(domain_bounds_log10), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled_value), log_likelihood_optimizer.objective_function.dim, log_likelihood_optimizer.objective_function._num_sampled, cpp_utils.cppify_hyperparameters(log_likelihood_optimizer.objective_function.hyperparameters), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled_noise_variance), max_num_threads, randomness, status, ) return numpy.array(hyperparameters_opt)
def multistart_expected_improvement_mcmc_optimization( ei_optimizer, num_multistarts, num_to_sample, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): """Solve the q,p-KG problem, returning the optimal set of q points to sample CONCURRENTLY in future experiments. .. NOTE:: The following comments are copied from gpp_math.hpp, ComputeOptimalPointsToSample(). These comments are copied into :func:`moe.optimal_learning.python.python_version.expected_improvement.multistart_expected_improvement_optimization` This is the primary entry-point for EI optimization in the optimal_learning library. It offers our best shot at improving robustness by combining higher accuracy methods like gradient descent with fail-safes like random/grid search. Returns the optimal set of q points to sample CONCURRENTLY by solving the q,p-EI problem. That is, we may want to run 4 experiments at the same time and maximize the EI across all 4 experiments at once while knowing of 2 ongoing experiments (4,2-EI). This function handles this use case. Evaluation of q,p-EI (and its gradient) for q > 1 or p > 1 is expensive (requires monte-carlo iteration), so this method is usually very expensive. Compared to ComputeHeuristicPointsToSample() (``gpp_heuristic_expected_improvement_optimization.hpp``), this function makes no external assumptions about the underlying objective function. Instead, it utilizes a feature of the GaussianProcess that allows the GP to account for ongoing/incomplete experiments. If ``num_to_sample = 1``, this is the same as ComputeOptimalPointsToSampleWithRandomStarts(). The option of using GPU to compute general q,p-EI via MC simulation is also available. To enable it, make sure you have installed GPU components of MOE, otherwise, it will throw Runtime excpetion. :param kg_optimizer: object that optimizes (e.g., gradient descent, newton) EI over a domain :type kg_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``ei_optimizer`` (UNUSED, data is in ei_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param num_to_sample: how many simultaneous experiments you would like to run (i.e., the q in q,p-EI) :type num_to_sample: int >= 1 :param use_gpu: set to True if user wants to use GPU for MC simulation :type use_gpu: bool :param which_gpu: GPU device ID :type which_gpu: int >= 0 :param randomness: RNGs used by C++ to generate initial guesses and as the source of normal random numbers when monte-carlo is used :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: point(s) that maximize the knowledge gradient (solving the q,p-KG problem) :rtype: array of float64 with shape (num_to_sample, ei_optimizer.objective_function.dim) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seeds based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} best_points_to_sample = C_GP.multistart_expected_improvement_mcmc_optimization( ei_optimizer.optimizer_parameters, ei_optimizer.objective_function._gaussian_process_mcmc. _gaussian_process_mcmc, cpp_utils.cppify(ei_optimizer.domain.domain_bounds), cpp_utils.cppify( ei_optimizer.objective_function._points_being_sampled), num_to_sample, ei_optimizer.objective_function.num_being_sampled, cpp_utils.cppify( numpy.array(ei_optimizer.objective_function._best_so_far_list)), ei_optimizer.objective_function._num_mc_iterations, max_num_threads, randomness, status, ) # reform output to be a list of dim-dimensional points, dim = len(self.domain) return cpp_utils.uncppify( best_points_to_sample, (num_to_sample, ei_optimizer.objective_function.dim))
def multistart_hyperparameter_optimization( log_likelihood_optimizer, num_multistarts, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): r"""Select the hyperparameters that maximize the specified log likelihood measure of model fit (over the historical data) within the specified domain. .. Note:: The following comments are copied to :mod:`moe.optimal_learning.python.python_version.log_likelihood.multistart_hyperparameter_optimization`. See :class:`moe.optimal_learning.python.cpp_wrappers.log_likelihood.GaussianProcessLogMarginalLikelihood` and :class:`moe.optimal_learning.python.cpp_wrappers.log_likelihood.GaussianProcessLeaveOneOutLogLikelihood` for an overview of some example log likelihood-like measures. Optimizers are: null ('dumb' search), gradient descent, newton Newton is the suggested optimizer. 'dumb' search means this will just evaluate the objective log likelihood measure at num_multistarts 'points' (hyperparameters) in the domain, uniformly sampled using latin hypercube sampling. The hyperparameter_optimizer_parameters input specifies the desired optimization technique as well as parameters controlling its behavior (see :mod:`moe.optimal_learning.python.cpp_wrappers.optimization`). See gpp_python_common.cpp for C++ enum declarations laying out the options for objective and optimizer types. Currently, during optimization, we recommend that the coordinates of the initial guesses not differ from the coordinates of the optima by more than about 1 order of magnitude. This is a very (VERY!) rough guideline for sizing the domain and gd_parameters.num_multistarts; i.e., be wary of sets of initial guesses that cover the space too sparsely. Note that the domain here must be specified in LOG-10 SPACE! Solution is guaranteed to lie within the region specified by "domain"; note that this may not be a true optima (i.e., the gradient may be substantially nonzero). .. WARNING:: this function fails if NO improvement can be found! In that case, the output will always be the first randomly chosen point. status will report failure. :param log_likelihood_optimizer: object that optimizes (e.g., gradient descent, newton) log likelihood over a domain :type log_likelihood_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``log_likelihood_optimizer`` (UNUSED, data is in log_likelihood_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param randomness: RNGs used by C++ to generate initial guesses :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: hyperparameters that maximize the specified log likelihood measure within the specified domain :rtype: array of float64 with shape (log_likelihood_optimizer.objective_function.num_hyperparameters) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seed based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} # C++ expects the domain in log10 space and in list form domain_bounds_log10 = numpy.log10(log_likelihood_optimizer.domain._domain_bounds) hyperparameters_opt = C_GP.multistart_hyperparameter_optimization( log_likelihood_optimizer.optimizer_parameters, cpp_utils.cppify(domain_bounds_log10), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled_value), log_likelihood_optimizer.objective_function.dim, log_likelihood_optimizer.objective_function._num_sampled, cpp_utils.cppify_hyperparameters(log_likelihood_optimizer.objective_function.hyperparameters), cpp_utils.cppify(log_likelihood_optimizer.objective_function._points_sampled_noise_variance), max_num_threads, randomness, status, ) return numpy.array(hyperparameters_opt)
def _heuristic_expected_improvement_optimization( ei_optimizer, num_multistarts, num_to_sample, estimation_policy, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): r"""Heuristically solve the q,0-EI problem (estimating multistart_expected_improvement_optimization()) using 1,0-EI solves. Consider this as an alternative when multistart_expected_improvement_optimization() is too expensive. Since this function kernalizes 1,0-EI, it always hits the analytic case; hence it is much faster than q,0-EI which requires monte-carlo. Users will probably call one of this function's wrappers (e.g., constant_liar_expected_improvement_optimization() or kriging_believer_expected_improvement_optimization()) instead of accessing this directly. Calls into heuristic_expected_improvement_optimization_wrapper in cpp/GPP_python_expected_improvement.cpp. .. NOTE:: The following comments are copied from gpp_heuristic_expected_improvement_optimization.hpp, ComputeHeuristicPointsToSample(). It heuristically solves the q,0-EI optimization problem. As a reminder, that problem is finding the set of q points that maximizes the Expected Improvement (saved in the output, ``best_points_to_sample``). Solving for q points simultaneously usually requires monte-carlo iteration and is expensive. The heuristic here solves q-EI as a sequence of 1-EI problems. We solve 1-EI, and then we *ASSUME* an objective function value at the resulting optima. This process is repeated q times. It is perhaps more clear in pseudocode:: points_being_sampled = {} // This stays empty! We are only working with 1,0-EI solves for i = 0:num_to_sample-1 { // First, solve the 1,0-EI problem\* new_point = ComputeOptimalPointsToSampleWithRandomStarts(gaussian_process, points_being_sampled, other_parameters) // *Estimate* the objective function value at new_point new_function_value = ESTIMATED_OBJECTIVE_FUNCTION_VALUE(new_point, other_args) new_function_value_noise = ESTIMATED_NOISE_VARIANCE(new_point, other_args) // Write the estimated objective values to the GP as *truth* gaussian_process.AddPoint(new_point, new_function_value, new_function_value_noise) optimal_points_to_sample.append(new_point) } \*Recall: each call to ComputeOptimalPointsToSampleWithRandomStarts() (gpp_math.hpp) kicks off a round of MGD optimization of 1-EI. Note that ideally the estimated objective function value (and noise) would be measured from the real-world (e.g., by running an experiment). Then this algorithm would be optimal. However, the estimate probably is not accurately representating of the true objective. The estimation is handled through the "estimation_policy" input. Passing a ConstantLiarEstimationPolicy or KrigingBelieverEstimationPolicy object to this function will produce the "Constant Liar" and "Kriging Believer" heuristics described in Ginsbourger 2008. The interface for estimation_policy is generic so users may specify other estimators as well. Contrast this approach with ComputeOptimalPointsToSample() (gpp_math.hpp) which solves all outputs of the q,0-EI problem simultaneously instead of one point at a time. That method is more accurate (b/c it does not attempt to estimate the behavior of the underlying objective function) but much more expensive (because it requires monte-carlo iteration). If ``num_to_sample = 1``, this is exactly the same as ComputeOptimalPointsToSampleWithRandomStarts(); i.e., both methods solve the 1-EI optimization problem the same way. Currently, during optimization, we recommend that the coordinates of the initial guesses not differ from the coordinates of the optima by more than about 1 order of magnitude. This is a very (VERY!) rough guideline for sizing the domain and num_multistarts; i.e., be wary of sets of initial guesses that cover the space too sparsely. Solution is guaranteed to lie within the region specified by "domain"; note that this may not be a local optima (i.e., the gradient may be substantially nonzero). .. WARNING:: this function fails if any step fails to find improvement! In that case, the return should not be read and status will report false. :param ei_optimizer: object that optimizes (e.g., gradient descent, newton) EI over a domain :type ei_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``ei_optimizer`` (UNUSED, data is in ei_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param num_to_sample: how many simultaneous experiments you would like to run (i.e., the q in q,0-EI) :type num_to_sample: int >= 1 :param estimation_policy: the policy to use to produce (heuristic) objective function estimates during q,0-EI optimization :type estimation_policy: subclass of ObjectiveEstimationPolicyInterface (C++ pure abstract class) e.g., C_GP.KrigingBelieverEstimationPolicy, C_GP.ConstantLiarEstimationPolicy See gpp_heuristic_expected_improvement_optimization.hpp :param randomness: RNGs used by C++ to generate initial guesses :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: point(s) that approximately maximize the expected improvement (solving the q,0-EI problem) :rtype: array of float64 with shape (num_to_sample, ei_optimizer.objective_function.dim) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seed based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} best_points_to_sample = C_GP.heuristic_expected_improvement_optimization( ei_optimizer.optimizer_parameters, ei_optimizer.objective_function._gaussian_process._gaussian_process, cpp_utils.cppify(ei_optimizer.domain._domain_bounds), estimation_policy, num_to_sample, ei_optimizer.objective_function._best_so_far, max_num_threads, randomness, status, ) # reform output to be a list of dim-dimensional points, dim = len(self.domain) return cpp_utils.uncppify(best_points_to_sample, (num_to_sample, ei_optimizer.objective_function.dim))
def _heuristic_expected_improvement_optimization( ei_optimizer, num_multistarts, num_to_sample, estimation_policy, randomness=None, max_num_threads=DEFAULT_MAX_NUM_THREADS, status=None, ): r"""Heuristically solve the q,0-EI problem (estimating multistart_expected_improvement_optimization()) using 1,0-EI solves. Consider this as an alternative when multistart_expected_improvement_optimization() is too expensive. Since this function kernalizes 1,0-EI, it always hits the analytic case; hence it is much faster than q,0-EI which requires monte-carlo. Users will probably call one of this function's wrappers (e.g., constant_liar_expected_improvement_optimization() or kriging_believer_expected_improvement_optimization()) instead of accessing this directly. Calls into heuristic_expected_improvement_optimization_wrapper in cpp/GPP_python_expected_improvement.cpp. .. NOTE:: The following comments are copied from gpp_heuristic_expected_improvement_optimization.hpp, ComputeHeuristicPointsToSample(). It heuristically solves the q,0-EI optimization problem. As a reminder, that problem is finding the set of q points that maximizes the Expected Improvement (saved in the output, ``best_points_to_sample``). Solving for q points simultaneously usually requires monte-carlo iteration and is expensive. The heuristic here solves q-EI as a sequence of 1-EI problems. We solve 1-EI, and then we *ASSUME* an objective function value at the resulting optima. This process is repeated q times. It is perhaps more clear in pseudocode:: points_being_sampled = {} // This stays empty! We are only working with 1,0-EI solves for i = 0:num_to_sample-1 { // First, solve the 1,0-EI problem\* new_point = ComputeOptimalPointsToSampleWithRandomStarts(gaussian_process, points_being_sampled, other_parameters) // *Estimate* the objective function value at new_point new_function_value = ESTIMATED_OBJECTIVE_FUNCTION_VALUE(new_point, other_args) new_function_value_noise = ESTIMATED_NOISE_VARIANCE(new_point, other_args) // Write the estimated objective values to the GP as *truth* gaussian_process.AddPoint(new_point, new_function_value, new_function_value_noise) optimal_points_to_sample.append(new_point) } \*Recall: each call to ComputeOptimalPointsToSampleWithRandomStarts() (gpp_math.hpp) kicks off a round of MGD optimization of 1-EI. Note that ideally the estimated objective function value (and noise) would be measured from the real-world (e.g., by running an experiment). Then this algorithm would be optimal. However, the estimate probably is not accurately representating of the true objective. The estimation is handled through the "estimation_policy" input. Passing a ConstantLiarEstimationPolicy or KrigingBelieverEstimationPolicy object to this function will produce the "Constant Liar" and "Kriging Believer" heuristics described in Ginsbourger 2008. The interface for estimation_policy is generic so users may specify other estimators as well. Contrast this approach with ComputeOptimalPointsToSample() (gpp_math.hpp) which solves all outputs of the q,0-EI problem simultaneously instead of one point at a time. That method is more accurate (b/c it does not attempt to estimate the behavior of the underlying objective function) but much more expensive (because it requires monte-carlo iteration). If ``num_to_sample = 1``, this is exactly the same as ComputeOptimalPointsToSampleWithRandomStarts(); i.e., both methods solve the 1-EI optimization problem the same way. Currently, during optimization, we recommend that the coordinates of the initial guesses not differ from the coordinates of the optima by more than about 1 order of magnitude. This is a very (VERY!) rough guideline for sizing the domain and num_multistarts; i.e., be wary of sets of initial guesses that cover the space too sparsely. Solution is guaranteed to lie within the region specified by "domain"; note that this may not be a local optima (i.e., the gradient may be substantially nonzero). .. WARNING:: this function fails if any step fails to find improvement! In that case, the return should not be read and status will report false. :param ei_optimizer: object that optimizes (e.g., gradient descent, newton) EI over a domain :type ei_optimizer: cpp_wrappers.optimization.*Optimizer object :param num_multistarts: number of times to multistart ``ei_optimizer`` (UNUSED, data is in ei_optimizer.optimizer_parameters) :type num_multistarts: int > 0 :param num_to_sample: how many simultaneous experiments you would like to run (i.e., the q in q,0-EI) :type num_to_sample: int >= 1 :param estimation_policy: the policy to use to produce (heuristic) objective function estimates during q,0-EI optimization :type estimation_policy: subclass of ObjectiveEstimationPolicyInterface (C++ pure abstract class) e.g., C_GP.KrigingBelieverEstimationPolicy, C_GP.ConstantLiarEstimationPolicy See gpp_heuristic_expected_improvement_optimization.hpp :param randomness: RNGs used by C++ to generate initial guesses :type randomness: RandomnessSourceContainer (C++ object; e.g., from C_GP.RandomnessSourceContainer()) :param max_num_threads: maximum number of threads to use, >= 1 :type max_num_threads: int > 0 :param status: (output) status messages from C++ (e.g., reporting on optimizer success, etc.) :type status: dict :return: point(s) that approximately maximize the expected improvement (solving the q,0-EI problem) :rtype: array of float64 with shape (num_to_sample, ei_optimizer.objective_function.dim) """ # Create enough randomness sources if none are specified. if randomness is None: randomness = C_GP.RandomnessSourceContainer(max_num_threads) # Set seed based on less repeatable factors (e.g,. time) randomness.SetRandomizedUniformGeneratorSeed(0) randomness.SetRandomizedNormalRNGSeed(0) # status must be an initialized dict for the call to C++. if status is None: status = {} best_points_to_sample = C_GP.heuristic_expected_improvement_optimization( ei_optimizer.optimizer_parameters, ei_optimizer.objective_function._gaussian_process._gaussian_process, cpp_utils.cppify(ei_optimizer.domain._domain_bounds), estimation_policy, num_to_sample, ei_optimizer.objective_function._best_so_far, max_num_threads, randomness, status, ) # reform output to be a list of dim-dimensional points, dim = len(self.domain) return cpp_utils.uncppify( best_points_to_sample, (num_to_sample, ei_optimizer.objective_function.dim))