def compute_log_ei(self, x, incumbent): Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + \ T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * \ casting(self.n_points - self.set_for_training) / \ casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot( covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvcovCavity = T.dot(KzzInv, covCavity) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) B = T.dot(KzzInvcovCavity, KzzInv) - KzzInv v_out = T.exp(self.lsf) + T.dot(Kxz * T.dot(Kxz, B), T.ones_like(self.z[:, 0:1])) m_out = T.dot(Kxz, KzzInvmeanCavity) s = (incumbent - m_out) / T.sqrt(v_out) log_ei = T.log((incumbent - m_out) * ratio(s) + T.sqrt(v_out)) + log_n_pdf(s) return log_ei
def setForTraining(self): # We only do something if the node was set for prediction instead of # training if self.set_for_training == casting(0.0): self.set_for_training == casting(1.0)
def getLogNormalizerCavity(self): assert self.covCavity is not None and self.meanCavity is not None and \ self.covCavityInv is not None return casting(0.5 * self.n_inducing_points * np.log(2 * np.pi)) + \ casting(0.5) * T.nlinalg.LogDetPSD()(self.covCavity) + \ casting(0.5) * T.dot(T.dot(T.transpose(self.meanCavity), self.covCavityInv), self.meanCavity)
def getLogNormalizerPosterior(self): assert self.covPosterior is not None \ and self.meanPosterior is not None \ and self.covPosteriorInv is not None return casting(0.5 * self.n_inducing_points * np.log(2 * np.pi)) + \ casting(0.5) * T.nlinalg.LogDetPSD()(self.covPosterior) + \ casting(0.5) * T.dot(T.dot(T.transpose(self.meanPosterior), self.covPosteriorInv), self.meanPosterior)
def __init__(self, n_inducing_points, n_points, input_d, input_means, input_vars, training_targets): self.ignore_variances = True self.n_inducing_points = n_inducing_points self.n_points = n_points self.input_d = input_d self.training_targets = training_targets self.input_means = input_means self.input_vars = input_vars # These are the actual parameters of the posterior distribution being # optimzied # covCavity = (Kzz^-1 + LParamPost LParamPost^T * (n - 1) / n) and # meanCavity = covCavity mParamPost * (n - 1) / n initial_value = np.zeros((n_inducing_points, n_inducing_points)) self.LParamPost = theano.shared(value=initial_value.astype( theano.config.floatX), name='LParamPost', borrow=True) self.mParamPost = theano.shared(value=initial_value[:, 0:1].astype( theano.config.floatX), name='mParamPost', borrow=True) self.lls = theano.shared(value=np.zeros(input_d).astype( theano.config.floatX), name='lls', borrow=True) self.lsf = theano.shared(value=np.zeros(1).astype( theano.config.floatX)[0], name='lsf', borrow=True) self.z = theano.shared(value=np.zeros( (n_inducing_points, input_d)).astype(theano.config.floatX), name='z', borrow=True) self.lvar_noise = theano.shared( value=casting(0) * np.ones(1).astype(theano.config.floatX)[0], name='lvar_noise', borrow=True) self.set_for_training = casting(1.0) # We set the level of jitter to use (added to the diagonal of Kzz) self.jitter = casting(1e-3)
def getContributionToEnergy(self): assert self.n_points is not None \ and self.covCavity is not None \ and self.covPosterior is not None \ and self.input_means is not None logZpost = self.getLogNormalizerPosterior() logZprior = self.getLogNormalizerPrior() logZcav = self.getLogNormalizerCavity() # We multiply by the minibatch size and normalize terms according to # the total number of points (n_points) return ((logZcav - logZpost) + logZpost / casting(self.n_points) - logZprior / casting(self.n_points)) * \ T.cast(self.input_means.shape[0], 'float32') + T.sum(self.getLogZ())
def compute_log_averaged_ei(self, x, X, randomness, incumbent): # We compute the old predictive mean at x Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + \ T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) KzzInv = T.nlinalg.MatrixInversePSD()(Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) covCavityInv = KzzInv + LLt * \ casting(self.n_points - self.set_for_training) / \ casting(self.n_points) covCavity = T.nlinalg.MatrixInversePSD()(covCavityInv) meanCavity = T.dot( covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) KzzInvmeanCavity = T.dot(KzzInv, meanCavity) Kxz = compute_kernel(self.lls, self.lsf, x, self.z) m_old_x = T.dot(Kxz, KzzInvmeanCavity) # We compute the old predictive mean at X KXz = compute_kernel(self.lls, self.lsf, X, self.z) m_old_X = T.dot(KXz, KzzInvmeanCavity) # We compute the required cross covariance matrices KXX = compute_kernel(self.lls, self.lsf, X, X) - \ T.dot(T.dot(KXz, KzzInv), KXz.T) + T.eye(X.shape[0]) * self.jitter * T.exp(self.lsf) KXXInv = T.nlinalg.MatrixInversePSD()(KXX) KxX = compute_kernel(self.lls, self.lsf, x, X) xX = T.concatenate([x, X], 0) KxXz = compute_kernel(self.lls, self.lsf, xX, self.z) KxX = KxX - T.dot(T.dot(KxXz[0:x.shape[0], :], KzzInv), KxXz[x.shape[0]:xX.shape[0], :].T) # We compute the new posterior mean samples_internal = T.dot(MatrixChol()(KXX), randomness) new_predictive_mean = T.tile( m_old_x, [1, randomness.shape[1]]) + \ T.dot(KxX, T.dot(KXXInv, samples_internal)) # We compute the new posterior variance z_expanded = T.concatenate([self.z, X], 0) Kxz_expanded = compute_kernel(self.lls, self.lsf, x, z_expanded) Kzz_expanded = compute_kernel( self.lls, self.lsf, z_expanded, z_expanded) + T.eye( z_expanded.shape[0]) * self.jitter * T.exp(self.lsf) Kzz_expandedInv = T.nlinalg.MatrixInversePSD()(Kzz_expanded) v_out = T.exp(self.lsf) - T.dot( Kxz_expanded * T.dot(Kxz_expanded, Kzz_expandedInv), T.ones_like(z_expanded[:, 0:1])) new_predictive_var = T.tile(v_out, [1, randomness.shape[1]]) s = (incumbent - new_predictive_mean) / T.sqrt(new_predictive_var) log_ei = T.log((incumbent - new_predictive_mean) * ratio(s) + T.sqrt(new_predictive_var)) + log_n_pdf(s) return T.mean(LogSumExp(log_ei, 1), 1)
def setForPrediction(self): if self.set_for_training == casting(1.0): self.set_for_training = casting(0.0)
def ratio(x): x = T.switch( T.lt(x, casting(-10)), -(casting(1.0) / x - casting(1.0) / x**3 + casting(3.0) / x**5 - casting(15.0) / x**7), n_cdf(x) / n_pdf(x)) return x
def log_n_cdf(x): x = T.switch(T.lt(x, casting(-10)), log_n_cdf_approx(x), T.log(n_cdf(x))) return x
def getLogNormalizerPrior(self): assert self.KzzInv is not None return casting(0.5 * self.n_inducing_points * np.log(2 * np.pi)) - \ casting(0.5) * T.nlinalg.LogDetPSD()(self.KzzInv)
def compute_output(self): # We compute the output mean self.Kzz = compute_kernel(self.lls, self.lsf, self.z, self.z) + \ T.eye(self.z.shape[0]) * self.jitter * T.exp(self.lsf) self.KzzInv = T.nlinalg.MatrixInversePSD()(self.Kzz) LLt = T.dot(self.LParamPost, T.transpose(self.LParamPost)) self.covCavityInv = self.KzzInv + LLt * \ casting(self.n_points - self.set_for_training) / \ casting(self.n_points) self.covCavity = T.nlinalg.MatrixInversePSD()(self.covCavityInv) self.meanCavity = T.dot( self.covCavity, casting(self.n_points - self.set_for_training) / casting(self.n_points) * self.mParamPost) self.KzzInvcovCavity = T.dot(self.KzzInv, self.covCavity) self.KzzInvmeanCavity = T.dot(self.KzzInv, self.meanCavity) self.covPosteriorInv = self.KzzInv + LLt self.covPosterior = T.nlinalg.MatrixInversePSD()(self.covPosteriorInv) self.meanPosterior = T.dot(self.covPosterior, self.mParamPost) self.Kxz = compute_kernel(self.lls, self.lsf, self.input_means, self.z) self.B = T.dot(self.KzzInvcovCavity, self.KzzInv) - self.KzzInv v_out = T.exp(self.lsf) + T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[:, 0:1])) if self.ignore_variances: self.output_means = T.dot(self.Kxz, self.KzzInvmeanCavity) self.output_vars = abs(v_out) + casting(0) * T.sum(self.input_vars) else: self.EKxz = compute_psi1(self.lls, self.lsf, self.input_means, self.input_vars, self.z) self.output_means = T.dot(self.EKxz, self.KzzInvmeanCavity) # In other layers we have to compute the expected variance self.B2 = T.outer(T.dot(self.KzzInv, self.meanCavity), T.dot(self.KzzInv, self.meanCavity)) exact_output_vars = True if exact_output_vars: # We compute the exact output variance self.psi2 = compute_psi2(self.lls, self.lsf, self.z, self.input_means, self.input_vars) ll = T.transpose(self.EKxz[:, None, :] * self.EKxz[:, :, None], [1, 2, 0]) kk = T.transpose(self.Kxz[:, None, :] * self.Kxz[:, :, None], [1, 2, 0]) v1 = T.transpose( T.sum(T.sum( T.shape_padaxis(self.B2, 2) * (self.psi2 - ll), 0), 0, keepdims=True)) v2 = T.transpose( T.sum(T.sum( T.shape_padaxis(self.B, 2) * (self.psi2 - kk), 0), 0, keepdims=True)) else: # We compute the approximate output variance using the # unscented kalman filter v1 = 0 v2 = 0 n = self.input_d for j in range(1, n + 1): mask = T.zeros_like(self.input_vars) mask = T.set_subtensor(mask[:, j - 1], 1) inc = mask * T.sqrt(casting(n) * self.input_vars) self.kplus = T.sqrt(casting(1.0) / casting(2 * n)) * \ compute_kernel( self.lls, self.lsf, self.input_means + inc, self.z) self.kminus = T.sqrt(casting(1.0) / casting(2 * n)) *\ compute_kernel( self.lls, self.lsf, self.input_means - inc, self.z) v1 += T.dot(self.kplus * T.dot(self.kplus, self.B2), T.ones_like(self.z[:, 0:1])) v1 += T.dot(self.kminus * T.dot(self.kminus, self.B2), T.ones_like(self.z[:, 0:1])) v2 += T.dot(self.kplus * T.dot(self.kplus, self.B), T.ones_like(self.z[:, 0:1])) v2 += T.dot(self.kminus * T.dot(self.kminus, self.B), T.ones_like(self.z[:, 0:1])) v1 -= T.dot(self.EKxz * T.dot(self.EKxz, self.B2), T.ones_like(self.z[:, 0:1])) v2 -= T.dot(self.Kxz * T.dot(self.Kxz, self.B), T.ones_like(self.z[:, 0:1])) self.output_vars = abs(v_out) + abs(v2) + abs(v1) self.output_vars = self.output_vars + T.exp(self.lvar_noise) return