def train(model, batch_size=100, algorithm="adam", max_iter=1000): """ Train a george-based Gaussian process model. """ def callback(p): print('{}\t{}'.format(np.exp(p), model.log_evidence(p, n=batch_size)[0])) def nll(k): ll = model.log_evidence(k, n=batch_size)[0] return -ll if np.isfinite(ll) else 1e25 def grad_nll(k): return - model.log_evidence(k, n=batch_size)[1] def grad_ll(k): return model.log_evidence(k, n=batch_size)[1] # Get the default value of the hyperparameters as the initial point for the optimisation p0 = model.gp.get_parameter_vector() model.train() if not batch_size == None: if algorithm == "adam": """ Optimise using the adam algorithm. """ import climin opt = climin.Adam(p0, grad_nll) for info in opt: if info['n_iter']%10 == 0: k = model.gp.get_parameter_vector() print("{} - {} - {}".format(info['n_iter'], model.log_evidence(k, n=batch_size)[0], np.exp(k) )) if info['n_iter'] > max_iter: break results = model.gp.get_parameter_vector() else: results = op.minimize(nll, p0, jac=grad_nll, method="L-BFGS-B", callback=callback) model.gp.set_parameter_vector(results.x) model.eval() return results
def vem_algorithm(model, vem_iters=None, maxIter_perVEM=None, step_rate=None, verbose=False, optZ=True, verbose_plot=False, non_chained=True): if vem_iters is None: vem_iters = 5 if maxIter_perVEM is None: maxIter_perVEM = 100 model['.*.kappa'].fix() # must be always fixed #model.elbo = np.empty((vem_iters,1)) if model.batch_size is None: for i in range(vem_iters): # VARIATIONAL E-STEP model['.*.lengthscale'].fix() model['.*.variance'].fix() model.Z.fix() model['.*.W'].fix() model.q_u_means.unfix() model.q_u_chols.unfix() model.optimize(messages=verbose, max_iters=maxIter_perVEM) print('iteration (' + str(i + 1) + ') VE step, log_likelihood=' + str(model.log_likelihood().flatten())) # VARIATIONAL M-STEP model['.*.lengthscale'].unfix() model['.*.variance'].unfix() if optZ: model.Z.unfix() if non_chained: model['.*.W'].unfix() model.q_u_means.fix() model.q_u_chols.fix() model.optimize(messages=verbose, max_iters=maxIter_perVEM) print('iteration (' + str(i + 1) + ') VM step, log_likelihood=' + str(model.log_likelihood().flatten())) else: if step_rate is None: step_rate = 0.01 # Here the E step has maxIter_perVEM (100 by default) and # the M step has also maxIter_perVEM (100 by default) model.elbo = np.empty((2 * maxIter_perVEM * vem_iters + 2, 1)) model.elbo[0, 0] = model.log_likelihood() c_full = partial(model.callback, max_iter=maxIter_perVEM, verbose=verbose, verbose_plot=verbose_plot) for i in range(vem_iters): # VARIATIONAL E-STEP model['.*.lengthscale'].fix() model['.*.variance'].fix() model.Z.fix() model['.*.W'].fix() model.q_u_means.unfix() model.q_u_chols.unfix() optimizer = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999) model.index_VEM = 2 * (i) * maxIter_perVEM optimizer.minimize_until(c_full) # vo.variational_opt_HetMOGP(model=model, max_iters=maxIter_perVEM, step_size=step_rate, momentum=0.0,prior_lambda=1.0e-1,MC=1) print('iteration (' + str(i + 1) + ') VE step, mini-batch log_likelihood=' + str(model.log_likelihood().flatten())) # # # VARIATIONAL M-STEP model['.*.lengthscale'].unfix() model['.*.variance'].unfix() if optZ: model.Z.unfix() if non_chained: model['.*.W'].unfix() model.q_u_means.fix() model.q_u_chols.fix() optimizer = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999) model.index_VEM = 2 * (i) * maxIter_perVEM + maxIter_perVEM optimizer.minimize_until(c_full) # vo.variational_opt_HetMOGP(model=model, max_iters=maxIter_perVEM, step_size=step_rate, momentum=0.0,prior_lambda=1.0e-1,MC=1) print('iteration (' + str(i + 1) + ') VM step, mini-batch log_likelihood=' + str(model.log_likelihood().flatten())) return model
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=500, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, optimizer='gd', activation=T.tanh): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, n_hidden), n_hidden, (n_hidden, 10), 10] flat, (Weights_1, bias_1, Weights_2, bias_2) = climin.util.empty_with_views(tmpl) #Initialize weights with uniformal distribution according to the tutorial rng = numpy.random.RandomState(1234) Weights_1_init = rng.uniform(low=-numpy.sqrt(6. / (28 * 28 + n_hidden)), high=numpy.sqrt(6. / (28 * 28 + n_hidden)), size=(28 * 28, n_hidden)) Weights_2_init = rng.uniform(low=-numpy.sqrt(6. / (n_hidden + 10)), high=numpy.sqrt(6. / (n_hidden + 10)), size=(n_hidden, 10)) bias_1_init = numpy.zeros((n_hidden, ), dtype=theano.config.floatX) bias_2_init = numpy.zeros((10, ), dtype=theano.config.floatX) if activation == T.nnet.sigmoid: Weights_1_init *= 4 Weights_2_init *= 4 def initialize_in_place(array, values): for j in range(0, len(values)): array[j] = values[j] initialize_in_place(Weights_1, Weights_1_init) initialize_in_place(Weights_2, Weights_2_init) initialize_in_place(bias_1, bias_1_init) initialize_in_place(bias_2, bias_2_init) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') rng = numpy.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, Weights_1=theano.shared(value=Weights_1, name='W', borrow=True), bias_1=theano.shared(value=bias_1, name='b', borrow=True), Weights_2=theano.shared(value=Weights_2, name='W', borrow=True), bias_2=theano.shared(value=bias_2, name='b', borrow=True), activation=T.tanh) #cost with regularisation terms cost = theano.function(inputs=[x, y], outputs=classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, allow_input_downcast=True) # gradients with regularisation terms gradients = theano.function( inputs=[x, y], outputs=[ T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.b), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.b) ], allow_input_downcast=True) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W_1, g_b_1, g_W_2, g_b_2 = gradients(inputs, targets) return numpy.concatenate( [g_W_1.flatten(), g_b_1, g_W_2.flatten(), g_b_2]) zero_one_loss = theano.function(inputs=[x, y], outputs=classifier.errors(y), allow_input_downcast=True) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'bfgs': print('... using using quasi-newton BFGS') opt = cli.Bfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'lbfgs': print('... using using quasi-newton L-BFGS') opt = cli.Lbfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'nlcg': print('... using using non linear conjugate gradient') opt = cli.NonlinearConjugateGradient(flat, loss, d_loss_wrt_pars, min_grad=1e-03, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate=0.0002, decay=0.99999999, decay_mom1=0.1, decay_mom2=0.001, momentum=0, offset=1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay=0.9, momentum=.95, offset=0.0001, args=args) else: print('unknown optimizer') return 1 print('... training') # early stopping parameters if batch_size == None: patience = 250 else: patience = 10000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience)) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % (epoch, minibatch_index + 1, n_train_batches, test_loss * 100)) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print(( 'Optimization complete. Best validation score of %f %% with test performance %f %%' ) % (best_validation_loss * 100., test_loss * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
def optimise_HetMOGP(model, Xval=None, Yval=None, max_iters=1000, step_rate=0.01, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999, fng=False, q_s_ini=0.0, prior_lamb_or_offset=None): if prior_lamb_or_offset is None: prior_lamb_or_offset = 1e-8 global mk_ant, mk_aux, mk, V_i, Vk, Lk, Vk, Vki_ant def natural_grad_qu(model, n_iter=1, step_size=step_rate, momentum=0.0): global mk_ant, mk_aux, mk, V_i, Vk, Lk, Vk, Vki_ant """"Initialize the step-sizes""" "" beta2_k = step_size #use step_size*0.1 for Convolutional MOGP gamma2_k = momentum alpha2_k = step_size N_posteriors = model.q_u_means.shape[1] if n_iter == 1: V_i = choleskies.multiple_dpotri( choleskies.flat_to_triang(model.q_u_chols.values)).copy() Vk = np.zeros_like(V_i) for i in range(N_posteriors): Vk[i, :, :] = 0.5 * (model.posteriors[i].covariance.copy() + model.posteriors[i].covariance.T.copy()) Lk = np.zeros_like(Vk) mk = model.q_u_means.values.copy() Vki_ant = V_i.copy() mk_aux = mk.copy() dL_dm, dL_dV = compute_stoch_grads_for_qu_HetMOGP(model=model) mk_ant = mk_aux.copy() mk_aux = mk.copy() if not model.q_u_means.is_fixed and not model.q_u_chols.is_fixed: mk_ant = mk_aux.copy() mk_aux = mk.copy() for i in range(N_posteriors): try: V_i[i, :, :] = V_i[i, :, :] + 2 * beta2_k * dL_dV[ i] #+ 1.0e-6*np.eye(*Vk[i,:,:].shape) Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 0.5 * (np.array(Vk[i, :, :]) + np.array(Vk[i, :, :].T)) Lk[i, :, :] = np.linalg.cholesky(Vk[i, :, :]) mk[:, i] = mk[:, i] - alpha2_k * np.dot( Vk[i, :, :], dL_dm[i]) + gamma2_k * np.dot( np.dot(Vk[i, :, :], Vki_ant[i, :, :]), (mk[:, i] - mk_ant[:, i])) except LinAlgError: print("Overflow") Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 1.0e-1 * np.eye( *Vk[i, :, :].shape ) #nearestPD(Vk[i,:,:]) # + 1.0e-3*np.eye(*Vk[i,:,:].shape) Lk[i, :, :] = linalg.jitchol(Vk[i, :, :]) V_i[i, :, :] = np.linalg.inv(Vk[i, :, :]) mk[:, i] = mk[:, i] * 0.0 Vki_ant = V_i.copy() model.L_u.setfield(choleskies.triang_to_flat(Lk.copy()), np.float64) model.m_u.setfield(mk.copy(), np.float64) global ELBO, myTimes, sched, NLPD ELBO = [] NLPD = [] myTimes = [] sched = step_rate def callhybrid(i): global start global ELBO, myTimes, sched, NLPD if i['n_iter'] > max_iters: model.q_u_means.unfix() model.q_u_chols.unfix() return True model.update_model(False) model.q_u_means.unfix() model.q_u_chols.unfix() if fng: mom = 0.9 else: mom = 0.0 natural_grad_qu(model, n_iter=i['n_iter'], step_size=step_rate, momentum=mom) model.update_model(True) model.q_u_means.fix() model.q_u_chols.fix() #model.update_model(True) ELBO.append(model.log_likelihood()) myTimes.append(time.time()) if (i['n_iter']) % 50 == 0: print(i['n_iter']) print(model.log_likelihood()) if not (Xval == None or Yval == None): NLPD.append( model.negative_log_predictive(Xval, Yval, num_samples=1000)) return False model.q_u_means.fix() model.q_u_chols.fix() if fng is True: print('Running Fully NG, check s_ini:', q_s_ini, ' and prior_lamb:', prior_lamb_or_offset) opt = climin.VarOpt(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, s_ini=q_s_ini, decay_mom1=decay_mom1, decay_mom2=decay_mom2, prior_lambda=prior_lamb_or_offset) else: print('Running Hybrid (NG+Adam), check offset:', prior_lamb_or_offset) opt = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, decay_mom1=decay_mom1, decay_mom2=decay_mom2, offset=prior_lamb_or_offset) ELBO.append(model.log_likelihood()) if not (Xval == None or Yval == None): NLPD.append(model.negative_log_predictive(Xval, Yval, num_samples=1000)) start = time.time() myTimes.append(start) info = opt.minimize_until(callhybrid) return np.array(ELBO).flatten(), np.array(NLPD), np.array(myTimes) - start
def sgd_optimization_mnist(learning_rate=0.01, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600, optimizer='gd'): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, 10), 10] flat, (Weights, bias) = climin.util.empty_with_views(tmpl) cli.initialize.randomize_normal(flat, 0, 1) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') classifier = LogisticRegression( input = x, n_in = 28 * 28, n_out = 10, W = theano.shared(value = Weights, name = 'W', borrow = True), b = theano.shared(value = bias, name = 'b', borrow = True) ) gradients = theano.function( inputs = [x, y], outputs = [ T.grad(classifier.negative_log_likelihood(y), classifier.W), T.grad(classifier.negative_log_likelihood(y), classifier.b) ], allow_input_downcast = True ) cost = theano.function( inputs=[x, y], outputs=classifier.negative_log_likelihood(y), allow_input_downcast=True ) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W, g_b = gradients(inputs, targets) return np.concatenate([g_W.flatten(), g_b]) zero_one_loss = theano.function( inputs = [x, y], outputs = classifier.errors(y), allow_input_downcast = True ) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate = 0.0002, decay = 0.99999999, decay_mom1 = 0.1, decay_mom2 = 0.001, momentum = 0, offset = 1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay = 0.9, momentum = .95, offset = 0.0001, args=args) else: print('unknown optimizer') return 1 print('... training the model') # early stopping parameters if batch_size== None: patience = 250 else: patience = 5000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: # compute zero-one loss on validation set validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % ( epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience ) ) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % ( epoch, minibatch_index + 1, n_train_batches, test_loss * 100 ) ) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_loss * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
#model.kern_list[0].fix() model['.*.kappa'].fix() """""""""""""""""""""""""""""""""""""""""""" """""""""""""""""""""""""""""""""""""""""""" for q in range(Q): model['B_q' + str(q) + '.W'] = 0.1 * np.random.randn(model['B_q0.W'].__len__())[:, None] model.kern_list[q].variance.fix() """""""""""""""""""""""""""""""""""""""""""""""""""""" print(model['B']) print('Initial Log Likelihood:\n',model.log_likelihood()) if method == 'adam': opt = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=0.005, decay_mom1=1 - 0.9,decay_mom2=1 - 0.999) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Adam...') info = opt.minimize_until(callback) elif method == 'adad': opt = climin.Adadelta(model.optimizer_array, model.stochastic_grad, step_rate=0.005, momentum=0.9) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Adadelta...') info = opt.minimize_until(callback)
def vem_algorithm(model, vem_iters=None, maxIter_perVEM=None, step_rate=None, verbose=False, optZ=True, verbose_plot=False, non_chained=True): if vem_iters is None: vem_iters = 5 if maxIter_perVEM is None: maxIter_perVEM = 100 model['.*.kappa'].fix() # must be always fixed! if model.batch_size is None: for i in range(vem_iters): ####### VARIATIONAL E-STEP ####### model['.*.lengthscale'].fix() model['.*.variance'].fix() model.Z.fix() model['.*.W'].fix() model.q_u_means.unfix() model.q_u_chols.unfix() model.optimize(messages=verbose, max_iters=maxIter_perVEM) print('iteration (' + str(i + 1) + ') VE step, log_likelihood=' + str(model.log_likelihood().flatten())) ####### VARIATIONAL M-STEP ####### model['.*.lengthscale'].unfix() model['.*.variance'].unfix() if optZ: model.Z.unfix() if non_chained: model['.*.W'].unfix() model.q_u_means.fix() model.q_u_chols.fix() model.optimize(messages=verbose, max_iters=maxIter_perVEM) print('iteration (' + str(i + 1) + ') VM step, log_likelihood=' + str(model.log_likelihood().flatten())) else: if step_rate is None: step_rate = 0.01 model.elbo = np.empty((maxIter_perVEM * vem_iters + 2, 1)) model.elbo[0, 0] = model.log_likelihood() c_full = partial(model.callback, max_iter=maxIter_perVEM, verbose=verbose, verbose_plot=verbose_plot) for i in range(vem_iters): ####### VARIATIONAL E-STEP ####### model['.*.lengthscale'].fix() model['.*.variance'].fix() model.Z.fix() model['.*.W'].fix() model.q_u_means.unfix() model.q_u_chols.unfix() optimizer = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999) optimizer.minimize_until(c_full) print('iteration (' + str(i + 1) + ') VE step, mini-batch log_likelihood=' + str(model.log_likelihood().flatten())) ####### VARIATIONAL M-STEP ####### model['.*.lengthscale'].unfix() model['.*.variance'].unfix() if optZ: model.Z.unfix() if non_chained: model['.*.W'].unfix() model.q_u_means.fix() model.q_u_chols.fix() optimizer = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, decay_mom1=1 - 0.9, decay_mom2=1 - 0.999) optimizer.minimize_until(c_full) print('iteration (' + str(i + 1) + ') VM step, mini-batch log_likelihood=' + str(model.log_likelihood().flatten())) return model