def vem_algorithm(model, stochastic=False, vem_iters=None, step_rate=None, verbose=False, optZ=True, verbose_plot=False, non_chained=True): if vem_iters is None: vem_iters = 5 model['.*.kappa'].fix() # must be always fixed model.elbo = np.empty((vem_iters, 1)) if stochastic is False: for i in range(vem_iters): # VARIATIONAL E-STEP model['.*.lengthscale'].fix() model['.*.variance'].fix() model.Z.fix() model['.*.W'].fix() model.q_u_means.unfix() model.q_u_chols.unfix() model.optimize(messages=verbose, max_iters=100) print('iteration (' + str(i + 1) + ') VE step, log_likelihood=' + str(model.log_likelihood().flatten())) # VARIATIONAL M-STEP model['.*.lengthscale'].unfix() model['.*.variance'].unfix() if optZ: model.Z.unfix() if non_chained: model['.*.W'].unfix() model.q_u_means.fix() model.q_u_chols.fix() model.optimize(messages=verbose, max_iters=100) print('iteration (' + str(i + 1) + ') VM step, log_likelihood=' + str(model.log_likelihood().flatten())) else: if step_rate is None: step_rate = 0.01 sto_iters = vem_iters model.elbo = np.empty((sto_iters + 1, 1)) optimizer = climin.Adadelta(model.optimizer_array, model.stochastic_grad, step_rate=step_rate, momentum=0.9) c_full = partial(model.callback, max_iter=sto_iters, verbose=verbose, verbose_plot=verbose_plot) optimizer.minimize_until(c_full) return model
def SVGP(X, Y): if Y.ndim != 2: Y = Y[:, None] Z = np.random.rand(20, 1) batchsize = 20 m = GPy.core.SVGP(X, Y, Z, GPy.kern.Matern52(1), GPy.likelihoods.Gaussian(), batchsize=batchsize) #m.kern.white.variance = 1e-5 #m.kern.white.fix() opt = climin.Adadelta(m.optimizer_array, m.stochastic_grad, step_rate=0.2, momentum=0.9) def callback(i): print m.log_likelihood(), "\r", if i['n_iter'] > 5000: return True return False info = opt.minimize_until(callback) print info return m
def _prepare_adadelta(self, x, fp): exclude = [ 'verbosity', 'min_grad_ratio', 'max_it', 'permitted_drops', 'callback' ] ada_kwargs = {k: v for k, v in self.kwargs.items() if k not in exclude} return climin.Adadelta(x, fp, **ada_kwargs)
def trainGP(all_training_set, all_training_label, all_testing_set, all_testing_label): t = Text(align='right') display(t) batchsize = 10 Z = np.random.rand(20, 72) all_training_label = np.vstack(all_training_label) m = GPy.core.SVGP(all_training_set, all_training_label, Z, GPy.kern.RBF(72) + GPy.kern.White(72), GPy.likelihoods.Gaussian(), batchsize=batchsize) m.kern.white.variance = 1e-5 m.kern.white.fix() opt = climin.Adadelta(m.optimizer_array, m.stochastic_grad, step_rate=0.2, momentum=0.9) def callback(i): t.value = str(m.log_likelihood()) #Stop after 288615 iterations if i['n_iter'] > 100000: return True return False info = opt.minimize_until(callback) all_answers = m.predict(all_testing_set) answer_shape = np.shape(all_answers) percent_right = np.zeros(answer_shape[1]) for i in range(answer_shape[1]): if all_answers[0][i] > 0.5: percent_right[i] = 1 else: percent_right[i] = 0 final_percent = np.sum( abs(all_testing_label - percent_right)) / answer_shape[1] print('classifier got ', 1 - final_percent * 100, '% correct on the test set') print("Finish training, saving...") # 1: Saving a model: np.save('model_save.npy', m.param_array) # 2: loading a model # Model creation, without initialization: #m_load = GPy.models.GPRegression(X, Y, initialize=False) #m_load.update_model(False) # do not call the underlying expensive algebra on load #m_load.initialize_parameter() # Initialize the parameters (connect the parameters up) #m_load[:] = np.load('model_save.npy') # Load the parameters #m_load.update_model(True) # Call the algebra only once #print(m_load) model_path = "trained_model" np.save( model_path + "_" + datetime.datetime.now().strftime("%m_%d_%y_%H%M") + ".npy", m.param_array) return m
def inference(self, X, Y,numZ,num_local_Z,num_cluster, batchsize=1000,upperbound=-1, lowerbound_ratio=-1, optimizer=1, num_iters=1000): [Ntrain, d]=X.shape Yi=(Y==1).astype(int) pu=np.random.permutation(Ntrain) #numZ=300 Z=X[pu[range(numZ)], :] #batchsize = 1000 lik = GPy.likelihoods.Bernoulli() k = GPy.kern.RBF(d, lengthscale=5.,ARD=True) + GPy.kern.White(1, 1e-6) m = SMGP(X, Yi, Z, likelihood=lik, kernel=k, batchsize=batchsize, num_cluster=num_cluster,num_local_Z=num_local_Z,upperbound=upperbound, lowerbound_ratio=lowerbound_ratio) m.kern.white.variance = 1e-5 m.kern.white.fix() from ipywidgets import Text from IPython.display import display t = Text(align='right') display(t) m.iter_no=0 #import sys def callback_adadelta(i): t.value = str(m.log_likelihood()) print(i['n_iter']) if i['n_iter'] > num_iters: return True return False def callback_lbfgsb(i): m.iter_no=m.iter_no+1 t.value = str(m.log_likelihood()) print(m.iter_no) if optimizer==1: #Adadelta opt = climin.Adadelta(m.optimizer_array, m.stochastic_grad, step_rate=0.2, momentum=0.9) info = opt.minimize_until(callback_adadelta) elif optimizer==2: #l_bfgs_b x, f, d = optimize.fmin_l_bfgs_b(m._objective, m.optimizer_array, fprime=m.stochastic_grad, maxfun=1000, callback=callback_lbfgsb) else: print('optimizer not supported') return m
def minimize(self, fun, x_0, bounds=None): """ Does not take bounds into account """ x = np.copy(x_0).reshape(-1) opt = climin.Adadelta(wrt=x, fprime=fun, step_rate=self.step_rate, momentum=self.momentum, decay=self.decay, offset=self.offset) x_list = [x.copy()] time_list = [0.] start = time.time() for info in opt: i = info['n_iter'] if i > self.maxiter: break if self.disp and not (i % self.print_freq): grad = info['gradient'] print('Epoch', int(i / self.iter_per_epoch), ':') print('\tx', x.reshape(-1)[:5]) print("\tGradient norm", np.linalg.norm(grad)) if not i % int(self.iter_per_epoch): x_list.append(x.copy()) time_list.append(time.time() - start) stat_dict = { 'time_lst': time_list, 'x_lst': x_list, 'fun': None, 'time': time_list[-1], 'info': info } return x.copy(), stat_dict
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=500, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500, optimizer='gd', activation=T.tanh): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, n_hidden), n_hidden, (n_hidden, 10), 10] flat, (Weights_1, bias_1, Weights_2, bias_2) = climin.util.empty_with_views(tmpl) #Initialize weights with uniformal distribution according to the tutorial rng = numpy.random.RandomState(1234) Weights_1_init = rng.uniform(low=-numpy.sqrt(6. / (28 * 28 + n_hidden)), high=numpy.sqrt(6. / (28 * 28 + n_hidden)), size=(28 * 28, n_hidden)) Weights_2_init = rng.uniform(low=-numpy.sqrt(6. / (n_hidden + 10)), high=numpy.sqrt(6. / (n_hidden + 10)), size=(n_hidden, 10)) bias_1_init = numpy.zeros((n_hidden, ), dtype=theano.config.floatX) bias_2_init = numpy.zeros((10, ), dtype=theano.config.floatX) if activation == T.nnet.sigmoid: Weights_1_init *= 4 Weights_2_init *= 4 def initialize_in_place(array, values): for j in range(0, len(values)): array[j] = values[j] initialize_in_place(Weights_1, Weights_1_init) initialize_in_place(Weights_2, Weights_2_init) initialize_in_place(bias_1, bias_1_init) initialize_in_place(bias_2, bias_2_init) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') rng = numpy.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, Weights_1=theano.shared(value=Weights_1, name='W', borrow=True), bias_1=theano.shared(value=bias_1, name='b', borrow=True), Weights_2=theano.shared(value=Weights_2, name='W', borrow=True), bias_2=theano.shared(value=bias_2, name='b', borrow=True), activation=T.tanh) #cost with regularisation terms cost = theano.function(inputs=[x, y], outputs=classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, allow_input_downcast=True) # gradients with regularisation terms gradients = theano.function( inputs=[x, y], outputs=[ T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.hiddenLayer.b), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.W), T.grad( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr, classifier.logRegressionLayer.b) ], allow_input_downcast=True) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W_1, g_b_1, g_W_2, g_b_2 = gradients(inputs, targets) return numpy.concatenate( [g_W_1.flatten(), g_b_1, g_W_2.flatten(), g_b_2]) zero_one_loss = theano.function(inputs=[x, y], outputs=classifier.errors(y), allow_input_downcast=True) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'bfgs': print('... using using quasi-newton BFGS') opt = cli.Bfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'lbfgs': print('... using using quasi-newton L-BFGS') opt = cli.Lbfgs(flat, loss, d_loss_wrt_pars, args=args) elif optimizer == 'nlcg': print('... using using non linear conjugate gradient') opt = cli.NonlinearConjugateGradient(flat, loss, d_loss_wrt_pars, min_grad=1e-03, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate=0.0002, decay=0.99999999, decay_mom1=0.1, decay_mom2=0.001, momentum=0, offset=1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay=0.9, momentum=.95, offset=0.0001, args=args) else: print('unknown optimizer') return 1 print('... training') # early stopping parameters if batch_size == None: patience = 250 else: patience = 10000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % (epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience)) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % (epoch, minibatch_index + 1, n_train_batches, test_loss * 100)) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print(( 'Optimization complete. Best validation score of %f %% with test performance %f %%' ) % (best_validation_loss * 100., test_loss * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
#plt.savefig("gaussian_1000obs.pdf", bbox_inches='tight', transparent=True, pad_inches=0) Z_init = domain[0] + np.random.rand(20, 1) * domain[1] mf = gen_mf(0) gsvgp = GPy.core.SVGP(X=x_init, Y=y_init, Z=Z_init, kernel=k3, likelihood=lik, Y_metadata={'trials': np.ones_like(y_init) * NB_SHOTS}, mean_function=mf, batchsize=15) import climin opt = climin.Adadelta(gsvgp.optimizer_array, gsvgp.stochastic_grad, step_rate=0.2, momentum=0.9) def callback(i): print(str(m.log_likelihood())) #Stop after 5000 iterations if i['n_iter'] > 5000: return True return False info = opt.minimize_until(callback) ### ============================================================ ### # 3.a Super restricted range no MF ### ============================================================ ###
def sgd_optimization_mnist(learning_rate=0.01, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600, optimizer='gd'): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, 10), 10] flat, (Weights, bias) = climin.util.empty_with_views(tmpl) cli.initialize.randomize_normal(flat, 0, 1) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') classifier = LogisticRegression( input = x, n_in = 28 * 28, n_out = 10, W = theano.shared(value = Weights, name = 'W', borrow = True), b = theano.shared(value = bias, name = 'b', borrow = True) ) gradients = theano.function( inputs = [x, y], outputs = [ T.grad(classifier.negative_log_likelihood(y), classifier.W), T.grad(classifier.negative_log_likelihood(y), classifier.b) ], allow_input_downcast = True ) cost = theano.function( inputs=[x, y], outputs=classifier.negative_log_likelihood(y), allow_input_downcast=True ) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W, g_b = gradients(inputs, targets) return np.concatenate([g_W.flatten(), g_b]) zero_one_loss = theano.function( inputs = [x, y], outputs = classifier.errors(y), allow_input_downcast = True ) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate = 0.0002, decay = 0.99999999, decay_mom1 = 0.1, decay_mom2 = 0.001, momentum = 0, offset = 1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay = 0.9, momentum = .95, offset = 0.0001, args=args) else: print('unknown optimizer') return 1 print('... training the model') # early stopping parameters if batch_size== None: patience = 250 else: patience = 5000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: # compute zero-one loss on validation set validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % ( epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience ) ) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % ( epoch, minibatch_index + 1, n_train_batches, test_loss * 100 ) ) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_loss * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
model.kern_list[q].variance.fix() """""""""""""""""""""""""""""""""""""""""""""""""""""" print(model['B']) print('Initial Log Likelihood:\n',model.log_likelihood()) if method == 'adam': opt = climin.Adam(model.optimizer_array, model.stochastic_grad, step_rate=0.005, decay_mom1=1 - 0.9,decay_mom2=1 - 0.999) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Adam...') info = opt.minimize_until(callback) elif method == 'adad': opt = climin.Adadelta(model.optimizer_array, model.stochastic_grad, step_rate=0.005, momentum=0.9) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Adadelta...') info = opt.minimize_until(callback) elif method == 'vo': model.Gauss_Newton = False opt = climin.VarOpt(model.optimizer_array, model.stochastic_grad, step_rate=0.005, s_ini=q_s_ini,decay_mom1=1 - 0.9,decay_mom2=1 - 0.999,prior_lambda=prior_lamb) ELBO.append(model.log_likelihood()) #NLPD.append(model.negative_log_predictive(Xtest, Ytest, num_samples=1000)) start = time.time() myTimes.append(start) print('Running Variationa Opt...') info = opt.minimize_until(callback)
def climin_wrapper(oracle, w0, train_points, train_targets, options, method='AdaDelta'): default_options = { 'maxiter': 1000, 'print_freq': 1, 'verbose': False, 'g_tol': 1e-5, 'batch_size': 10, 'step_rate': 0.1 } if not options is None: default_options.update(options) if 'print_freq' in options.keys(): default_options['verbose'] = True options = default_options w = w0.copy() data = ((i, {}) for i in iter_minibatches([train_points, train_targets], options['batch_size'], [1, 0])) if method == 'AdaDelta': opt = climin.Adadelta(wrt=w, fprime=oracle, args=data, step_rate=options['step_rate']) elif method == 'SG': opt = climin.GradientDescent(wrt=w, fprime=oracle, args=data, step_rate=options['step_rate']) else: raise ValueError('Unknown optimizer') w_lst = [w.copy()] time_lst = [0.] start = time.time() n_epochs = options['maxiter'] n_iterations = int(n_epochs * train_targets.size / options['batch_size']) print_freq = int(options['print_freq'] * train_targets.size / options['batch_size']) if options['verbose']: print('Using ' + method + ' optimizer') for info in opt: i = info['n_iter'] if i > n_iterations: break if not (i % print_freq) and options['verbose']: grad = info['gradient'] print("Iteration ", int(i * options['batch_size'] / train_targets.size), ":") print("\tGradient norm", np.linalg.norm(grad)) if not i % int(train_targets.size / options['batch_size']): w_lst.append(w.copy()) time_lst.append(time.time() - start) return w.copy(), w_lst, time_lst
def inference_time(self, X, Y,Xt, Yt, numZ,num_local_Z,num_cluster, batchsize=1000,upperbound=-1, lowerbound_ratio=-1, optimizer=1, num_iters=1000): [Ntrain, d]=X.shape ac_array=[]; decv_array=[]; duration_array=[] self.start_time=time.time() self.total_pred_time=0; Yi=(Y==1).astype(int) pu=np.random.permutation(Ntrain) Z=X[pu[range(numZ)], :] lik = GPy.likelihoods.Bernoulli() k = GPy.kern.RBF(d, lengthscale=5.,ARD=True) + GPy.kern.White(1, 1e-6) m = SMGP(X, Yi, Z, likelihood=lik, kernel=k, batchsize=batchsize, num_cluster=num_cluster,num_local_Z=num_local_Z,upperbound=upperbound, lowerbound_ratio=lowerbound_ratio) m.kern.white.variance = 1e-5 m.kern.white.fix() from ipywidgets import Text from IPython.display import display t = Text(align='right') display(t) m.iter_no=0 def callback_adadelta(i): t.value = str(m.log_likelihood()) print(i['n_iter']) if (i['n_iter'] %10==0): start_pred=time.time(); ac, pred,decv, test_error, Yt_m, Yt_v = self.prediction1(m, Xt,Yt) ac_array.append(ac) decv_array.append(decv) pred_dur=time.time()-start_pred self.total_pred_time=self.total_pred_time+pred_dur duration_array.append(time.time()-self.start_time-self.total_pred_time) if i['n_iter'] > 1000: return True return False def callback_lbfgsb(i): m.iter_no=m.iter_no+1 t.value = str(m.log_likelihood()) print(m.iter_no) if (m.iter_no %10==0): start_pred=time.time(); ac, pred,decv, test_error, Yt_m, Yt_v = self.prediction1(m, Xt,Yt) ac_array.append(ac) decv_array.append(decv) pred_dur=time.time()-start_pred self.total_pred_time=self.total_pred_time+pred_dur duration_array.append(time.time()-self.start_time-self.total_pred_time) if optimizer==1: #Adadelta opt = climin.Adadelta(m.optimizer_array, m.stochastic_grad, step_rate=0.2, momentum=0.9) info = opt.minimize_until(callback_adadelta) elif optimizer==2: #l_bfgs_b x, f, d = optimize.fmin_l_bfgs_b(m._objective, m.optimizer_array, fprime=m.stochastic_grad, maxfun=1000, callback=callback_lbfgsb) else: print('optimizer not supported') return m,ac_array, decv_array,duration_array
## load data train_x, train_y = ds.load('train') valid_x, valid_y = ds.load('valid') ## setup model idim = len(train_x[0][0]) odim = max(train_y) + 1 model = RNN(idim, 300, odim, 'lstm') ## setup optimizer #train_w = utils.balance_prior(train_y) params = model.get_theta() #args, n_batches = utils.make_batches([train_x, train_y], None) #opt = climin.Rprop(params, model.opt_fprime, args=args, init_step=0.0001) args, n_batches = utils.make_batches([train_x, train_y], 30) opt = climin.Adadelta(params, model.opt_fprime, offset=1e-6, args=args) #args, n_batches = utils.make_batches([train_x, train_y], 30) #opt = climin.rmsprop.RmsProp(params, model.opt_fprime, step_rate=0.01, args=args) ## perform optimization epoch = 0 start = time.time() for info in opt: if info['n_iter'] % n_batches == 0: epoch += 1 # end if epoch == 100: break # print performance if epoch % 1 == 0: terr, tauc, tlos = utils.eval_perf(
else: print '\r', m_vb.log_likelihood(), sys.stdout.flush() if info['n_iter'] >= self.max_iters: return True return False stop m_vb.kern.fix() m_vb.Z.fix() opt = climin.Adadelta(m_vb.optimizer_array, m_vb.stochastic_grad, step_rate=step_rates[0]) opt.minimize_until(cb(stop_pc=0.9, max_iters=600)) m_vb.kern.constrain_positive() m_vb.Z.unfix() opt = climin.Adadelta(m_vb.optimizer_array, m_vb.stochastic_grad, step_rate=step_rates[1]) opt.minimize_until(cb(max_iters=vb_max_iters)) #set mcmc from vb solution m.kern[:] = m_vb.kern[:] * 1 m.Z[:] = m_vb.Z[:] * 1 m.Z.fix() L = GPy.util.choleskies.flat_to_triang(m_vb.q_u_chol) U = np.vstack(