def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) plt.cla() target_distribution = lambda x: np.exp(log_posterior(x, t)) plot_isocontours(ax, target_distribution) mean, log_std = unpack_params(params) variational_contour = lambda x: mvn.pdf(x, mean, np.diag(np.exp(2 * log_std))) plot_isocontours(ax, variational_contour) plt.draw() plt.pause(1.0 / 30.0)
def log_posterior(x, t): """An example 2D intractable distribution: a Gaussian evaluated at zero with a Gaussian prior on the log-variance.""" mu, log_sigma = x[:, 0], x[:, 1] prior = norm.logpdf(log_sigma, 0, 1.35) likelihood = norm.logpdf(mu, 0, np.exp(log_sigma)) return prior + likelihood
def variational_objective(params, t): """Provides a stochastic estimate of the variational lower bound.""" mean, log_std = unpack_params(params) samples = rs.randn(num_samples, D) * np.exp(log_std) + mean lower_bound = gaussian_entropy(log_std) + np.mean(logprob(samples, t)) loss = np.mean(logprob(samples, t)) print("loss is "+ str(loss)) return -lower_bound
def variational_objective(params, t): """Provides a stochastic estimate of the variational lower bound.""" mean, log_std = unpack_params(params) generatedSample=rs.randn(num_samples, D) * np.exp(log_std) samples = generatedSample + mean #samples: sample of weights #t: targets #inputs used in logprob is the inputs user initial generated logvalue = logprob(samples, t) lower_bound = gaussian_entropy(log_std) + np.mean(logprob(samples, t)) loss = np.mean(logprob(samples, t)) print("loss is "+ str(loss)) return -lower_bound
def unpack_params(params): """Unpacks parameter vector into the proportions, means and covariances of each mixture component. The covariance matrices are parametrized by their Cholesky decompositions.""" log_proportions = parser.get(params, 'log proportions') normalized_log_proportions = log_proportions - logsumexp(log_proportions) means = parser.get(params, 'means') lower_tris = np.tril(parser.get(params, 'lower triangles'), k=-1) diag_chols = np.exp( parser.get(params, 'log diagonals')) chols = [] for lower_tri, diag in zip(lower_tris, diag_chols): chols.append(np.expand_dims(lower_tri + np.diag(diag), 0)) chols = np.concatenate(chols, axis=0) return normalized_log_proportions, means, chols
def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) # Sample functions from posterior. rs = npr.RandomState(0) mean, log_std = unpack_params(params) #rs = npr.RandomState(0) sample_weights = rs.randn(10, num_weights) * np.exp(log_std) + mean plot_inputs = np.linspace(-8, 8, num=400) outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1)) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx') ax.plot(plot_inputs, outputs[:, :, 0].T) ax.set_ylim([-2, 3]) plt.draw() plt.pause(1.0/60.0)
def rbf_covariance(kernel_params, x, xp): output_scale = np.exp(kernel_params[0]) lengthscales = np.exp(kernel_params[1:]) diffs = np.expand_dims(x /lengthscales, 1)\ - np.expand_dims(xp/lengthscales, 0) return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=2))
def unpack_params(params): mean = params[0] cov_params = params[2:] noise_scale = np.exp(params[1]) + 0.001 return mean, cov_params, noise_scale
def gradient_product(g): # This closure multiplies g with the Jacobian of logsumexp (d_ans/d_x). # Because autogradwithbay uses reverse-mode differentiation, g contains # the gradient of the objective w.r.t. ans, the output of logsumexp. return np.full(x.shape, g) * np.exp(x - np.full(x.shape, ans))
def logsumexp(x): """Numerically stable log(sum(exp(x))), also defined in scipy.misc""" max_x = np.max(x) return max_x + np.log(np.sum(np.exp(x - max_x)))
"""Gradients of the normal distribution.""" from __future__ import absolute_import import scipy.stats import autogradwithbay.numpy as anp from autogradwithbay.core import primitive from autogradwithbay.numpy.numpy_grads import unbroadcast pdf = primitive(scipy.stats.norm.pdf) cdf = primitive(scipy.stats.norm.cdf) logpdf = primitive(scipy.stats.norm.logpdf) logcdf = primitive(scipy.stats.norm.logcdf) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: -g * ans * (x - loc) / scale**2)) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * ans * (x - loc) / scale**2), argnum=1) pdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * ans * (((x - loc)/scale)**2 - 1.0)/scale), argnum=2) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * pdf(x, loc, scale))) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * pdf(x, loc, scale)), argnum=1) cdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: -g * pdf(x, loc, scale)*(x-loc)/scale), argnum=2) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: -g * (x - loc) / scale**2)) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: g * (x - loc) / scale**2), argnum=1) logpdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: g * (-1.0/scale + (x - loc)**2/scale**3)), argnum=2) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, x, lambda g: g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale)))) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, loc, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))), argnum=1) logcdf.defgrad(lambda ans, x, loc=0.0, scale=1.0: unbroadcast(ans, scale, lambda g: -g * anp.exp(logpdf(x, loc, scale) - logcdf(x, loc, scale))*(x-loc)/scale), argnum=2)
lambda g: g * np.sum(digamma(np.expand_dims(a, -1) - np.arange(d)/2.), -1)) multigammaln.defgrad_is_zero(argnums=(1,)) ### Bessel functions ### j0 = primitive(scipy.special.j0) y0 = primitive(scipy.special.y0) j1 = primitive(scipy.special.j1) y1 = primitive(scipy.special.y1) jn = primitive(scipy.special.jn) yn = primitive(scipy.special.yn) j0.defgrad(lambda ans, x: lambda g: -g * j1(x)) y0.defgrad(lambda ans, x: lambda g: -g * y1(x)) j1.defgrad(lambda ans, x: lambda g: g * (j0(x) - jn(2, x)) / 2.0) y1.defgrad(lambda ans, x: lambda g: g * (y0(x) - yn(2, x)) / 2.0) jn.defgrad_is_zero(argnums=(0,)) yn.defgrad_is_zero(argnums=(0,)) jn.defgrad(lambda ans, n, x: lambda g: g * (jn(n - 1, x) - jn(n + 1, x)) / 2.0, argnum=1) yn.defgrad(lambda ans, n, x: lambda g: g * (yn(n - 1, x) - yn(n + 1, x)) / 2.0, argnum=1) ### Error Function ### inv_root_pi = 0.56418958354775627928 erf = primitive(scipy.special.erf) erfc = primitive(scipy.special.erfc) erf.defgrad(lambda ans, x: lambda g: 2.*g*inv_root_pi*np.exp(-x**2)) erfc.defgrad(lambda ans, x: lambda g: -2.*g*inv_root_pi*np.exp(-x**2))
return r, p(r) if __name__ == "__main__": # generate data npr.seed(0) data = negbin_sample(r=5, p=0.5, size=1000) # fit likelihood-extremizing parameters r, p = fit_maxlike(data, r_guess=1) # report fit print('Fit parameters:') print('r={r}, p={p}'.format(r=r, p=p)) print('Check that we are at a local stationary point:') loglike = lambda r, p: np.sum(negbin_loglike(r, p, data)) grad_both = multigrad(loglike, argnums=[0,1]) print(grad_both(r, p)) import matplotlib.pyplot as plt xm = data.max() plt.figure() plt.hist(data, bins=np.arange(xm+1)-0.5, normed=True, label='normed data counts') plt.xlim(0,xm) plt.plot(np.arange(xm), np.exp(negbin_loglike(r, p, np.arange(xm))), label='maxlike fit') plt.xlabel('k') plt.ylabel('p(k)') plt.legend(loc='best') plt.show()
def make_grad_logsumexp(ans, x, axis=None, b=1.0, keepdims=False): repeater, _ = repeat_to_match_shape(x, axis, keepdims) return lambda g: repeater(g) * b * anp.exp(x - repeater(ans))
def tanh(x): return (1.0 - np.exp(-x)) / (1.0 + np.exp(-x))
def build_toy_dataset(n_data=80, noise_std=0.1, D=1): rs = npr.RandomState(0) inputs = np.concatenate([np.linspace(0, 3, num=n_data / 2), np.linspace(6, 8, num=n_data / 2)]) targets = np.cos(inputs) + rs.randn(n_data) * noise_std inputs = (inputs - 4.0) / 2.0 inputs = inputs.reshape((len(inputs), D)) targets = targets.reshape((len(targets), D)) / 2.0 return inputs, targets if __name__ == "__main__": # Specify inference problem by its unnormalized log-posterior. rbf = lambda x: np.exp(-x ** 2) relu = lambda x: np.maximum(x, 0.0) # Implement a 3-hidden layer neural network. num_weights, predictions, logprob = make_nn_funs(layer_sizes=[1, 20, 20, 20, 1], nonlinearity=rbf) inputs, targets = build_toy_dataset() objective = lambda weights, t: -logprob(weights, inputs, targets) # Set up figure. fig = plt.figure(figsize=(12, 8), facecolor="white") ax = fig.add_subplot(111, frameon=False) plt.show(block=False) def callback(params, t, g): print("Iteration {} log likelihood {}".format(t, -objective(params, t)))
def logsumexp(X, axis, keepdims=False): max_X = np.max(X) return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=keepdims))
def logsumexp(X, axis=1): max_X = np.max(X) return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=True))
# Wrap function to only have one argument, for scipy.minimize. def training_loss(weights): return -loglike_fun(weights, train_inputs, train_inputs) def callback(weights): print("Train loss:", training_loss(weights)) print_training_prediction(weights) # Build gradient of loss function using autogradwithbay. training_loss_and_grad = value_and_grad(training_loss) init_weights = npr.randn(num_weights) * param_scale # Check the gradients numerically, just to be safe quick_grad_check(training_loss, init_weights) print("Training LSTM...") result = minimize(training_loss_and_grad, init_weights, jac=True, method='CG', options={'maxiter':train_iters}, callback=callback) trained_weights = result.x print() print("Generating text from RNN...") num_letters = 30 for t in range(20): text = "" for i in range(num_letters): seqs = string_to_one_hot(text, output_size)[:, np.newaxis, :] logprobs = pred_fun(trained_weights, seqs)[-1].ravel() text += chr(npr.choice(len(logprobs), p=np.exp(logprobs))) print(text)