def gradient_lower_bound(params,y,X,num_samples,N): eps = np.random.normal(0,1,(num_samples,np.shape(X)[-1]+1)) n = np.shape(y)[0] u = np.random.uniform(0,1,num_samples*N*n) gradient_lower_bound = grad(lower_bound) g = gradient_lower_bound(params,y,X,eps,N,u) return g
def expectation(params,y,X,eps,N,z,u): mu = params[0:(len(params)-2)/2] Sigma = np.exp(params[(len(params)-2)/2:-2]) tauParams = params[-2:] E = 0 n = X.shape[0] for j in range(np.shape(eps)[0]): beta = mu+Sigma*eps[j,:] ll=log_likelihood(beta,y,X,z[j*(n*N):(j+1)*(n*N)],u[j*(n*N):(j+1)*(n*N)],tauParams,N) E += ll return E/np.shape(eps)[0]
def natural_sample(natparam, num_samples): neghalfJ, h, _, _ = unpack_dense(natparam) sample_shape = np.shape(h) + (num_samples,) J = -2*neghalfJ L = np.linalg.cholesky(J) noise = np.linalg.solve(T(L), npr.randn(*sample_shape)) return np.linalg.solve(J, h)[...,None,:] + T(noise)
def KL_via_sampling(params,eps): #also need to include lognormal as a replacement for gamma distribution #this is giving log of negatives d = np.shape(params)[0]-1 mu = params[0:d,0] Sigma = params[0:d,1:d+1] di = np.diag_indices(d) Sigma[di] = np.exp(Sigma[di]) muPrior = np.zeros(d) sigmaPrior = np.identity(d) E = 0 for j in range(np.shape(eps)[0]): beta = mu+np.dot(Sigma,eps[j,:]) E+= np.log(normal_pdf(beta,mu,Sigma)/normal_pdf(beta,muPrior,sigmaPrior)) E = np.mean(E) return E
def make_grad_logsumexp(ans, x, axis=None, b=1.0, keepdims=False): shape, dtype = anp.shape(x), anp.result_type(x) def vjp(g): g_repeated, _ = repeat_to_match_shape(g, shape, dtype, axis, keepdims) ans_repeated, _ = repeat_to_match_shape(ans, shape, dtype, axis, keepdims) return g_repeated * b * anp.exp(x - ans_repeated) return vjp
def init(): offset = 2.0 #if optimum[0] < np.inf: # xmin = min(results['ADAM'][0][0], optimum[0]) - offset # xmax = max(results['ADAM'][0][0], optimum[0]) + offset #else: xmin = domain[0, 0] xmax = domain[0, 1] #if optimum[1] < np.inf: # ymin = min(results['ADAM'][1][0], optimum[1]) - offset # ymax = max(results['ADAM'][1][0], optimum[1]) + offset #else: ymin = domain[1, 0] ymax = domain[1, 1] x = np.arange(xmin, xmax, 0.01) y = np.arange(ymin, ymax, 0.01) X, Y = np.meshgrid(x, y) Z = np.zeros(np.shape(Y)) for a, _ in np.ndenumerate(Y): Z[a] = func(X[a], Y[a]) level = fdict['level'] if level is None: level = np.linspace(Z.min(), Z.max(), 20) else: if level[0] == 'normal': level = np.linspace(Z.min(), Z.max(), level[1]) if level[0] == 'log': level = np.logspace(np.log(Z.min()), np.log(Z.max()), level[1]) CF = ax[0].contour(X,Y,Z, levels=level) #plt.colorbar(CF, orientation='horizontal', format='%.2f') ax[0].grid() ax[0].plot(results['ADAM'][0][0], results['ADAM'][1][0], 'h', markersize=15, color = '0.75') if optimum[0] < np.inf and optimum[1] < np.inf: ax[0].plot(optimum[0], optimum[1], '*', markersize=40, markeredgewidth = 2, alpha = 0.5, color = '0.75') ax[0].legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15)) ax[1].plot(0, results['ADAM'][2][0], 'o') ax[1].axis([0, T, -0.5, max_err + 0.5]) ax[1].set_xlabel('num. iteration') ax[1].set_ylabel('loss') line1.set_data([], []) line2.set_data([], []) line3.set_data([], []) line4.set_data([], []) line5.set_data([], []) err1.set_data([], []) err2.set_data([], []) err3.set_data([], []) err4.set_data([], []) err5.set_data([], []) return line1, line2, line3, line4, line5, \ err1, err2, err3, err4, err5,
def expectation(params,y,X,eps,N,u): mu = params[0:len(params)/2] Sigma = np.exp(params[len(params)/2:]) E = 0 n = X.shape[0] for j in range(np.shape(eps)[0]): beta = mu+Sigma*eps[j,:] E+=log_likelihood(beta,y,X)#,u[j*(n*N):(j+1)*(n*N)]) return E/len(beta)
def KL_two_gaussians(params): d = np.shape(params)[0]-1 mu = params[0:d,0] toSigma = params[0:d,1:d+1] intSigma = toSigma-np.diag(np.diag(toSigma))+np.diag(np.exp(np.diag(toSigma))) Sigma = intSigma-np.tril(intSigma)+np.transpose(np.triu(intSigma)) muPrior = np.zeros(d) sigmaPrior = np.identity(d) #print Sigma #print np.linalg.det(Sigma) return 1/2*(np.log(np.linalg.det(Sigma)/np.linalg.det(sigmaPrior))-d+np.trace(np.dot(np.linalg.inv(Sigma),sigmaPrior))+np.dot(np.transpose(mu-muPrior),np.dot(np.linalg.inv(Sigma),mu-muPrior)))
def expectation(params,y,X,eps,N,u): #for each sample of theta, calculate likelihood #likelihood has participants #for each participant, we have N particles #with L samples, n participants, N particles per participant and sample, we have #L*n*N particles #get the first column to be mu d = np.shape(X)[-1]+1 mu = params[0:d,0] toSigma = params[0:d,1:d+1] intSigma = toSigma-np.diag(np.diag(toSigma))+np.diag(np.exp(np.diag(toSigma))) Sigma = intSigma-np.tril(intSigma)+np.transpose(np.triu(intSigma)) print mu print Sigma n = X.shape[0] E = 0 #iterate over number of samples of theta for j in range(np.shape(eps)[0]): beta = mu+np.dot(Sigma,eps[j,:]) #this log likelihood will iterate over both the participants and the particles E+=log_likelihood(beta,y,X,u[j*(n*N):(j+1)*(n*N)]) return E/len(beta)
def grad_odeint(yt, func, y0, t, func_args, **kwargs): # Extended from "Scalable Inference of Ordinary Differential # Equation Models of Biochemical Processes", Sec. 2.4.2 # Fabian Froehlich, Carolin Loos, Jan Hasenauer, 2017 # https://arxiv.org/abs/1711.08079 T, D = np.shape(yt) flat_args, unflatten = flatten(func_args) def flat_func(y, t, flat_args): return func(y, t, *unflatten(flat_args)) def unpack(x): # y, vjp_y, vjp_t, vjp_args return x[0:D], x[D:2 * D], x[2 * D], x[2 * D + 1:] def augmented_dynamics(augmented_state, t, flat_args): # Orginal system augmented with vjp_y, vjp_t and vjp_args. y, vjp_y, _, _ = unpack(augmented_state) vjp_all, dy_dt = make_vjp(flat_func, argnum=(0, 1, 2))(y, t, flat_args) vjp_y, vjp_t, vjp_args = vjp_all(-vjp_y) return np.hstack((dy_dt, vjp_y, vjp_t, vjp_args)) def vjp_all(g): vjp_y = g[-1, :] vjp_t0 = 0 time_vjp_list = [] vjp_args = np.zeros(np.size(flat_args)) for i in range(T - 1, 0, -1): # Compute effect of moving measurement time. vjp_cur_t = np.dot(func(yt[i, :], t[i], *func_args), g[i, :]) time_vjp_list.append(vjp_cur_t) vjp_t0 = vjp_t0 - vjp_cur_t # Run augmented system backwards to the previous observation. aug_y0 = np.hstack((yt[i, :], vjp_y, vjp_t0, vjp_args)) aug_ans = odeint(augmented_dynamics, aug_y0, np.array([t[i], t[i - 1]]), tuple((flat_args,)), **kwargs) _, vjp_y, vjp_t0, vjp_args = unpack(aug_ans[1]) # Add gradient from current output. vjp_y = vjp_y + g[i - 1, :] time_vjp_list.append(vjp_t0) vjp_times = np.hstack(time_vjp_list)[::-1] return None, vjp_y, vjp_times, unflatten(vjp_args) return vjp_all
def expectation(params, y, X, eps, N, u): #for each sample of theta, calculate likelihood #likelihood has participants #for each participant, we have N particles #with L samples, n participants, N particles per participant and sample, we have #L*n*N particles #get the first column to be mu d = np.shape(X)[-1] + 1 mu = params[0:d, 0] toSigma = params[0:d, 1:d + 1] intSigma = toSigma - np.diag(np.diag(toSigma)) + np.diag( np.exp(np.diag(toSigma))) Sigma = intSigma - np.tril(intSigma) + np.transpose(np.triu(intSigma)) print mu print Sigma n = X.shape[0] E = 0 #iterate over number of samples of theta for j in range(np.shape(eps)[0]): beta = mu + np.dot(Sigma, eps[j, :]) #this log likelihood will iterate over both the participants and the particles E += log_likelihood(beta, y, X, u[j * (n * N):(j + 1) * (n * N)]) return E / len(beta)
def RMSprop(g, alpha, max_its, w, num_pts, batch_size, **kwargs): # rmsprop params gamma = 0.9 eps = 10**-8 if 'gamma' in kwargs: gamma = kwargs['gamma'] if 'eps' in kwargs: eps = kwargs['eps'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # initialize average gradient avg_sq_grad = np.ones(np.size(w)) # record history w_hist = [unflatten(w)] train_hist = [g_flat(w, np.arange(num_pts))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # loop over each minibatch train_cost = 0 for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, batch_inds) grad_eval.shape = np.shape(w) # update exponential average of past gradients avg_sq_grad = gamma * avg_sq_grad + (1 - gamma) * grad_eval**2 # take descent step w = w - alpha * grad_eval / (avg_sq_grad**(0.5) + eps) # update training and validation cost train_cost = g_flat(w, np.arange(num_pts)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) return w_hist, train_hist
def predict(self, Xnew, predvar=False): """ Returns the predictive mean and variance of the GP """ Xnew = (Xnew - self.Xmean)/self.Xstd alpha = solve(self.L.T, solve(self.L,self.Y*self.Ystd+self.Ymean) ) if predvar: m = np.shape(Xnew)[0] Knew_N,_ = self.K(self.lengthscale, Xnew, self.X) Knew_new = np.array( [self.scalarK(Xnew[i], Xnew[i], self.lengthscale) for i in range(m)] ).reshape([m,1]) v = solve(self.L, Knew_N.T) return np.dot(Knew_N, alpha), np.diag( Knew_new + self.likelihood_variance - np.dot(v.T, v) ).reshape(m,1) else: Knew_N,_ = self.K(self.lengthscale, Xnew, self.X) return np.dot(Knew_N, alpha)
def maxout_feature_transforms(self,a, w): # loop through each layer matrix for W1,W2 in w: # pad with ones (to compactly take care of bias) for next layer computation o = np.ones((1,np.shape(a)[1])) a = np.vstack((o,a)) # compute inner product with current layer weights a1 = np.dot(a.T, W1).T a2 = np.dot(a.T, W2).T # output of layer activation a = self.activation(a1,a2) return a
def sample_latent_variables_from_posterior(encoder_output): # Params of a diagonal Gaussian. D = np.shape(encoder_output)[-1] // 2 mean, log_std = encoder_output[:, :D], encoder_output[:, D:] # TODO use the reparametrization trick to generate one sample from q(z|x) per each batch datapoint # use npr.randn for that. # The output of this function is a matrix of size the batch x the number of latent dimensions # the sampling is done based on 15th equation the noise is generated random (via npr.randn) # also in case of log_std, I remove the log via exp, and put the square by multiplying 0.5 #return mean + np.exp(0.5 * log_std) * npr.randn(mean.shape[0], mean.shape[1]) return mean + np.exp(log_std) * npr.randn(mean.shape[0], mean.shape[1])
def choose_convolutions(self, kernel_sizes, **kwargs): # setup convolution layer #img_size = int(self.x.shape[0]**(0.5)) transformer = convolutional_layer.Setup(kernel_sizes, **kwargs) self.conv_layer = transformer.conv_layer self.conv_initializer = transformer.conv_initializer # determine output size of conv layer based on image size / kernel sizes # by passing image through the convolution layer kernels = self.conv_initializer() if 'kernels' in kwargs: kernels = kwargs['kernels'] final_features = self.conv_layer(self.x[:, :1].T, kernels) self.conv_output_size = np.shape(final_features)[1]
def compute_maxout_features(x, inner_weights): # pad data with ones to deal with bias o = np.ones((np.shape(x)[0], 1)) a_padded = np.concatenate((o, x), axis=1) # loop through weights and update each layer of the network for W1, W2 in inner_weights: # output of layer activation a = activation(np.dot(a_padded, W1), np.dot(a_padded, W2)) ### normalize output of activation # compute the mean and standard deviation of the activation output distributions a_means = np.mean(a, axis=0) a_stds = np.std(a, axis=0) # normalize the activation outputs a_normed = normalize(a, a_means, a_stds) # pad with ones for bias o = np.ones((np.shape(a_normed)[0], 1)) a_padded = np.concatenate((o, a_normed), axis=1) return a_padded
def reflect_over_XZ_plane(input_vector): # Takes in a vector or an array and flips the y-coordinates. output_vector = input_vector shape = np.shape(output_vector) if len(shape) == 1 and shape[0] == 3: # Vector of 3 items output_vector = output_vector * np.array([1, -1, 1]) elif len(shape) == 2 and shape[1] == 3: # 2D Nx3 vector output_vector = output_vector * np.array([1, -1, 1]) elif len(shape) == 3 and shape[2] == 3: # 3D MxNx3 vector output_vector = output_vector * np.array([1, -1, 1]) else: raise Exception("Invalid input for reflect_over_XZ_plane!") return output_vector
def matvec_mul_last2dims(x, y): # x is m1 x m2 ....x m_d2 x s x r # y is m1 x m2 ... x m_d1 x s # we do x^T y, along the last two dimension of x # the y can have more dimension than x (d1 >= d2), # in which case we sum y over the extra dimensions assert np.shape(x)[-2] == np.shape(y)[-1] d1 = len(np.shape(y)) - 1 d2 = len(np.shape(x)) - 2 assert d2 <= d1 einsum_indx1 = list(range(d2)) einsum_indx1.append(d1) einsum_indx1.append(d1 + 1) einsum_indx2 = list(range(d1 + 1)) einsum_indx_out = list(range(d2)) einsum_indx_out.append(d1 + 1) return np.einsum(x, einsum_indx1, y, einsum_indx2, einsum_indx_out)
def mat_mul_last2dims(x1, x2): # multiply the last two dimensions of two arrays # x1 is m1 x m2 ....x m_d x s x r # x2 is m1 x m2 ... x m_d x s x t # x1 and x2 are an (m1 x m2 ....x m_d) array of matrices, # whose last two dimensions specify a matrix. # We return 'x1^T x2', this matrix multiplication done along # the last two dimensions assert len(np.shape(x1)) == (len(np.shape(x2))) assert np.shape(x2)[-2] == np.shape(x1)[-2] d = len(np.shape(x1)) einsum_indx2 = list(range(d - 1)) einsum_indx2.append(d) einsum_indx_out = list(range(d - 2)) einsum_indx_out.append(d - 1) einsum_indx_out.append(d) return np.einsum(x1, list(range(d)), x2, einsum_indx2, einsum_indx_out)
def conv_model(self,x,w): c = self.conv_layer(x.T,w[0]).T # pass through fully connected layers f = self.feature_transforms(c,w[1]) # tack a 1 onto the top of each input point all at once o = np.ones((1,np.shape(f)[1])) f = np.vstack((o,f)) # compute linear combination and return a = np.dot(f.T,w[2]) return a.T
def forward_pass(W1, W2, W3, b1, b2, b3, x): """ forward-pass for an fully connected neural network with 2 hidden layers of M neurons Inputs: W1 : (M, 784) weights of first (hidden) layer W2 : (M, M) weights of second (hidden) layer W3 : (10, M) weights of third (output) layer b1 : (M, 1) biases of first (hidden) layer b2 : (M, 1) biases of second (hidden) layer b3 : (10, 1) biases of third (output) layer x : (N, 784) training inputs Outputs: Fhat : (N, 10) output of the neural network at training inputs """ H1 = np.maximum(0, np.dot(x, W1.T) + b1.T) # layer 1 neurons with ReLU activation, shape (N, M) H2 = np.maximum(0, np.dot(H1, W2.T) + b2.T) # layer 2 neurons with ReLU activation, shape (N, M) Fhat = np.dot( H2, W3.T ) + b3.T # layer 3 (output) neurons with linear activation, shape (N, 10) # Implement a stable log-softmax activation function at the ouput layer # Compute max of each row a = np.ones(np.shape(Fhat)) * np.expand_dims( np.amax(Fhat, axis=1), axis=1) # a is typically max of g ; make to the same shape as Fhat log_sum_exp = np.ones(np.shape(Fhat)) * np.expand_dims( np.log(np.sum(np.exp(np.subtract(Fhat, a)), axis=1)), axis=1) # Compute using logSumExp trick # Element-wise subtraction Fhat = np.subtract(np.subtract(Fhat, a), log_sum_exp) return Fhat
def get_mixture_weights_from_stick_break_propns(stick_break_propns): """ Computes stick lengths (i.e. mixture weights) from stick breaking proportions. Parameters ---------- stick_break_propns : ndarray Array of stick breaking proportions, with sticks along last dimension Returns ------- mixture_weights : ndarray An array the same size as stick_break_propns, with the mixture weights computed for each row of stick breaking proportions. """ # if input is a vector, make it a 1 x k_approx array if len(np.shape(stick_break_propns)) == 1: stick_break_propns = np.array([stick_break_propns]) # number of components k_approx = np.shape(stick_break_propns)[-1] # number of mixtures ones_shape = stick_break_propns.shape[0:-1] + (1,) stick_break_propns_1m = 1 - stick_break_propns stick_remain = np.concatenate((np.ones(ones_shape), _cumprod_through_log(stick_break_propns_1m, axis = -1)), axis = -1) stick_add = np.concatenate((stick_break_propns, np.ones(ones_shape)), axis = -1) mixture_weights = (stick_remain * stick_add).squeeze() return mixture_weights
def gradient_descent(g, alpha, max_its, w, num_pts, batch_size, **kwargs): # pluck out args beta = 0 if 'beta' in kwargs: beta = kwargs['beta'] normalize = False if 'normalize' in kwargs: normalize = kwargs['normalize'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # initialization for momentum direction h = np.zeros((w.shape)) # over the line for k in range(max_its): # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = grad(w, batch_inds) grad_eval.shape = np.shape(w) # normalize? if normalize == True: grad_eval = np.sign(grad_eval) # momentum step # h = beta*h - (1 - beta)*grad_eval # take descent step with momentum w = w - alpha * grad_eval # record weight update w_hist.append(unflatten(w)) return w_hist
def KL_two_gaussians(params): d = np.shape(params)[0] - 1 mu = params[0:d, 0] toSigma = params[0:d, 1:d + 1] intSigma = toSigma - np.diag(np.diag(toSigma)) + np.diag( np.exp(np.diag(toSigma))) Sigma = intSigma - np.tril(intSigma) + np.transpose(np.triu(intSigma)) muPrior = np.zeros(d) sigmaPrior = np.identity(d) #print Sigma #print np.linalg.det(Sigma) return 1 / 2 * (np.log(np.linalg.det(Sigma) / np.linalg.det(sigmaPrior)) - d + np.trace(np.dot(np.linalg.inv(Sigma), sigmaPrior)) + np.dot(np.transpose(mu - muPrior), np.dot(np.linalg.inv(Sigma), mu - muPrior)))
def plot_data(self): # construct figure fig, axs = plt.subplots(1, 3, figsize=(9,4)) # create subplot with 2 panels gs = gridspec.GridSpec(1, 3, width_ratios=[1,5,1]) ax1 = plt.subplot(gs[0]); ax1.axis('off') ax2 = plt.subplot(gs[1]); ax3 = plt.subplot(gs[2]); ax3.axis('off') if np.shape(self.x)[1] == 2: ax2 = plt.subplot(gs[1],projection='3d'); # scatter points self.scatter_pts(ax2)
def KW(TempK, Sal, Pbar, RGas, WhichKs): """Calculate water dissociation constant for the given options.""" # Evaluate at atmospheric pressure KW = np.full(np.shape(TempK), np.nan) KW = np.where(WhichKs == 6, 0.0, KW) # GEOSECS doesn't include OH effects KW = np.where(WhichKs == 7, p1atm.kH2O_SWS_M79(TempK, Sal), KW) KW = np.where(WhichKs == 8, p1atm.kH2O_SWS_HO58_M79(TempK, Sal), KW) KW = np.where( (WhichKs != 6) & (WhichKs != 7) & (WhichKs != 8), p1atm.kH2O_SWS_M95(TempK, Sal), KW, ) # Now correct for seawater pressure KW = KW * pcx.KWfac(TempK, Pbar, RGas, WhichKs) return KW
def gradient_descent(g, w_init, alpha, max_its, verbose): # flatten the input function g_flat, unflatten, w = flatten_func(g, w_init) # compute gradient of flattened input function # when evaluated this returns both the evaluation of the gradient and the original function grad = value_and_grad(g_flat) cost_eval, grad_eval = grad(w) grad_eval.shape = np.shape(w) # record history w_hist = [unflatten(w)] train_hist = [cost_eval] # gradient descent loop for k in range(max_its): # take descent step with momentum w = w - alpha * grad_eval # plug in updated w into func and gradient cost_eval, grad_eval = grad(w) grad_eval.shape = np.shape(w) # store updates w_hist.append(unflatten(w)) train_hist.append(cost_eval) # print update if verbose == True: print('step ' + str(k + 1) + ' complete, train cost = ' + str(np.round(train_hist[-1], 4)[0])) # print update and return if verbose == True: print('finished all ' + str(max_its) + ' steps') return w_hist, train_hist
def normalized_gradient_descent(g, alpha, max_its, w, beta): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) # Record histories weight_hist = [] weight_hist.append(unflatten(w)) cost_hist = [] # run the gradient descent loop z = np.zeros((np.shape(w))) # momentum term for k in range(max_its): # evaluate the gradient, compute its length grad_eval = grad(w) grad_eval.shape = np.shape(w) grad_norm = np.linalg.norm(grad_eval) # check that magnitude of gradient is not too small, if yes pick a random direction to move if grad_norm == 0: # pick random direction and normalize to have unit legnth grad_eval = 10**-6 * np.sign(2 * np.random.rand(len(w)) - 1) grad_norm = np.linalg.norm(grad_eval) grad_eval /= grad_norm # take descent step with momentum z = beta * z + grad_eval w = w - alpha * z # Record and update histories weight_hist.append(unflatten(w)) cost_hist.append(g_flat(w)) return weight_hist, cost_hist
def gaussian_mix_init(mu_arr, var_arr, prob_arr): # default, dimension>1 gs = list() if mu_arr.ndim == 1: num_g, d = np.shape(mu_arr)[0], 1 gs = [(gaussian_init(np.array([mu_arr[i]]), np.array(var_arr[i]))) for i in range(num_g)] else: num_g, d = np.shape(mu_arr) gs = [(gaussian_init(mu_arr[i, :], var_arr[i, :, :])) for i in range(num_g)] def log_gaussian_mix(x): log_gs = np.array([g[0](x) for g in gs]).T prob_gs = np.exp(log_gs) probs = np.sum(prob_gs * prob_arr, 1) return np.log(probs) def generator(size): indices = np.argmax(npr.multinomial(1, prob_arr, size), axis=1) samples = [gs[id][1](1)[0] for id in indices] return np.array(samples) return log_gaussian_mix, generator
def rms_norm(array): """ Compute the rms norm of the array. Arguments: array :: ndarray (N) - The array to compute the norm of. Returns: norm :: float - The rms norm of the array. """ square_norm = anp.sum(array * anp.conjugate(array)) size = anp.prod(anp.shape(array)) rms_norm_ = anp.sqrt(square_norm / size) return rms_norm_
def _laplace_neg_hessian_params(self, data, input, mask, tag, x, Ez, Ezzp1): T, D = np.shape(x) x_mask = np.ones((T, D), dtype=bool) J_ini, J_dyn_11, J_dyn_21, J_dyn_22 = self.dynamics.\ neg_hessian_expected_log_dynamics_prob(Ez, x, input, x_mask, tag) J_transitions = self.transitions.\ neg_hessian_expected_log_trans_prob(x, input, x_mask, tag, Ezzp1) J_dyn_11 += J_transitions J_obs = self.emissions.\ neg_hessian_log_emissions_prob(data, input, mask, tag, x, Ez) return J_ini, J_dyn_11, J_dyn_21, J_dyn_22, J_obs
def gradient_descent(g, w, alpha, max_its, beta, version): g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) w_hist = [] w_hist.append(unflatten(w)) z = np.zeros((np.shape(w))) for k in range(max_its): grad_eval = grad(w) grad_eval.shape = np.shape(w) if version == 'normalized': grad_norm = np.linalg.norm(grad_eval) if grad_norm == 0: grad_norm += 10 ** -6 * np.sign(2 * np.random.rand(1) - 1) grad_eval /= grad_norm z = beta * z + grad_eval w = w - alpha * z w_hist.append(unflatten(w)) return w_hist
def KWfac(TempK, Pbar, RGas, WhichKs): """Calculate pressure correction factor for KW.""" TempC = convert.TempK2C(TempK) deltaV = np.full(np.shape(TempK), np.nan) Kappa = np.full(np.shape(TempK), np.nan) F = WhichKs == 8 # freshwater case # This is from Millero, 1983. deltaV = np.where(F, -25.6 + 0.2324 * TempC - 0.0036246 * TempC**2, deltaV) Kappa = np.where(F, (-7.33 + 0.1368 * TempC - 0.001233 * TempC**2) / 1000, Kappa) # Note: the temperature dependence of KappaK1 and KappaKW for freshwater # in Millero, 1983 are the same. F = WhichKs != 8 # GEOSECS doesn't include OH term, so this won't matter. # Peng et al didn't include pressure, but here I assume that the KW # correction is the same as for the other seawater cases. # This is from Millero, 1983 and his programs CO2ROY(T).BAS. deltaV = np.where(F, -20.02 + 0.1119 * TempC - 0.001409 * TempC**2, deltaV) # Millero, 1992 and Millero, 1995 have: Kappa = np.where(F, (-5.13 + 0.0794 * TempC) / 1000, Kappa) # Millero, 1983 # Millero, 1995 has this too, but Millero, 1992 is different. # Millero, 1979 does not list values for these. return Kfac(deltaV, Kappa, Pbar, TempK, RGas)
def gradient_descent(g, alpha, max_its, w, beta): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = compute_grad(g_flat) # record history w_hist = [] w_hist.append(unflatten(w)) # start gradient descent loop z = np.zeros((np.shape(w))) # momentum term # over the line for k in range(max_its): # plug in value into func and derivative grad_eval = grad(w) grad_eval.shape = np.shape(w) """ normalized or unnormalized descent step? """ # take descent step with momentum z = beta * z + grad_eval w = w - alpha * z # record weight update w_hist.append(unflatten(w))
def get_link_g(w, q, ln_q, ln_1_q, ln_s): w = w.reshape(-1, 3) n = numpy.shape(w)[0] g = numpy.zeros((n, 3)) for i in range(0, 3): tmp_grad = autograd.elementwise_grad(e_link_log_lik, i) g[:, i] = tmp_grad(w[:, 0].reshape(-1, 1), w[:, 1].reshape(-1, 1), w[:, 2].reshape(-1, 1), q, ln_q, ln_1_q, ln_s).ravel() return g.ravel()
def _line_search(self, x, dk): t = 1 delta_x = dk grad_x = copy(-dk) f_x = self.objectiveFunction(x) if np.shape(f_x) is not (): print('opa') f_x = np.dot(f_x.T, f_x) f_x_tdeltax = self.objectiveFunction(x + t * delta_x) if np.shape(f_x_tdeltax) is not (): f_x_tdeltax = np.dot(f_x_tdeltax.T, f_x_tdeltax) while ~np.isclose(f_x_tdeltax, f_x + self.alpha * t * (np.transpose(grad_x) @ delta_x), rtol=1e-3): t = self.beta * t f_x_tdeltax = self.objectiveFunction(x + t * delta_x) if np.shape(f_x_tdeltax) is not (): f_x_tdeltax = np.dot(f_x_tdeltax.T, f_x_tdeltax) if t < 2 * self.xtol: break return t, f_x_tdeltax
def test_random_point(self): # Just test that rand returns a point on the manifold and two # different matrices generated by rand aren't too close together n = self.n manifold = self.manifold x = manifold.random_point() assert np.shape(x) == (n, n) # Check symmetry np_testing.assert_allclose(x, multisym(x)) # Check positivity of eigenvalues w = np.linalg.eigvalsh(x) assert (w > [0]).all()
def minibatch_gradient_descent(g, alpha_choice, max_its, w, batch_size, num_pts): # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) # compute the gradient function of our input function - note this is a function too # that - when evaluated - returns both the gradient and function evaluations (remember # as discussed in Chapter 3 we always ge the function evaluation 'for free' when we use # an Automatic Differntiator to evaluate the gradient) gradient = value_and_grad(g_flat) # run the gradient descent loop weight_history = [] # container for weight history cost_history = [] # container for corresponding cost function history alpha = 0 # record history weight_history.append(unflatten(w)) cost_history.append(g_flat(w, np.arange(num_pts))) # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_pts, batch_size))) # over the line for k in range(max_its): # check if diminishing steplength rule used if alpha_choice == 'diminishing': alpha = 1 / float(k) else: alpha = alpha_choice # loop over each minibatch for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b * batch_size, min((b + 1) * batch_size, num_pts)) # plug in value into func and derivative cost_eval, grad_eval = gradient(w, batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha * grad_eval # record weight update weight_history.append(unflatten(w)) cost_history.append(g_flat(w, np.arange(num_pts))) return weight_history, cost_history
def gradient_descent(g,w,x_train,y_train,alpha,max_its,batch_size,**kwargs): verbose = True if 'verbose' in kwargs: verbose = kwargs['verbose'] # flatten the input function, create gradient based on flat function g_flat, unflatten, w = flatten_func(g, w) grad = value_and_grad(g_flat) # record history num_train = y_train.shape[1] w_hist = [unflatten(w)] train_hist = [g_flat(w,x_train,y_train,np.arange(num_train))] # how many mini-batches equal the entire dataset? num_batches = int(np.ceil(np.divide(num_train, batch_size))) # over the line for k in range(max_its): # loop over each minibatch start = timer() train_cost = 0 for b in range(num_batches): # collect indices of current mini-batch batch_inds = np.arange(b*batch_size, min((b+1)*batch_size, num_train)) # plug in value into func and derivative cost_eval,grad_eval = grad(w,x_train,y_train,batch_inds) grad_eval.shape = np.shape(w) # take descent step with momentum w = w - alpha*grad_eval end = timer() # update training and validation cost train_cost = g_flat(w,x_train,y_train,np.arange(num_train)) # record weight update, train and val costs w_hist.append(unflatten(w)) train_hist.append(train_cost) if verbose == True: print ('step ' + str(k+1) + ' done in ' + str(np.round(end - start,1)) + ' secs, train cost = ' + str(np.round(train_hist[-1][0],4))) if verbose == True: print ('finished all ' + str(max_its) + ' steps') return w_hist,train_hist
def log_likelihood(beta, y,X,z,u,tauParams,N): ll = 0 #generate N*n particles inv_lognormal = 1./generate_lognormal(tauParams,u) if np.isnan(inv_lognormal).any(): print 'some nans' print 5/0 alpha = np.zeros(len(inv_lognormal))#np.sqrt(inv_lognormal)*z print 'mean inv lognormal' print np.mean(inv_lognormal) count = 0 ll = 0 t = np.shape(y)[1] #iterate over participants for i in range(y.shape[0]): l_individual = likelihood_individual(beta,y[i,:],X[i,:,:],alpha[i*N:(i+1)*N]) ll += np.log(l_individual) return ll
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=EPS, rtol=RTOL, atol=ATOL, rs=None): """Checks the gradient of a function (w.r.t. to its first arg) in a random direction""" if verbose: print("Checking gradient of {0} at {1}".format(fun, arg0)) if rs is None: rs = np.random.RandomState() random_dir = rs.standard_normal(np.shape(arg0)) random_dir = random_dir / np.sqrt(np.sum(random_dir * random_dir)) unary_fun = lambda x : fun(arg0 + x * random_dir, *extra_args, **kwargs) numeric_grad = unary_nd(unary_fun, 0.0, eps=eps) analytic_grad = np.sum(grad(fun)(arg0, *extra_args, **kwargs) * random_dir) assert np.allclose(numeric_grad, analytic_grad, rtol=rtol, atol=atol), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) if verbose: print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})".format( numeric_grad, analytic_grad))
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, mean, lambda g: g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, cov, lambda g: -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defgrad_is_zero(argnums=(0,)) entropy.defgrad(lambda ans, mean, cov: unbroadcast(ans, cov, lambda g: 0.5 * g * np.linalg.inv(cov).T), argnum=1)
assert shape(a) == shape(b) return binop(a, b) return wrapped make_binop = (lambda make_binop: lambda *args: add_binop_size_check(make_binop(*args)))(make_binop) add = make_binop(operator.add, tuple) sub = make_binop(operator.sub, tuple) mul = make_binop(operator.mul, tuple) div = make_binop(operator.truediv, tuple) allclose = make_binop(np.allclose, all) contract = make_binop(inner, sum) shape = make_unop(np.shape, tuple) unbox = make_unop(getval, tuple) sqrt = make_unop(np.sqrt, tuple) square = make_unop(lambda a: a**2, tuple) randn_like = make_unop(lambda a: npr.normal(size=np.shape(a)), tuple) zeros_like = make_unop(lambda a: np.zeros(np.shape(a)), tuple) flatten = make_unop(lambda a: np.ravel(a), np.concatenate) scale = make_scalar_op(operator.mul, tuple) add_scalar = make_scalar_op(operator.add, tuple) norm = lambda x: np.sqrt(contract(x, x)) rand_dir_like = lambda x: scale(1./norm(x), randn_like(x)) isobjarray = lambda x: isinstance(x, np.ndarray) and x.dtype == np.object tuplify = Y(lambda f: lambda a: a if not istuple(a) and not isobjarray(a) else tuple(map(f, a))) depth = Y(lambda f: lambda a: np.ndim(a) if not istuple(a) else 1+(min(map(f, a)) if len(a) else 1))
def randn_like(x): return npr.RandomState(0).randn(*np.shape(x))
def unpack_gaussian_params(params): # Variational dist is a diagonal Gaussian. D = np.shape(params)[0] // 2 mean, log_std = params[:D], params[D:] return mean, log_std
J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1./2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp(logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) defvjp(entropy, None, lambda ans, mean, cov: unbroadcast_f(cov, lambda g: 0.5 * g * np.linalg.inv(cov).T))
def sample_diag_gaussian(params, num_samples, rs): mean, log_std = unpack_gaussian_params(params) D = np.shape(mean)[0] return rs.randn(num_samples, D) * np.exp(log_std) + mean
elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov, allow_singular=False): if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
def getshape(val): val = getval(val) assert np.isscalar(val) or isinstance(val, np.ndarray), \ 'Jacobian requires input and output to be scalar- or array-valued' return np.shape(val)
def unpack_gaussian_params(params): # Params of a diagonal Gaussian. D = np.shape(params)[-1] // 2 mean, log_std = params[:, :D], params[:, D:] return mean, log_std
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defgrad_is_zero(argnums=(0,)) entropy.defgrad(lambda ans, mean, cov: unbroadcast(ans, cov, lambda g: 0.5 * g * np.linalg.inv(cov).T), argnum=1)