def RBF(self,hyper,xi,xj=None): if xj is None: xj = xi sigma_f = np.array(hyper[0]) lengthscale = np.array(hyper[1:]) r = np.expand_dims(xi*lengthscale,1) - np.expand_dims(xj*lengthscale,0) return sigma_f * np.exp(-0.5 * np.sum(r**2,axis=2))
def line_point_dist(lines, ps): """ Closest distance of a point to a line segment defined by two points (a, b). The arguments can also be lists of lines and points, in that case the distance for each combination is returned, with shape lines.shape[:-2] + ps.shape[:-1]. """ assert(lines.shape[-2:] == (2, 2)) assert(ps.shape[-1] == 2) a = lines[...,0,:] b = lines[...,1,:] for _ in range(max(len(ps.shape)-1, 1)): a = np.expand_dims(a, -2) b = np.expand_dims(b, -2) # ps = np.expand_dims(ps, 0) v_hat = (b - a) / np.expand_dims(norm(b - a), -1) # d_along.shape == (v_hat.shape[0], ps.shape[0]) # i.e. one scalar product for each line-point combination d_along = np.sum(v_hat*(ps - a), axis=-1) d_normal = np.abs(cross(v_hat, ps - a)) assert(d_along.shape == d_normal.shape) d_ends = np.min(np.array([norm(ps-a), norm(ps-b)]), axis=0) # if p lies along the sides of the line use the normal distance, # else the distance to one of the ends mask = (0 <= d_along) & (d_along <= norm(b - a)) return np.where(mask, d_normal, d_ends)
def periodic_kernel(x, xstar, hyp): """ Implements the periodic kernel function for Gaussian Process x: input data with shape (N,d) xstar: inpt data with data (Nstar,d) hyp: (log(sigma_f),log(l1),log(l2),...,log(period)) with shape (d+2,) returns: a covariance matrix with shape (N,Nstar) """ sigma_f = np.exp(hyp[0]) N = x.shape[0] Nstar = xstar.shape[0] l = np.exp(hyp[1:-1]) #shape (d,) l = np.repeat(np.repeat(l[np.newaxis, :], Nstar, axis=0)[np.newaxis, :], N, axis=0) #shape (N,Nstar,d) period = np.exp(hyp[-1]) diff = np.sin( np.pi * np.abs(np.expand_dims(x, 1) - np.expand_dims(xstar, 0)) / period) / l #result of shape (N,Nstar,d) K = sigma_f * np.exp(-2 * (diff**2).sum(axis=2)) #should be of shape (N,Nstar) return K
def rbf_covariance(kernel_params, x, xp): output_scale = np.exp(kernel_params[0]) lengthscales = np.exp(kernel_params[1:]) diffs = np.expand_dims(x / lengthscales[:, np.newaxis], 2) - np.expand_dims( xp / lengthscales[:, np.newaxis], 1) return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=0))
def angle_axis_rotation_matrix(angle, axis, axis_already_normalized=False): # Gives the rotation matrix from an angle and an axis. # An implmentation of https://en.wikipedia.org/wiki/Rotation_matrix#Rotation_matrix_from_axis_and_angle # Inputs: # * angle: can be one angle or a vector (1d ndarray) of angles. Given in radians. # * axis: a 1d numpy array of length 3 (x,y,z). Represents the angle. # * axis_already_normalized: boolean, skips normalization for speed if you flag this true. # Outputs: # * If angle is a scalar, returns a 3x3 rotation matrix. # * If angle is a vector, returns a 3x3xN rotation matrix. if not axis_already_normalized: axis = axis / np.linalg.norm(axis) sintheta = np.sin(angle) costheta = np.cos(angle) cpm = np.array( [[0, -axis[2], axis[1]], [axis[2], 0, -axis[0]], [-axis[1], axis[0], 0]] ) # The cross product matrix of the rotation axis vector outer_axis = np.outer(axis, axis) angle = np.array(angle) # make sure angle is a ndarray if len(angle.shape) == 0: # is a scalar rot_matrix = costheta * np.eye(3) + sintheta * cpm + (1 - costheta) * outer_axis return rot_matrix else: # angle is assumed to be a 1d ndarray rot_matrix = costheta * np.expand_dims(np.eye(3), 2) + sintheta * np.expand_dims(cpm, 2) + ( 1 - costheta) * np.expand_dims(outer_axis, 2) return rot_matrix
def log_py_zM_ord_j(lambda_ord_j, y_oh_j, zM, k, nj_ord_j): ''' Compute log p(y_j | zM, s1 = k1) of each ordinal variable lambda_ord_j ( (nj_ord_j + r - 1) 1darray): Coefficients of the ordinal distributions in the GLLVM layer y_oh_j (numobs 1darray): The jth ordinal variable in the dataset zM (M x r x k ndarray): M Monte Carlo copies of z for each component k1 of the mixture k (int): The number of components of the mixture nj_ord_j (int): The number of possible values values of the jth ordinal variable -------------------------------------------------------------- returns (ndarray): The p(y_j | zM, s1 = k1) for the jth ordinal variable ''' r = zM.shape[1] M = zM.shape[0] epsilon = 1E-10 # Numeric stability lambda0 = lambda_ord_j[:(nj_ord_j - 1)] Lambda = lambda_ord_j[-r:] broad_lambda0 = lambda0.reshape((nj_ord_j - 1, 1, 1, 1)) eta = broad_lambda0 - (np.transpose(zM, (0, 2, 1)) @ Lambda.reshape((1, r, 1)))[np.newaxis] gamma = expit(eta) gamma_prev = np.concatenate([np.zeros((1,M, k, 1)), gamma]) gamma_next = np.concatenate([gamma, np.ones((1,M, k, 1))]) pi = gamma_next - gamma_prev pi = np.where(pi <= 0, epsilon, pi) pi = np.where(pi >= 1, 1 - epsilon, pi) yg = np.expand_dims(y_oh_j.T, 1)[..., np.newaxis, np.newaxis] log_p_y_z = yg * np.log(np.expand_dims(pi, axis=2)) return log_p_y_z.sum((0))
def sample(self, n, seed=3): dim = self.d def mu(x): mu = np.zeros((dim)) mu[0] = x return mu mu0, cov0, = mu(self.params['mu0']), self.params['sig0'] * np.eye(dim) mu1, cov1, = mu(self.params['mu1']), self.params['sig1'] * np.eye(dim) muR, covR, = mu(self.params['muR']), self.params['sigR'] * np.eye(dim) with NumpySeedContext(seed=seed): self.p0 = stats.multivariate_normal(mu0, cov0) self.p1 = stats.multivariate_normal(mu1, cov1) self.q = stats.multivariate_normal(muR, covR) X = self.p0.rvs(size=n) Y = self.p1.rvs(size=n) Q = self.q.rvs(size=n) if X.ndim == 1: X = np.expand_dims(X, axis=1) if Y.ndim == 1: Y = np.expand_dims(Y, axis=1) if Q.ndim == 1: Q = np.expand_dims(Q, axis=1) return Data(X), \ Data(Y), \ Data(Q)
def RBF_eKdK(mu, sigma, X, lengthscales=None, kernel_variance=1): """ x ~ N(mu, sigma), Dx1 X is DxM Return E_x [ k(X, x) * dk(x, X) ], an M x (D x M) array """ if lengthscales is None: lengthscales = np.ones((mu.shape[0], 1)) # d x m1 x m2 exKK = RBF_exKK(mu=mu, sigma=sigma, X=X, lengthscales=lengthscales, kernel_variance=kernel_variance) # m1 x m2 eKK = RBF_eKK(mu=mu, sigma=sigma, X=X, lengthscales=lengthscales, kernel_variance=kernel_variance) # d x m1 x m2, # As exKK naturally uses the first argument and # X is the second argument in the derivative kernel, we should expand it, such that we iterate along m2 dimension eKdK = (exKK - np.expand_dims(X, axis=1) * np.expand_dims(eKK, axis=0)) / ( (lengthscales**2)[:, :, None]) # We then finally modify the order of axis and the dimensionality to get # the expected m1 - d - m2 order with M x (DM) shape return np.reshape(eKdK.swapaxes(0, 1), (X.shape[1], -1), order='F')
def RBF_exxKK(mu, sigma, X, lengthscales=None, kernel_variance=1): """ x ~ N(mu, sigma), Dx1 X is DxM Return E_x [ (x*x.T) * k(X, x) * k(x, X) ], a D x D x M x M array """ if lengthscales is None: lengthscales = np.ones((mu.shape[0], 1)) # M x M array eKK = RBF_eKK(mu=mu, sigma=sigma, X=X, lengthscales=lengthscales, kernel_variance=kernel_variance) # D x D array var_gauss = 1 / (1 / ((lengthscales**2) / 2) + (1 / sigma)) X_pairwise_sums = X[:, :, None] + X[:, :, None].swapaxes(1, 2) # D x M x M array mean_gauss = ((X_pairwise_sums / 2) / (((lengthscales**2) / 2)[:, :, None]) + (mu / sigma)[:, :, None]) * (var_gauss[:, :, None]) # D x D x M x M array mean_outer = np.expand_dims(mean_gauss, axis=1) * np.expand_dims( mean_gauss, axis=0) return np.expand_dims(np.expand_dims(eKK, axis=0), axis=0) * (var_gauss[:, :, None, None] + mean_outer)
def generate_text(n, model): result = [] start = '' if PAN_TADEUSZ: start = "Jam jest Jacek" else: start = "no it was n't black monday" raw_start = list(map(lambda l: c2i[l], start)) placeholders = model.reset_state(1) placeholders[model.scope.is_training] = False expected_nodes = [model.predicts[0], model.first_memory] for i in raw_start: x = np.expand_dims(onehot(i), axis=0) x = np.expand_dims(x, axis=0) placeholders[model.batch] = x predicts, state = sess.run(expected_nodes, feed_dict=placeholders) for i in range(len(model.cells)): placeholders[model.cells[i].h_t] = state[i][0] placeholders[model.cells[i].c_t] = state[i][1] letter = c2i[" "] for i in range(n): x = np.expand_dims(onehot(letter), axis=0) x = np.expand_dims(x, axis=0) placeholders[model.batch] = x predicts, state = sess.run(expected_nodes, feed_dict=placeholders) letter = get_letter(predicts) for i in range(len(model.cells)): placeholders[model.cells[i].h_t] = state[i][0] placeholders[model.cells[i].c_t] = state[i][1] result.append(i2c[letter]) return start + ''.join(result)
def kernel(self, x, xp, hyp): output_scale = np.exp(hyp[0]) lengthscales = np.exp(hyp[1]) print(lengthscales) diffs = np.expand_dims(x / lengthscales, 1) - \ np.expand_dims(xp / lengthscales, 0) return output_scale * np.exp(-0.5 * np.sum(diffs ** 2, axis=2))
def log_py_zM_categ_j(lambda_categ_j, y_categ_j, zM, k, nj_categ_j): ''' Compute log p(y_j | zM, s1 = k1) of each categorical variable lambda_categ_j (nj_categ x (r + 1) ndarray): Coefficients of the categorical distributions in the GLLVM layer y_categ_j (numobs 1darray): The jth categorical variable in the dataset zM (M x r x k ndarray): M Monte Carlo copies of z for each component k1 of the mixture k (int): The number of components of the mixture nj_categ_j (int): The number of possible values values of the jth categorical variable -------------------------------------------------------------- returns (ndarray): The p(y_j | zM, s1 = k1) for the jth categorical variable ''' epsilon = 1E-10 r = zM.shape[1] nj = y_categ_j.shape[1] zM_broad = np.expand_dims(np.expand_dims(np.transpose(zM, (0, 2, 1)), 2), 3) lambda_categ_j_ = lambda_categ_j.reshape(nj, r + 1, order = 'C') eta = zM_broad @ lambda_categ_j_[:, 1:][n_axis, n_axis, ..., n_axis] # Check que l'on fait r et pas k ? eta = eta + lambda_categ_j_[:,0].reshape(1, 1, nj_categ_j, 1, 1) # Add the constant pi = softmax_(eta.astype(np.float), axis = 2) # Numeric stability pi = np.where(pi <= 0, epsilon, pi) pi = np.where(pi >= 1, 1 - epsilon, pi) yg = np.expand_dims(np.expand_dims(y_categ_j, 1), 1)[..., np.newaxis, np.newaxis] log_p_y_z = yg * np.log(pi[n_axis]) # Reshaping output log_p_y_z = log_p_y_z.sum((3)) # Suming over the modalities nj log_p_y_z = log_p_y_z[:,:,:,0,0] # Deleting useless axes return np.transpose(log_p_y_z,(1,0, 2))
def make_pinwheel_data(num_spokes=5, points_per_spoke=40, rate=1.0, noise_std=0.005): """Make synthetic data in the shape of a pinwheel.""" spoke_angles = np.linspace(0, 2 * np.pi, num_spokes + 1)[:-1] rs = npr.RandomState(0) x = np.linspace(0.1, 1, points_per_spoke) xs = np.concatenate([x * np.cos(angle + x * rate) + noise_std * rs.randn(len(x)) for angle in spoke_angles]) ys = np.concatenate([x * np.sin(angle + x * rate) + noise_std * rs.randn(len(x)) for angle in spoke_angles]) return np.concatenate([np.expand_dims(xs, 1), np.expand_dims(ys, 1)], axis=1)
def fwd_grad_logsumexp(g, ans, x, axis=None, b=1.0, keepdims=False): if not keepdims: if isinstance(axis, int): ans = anp.expand_dims(ans, axis) elif isinstance(axis, tuple): for ax in sorted(axis): ans = anp.expand_dims(ans, ax) return anp.sum(g * b * anp.exp(x - ans), axis=axis, keepdims=keepdims)
def vec_compute_params(self, params, x, xp): n_params = params.shape[0] var = params[:, 0] hvar = params[:, 1] off = params[:, 2] diffs = agnp.expand_dims(var * (x - off), 1) * agnp.expand_dims( (y - off), 0) return hvar + diffs
def fwd_grad_logsumexp(g, ans, x, axis=None, b=1.0, keepdims=False): if not keepdims: if isinstance(axis, int): ans = np.expand_dims(ans, axis) elif isinstance(axis, tuple): for ax in sorted(axis): ans = np.expand_dims(ans, ax) return np.sum(g * b * np.exp(x - ans), axis=axis, keepdims=keepdims)
def non_uniform_approx_nearest(points, values): """Approximate derivatives using nearest points in non-uniform grid.""" ndim = points.shape[-1] k = triangular(ndim + 1) diffs = np.expand_dims(points, axis=0) - np.expand_dims(points, axis=1) norms = np.linalg.norm(diffs, axis=-1) nearest_k = np.argpartition(norms, k)[..., :k] return taylor_approx(points, points[nearest_k], values[nearest_k])
def exp_sin_squared(x,y,a,b,c): if y is None: y = x diffs = np.expand_dims(x,1)-np.expand_dims(y,0) sqdist = np.sum(diffs**2, axis=2) assert np.all(sqdist>=0),sqdist[sqdist<0] out = b*b*np.exp(-np.sin(sqdist/c*np.pi)**2/a**2*2) return out
def differentiate(self): "get gradient values using finite difference" C = np.repeat(np.expand_dims(self.cost_fn.C, axis=0), [self.T, 1, 1]) F = np.repeat(np.expand_dims(self.dyn_model.F, axis=0), [self.T, 1, 1]) c = np.repeat(np.expand_dims(self.cost_fn.c, axis=0), (self.T, 1)) f = np.repeat(np.expand_dims(self.dyn_model.f, axis=0), (self.T, 1)) return C, F, c, f
def rbf_covariance(self, X, y=None, signal_variance=1.0, length_scale=1.0): if y is None: y = X D = np.expand_dims(X / length_scale, 1) - np.expand_dims( y / length_scale, 0) return signal_variance * np.exp(-0.5 * np.sum(D**2, axis=2))
def gp0(self, m, s): """ Compute joint predictions for MGP with uncertain inputs. """ assert hasattr(self, "hyp") if not hasattr(self, "K"): self.cache() x = np.atleast_2d(self.inputs) y = np.atleast_2d(self.targets) n, D = x.shape n, E = y.shape X = self.hyp iK = self.iK beta = self.alpha m = np.atleast_2d(m) inp = x - m # Compute the predicted mean and IO covariance. iL = np.stack([np.diag(exp(-X[i, :D])) for i in range(E)]) iN = np.matmul(inp, iL) B = iL @ s @ iL + np.eye(D) t = np.stack([solve(B[i].T, iN[i].T).T for i in range(E)]) q = exp(-np.sum(iN * t, 2) / 2) qb = q * beta.T tiL = np.matmul(t, iL) c = exp(2 * X[:, D]) / sqrt(det(B)) M = np.sum(qb, 1) * c V = (np.transpose(tiL, [0, 2, 1]) @ np.expand_dims(qb, 2)).reshape( E, D).T * c k = 2 * X[:, D].reshape(E, 1) - np.sum(iN**2, 2) / 2 # Compute the predicted covariance. inp = np.expand_dims(inp, 0) / np.expand_dims(exp(2 * X[:, :D]), 1) ii = np.repeat(inp[:, newaxis, :, :], E, 1) ij = np.repeat(inp[newaxis, :, :, :], E, 0) iL = np.stack([np.diag(exp(-2 * X[i, :D])) for i in range(E)]) siL = np.expand_dims(iL, 0) + np.expand_dims(iL, 1) R = np.matmul(s, siL) + np.eye(D) t = 1 / sqrt(det(R)) iRs = np.stack( [solve(R.reshape(-1, D, D)[i], s) for i in range(E * E)]) iRs = iRs.reshape(E, E, D, D) Q = exp(k[:, newaxis, :, newaxis] + k[newaxis, :, newaxis, :] + maha(ii, -ij, iRs / 2)) S = np.einsum('ji,iljk,kl->il', beta, Q, beta) tr = np.hstack([np.sum(Q[i, i] * iK[i]) for i in range(E)]) S = (S - np.diag(tr)) * t + np.diag(exp(2 * X[:, D])) S = S - np.matmul(M[:, newaxis], M[newaxis, :]) return M, S, V
def predict_mean(self, x_new): k_dims = [self.kernels[d].eval(self.kernels[d].params, np.expand_dims(np.unique(self.X[:, d]), 1), np.expand_dims(x_new[:, d], 1)) for d in self.X.shape[1]] kx = np.squeeze(kron_list(k_dims)) mean = np.sum(np.multiply(kx, self.alpha)) + self.mu[0] return mean
def kernel1(self, x, xp, hyp, active_dims=None): if active_dims is None: active_dims = np.arange(self.dim) output_scale = np.exp(hyp[0]) lengthscales = np.exp(hyp[1:]) lengthscales = lengthscales + 0.000001 diffs = np.expand_dims( (x[active_dims].T / lengthscales).T, 2) - np.expand_dims( (xp[active_dims].T / lengthscales).T, 1) return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=0))
def make_pinwheel_data(num_classes, num_per_class, rate=2.0, noise_std=0.001): spoke_angles = np.linspace(0, 2*np.pi, num_classes+1)[:-1] rs = npr.RandomState(0) x = np.linspace(0.1, 1, num_per_class) xs = np.concatenate([rate *x * np.cos(angle + x * rate) + noise_std * rs.randn(num_per_class) for angle in spoke_angles]) ys = np.concatenate([rate *x * np.sin(angle + x * rate) + noise_std * rs.randn(num_per_class) for angle in spoke_angles]) return np.concatenate([np.expand_dims(xs, 1), np.expand_dims(ys,1)], axis=1)
def mykron(A, B): """ Efficient Kronecker product. """ a1, a2 = A.shape b1, b2 = B.shape C = np.reshape( np.expand_dims(A, (1, 3)) * np.expand_dims(B, (0, 2)), (a1 * b1, a2 * b2)) return C
def make_pinwheel_data(num_spokes=5, points_per_spoke=40, rate=1.0, noise_std=0.005): """Make synthetic data in the shape of a pinwheel.""" spoke_angles = np.linspace(0, 2 * np.pi, num_spokes + 1)[:-1] rs = npr.RandomState(0) x = np.linspace(0.1, 1, points_per_spoke) xs = np.concatenate([x * np.cos(angle + x * rate) + noise_std * rs.randn(len(x)) for angle in spoke_angles]) ys = np.concatenate([x * np.sin(angle + x * rate) + noise_std * rs.randn(len(x)) for angle in spoke_angles]) return np.concatenate([np.expand_dims(xs, 1), np.expand_dims(ys,1)], axis=1)
def softmax(x): y = np.atleast_2d(x) axis = 1 y = y - np.expand_dims(np.max(y, axis=axis), axis) y = np.exp(y) ax_sum = np.expand_dims(np.sum(y, axis=axis), axis) p = y / ax_sum if len(x.shape) == 1: p = p.flatten() return p
def fwd_grad_chooser(g, ans, gvs, vs, x, axis=None, keepdims=False): if anp.isscalar(x): return g if not keepdims: if isinstance(axis, int): ans = anp.expand_dims(ans, axis) elif isinstance(axis, tuple): for ax in sorted(axis): ans = anp.expand_dims(ans, ax) chosen_locations = x == ans return anp.sum(g * chosen_locations, axis=axis, keepdims=keepdims)
def sample_qf_q_and_p(logprob, t, combined_params, k, num_samples, rs): add_dim_to_pair = lambda (a, b, c): (np.expand_dims( a, 1), np.expand_dims(b, 1), np.expand_dims(c, 1)) combined_qs_and_samples = [ add_dim_to_pair(sample_q_and_p(logprob, t, params, num_samples, rs)) for params in np.split(combined_params, k) ] combined_qs, combined_ps, combined_samples = zip(*combined_qs_and_samples) return np.concatenate(combined_qs, axis=1),\ np.concatenate(combined_ps, axis=1),\ np.concatenate(combined_samples, axis=1) # should be NxK, and NxKxD
def _sqdist(x,y,Torch=False): if y is None: y = x if Torch: diffs = torch.unsqueeze(x,1)-torch.unsqueeze(y,0) sqdist = torch.sum(diffs**2, axis=2, keepdim=False) else: diffs = np.expand_dims(x,1)-np.expand_dims(y,0) sqdist = np.sum(diffs**2, axis=2) del diffs return sqdist
def rbf(X, Y, gamma): # X: (num_samples1, num_features) # Y: (num_samples2, num_features) # return: (num_samples1, num_samples2) X_ = np.expand_dims(X, 1) # unsqueeze -> (num_samples1, 1, num_features) Y_ = np.expand_dims(Y, 0) # unsqueeze -> (1, num_samples2, num_features) dm = X_ - Y_ # (num_samples1, num_samples2, num_features) norm = np.sum(dm**2, axis=2) return np.exp(-gamma * norm)
def get_log_prob_nk(self, x): # Up to a constant. v = self.mix_par.values x_centered = np.expand_dims(x, 2) - np.expand_dims(np.transpose(v['loc']), 0) log_det_k = np.array( [ np.linalg.slogdet(v['info'][k,:,:])[1] \ for k in range(self.num_components) ]) lp_nk = \ -0.5 * np.einsum('nik,kij,njk->nk', x_centered, v['info'], x_centered) + \ 0.5 * np.expand_dims(log_det_k, 0) return lp_nk
def train_pruned_model(args, mdl, results, top_vec, coeff): all_w = [] results['args'] = args init_loss = mdl.loss(mdl.params_flat) init_grad_norm = np.linalg.norm(mdl.gradient(mdl.params_flat)) print('Initial loss: {}, norm grad: {}'.format(init_loss, init_grad_norm)) results['init_full_loss'] = init_loss results['init_full_grad_norm'] = init_grad_norm results['history1'] = [] results['history1_columns'] = [ 'iter_no', 'batch_loss', 'batch_grad_norm', 'batch_param_norm' ] results['history2'] = [] results['history2_columns'] = ['full_hessian', 'full_hessian_evals'] for iter_no in tqdm(range(args.max_iterations)): inputs, targets = get_batch_samples(iter_no, args, mdl) batch_loss = mdl.loss(mdl.params_flat, inputs, targets) batch_grad = mdl.gradient(mdl.params_flat, inputs, targets) batch_grad_norm = np.linalg.norm(batch_grad) batch_param_norm = np.linalg.norm(mdl.params_flat) if iter_no % args.freq == 0: # calculating hessian # Calculating Hessian hess = mdl.hessian(mdl.params_flat) # Converting the Hessian to Tensor hess = torch.tensor(hess).float() c = torch.mv(hess.transpose(0, 1), torch.tensor(top_vec).float()) if np.size(coeff) == 0: coeff = c.detach().cpu().numpy() coeff = np.expand_dims(coeff, axis=0) else: coeff = np.concatenate( (coeff, np.expand_dims(c.detach().cpu().numpy(), axis=0)), 0) # saving weights in all iterations if batch_grad_norm <= args.stopping_grad_norm: break mdl.params_flat -= batch_grad * args.learning_rate all_w.append(np.power(math.e, mdl.params_flat)) final_loss = mdl.loss(mdl.params_flat) final_grad_norm = np.linalg.norm(mdl.gradient(mdl.params_flat)) print('Final loss: {}, norm grad: {}\n'.format(final_loss, final_grad_norm)) return mdl.params, coeff
def calcSigma(x1, x2,l): ''' Creating a Covariance Matrix INPUTS: X1, X2: arrays containing the x values from two separate samples l : length scale parameter OUTPUTS: Sigma: a covariance matrix between x1 and X2 -------------------------------------------------------- Notes: ''' length_scale = l diffs = np.expand_dims(x1 /length_scale,1)\ - np.expand_dims(x2 /length_scale,0) return np.exp(-0.5 * np.sum(diffs**2,axis=2))
def predictions(weights, inputs): inputs = np.expand_dims(inputs, 0) for W, b in unpack_layers(weights): outputs = np.einsum('mnd,mdo->mno', inputs, W) + b inputs = nonlinearity(outputs) #return outputs - logsumexp(outputs, axis=1, keepdims=True) return outputs
def covgrad(x, mean, cov, allow_singular=False): if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1./2 * (generalized_outer_product(solved) - J)
def plot_gmm(params, ax, num_points=100): angles = np.expand_dims(np.linspace(0, 2*np.pi, num_points), 1) xs, ys = np.cos(angles), np.sin(angles) circle_pts = np.concatenate([xs, ys], axis=1) * 2.0 for log_proportion, mean, chol in zip(*unpack_params(params)): cur_pts = mean + np.dot(circle_pts, chol) ax.plot(cur_pts[:, 0], cur_pts[:, 1], '-')
def log_marginal_likelihood(params, data): cluster_lls = [] for log_proportion, mean, chol in zip(*unpack_params(params)): cov = np.dot(chol.T, chol) + 0.000001 * np.eye(D) cluster_log_likelihood = log_proportion + mvn.logpdf(data, mean, cov) cluster_lls.append(np.expand_dims(cluster_log_likelihood, axis=0)) cluster_lls = np.concatenate(cluster_lls, axis=0) return np.sum(logsumexp(cluster_lls, axis=0))
def predictions(weights, inputs): """weights is shape (num_weight_samples x num_weights) inputs is shape (num_datapoints x D)""" inputs = np.expand_dims(inputs, 0) for W, b in unpack_layers(weights): outputs = np.einsum('mnd,mdo->mno', inputs, W) + b inputs = nonlinearity(outputs) return outputs
def cost(theta): # Unpack parameters nu = np.concatenate([theta[1], [0]], axis=0) S = theta[0] logdetS = np.expand_dims(np.linalg.slogdet(S)[1], 1) y = np.concatenate([samples.T, np.ones((1, N))], axis=0) # Calculate log_q y = np.expand_dims(y, 0) # 'Probability' of y belonging to each cluster log_q = -0.5 * (np.sum(y * np.linalg.solve(S, y), axis=1) + logdetS) alpha = np.exp(nu) alpha = alpha / np.sum(alpha) alpha = np.expand_dims(alpha, 1) loglikvec = logsumexp(np.log(alpha) + log_q, axis=0) return -np.sum(loglikvec)
def unpack_params(params): """Unpacks parameter vector into the proportions, means and covariances of each mixture component. The covariance matrices are parametrized by their Cholesky decompositions.""" log_proportions = parser.get(params, 'log proportions') normalized_log_proportions = log_proportions - logsumexp(log_proportions) means = parser.get(params, 'means') lower_tris = np.tril(parser.get(params, 'lower triangles'), k=-1) diag_chols = np.exp( parser.get(params, 'log diagonals')) chols = [] for lower_tri, diag in zip(lower_tris, diag_chols): chols.append(np.expand_dims(lower_tri + np.diag(diag), 0)) chols = np.concatenate(chols, axis=0) return normalized_log_proportions, means, chols
def callback(params, t, g): print("Iteration {} lower bound {}".format(t, -objective(params, t))) # Sample functions from posterior. mean, cov = unpack_params(params) rs = npr.RandomState(0) sample_weights = rs.randn(10, num_weights) * np.sqrt(cov) + mean plot_inputs = np.linspace(-8, 8, num=200) outputs = predictions(sample_weights, np.expand_dims(plot_inputs, 1)) # Plot data and functions. plt.cla() ax.plot(inputs.ravel(), targets.ravel(), 'bx') ax.plot(plot_inputs, outputs[:, :, 0].T) ax.set_ylim([-2, 3]) plt.draw() plt.pause(1.0/60.0)
# Initialize variational parameters rs = npr.RandomState(0) num_samples = 2 init_mean = rs.randn(num_weights) init_log_std = -5 * np.ones(num_weights) variational_params = np.concatenate([init_mean, init_log_std]) for step in range(num_steps): offset = (step * batch_size) % (train_labels.shape[0] - batch_size) batch_data = train_data[offset:(offset + batch_size), :] batch_labels = train_labels[offset:(offset + batch_size), :] variational_params = update_nn(variational_params, batch_data, batch_labels) if (step % 10) == 0: correct = 0 num_test = len(test_labels) for ix, val in enumerate(test_labels): outputs = generate_nn_output(variational_params, np.expand_dims(test_data[ix,:],0), num_weights, num_samples) predicted_class = np.argmax(np.mean(outputs, axis=0)) actual_class = np.argmax(val) if actual_class == predicted_class: correct += 1 print ('Accuracy at step %d: %2.3f' % (step, float(correct)/num_test*100))
elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov, allow_singular=False): if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov, allow_singular=False: unbroadcast(vs, gvs, -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
gammaln = primitive(scipy.special.gammaln) gammainc = primitive(scipy.special.gammainc) gammaincc = primitive(scipy.special.gammaincc) gammasgn = primitive(scipy.special.gammasgn) rgamma = primitive(scipy.special.rgamma) multigammaln = primitive(scipy.special.multigammaln) defvjp(gammasgn, None) defvjp(polygamma, None, lambda ans, n, x: lambda g: g * polygamma(n + 1, x)) defvjp(psi, lambda ans, x: lambda g: g * polygamma(1, x)) defvjp(digamma, lambda ans, x: lambda g: g * polygamma(1, x)) defvjp(gamma, lambda ans, x: lambda g: g * ans * psi(x)) defvjp(gammaln, lambda ans, x: lambda g: g * psi(x)) defvjp(rgamma, lambda ans, x: lambda g: g * psi(x) / -gamma(x)) defvjp(multigammaln,lambda ans, a, d: lambda g: g * np.sum(digamma(np.expand_dims(a, -1) - np.arange(d)/2.), -1), None) def make_gammainc_vjp_arg1(sign): def gammainc_vjp_arg1(ans, a, x): coeffs = sign * np.exp(-x) * np.power(x, a - 1) / gamma(a) return unbroadcast_f(x, lambda g: g * coeffs) return gammainc_vjp_arg1 defvjp(gammainc, make_gammainc_vjp_arg1(1), argnums=[1]) defvjp(gammaincc, make_gammainc_vjp_arg1(-1), argnums=[1]) ### Bessel functions ### j0 = primitive(scipy.special.j0) y0 = primitive(scipy.special.y0) j1 = primitive(scipy.special.j1)
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, mean, lambda g: g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defgrad(lambda ans, x, mean, cov: unbroadcast(ans, cov, lambda g: -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defgrad_is_zero(argnums=(0,)) entropy.defgrad(lambda ans, mean, cov: unbroadcast(ans, cov, lambda g: 0.5 * g * np.linalg.inv(cov).T), argnum=1)
if allow_singular: raise NotImplementedError("The multivariate normal pdf is not " "differentiable w.r.t. a singular covariance matix") J = np.linalg.inv(cov) solved = np.matmul(J, np.expand_dims(x - mean, -1)) return 1./2 * (generalized_outer_product(solved) - J) def solve(allow_singular): if allow_singular: return lambda A, x: np.dot(np.linalg.pinv(A), x) else: return np.linalg.solve defvjp(logpdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) # Same as log pdf, but multiplied by the pdf (ans). defvjp(pdf, lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(x, lambda g: -np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(mean, lambda g: np.expand_dims(ans * g, 1) * solve(allow_singular)(cov, (x - mean).T).T), lambda ans, x, mean, cov, allow_singular=False: unbroadcast_f(cov, lambda g: -np.reshape(ans * g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov, allow_singular))) defvjp(entropy, None,
def rbf_covariance(kernel_params, x, xp): output_scale = np.exp(kernel_params[0]) lengthscales = np.exp(kernel_params[1:]) diffs = np.expand_dims(x /lengthscales, 1)\ - np.expand_dims(xp/lengthscales, 0) return output_scale * np.exp(-0.5 * np.sum(diffs**2, axis=2))
def standard_to_natural(S, m, kappa, nu): b = np.expand_dims(kappa, -1) * m A = S + outer(b, m) return pack_dense(A, b, kappa, nu)
def natural_to_standard(natparam): A, b, kappa, nu = unpack_dense(natparam) m = b / np.expand_dims(kappa, -1) S = A - outer(b, m) return S, m, kappa, nu
def fun(x): return to_scalar(np.expand_dims(x, 2)) d_fun = lambda x : to_scalar(grad(fun)(x))
def fast_array_from_list(xs): # import pdb; pdb.set_trace() # print 'I am in fast_array_from_list' return np.concatenate([np.expand_dims(x, axis=0) for x in xs], axis=0)
def RBF(x,xp): output_scale = params[num+'kernel_noise'] lengthscales = params[num+'kernel_lenscale'] diffs = np.expand_dims(x / lengthscales, 1) - np.expand_dims(xp / lengthscales, 0) return output_scale * np.exp(-0.5 * np.sum(diffs ** 2, axis=2))
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, x, lambda g: -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, mean, lambda g: g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defgrad(lambda ans, x, mean=None, cov=1, allow_singular=False: unbroadcast(ans, cov, lambda g: -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defgrad_is_zero(argnums=(0,)) entropy.defgrad(lambda ans, mean, cov: unbroadcast(ans, cov, lambda g: 0.5 * g * np.linalg.inv(cov).T), argnum=1)
return 0.5 * (np.tril(mat) + np.triu(mat, 1).T) elif len(mat.shape) == 3: return 0.5 * (np.tril(mat) + np.swapaxes(np.triu(mat, 1), 1,2)) else: raise ArithmeticError def generalized_outer_product(mat): if len(mat.shape) == 1: return np.outer(mat, mat) elif len(mat.shape) == 2: return np.einsum('ij,ik->ijk', mat, mat) else: raise ArithmeticError def covgrad(x, mean, cov): # I think once we have Cholesky we can make this nicer. solved = np.linalg.solve(cov, (x - mean).T).T return lower_half(np.linalg.inv(cov) - generalized_outer_product(solved)) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=0) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, np.expand_dims(g, 1) * np.linalg.solve(cov, (x - mean).T).T), argnum=1) logpdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -np.reshape(g, np.shape(g) + (1, 1)) * covgrad(x, mean, cov)), argnum=2) # Same as log pdf, but multiplied by the pdf (ans). pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * np.linalg.solve(cov, x - mean)), argnum=0) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, g * ans * np.linalg.solve(cov, x - mean)), argnum=1) pdf.defvjp(lambda g, ans, vs, gvs, x, mean, cov: unbroadcast(vs, gvs, -g * ans * covgrad(x, mean, cov)), argnum=2) entropy.defvjp_is_zero(argnums=(0,)) entropy.defvjp(lambda g, ans, vs, gvs, mean, cov: unbroadcast(vs, gvs, 0.5 * g * np.linalg.inv(cov).T), argnum=1)
fig = plt.figure(figsize=(8,8), facecolor='white') ax = fig.add_subplot(111, frameon=False) plt.ion() plt.show(block=False) for step in range(num_steps): # Grab a random datum datum_id = npr.randint(0, num_datums) # Assess expected reward across all possible actions (loop over context + action vectors) rewards = [] contexts = np.zeros((num_actions, F)) for aa in range(num_actions): contexts[aa,:] = np.hstack((x[datum_id, :], [aa])) outputs = generate_nn_output(variational_params, np.expand_dims(contexts[aa,:],0), num_weights, num_samples) rewards.append(np.mean(outputs)) # Check which is greater and choose that [1,0] = eat | [0,1] do not eat # If argmax returns 0, then we eat, otherwise we don't action_chosen = np.argmax(rewards) reward, oracle_reward = reward_function(action_chosen, y[datum_id]) # Calculate the cumulative regret cumulative_regret += oracle_reward - agent_reward # Store the experience of that reward as a training/data pair experience.append([contexts[action_chosen, :], reward])