def testInvalidShapeAtEval(self): with self.test_session(use_gpu=self._use_gpu): v = tf.placeholder(dtype=tf.float32) with self.assertRaisesOpError("input must be at least 2-dim"): tf.matrix_diag_part(v).eval(feed_dict={v: 0.0}) with self.assertRaisesOpError("last two dimensions must be equal"): tf.matrix_diag_part(v).eval(feed_dict={v: [[0, 1], [1, 0], [0, 0]]})
def testSampleWithSameSeed(self): if tf.executing_eagerly(): return scale = make_pd(1., 2) df = 4 chol_w = tfd.Wishart( df, scale_tril=chol(scale), input_output_cholesky=False) x = self.evaluate(chol_w.sample(1, seed=42)) chol_x = [chol(x[0])] full_w = tfd.Wishart(df, scale, input_output_cholesky=False) self.assertAllClose(x, self.evaluate(full_w.sample(1, seed=42))) chol_w_chol = tfd.Wishart( df, scale_tril=chol(scale), input_output_cholesky=True) self.assertAllClose(chol_x, self.evaluate(chol_w_chol.sample(1, seed=42))) eigen_values = tf.matrix_diag_part(chol_w_chol.sample(1000, seed=42)) np.testing.assert_array_less(0., self.evaluate(eigen_values)) full_w_chol = tfd.Wishart(df, scale=scale, input_output_cholesky=True) self.assertAllClose(chol_x, self.evaluate(full_w_chol.sample(1, seed=42))) eigen_values = tf.matrix_diag_part(full_w_chol.sample(1000, seed=42)) np.testing.assert_array_less(0., self.evaluate(eigen_values))
def testRectangular(self): with self.test_session(use_gpu=self._use_gpu): mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) mat_diag = tf.matrix_diag_part(mat) self.assertAllEqual(mat_diag.eval(), np.array([1.0, 5.0])) mat = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) mat_diag = tf.matrix_diag_part(mat) self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
def _variance(self): if distribution_util.is_diagonal_scale(self.scale): return 2. * tf.square(self.scale.diag_part()) elif (isinstance(self.scale, tf.linalg.LinearOperatorLowRankUpdate) and self.scale.is_self_adjoint): return tf.matrix_diag_part(2. * self.scale.matmul(self.scale.to_dense())) else: return 2. * tf.matrix_diag_part( self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ tf.assert_positive( tf.matrix_diag_part(x), message="diagonal part must be positive"), ], x) return control_flow_ops.with_dependencies([ tf.assert_none_equal( tf.matrix_diag_part(x), tf.zeros([], x.dtype), message="diagonal part must be non-zero"), ], x)
def testSample(self): with self.test_session(): scale = make_pd(1., 2) df = 4 chol_w = distributions.WishartCholesky( df, chol(scale), cholesky_input_output_matrices=False) x = chol_w.sample_n(1, seed=42).eval() chol_x = [chol(x[0])] full_w = distributions.WishartFull( df, scale, cholesky_input_output_matrices=False) self.assertAllClose(x, full_w.sample_n(1, seed=42).eval()) chol_w_chol = distributions.WishartCholesky( df, chol(scale), cholesky_input_output_matrices=True) self.assertAllClose(chol_x, chol_w_chol.sample_n(1, seed=42).eval()) eigen_values = tf.matrix_diag_part(chol_w_chol.sample_n(1000, seed=42)) np.testing.assert_array_less(0., eigen_values.eval()) full_w_chol = distributions.WishartFull( df, scale, cholesky_input_output_matrices=True) self.assertAllClose(chol_x, full_w_chol.sample_n(1, seed=42).eval()) eigen_values = tf.matrix_diag_part(full_w_chol.sample_n(1000, seed=42)) np.testing.assert_array_less(0., eigen_values.eval()) # Check first and second moments. df = 4. chol_w = distributions.WishartCholesky( df=df, scale=chol(make_pd(1., 3)), cholesky_input_output_matrices=False) x = chol_w.sample_n(10000, seed=42) self.assertAllEqual((10000, 3, 3), x.get_shape()) moment1_estimate = tf.reduce_mean(x, reduction_indices=[0]).eval() self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05) # The Variance estimate uses the squares rather than outer-products # because Wishart.Variance is the diagonal of the Wishart covariance # matrix. variance_estimate = ( tf.reduce_mean(tf.square(x), reduction_indices=[0]) - tf.square(moment1_estimate)).eval() self.assertAllClose(chol_w.variance().eval(), variance_estimate, rtol=0.05)
def testMatrix(self): with self.test_session(use_gpu=self._use_gpu): v = np.array([1.0, 2.0, 3.0]) mat = np.diag(v) mat_diag = tf.matrix_diag_part(mat) self.assertEqual((3,), mat_diag.get_shape()) self.assertAllEqual(mat_diag.eval(), v)
def _forward_log_det_jacobian(self, x): # We formulate the Jacobian with respect to the flattened matrices # `vec(x)` and `vec(y)`. Suppose for notational convenience that # the first `n` entries of `vec(x)` are the diagonal of `x`, and # the remaining `n**2-n` entries are the off-diagonals in # arbitrary order. Then the Jacobian is a block-diagonal matrix, # with the Jacobian of the diagonal bijector in the first block, # and the identity Jacobian for the remaining entries (since this # bijector acts as the identity on non-diagonal entries): # # J_vec(x) (vec(y)) = # ------------------------------- # | J_diag(x) (diag(y)) 0 | n entries # | | # | 0 I | n**2-n entries # ------------------------------- # n n**2-n # # Since the log-det of the second (identity) block is zero, the # overall log-det-jacobian is just the log-det of first block, # from the diagonal bijector. # # Note that for elementwise operations (exp, softplus, etc) the # first block of the Jacobian will itself be a diagonal matrix, # but our implementation does not require this to be true. return self._diag_bijector.forward_log_det_jacobian( tf.matrix_diag_part(x), event_ndims=1)
def multivariate_normal(x, mu, L): """ Computes the log-density of a multivariate normal. :param x : Dx1 or DxN sample(s) for which we want the density :param mu : Dx1 or DxN mean(s) of the normal distribution :param L : DxD Cholesky decomposition of the covariance matrix :return p : (1,) or (N,) vector of log densities for each of the N x's and/or mu's x and mu are either vectors or matrices. If both are vectors (N,1): p[0] = log pdf(x) where x ~ N(mu, LL^T) If at least one is a matrix, we assume independence over the *columns*: the number of rows must match the size of L. Broadcasting behaviour: p[n] = log pdf of: x[n] ~ N(mu, LL^T) or x ~ N(mu[n], LL^T) or x[n] ~ N(mu[n], LL^T) """ if x.shape.ndims is None: warnings.warn('Shape of x must be 2D at computation.') elif x.shape.ndims != 2: raise ValueError('Shape of x must be 2D.') if mu.shape.ndims is None: warnings.warn('Shape of mu may be unknown or not 2D.') elif mu.shape.ndims != 2: raise ValueError('Shape of mu must be 2D.') d = x - mu alpha = tf.matrix_triangular_solve(L, d, lower=True) num_dims = tf.cast(tf.shape(d)[0], L.dtype) p = - 0.5 * tf.reduce_sum(tf.square(alpha), 0) p -= 0.5 * num_dims * np.log(2 * np.pi) p -= tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) return p
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _expectation(p, mean, none, kern, feat, nghp=None): """ Compute the expectation: expectation[n] = <x_n K_{x_n, Z}>_p(x_n) - K_{.,.} :: RBF kernel :return: NxDxM """ Xmu, Xcov = p.mu, p.cov with tf.control_dependencies([tf.assert_equal( tf.shape(Xmu)[1], tf.constant(kern.input_dim, settings.tf_int), message="Currently cannot handle slicing in exKxz.")]): Xmu = tf.identity(Xmu) with params_as_tensors_for(kern), params_as_tensors_for(feat): D = tf.shape(Xmu)[1] lengthscales = kern.lengthscales if kern.ARD \ else tf.zeros((D,), dtype=settings.float_type) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(feat.Z) - tf.expand_dims(Xmu, 2) # NxDxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N exponent_mahalanobis = tf.cholesky_solve(chol_L_plus_Xcov, all_diffs) # NxDxM non_exponent_term = tf.matmul(Xcov, exponent_mahalanobis, transpose_a=True) non_exponent_term = tf.expand_dims(Xmu, 2) + non_exponent_term # NxDxM exponent_mahalanobis = tf.reduce_sum(all_diffs * exponent_mahalanobis, 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM return kern.variance * (determinants[:, None] * exponent_mahalanobis)[:, None, :] * non_exponent_term
def _expectation(p, kern, feat, none1, none2, nghp=None): """ Compute the expectation: <K_{X, Z}>_p(X) - K_{.,.} :: RBF kernel :return: NxM """ with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov(p.cov) Z, Xmu = kern._slice(feat.Z, p.mu) D = tf.shape(Xmu)[1] if kern.ARD: lengthscales = kern.lengthscales else: lengthscales = tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(Z) - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N return kern.variance * (determinants[:, None] * exponent_mahalanobis)
def _build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(tf.eye(self.num_data, dtype=settings.float_type), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda), 1) f_var = 1. / tf.square(self.q_lambda) - tf.transpose(tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha * self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def _expectation(p, rbf_kern, feat1, lin_kern, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - K_lin_{.,.} :: RBF kernel - K_rbf_{.,.} :: Linear kernel Different Z1 and Z2 are handled if p is diagonal and K_lin and K_rbf have disjoint active_dims, in which case the joint expectations simplify into a product of expectations :return: NxM1xM2 """ if rbf_kern.on_separate_dims(lin_kern) and isinstance(p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (rbf_kern, feat1)) eKxz2 = expectation(p, (lin_kern, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2: raise NotImplementedError("Features have to be the same for both kernels.") if rbf_kern.active_dims != lin_kern.active_dims: raise NotImplementedError("active_dims have to be the same for both kernels.") with params_as_tensors_for(rbf_kern), params_as_tensors_for(lin_kern), \ params_as_tensors_for(feat1), params_as_tensors_for(feat2): # use only active dimensions Xcov = rbf_kern._slice_cov(tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov) Z, Xmu = rbf_kern._slice(feat1.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] lin_kern_variances = lin_kern.variance if lin_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + lin_kern.variance rbf_kern_lengthscales = rbf_kern.lengthscales if rbf_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + rbf_kern.lengthscales ## Begin RBF eKxz code: chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(rbf_kern_lengthscales ** 2) + Xcov) # NxDxD Z_transpose = tf.transpose(Z) all_diffs = Z_transpose - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(rbf_kern_lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N eKxz_rbf = rbf_kern.variance * (determinants[:, None] * exponent_mahalanobis) ## NxM <- End RBF eKxz code tiled_Z = tf.tile(tf.expand_dims(Z_transpose, 0), (N, 1, 1)) # NxDxM z_L_inv_Xcov = tf.matmul(tiled_Z, Xcov / rbf_kern_lengthscales[:, None] ** 2., transpose_a=True) # NxMxD cross_eKzxKxz = tf.cholesky_solve( chol_L_plus_Xcov, (lin_kern_variances * rbf_kern_lengthscales ** 2.)[..., None] * tiled_Z) # NxDxM cross_eKzxKxz = tf.matmul((z_L_inv_Xcov + Xmu[:, None, :]) * eKxz_rbf[..., None], cross_eKzxKxz) # NxMxM return cross_eKzxKxz
def _expectation(p, kern1, feat1, kern2, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - Ka_{.,.}, Kb_{.,.} :: RBF kernels Ka and Kb as well as Z1 and Z2 can differ from each other, but this is supported only if the Gaussian p is Diagonal (p.cov NxD) and Ka, Kb have disjoint active_dims in which case the joint expectations simplify into a product of expectations :return: NxMxM """ if kern1.on_separate_dims(kern2) and isinstance(p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (kern1, feat1)) eKxz2 = expectation(p, (kern2, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2 or kern1 != kern2: raise NotImplementedError("The expectation over two kernels has only an " "analytical implementation if both kernels are equal.") kern = kern1 feat = feat1 with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov(tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov) Z, Xmu = kern._slice(feat.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] squared_lengthscales = kern.lengthscales ** 2. if kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales ** 2. sqrt_det_L = tf.reduce_prod(0.5 * squared_lengthscales) ** 0.5 C = tf.cholesky(0.5 * tf.matrix_diag(squared_lengthscales) + Xcov) # NxDxD dets = sqrt_det_L / tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(C)), axis=1)) # N C_inv_mu = tf.matrix_triangular_solve(C, tf.expand_dims(Xmu, 2), lower=True) # NxDx1 C_inv_z = tf.matrix_triangular_solve(C, tf.tile(tf.expand_dims(tf.transpose(Z) / 2., 0), [N, 1, 1]), lower=True) # NxDxM mu_CC_inv_mu = tf.expand_dims(tf.reduce_sum(tf.square(C_inv_mu), 1), 2) # Nx1x1 z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1) # NxM zm_CC_inv_zn = tf.matmul(C_inv_z, C_inv_z, transpose_a=True) # NxMxM two_z_CC_inv_mu = 2 * tf.matmul(C_inv_z, C_inv_mu, transpose_a=True)[:, :, 0] # NxM exponent_mahalanobis = mu_CC_inv_mu + tf.expand_dims(z_CC_inv_z, 1) + \ tf.expand_dims(z_CC_inv_z, 2) + 2 * zm_CC_inv_zn - \ tf.expand_dims(two_z_CC_inv_mu, 2) - tf.expand_dims(two_z_CC_inv_mu, 1) # NxMxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxMxM # Compute sqrt(self.K(Z)) explicitly to prevent automatic gradient from # being NaN sometimes, see pull request #615 kernel_sqrt = tf.exp(-0.25 * kern.square_dist(Z, None)) return kern.variance ** 2 * kernel_sqrt * \ tf.reshape(dets, [N, 1, 1]) * exponent_mahalanobis
def testGrad(self): shapes = ((3, 3), (5, 3, 3)) with self.test_session(use_gpu=self._use_gpu): for shape in shapes: x = tf.constant(np.random.rand(*shape), dtype=np.float32) y = tf.matrix_diag_part(x) error = tf.test.compute_gradient_error(x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error, 1e-4)
def _variance(self): # Because df is a scalar, we need to expand dimensions to match # scale_operator. We use ellipses notation (...) to select all dimensions # and add two dimensions to the end. df = self.df[..., tf.newaxis, tf.newaxis] x = tf.sqrt(df) * self._square_scale_operator() d = tf.expand_dims(tf.matrix_diag_part(x), -1) v = tf.square(x) + tf.matmul(d, d, adjoint_b=True) return v
def testRectangularBatch(self): with self.test_session(use_gpu=self._use_gpu): v_batch = np.array([[1.0, 2.0], [4.0, 5.0]]) mat_batch = np.array( [[[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]], [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0]]]) self.assertEqual(mat_batch.shape, (2, 2, 3)) mat_batch_diag = tf.matrix_diag_part(mat_batch) self.assertEqual((2, 2), mat_batch_diag.get_shape()) self.assertAllEqual(mat_batch_diag.eval(), v_batch)
def _forward_log_det_jacobian(self, x): # CholeskyToInvCholesky.forward(X) is equivalent to # 1) M = CholeskyOuterProduct.forward(X) # 2) N = invert(M) # 3) Y = CholeskyOuterProduct.inverse(N) # # For step 1, # |Jac(outerprod(X))| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}. # For step 2, # |Jac(inverse(M))| = |M|^{-(p+1)} (because M is symmetric) # = |X|^{-2(p+1)} = (prod_{j=0}^{p-1} X[j,j])^{-2(p+1)} # (see http://web.mit.edu/18.325/www/handouts/handout2.pdf sect 3.0.2) # For step 3, # |Jac(Cholesky(N))| = -|Jac(outerprod(Y)| # = 2^p prod_{j=0}^{p-1} Y[j,j]^{p-j} n = tf.cast(tf.shape(x)[-1], x.dtype) y = self._forward(x) return ( (self._cholesky.forward_log_det_jacobian(x, event_ndims=2) - (n + 1.) * tf.reduce_sum(tf.log(tf.matrix_diag_part(x)), axis=-1)) - (self._cholesky.forward_log_det_jacobian(y, event_ndims=2) - (n + 1.) * tf.reduce_sum(tf.log(tf.matrix_diag_part(y)), axis=-1)))
def _expectation(p, kern, none1, none2, none3, nghp=None): """ Compute the expectation: <diag(K_{X, X})>_p(X) - K_{.,.} :: Linear kernel :return: N """ with params_as_tensors_for(kern): # use only active dimensions Xmu, _ = kern._slice(p.mu, None) Xcov = kern._slice_cov(p.cov) return tf.reduce_sum(kern.variance * (tf.matrix_diag_part(Xcov) + tf.square(Xmu)), 1)
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) output_dim = tf.cast(tf.shape(self.Y)[1], settings.float_type) err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) L = tf.cholesky(Kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma AAT = tf.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += tf.negative(output_dim) * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * output_dim * tf.reduce_sum(Kdiag) / self.likelihood.variance bound += 0.5 * output_dim * tf.reduce_sum(tf.matrix_diag_part(AAT)) return bound
def fit(self, x=None, y=None): # p(coeffs | x, y) = Normal(coeffs | # mean = (1/noise_variance) (1/noise_variance x^T x + I)^{-1} x^T y, # covariance = (1/noise_variance x^T x + I)^{-1}) # TODO(trandustin): We newly fit the data at each call. Extend to do # Bayesian updating. kernel_matrix = tf.matmul(x, x, transpose_a=True) / self.noise_variance coeffs_precision = tf.matrix_set_diag( kernel_matrix, tf.matrix_diag_part(kernel_matrix) + 1.) coeffs_precision_tril = tf.linalg.cholesky(coeffs_precision) self.coeffs_precision_tril_op = tf.linalg.LinearOperatorLowerTriangular( coeffs_precision_tril) self.coeffs_mean = self.coeffs_precision_tril_op.solvevec( self.coeffs_precision_tril_op.solvevec(tf.einsum('nm,n->m', x, y)), adjoint=True) / self.noise_variance # TODO(trandustin): To be fully Keras-compatible, return History object. return
def zero_mean_covariance(covariance, stability=0.0): '''Output covariance of ReLU for zero-mean Gaussian input. f(x) = max(x, 0). Args: covariance: Input covariance matrix (Size, Size). stability: For accurate results this should be zero if used in training, use a value like 1e-4 for stability. Returns: Output covariance of ReLU for zero-mean Gaussian input (Size, Size). ''' S = outer(tf.sqrt(tf.matrix_diag_part(covariance))) V = tf.clip_by_value(covariance / S, stability - 1.0, 1.0 - stability) Q = tf.acos(-V) * V + tf.sqrt(1.0 - (V**2.0)) - 1.0 return S * Q * (1.0 / (2.0 * math.pi))
def _assertions(self, x): if not self.validate_args: return [] x_shape = tf.shape(x) is_matrix = tf.assert_rank_at_least( x, 2, message="Input must have rank at least 2.") is_square = tf.assert_equal( x_shape[-2], x_shape[-1], message="Input must be a square matrix.") diag_part_x = tf.matrix_diag_part(x) is_lower_triangular = tf.assert_equal( tf.matrix_band_part(x, 0, -1), # Preserves triu, zeros rest. tf.matrix_diag(diag_part_x), message="Input must be lower triangular.") is_positive_diag = tf.assert_positive( diag_part_x, message="Input must have all positive diagonal entries.") return [is_matrix, is_square, is_lower_triangular, is_positive_diag]
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ # FITC approximation to the log marginal likelihood is # log ( normal( y | mean, K_fitc ) ) # where K_fitc = Qff + diag( \nu ) # where Qff = Kfu Kuu^{-1} Kuf # with \nu_i = Kff_{i,i} - Qff_{i,i} + \sigma^2 # We need to compute the Mahalanobis term -0.5* err^T K_fitc^{-1} err # (summed over functions). # We need to deal with the matrix inverse term. # K_fitc^{-1} = ( Qff + \diag( \nu ) )^{-1} # = ( V^T V + \diag( \nu ) )^{-1} # Applying the Woodbury identity we obtain # = \diag( \nu^{-1} ) - \diag( \nu^{-1} ) V^T ( I + V \diag( \nu^{-1} ) V^T )^{-1) V \diag(\nu^{-1} ) # Let \beta = \diag( \nu^{-1} ) err # and let \alpha = V \beta # then Mahalanobis term = -0.5* ( \beta^T err - \alpha^T Solve( I + V \diag( \nu^{-1} ) V^T, alpha ) ) err, nu, Luu, L, alpha, beta, gamma = self._build_common_terms() mahalanobisTerm = -0.5 * tf.reduce_sum(tf.square(err) / tf.expand_dims(nu, 1)) \ + 0.5 * tf.reduce_sum(tf.square(gamma)) # We need to compute the log normalizing term -N/2 \log 2 pi - 0.5 \log \det( K_fitc ) # We need to deal with the log determinant term. # \log \det( K_fitc ) = \log \det( Qff + \diag( \nu ) ) # = \log \det( V^T V + \diag( \nu ) ) # Applying the determinant lemma we obtain # = \log [ \det \diag( \nu ) \det( I + V \diag( \nu^{-1} ) V^T ) ] # = \log [ \det \diag( \nu ) ] + \log [ \det( I + V \diag( \nu^{-1} ) V^T ) ] constantTerm = -0.5 * self.num_data * tf.log(tf.constant(2. * np.pi, settings.float_type)) logDeterminantTerm = -0.5 * tf.reduce_sum(tf.log(nu)) - tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) logNormalizingTerm = constantTerm + logDeterminantTerm return mahalanobisTerm + logNormalizingTerm * self.num_latent
def testCovarianceFromSampling(self): # We will test mean, cov, var, stddev on a DirichletMultinomial constructed # via broadcast between alpha, n. alpha = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) # Ideally we'd be able to test broadcasting but, the multinomial sampler # doesn't support different total counts. n = np.float32(5) with self.cached_session() as sess: # batch_shape=[2], event_shape=[3] dist = ds.DirichletMultinomial(n, alpha) x = dist.sample(int(250e3), seed=1) sample_mean = tf.reduce_mean(x, 0) x_centered = x - sample_mean[tf.newaxis, ...] sample_cov = tf.reduce_mean(tf.matmul( x_centered[..., tf.newaxis], x_centered[..., tf.newaxis, :]), 0) sample_var = tf.matrix_diag_part(sample_cov) sample_stddev = tf.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = sess.run([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0.04, rtol=0.) self.assertAllClose(sample_cov_, analytic_cov, atol=0.05, rtol=0.) self.assertAllClose(sample_var_, analytic_var, atol=0.05, rtol=0.) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
def _define_full_covariance_probs(self, shard_id, shard): """Defines the full covariance probabilties per example in a class. Updates a matrix with dimension num_examples X num_classes. Args: shard_id: id of the current shard. shard: current data shard, 1 X num_examples X dimensions. """ diff = shard - self._means cholesky = tf.cholesky(self._covs + self._min_var) log_det_covs = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(cholesky)), 1) x_mu_cov = tf.square( tf.matrix_triangular_solve( cholesky, tf.transpose( diff, perm=[0, 2, 1]), lower=True)) diag_m = tf.transpose(tf.reduce_sum(x_mu_cov, 1)) self._probs[shard_id] = -0.5 * ( diag_m + tf.to_float(self._dimensions) * tf.log(2 * np.pi) + log_det_covs)
def testCovarianceFromSampling(self): # We will test mean, cov, var, stddev on a Multinomial constructed via # broadcast between alpha, n. theta = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) theta /= np.sum(theta, 1)[..., tf.newaxis] n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32) with self.cached_session() as sess: # batch_shape=[3, 2], event_shape=[3] dist = multinomial.Multinomial(n, theta) x = dist.sample(int(1000e3), seed=1) sample_mean = tf.reduce_mean(x, 0) x_centered = x - sample_mean[tf.newaxis, ...] sample_cov = tf.reduce_mean(tf.matmul( x_centered[..., tf.newaxis], x_centered[..., tf.newaxis, :]), 0) sample_var = tf.matrix_diag_part(sample_cov) sample_stddev = tf.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = sess.run([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0.01, rtol=0.01) self.assertAllClose(sample_cov_, analytic_cov, atol=0.01, rtol=0.01) self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
def _assertions(self, x): if not self.validate_args: return [] shape = tf.shape(x) is_matrix = tf.assert_rank_at_least( x, 2, message="Input must have rank at least 2.") is_square = tf.assert_equal( shape[-2], shape[-1], message="Input must be a square matrix.") above_diagonal = tf.matrix_band_part( tf.matrix_set_diag(x, tf.zeros(shape[:-1], dtype=tf.float32)), 0, -1) is_lower_triangular = tf.assert_equal( above_diagonal, tf.zeros_like(above_diagonal), message="Input must be lower triangular.") # A lower triangular matrix is nonsingular iff all its diagonal entries are # nonzero. diag_part = tf.matrix_diag_part(x) is_nonsingular = tf.assert_none_equal( diag_part, tf.zeros_like(diag_part), message="Input must have all diagonal entries nonzero.") return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
def _forward_log_det_jacobian(self, x): # Calculation of the Jacobian: # # Let X = (x_{ij}), 0 <= i,j < n, be a matrix of indeterminates. Let Z = # X^{-1} where Z = (z_{ij}). Then # # dZ/dx_{ij} = (d/dt | t=0) Y(t)^{-1}, # # where Y(t) = X + t*E_{ij} and E_{ij} is the matrix with a 1 in the (i,j) # entry and zeros elsewhere. By the product rule, # # 0 = d/dt [Identity matrix] # = d/dt [Y Y^{-1}] # = Y d/dt[Y^{-1}] + dY/dt Y^{-1} # # so # # d/dt[Y^{-1}] = -Y^{-1} dY/dt Y^{-1} # = -Y^{-1} E_{ij} Y^{-1}. # # Evaluating at t=0, # # dZ/dx_{ij} = -Z E_{ij} Z. # # Taking the (r,s) entry of each side, # # dz_{rs}/dx_{ij} = -z_{ri}z_{sj}. # # Now, let J be the Jacobian dZ/dX, arranged as the n^2-by-n^2 matrix whose # (r*n + s, i*n + j) entry is dz_{rs}/dx_{ij}. Considering J as an n-by-n # block matrix with n-by-n blocks, the above expression for dz_{rs}/dx_{ij} # shows that the block at position (r,i) is -z_{ri}Z. Hence # # J = -KroneckerProduct(Z, Z), # det(J) = (-1)^(n^2) (det Z)^(2n) # = (-1)^n (det X)^(-2n). with tf.control_dependencies(self._assertions(x)): return (-2. * tf.cast(tf.shape(x)[-1], x.dtype.base_dtype) * tf.reduce_sum(tf.log(tf.abs(tf.matrix_diag_part(x))), axis=-1))
def create_model(self, model_input, vocab_size, is_training, num_mixtures=None, l2_penalty=1e-8, **unused_params): """Creates a Mixture of (Logistic) Experts model. It also includes the possibility of gating the probabilities The model consists of a per-class softmax distribution over a configurable number of logistic classifiers. One of the classifiers in the mixture is not trained, and always predicts 0. Args: model_input: 'batch_size' x 'num_features' matrix of input features. vocab_size: The number of classes in the dataset. is_training: Is this the training phase ? num_mixtures: The number of mixtures (excluding a dummy 'expert' that always predicts the non-existence of an entity). l2_penalty: How much to penalize the squared magnitudes of parameter values. Returns: A dictionary with a tensor containing the probability predictions of the model in the 'predictions' key. The dimensions of the tensor are batch_size x num_classes. """ num_mixtures = num_mixtures or FLAGS.moe_num_mixtures low_rank_gating = FLAGS.moe_low_rank_gating l2_penalty = FLAGS.moe_l2; gating_probabilities = FLAGS.moe_prob_gating gating_input = FLAGS.moe_prob_gating_input input_size = model_input.get_shape().as_list()[1] remove_diag = FLAGS.gating_remove_diag if low_rank_gating == -1: gate_activations = layers.fully_connected( model_input, vocab_size * (num_mixtures + 1), activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates", float16_flag=FLAGS.float16_flag) else: gate_activations1 = slim.fully_connected( model_input, low_rank_gating, activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates1") gate_activations = slim.fully_connected( gate_activations1, vocab_size * (num_mixtures + 1), activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates2") expert_activations = layers.fully_connected( model_input, vocab_size * num_mixtures, activation_fn=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="experts", float16_flag=FLAGS.float16_flag) gating_distribution = tf.nn.softmax(tf.reshape( gate_activations, [-1, num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) expert_distribution = tf.nn.sigmoid(tf.reshape( expert_activations, [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures if '16' in str(expert_distribution.dtype): expert_distribution = tf.cast(expert_distribution,tf.float32) probabilities_by_class_and_batch = tf.reduce_sum( gating_distribution[:, :num_mixtures] * expert_distribution, 1) probabilities = tf.reshape(probabilities_by_class_and_batch, [-1, vocab_size]) if gating_probabilities: if gating_input == 'prob': gating_weights = tf.get_variable("gating_prob_weights", [vocab_size, vocab_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)), dtype = tf.float16 if FLAGS.float16_flag else tf.float32) gates = tf.matmul(probabilities, tf.cast(gating_weights,tf.float32) if '16' in str(gating_weights.dtype) else gating_weights) else: gating_weights = tf.get_variable("gating_prob_weights", [input_size, vocab_size], initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)), dtype = tf.float16 if FLAGS.float16_flag else tf.float32) gates = tf.matmul(model_input, tf.cast(gating_weights,tf.float32) if '16' in str(gating_weights.dtype) else gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals,probabilities) gates = slim.batch_norm( gates, center=True, scale=True, is_training=is_training, scope="gating_prob_bn") gates = tf.sigmoid(gates) probabilities = tf.multiply(probabilities, gates) return {"predictions": probabilities}
def gauss_kl(q_mu, q_sqrt, K=None): """ Compute the KL divergence KL[q || p] between q(x) = N(q_mu, q_sqrt^2) and p(x) = N(0, K) We assume N multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. Returns the sum of the divergences. q_mu is a matrix (M x L), each column contains a mean. q_sqrt can be a 3D tensor (L xM x M), each matrix within is a lower triangular square-root matrix of the covariance of q. q_sqrt can be a matrix (M x L), each column represents the diagonal of a square-root matrix of the covariance of q. K is the covariance of p. It is a positive definite matrix (M x M) or a tensor of stacked such matrices (L x M x M) If K is None, compute the KL divergence to p(x) = N(0, I) instead. """ white = K is None diag = q_sqrt.get_shape().ndims == 2 M, B = tf.shape(q_mu)[0], tf.shape(q_mu)[1] if white: alpha = q_mu # M x B else: batch = K.get_shape().ndims == 3 Lp = tf.cholesky(K) # B x M x M or M x M q_mu = tf.transpose( q_mu)[:, :, None] if batch else q_mu # B x M x 1 or M x B alpha = tf.matrix_triangular_solve(Lp, q_mu, lower=True) # B x M x 1 or M x B if diag: Lq = Lq_diag = q_sqrt Lq_full = tf.matrix_diag(tf.transpose(q_sqrt)) # B x M x M else: Lq = Lq_full = tf.matrix_band_part( q_sqrt, -1, 0) # force lower triangle # B x M x M Lq_diag = tf.matrix_diag_part(Lq) # M x B # Mahalanobis term: μqᵀ Σp⁻¹ μq mahalanobis = tf.reduce_sum(tf.square(alpha)) # Constant term: - B * M constant = tf.cast(-tf.size(q_mu, out_type=tf.int64), dtype=settings.float_type) # Log-determinant of the covariance of q(x): logdet_qcov = tf.reduce_sum(tf.log(tf.square(Lq_diag))) # Trace term: tr(Σp⁻¹ Σq) if white: trace = tf.reduce_sum(tf.square(Lq)) else: if diag and not batch: # K is M x M and q_sqrt is M x B: fast specialisation LpT = tf.transpose(Lp) # M x M Lp_inv = tf.matrix_triangular_solve(Lp, tf.eye( M, dtype=settings.float_type), lower=True) # M x M K_inv = tf.matrix_diag_part( tf.matrix_triangular_solve( LpT, Lp_inv, lower=False))[:, None] # M x M -> M x 1 trace = tf.reduce_sum(K_inv * tf.square(q_sqrt)) else: # TODO: broadcast instead of tile when tf allows (not implemented in tf <= 1.6.0) Lp_full = Lp if batch else tf.tile(tf.expand_dims(Lp, 0), [B, 1, 1]) LpiLq = tf.matrix_triangular_solve(Lp_full, Lq_full, lower=True) trace = tf.reduce_sum(tf.square(LpiLq)) twoKL = mahalanobis + constant - logdet_qcov + trace # Log-determinant of the covariance of p(x): if not white: log_sqdiag_Lp = tf.log(tf.square(tf.matrix_diag_part(Lp))) sum_log_sqdiag_Lp = tf.reduce_sum(log_sqdiag_Lp) # If K is B x M x M, num_latent is no longer implicit, no need to multiply the single kernel logdet scale = 1.0 if batch else tf.cast(B, settings.float_type) twoKL += scale * sum_log_sqdiag_Lp return 0.5 * twoKL
def build_model(self): # Compression Network # Takes x # Produces x' # Produces z = concat((z_c, z_r)) (Equations 1, 2, 3) # z_r = concat((eu_dist, cos_sim)) self.input = tf.placeholder( shape=(None, self.input_dim), dtype=tf.float32, name="input", ) encoder_1 = tf.layers.dense( inputs=self.input, units=12, activation=tf.tanh, ) encoder_2 = tf.layers.dense( inputs=encoder_1, units=4, activation=tf.tanh, ) self.z_c = tf.layers.dense( inputs=encoder_2, units=1, activation=None, ) decoder_1 = tf.layers.dense( inputs=self.z_c, units=4, activation=tf.tanh, ) decoder_2 = tf.layers.dense( inputs=decoder_1, units=12, activation=tf.tanh, ) self.recon = tf.layers.dense( inputs=decoder_2, units=self.input_dim, activation=None, ) eu_dist = tf.norm(self.input - self.recon, axis=1, keep_dims=True) / tf.norm(self.input, axis=1, keep_dims=True) cos_sim = tf.reduce_sum(self.input * self.recon, axis=1, keep_dims=True) / (tf.norm(self.input, axis=1, keep_dims=True) * tf.norm(self.recon, axis=1, keep_dims=True)) self.z_r = tf.concat((eu_dist, cos_sim), axis=1) self.z = tf.concat((self.z_c, self.z_r), axis=1) # Estimation Network # Takes z = concat((z_c, z_r)) # Produces p, where gamma = softmax(p) = soft mixture-component membership prediction (Equation 4) self.is_train = tf.placeholder( # for dropout shape=None, dtype=tf.bool, name="is_train", ) estim_1 = tf.layers.dense( inputs=self.z, units=10, activation=tf.tanh, ) estim_dropout = tf.layers.dropout( inputs=estim_1, rate=0.5, training=self.is_train, ) self.p = tf.layers.dense( inputs=estim_dropout, units=self.gmm_k, activation=None, ) self.gamma = tf.nn.softmax(self.p) # GMM parameters: gmm_dist (phi), gmm_mean (mu), gmm_cov (epsilon) (Equation 5) # self.gmm_dist = tf.expand_dims(tf.reduce_mean(self.gamma, axis=0, keep_dims=True), axis=2) self.gmm_dist = tf.transpose(tf.reduce_mean(self.gamma, axis=0, keep_dims=True)) self.gmm_mean = tf.matmul(self.gamma, self.z, transpose_a=True) / tf.transpose(tf.reduce_sum(self.gamma, axis=0, keep_dims=True)) self.diff_mean = diff_mean = tf.tile(tf.expand_dims(self.z, axis=0), tf.constant([self.gmm_k, 1, 1])) - tf.expand_dims(self.gmm_mean, axis=1) self.gmm_cov = tf.matmul(tf.transpose(diff_mean, perm=[0, 2, 1]), tf.expand_dims(tf.transpose(self.gamma), axis=2) * diff_mean) / tf.expand_dims(tf.transpose(tf.reduce_sum(self.gamma, axis=0, keep_dims=True)), axis=2) # Energy Function (Equation 6) energy_numerator = tf.exp(-0.5 * tf.reduce_sum(tf.matmul(self.diff_mean, self.gmm_cov) * self.diff_mean, axis=2)) energy_denominator = tf.expand_dims(tf.expand_dims(tf.sqrt(tf.matrix_determinant(2 * np.pi * self.gmm_cov)), axis=1), axis=2) self.energy = tf.expand_dims(-tf.log(tf.reduce_sum(tf.reduce_sum(tf.expand_dims(self.gmm_dist, axis=1) * energy_numerator / energy_denominator, axis=0), axis=0)), axis=1) # Loss Function (Equation 7) # Reconstruction loss + lmda_1 * Energy loss + lmda_2 * Diagonal loss # self.recon_loss = recon_loss = tf.losses.mean_squared_error(self.input, self.recon) self.recon_loss = recon_loss = tf.reduce_mean(tf.norm((self.input - self.recon), axis=1) ** 2) self.energy_loss = energy_loss = tf.reduce_mean(self.energy) self.diagonal_loss = diagonal_loss = tf.reduce_sum(tf.pow(tf.matrix_diag_part(self.gmm_cov), -tf.ones_like(tf.matrix_diag_part(self.gmm_cov)))) self.loss = recon_loss + self.lmda_1 * energy_loss + self.lmda_2 * diagonal_loss self.optimize = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(self.loss)
def _add_diagonal_shift(matrix, shift): diag_plus_shift = tf.matrix_diag_part(matrix) + shift return tf.matrix_set_diag(matrix, diag_plus_shift)
def measure_fock(self, modes, select=None, **kwargs): """ Measures 'modes' in the Fock basis and updates remaining modes conditioned on this result. After measurement, the states in 'modes' are reset to the vacuum. Args: modes (Sequence[int]): which modes to measure (in increasing order). select (Sequence[int]): user-specified measurement value (used instead of random sampling) **kwargs: can be used to pass a session or a feed_dict. Otherwise a temporary session and no feed_dict will be used. Returns: A list with the Fock number measurement results for each mode. """ # allow integer (non-list) arguments # not part of the API, but provided for convenience if isinstance(modes, int): modes = [modes] if isinstance(select, int): select = [select] # convert lists to np arrays if isinstance(modes, list): modes = np.array(modes) if isinstance(select, list): select = np.array(select) # check for valid 'modes' argument if len(modes) == 0 or len(modes) > self._num_modes or len( modes) != len(set(modes)): #pylint: disable=len-as-condition raise ValueError("Specified modes are not valid.") if np.any(modes != sorted(modes)): raise ValueError("'modes' must be sorted in increasing order.") # check for valid 'select' argument if select is not None: if np.any(select == None): #pylint: disable=singleton-comparison raise NotImplementedError( "Post-selection lists must only contain numerical values.") if self._batched: num_meas_modes = len(modes) # in this case, select must either be: # np array of shape (M,), or # np array of shape (B,M) # where B is the batch_size and M is the number of measured modes shape_err = False if len(select.shape) == 1: # non-batched list, must broadcast if select.shape[0] != num_meas_modes: shape_err = True else: select = np.vstack([select] * self._batch_size) elif len(select.shape) == 2: # batch of lists, no need to broadcast if select.shape != (self._batch_size, num_meas_modes): shape_err = True else: shape_err = True if shape_err: raise ValueError( "The shape of 'select' is incompatible with 'modes'.") else: # in this case, select should be a vector if select.shape != modes.shape: raise ValueError( "'select' must be have the same shape as 'modes'") # carry out the operation with self.graph.as_default(): evaluate_results, session, feed_dict, close_session = ops._check_for_eval( kwargs) num_reduced_state_modes = len(modes) reduced_state = self._state if self._state_is_pure: mode_size = 1 else: mode_size = 2 if self._batched: batch_size = self._batch_size batch_offset = 1 else: batch_size = 1 batch_offset = 0 if select is not None: # just use the supplied measurement results meas_result = select else: # compute and sample measurement result if self._state_is_pure and len(modes) == self._num_modes: # in this case, measure directly on the pure state probs = tf.abs(self._state)**2 logprobs = tf.log(probs) sample = tf.multinomial( tf.reshape(logprobs, [batch_size, -1]), 1) sample_tensor = tf.squeeze(sample) else: # otherwise, trace out unmeasured modes and sample using diagonal of reduced state removed_ctr = 0 red_state_is_pure = self._state_is_pure for m in range(self._num_modes): if m not in modes: new_mode_idx = m - removed_ctr reduced_state = ops.partial_trace( reduced_state, new_mode_idx, red_state_is_pure, self._batched) red_state_is_pure = False removed_ctr += 1 # go from bra_A,ket_A,bra_B,ket_B,... -> bra_A,bra_B,ket_A,ket_B,... since this is what diag_part expects # workaround for getting multi-index diagonal since tensorflow doesn't support getting diag of more than one subsystem at once if num_reduced_state_modes > 1: state_indices = np.arange(batch_offset + 2 * num_reduced_state_modes) batch_index = state_indices[:batch_offset] bra_indices = state_indices[batch_offset::2] ket_indices = state_indices[batch_offset + 1::2] transpose_list = np.concatenate( [batch_index, bra_indices, ket_indices]) reduced_state_reshuffled = tf.transpose( reduced_state, transpose_list) else: reduced_state_reshuffled = reduced_state diag_indices = [self._cutoff_dim**num_reduced_state_modes ] * 2 if self._batched: diag_indices = [self._batch_size] + diag_indices diag_tensor = tf.reshape(reduced_state_reshuffled, diag_indices) diag_entries = tf.matrix_diag_part(diag_tensor) # hack so we can use tf.multinomial for sampling logprobs = tf.log(tf.cast(diag_entries, tf.float64)) sample = tf.multinomial( tf.reshape(logprobs, [batch_size, -1]), 1) # sample is a single integer; we need to convert it to the corresponding [n0,n1,n2,...] sample_tensor = tf.squeeze(sample) # sample_val is a single integer for each batch entry; # we need to convert it to the corresponding [n0,n1,n2,...] meas_result = ops.unravel_index(sample_tensor, [self._cutoff_dim] * num_reduced_state_modes) if not self._batched: meas_result = meas_result[ 0] # no batch index, can get rid of first axis # unstack this here because that's how it should be returned meas_result = tf.unstack(meas_result, axis=-1, name="Meas_result") # project remaining modes into conditional state if len(modes) == self._num_modes: # in this case, all modes were measured and we can put everything in vacuum by reseting self.reset(pure=self._state_is_pure) else: # only some modes were measured: put unmeasured modes in conditional state, while reseting measured modes to vac fock_state = tf.one_hot(tf.stack(meas_result, axis=-1), depth=self._cutoff_dim, dtype=ops.def_type) conditional_state = self._state for idx, mode in enumerate(modes): if self._batched: f = fock_state[:, idx] else: f = fock_state[idx] conditional_state = ops.conditional_state( conditional_state, f, mode, self._state_is_pure, batched=self._batched) if self._state_is_pure: norm = tf.norm(tf.reshape(conditional_state, [batch_size, -1]), axis=1) else: # calculate norm of conditional_state # use a cheap hack since tensorflow doesn't allow einsum equation for trace: r = conditional_state for _ in range(self._num_modes - num_reduced_state_modes - 1): r = ops.partial_trace(r, 0, False, self._batched) norm = tf.trace(r) # for broadcasting norm_reshape = [1] * len( conditional_state.shape[batch_offset:]) if self._batched: norm_reshape = [self._batch_size] + norm_reshape normalized_conditional_state = conditional_state / tf.reshape( norm, norm_reshape) # reset measured modes into vacuum single_mode_vac = self._single_mode_pure_vac if self._state_is_pure else self._single_mode_mixed_vac if len(modes) == 1: meas_modes_vac = single_mode_vac else: meas_modes_vac = ops.combine_single_modes( [single_mode_vac] * len(modes), self._batched) batch_index = indices[:batch_offset] meas_mode_indices = indices[batch_offset:batch_offset + mode_size * len(modes)] conditional_indices = indices[batch_offset + mode_size * len(modes):batch_offset + mode_size * self._num_modes] eqn_lhs = batch_index + meas_mode_indices + "," + batch_index + conditional_indices eqn_rhs = '' meas_ctr = 0 cond_ctr = 0 for m in range(self._num_modes): if m in modes: # use measured_indices eqn_rhs += meas_mode_indices[mode_size * meas_ctr:mode_size * (meas_ctr + 1)] meas_ctr += 1 else: # use conditional indices eqn_rhs += conditional_indices[mode_size * cond_ctr:mode_size * (cond_ctr + 1)] cond_ctr += 1 eqn = eqn_lhs + "->" + batch_index + eqn_rhs new_state = tf.einsum(eqn, meas_modes_vac, normalized_conditional_state) self._update_state(new_state) # return measurement result if evaluate_results: _meas = [t.eval(feed_dict, session) for t in meas_result] if close_session: session.close() else: _meas = meas_result return tuple(_meas)
def logdet(self, A, **kwargs): A = (A + self.matrix_transpose(A)) / 2. term = tf.log(tf.matrix_diag_part(self.cholesky(A, **kwargs))) return 2 * tf.reduce_sum(term, -1)
def __init__(self, params, is_training=True): self.is_training = is_training batch_size = params["batch_size"] num_layers = params['nlayer'] rnn_size = params['n_hidden'] grad_clip = params["grad_clip"] self.output_keep_prob = tf.placeholder(tf.float32) self.input_keep_prob = tf.placeholder(tf.float32) NOUT = params['n_output'] # Transition LSTM cell_lst = [] for i in range(num_layers): cell = rnncell.ModifiedLSTMCell( rnn_size, forget_bias=1, initializer=tf.contrib.layers.xavier_initializer(), num_proj=None, is_training=self.is_training) if i > -1 and is_training == True: cell_drop = rnncell.DropoutWrapper( cell, output_keep_prob=self.output_keep_prob) cell = cell_drop if i > 10 and params['input_keep_prob'] < 1: cell_drop = rnncell.DropoutWrapper( cell, input_keep_prob=self.input_keep_prob) cell = cell_drop cell_lst.append(cell) self.cell = rnncell.MultiRNNCell(cell_lst) # LSTM for Q noise cell_lst = [] for i in range(params['Qnlayer']): cell_Q_noise = rnncell.ModifiedLSTMCell( params['Qn_hidden'], forget_bias=1, initializer=tf.contrib.layers.xavier_initializer(), num_proj=None, is_training=self.is_training) if i > -1 and is_training == True: cell_drop = rnncell.DropoutWrapper( cell_Q_noise, output_keep_prob=self.output_keep_prob) cell_Q_noise = cell_drop if i > 10 and params['input_keep_prob'] < 1: cell_drop = rnncell.DropoutWrapper( cell, input_keep_prob=self.input_keep_prob) cell = cell_drop cell_lst.append(cell_Q_noise) self.cell_Q_noise = rnncell.MultiRNNCell(cell_lst) # LSTM for R noise cell_lst = [] for i in range(params['Rnlayer']): cell_R_noise = rnncell.ModifiedLSTMCell( params['Rn_hidden'], forget_bias=1, initializer=tf.contrib.layers.xavier_initializer(), num_proj=None, is_training=self.is_training) if i > -1 and is_training == True: cell_drop = rnncell.DropoutWrapper( cell_R_noise, output_keep_prob=self.output_keep_prob) cell_R_noise = cell_drop if i > 10 and params['input_keep_prob'] < 1: cell_drop = rnncell.DropoutWrapper( cell, input_keep_prob=self.input_keep_prob) cell = cell_drop cell_lst.append(cell_R_noise) self.cell_R_noise = rnncell.MultiRNNCell(cell_lst) self.initial_state = self.cell.zero_state( batch_size=params['batch_size'], dtype=tf.float32) self.initial_state_Q_noise = self.cell_Q_noise.zero_state( batch_size=params['batch_size'], dtype=tf.float32) # self.initial_state_R_noise = self.cell_Q_noise.zero_state(batch_size=params['batch_size'], dtype=tf.float32) self.initial_state_R_noise = self.cell_R_noise.zero_state( batch_size=params['batch_size'], dtype=tf.float32) self.repeat_data = tf.placeholder( dtype=tf.int32, shape=[params["batch_size"], params['seq_length']]) #Measurements self._z = tf.placeholder(dtype=tf.float32, shape=[None, params['seq_length'], NOUT ]) # batch size, seqlength, feature self._x_inp = tf.placeholder( dtype=tf.float32, shape=[None, NOUT], name='Initialx') # batch size, seqlength, feature self.target_data = tf.placeholder( dtype=tf.float32, shape=[None, params['seq_length'], NOUT]) # batch size, seqlength, feature self._P_inp = tf.placeholder(dtype=tf.float32, shape=[None, NOUT, NOUT], name='P') self._F = 0.0 # state transition matrix self._alpha_sq = 1. # fading memory control self.M = 0.0 # process-measurement cross correlation self._I = tf.placeholder(dtype=tf.float32, shape=[None, NOUT, NOUT], name='I') self.u = 0.0 xres_lst = [] xpred_lst = [] pres_lst = [] tres_lst = [] kres_lst = [] qres_lst = [] rres_lst = [] with tf.variable_scope('rnnlm'): output_w1 = tf.get_variable( "output_w1", [rnn_size, rnn_size], initializer=tf.contrib.layers.xavier_initializer()) output_b1 = tf.get_variable("output_b1", [rnn_size]) output_w2 = tf.get_variable( "output_w2", [rnn_size, rnn_size], initializer=tf.contrib.layers.xavier_initializer()) output_b2 = tf.get_variable("output_b2", [rnn_size]) output_w3 = tf.get_variable( "output_w3", [rnn_size, NOUT], initializer=tf.contrib.layers.xavier_initializer()) output_b3 = tf.get_variable("output_b3", [NOUT]) output_w1_Q_noise = tf.get_variable( "output_w_Q_noise", [params['Qn_hidden'], NOUT], initializer=tf.contrib.layers.xavier_initializer()) output_b1_Q_noise = tf.get_variable("output_b_Q_noise", [NOUT]) output_w1_R_noise = tf.get_variable( "output_w_R_noise", [params['Rn_hidden'], NOUT], initializer=tf.contrib.layers.xavier_initializer()) # output_b1_R_noise = tf.get_variable("output_b_R_noise", [NOUT],initializer=tf.ones_initializer()) output_b1_R_noise = tf.get_variable("output_b_R_noise", [NOUT]) # indices = list(zip(*np.tril_indices(NOUT))) indices = tf.constant([list(i) for i in indices], dtype=tf.int64) state_F = self.initial_state state_Q = self.initial_state_Q_noise state_R = self.initial_state_R_noise with tf.variable_scope("rnnlm"): for time_step in range(params['seq_length']): if time_step > 0: tf.get_variable_scope().reuse_variables() z = self._z[:, time_step, :] #bs,features if time_step == 0: # self._x= z self._x = self._x_inp self._P = self._P_inp with tf.variable_scope("transitionF"): (pred, state_F, ls_internals) = self.cell(self._x, state_F) # pred = tf.matmul(pred,output_w1)+output_b1 pred = tf.nn.relu( tf.add(tf.matmul(pred, output_w1), output_b1)) pred = tf.nn.relu( tf.add(tf.matmul(pred, output_w2), output_b2)) pred = tf.add(tf.matmul(pred, output_w3), output_b3) with tf.variable_scope("noiseQ"): (pred_Q_noise, state_Q, ls_internals) = self.cell_Q_noise(self._x, state_Q) pred_Q_noise = tf.matmul( pred_Q_noise, output_w1_Q_noise) + output_b1_Q_noise # one_mask = tf.ones(shape=(batch_size, NOUT)) # zero_mask = tf.zeros(shape=(batch_size, NOUT)) # random_mask = tf.random_uniform(shape=(batch_size, NOUT)) # means = tf.mul(tf.ones(shape=(batch_size, NOUT)), 1 - self.output_keep_prob) # mask = tf.select(random_mask - means > 0.5, zero_mask, one_mask) # meas_z = tf.select(self.output_keep_prob >= 1, z, tf.mul(z, mask)) # norm = tf.random_normal(shape=(batch_size, NOUT), mean=0, stddev=0.01) # meas_z = tf.select(self.output_keep_prob >= 1, z, tf.add(z, norm)) meas_z = z with tf.variable_scope("noiseR"): (pred_R_noise, state_R, ls_internals) = self.cell_R_noise(meas_z, state_R) pred_R_noise = tf.matmul( pred_R_noise, output_w1_R_noise) + output_b1_R_noise # self._x = pred # lst=tf.unpack(pred, axis=1) # Q= tf.sparse_to_dense(sparse_indices=indices, output_shape=[batch_size,NOUT, NOUT], \ # sparse_values=pred_Q_noise, default_value=0, \ # validate_indices=True) # Q = tf.matrix_diag(tf.exp(pred_Q_noise)) R = tf.matrix_diag(tf.exp(pred_R_noise)) # Q=tf.matmul(tf.matrix_diag(tf.exp(pred_Q_noise)),tf.matrix_diag(tf.exp(pred_Q_noise))) # R=tf.matmul(tf.matrix_diag(tf.exp(pred_R_noise)),tf.matrix_diag(tf.exp(pred_R_noise))) #predict P = self._P self._P = P + Q #update P = self._P x = self._x self._y = meas_z - x # S = HPH' + R # project system uncertainty into measurement space S = P + R # S = P # K = PH'inv(S) # map system uncertainty into kalman gain K = tf.matmul(P, tf.matrix_inverse(S)) #(Q+P_init/(R+Q+P_init)) # x = x + Ky # predict new x with residual scaled by the kalman gain self._x = tf.squeeze(tf.matmul(K, tf.expand_dims(self._y, 2)), -1) #K-->>1, _x=z, K-->>0, _x=x, xpred_lst.append(x) xres_lst.append(self._x) tres_lst.append(meas_z) kres_lst.append(tf.matrix_diag_part(K)) rres_lst.append(tf.matrix_diag_part(R)) qres_lst.append(tf.matrix_diag_part(Q)) # P = (I-KH)P(I-KH)' + KRK' I_KH = self._I - K self._P = tf.matmul( I_KH, tf.matmul(P, tf.matrix_transpose(I_KH))) + tf.matmul( K, tf.matmul(R, tf.matrix_transpose(K))) # self._P = tf.matmul(I_KH, tf.matmul(P, tf.matrix_transpose(I_KH))) + tf.matmul(K, tf.matrix_transpose(K)) self._S = S self._K = K final_output = tf.reshape(tf.transpose(tf.stack(xres_lst), [1, 0, 2]), [-1, params['n_output']]) final_pred_output = tf.reshape( tf.transpose(tf.stack(xpred_lst), [1, 0, 2]), [-1, params['n_output']]) final_q_output = tf.reshape( tf.transpose(tf.stack(qres_lst), [1, 0, 2]), [-1, params['n_output']]) final_r_output = tf.reshape( tf.transpose(tf.stack(rres_lst), [1, 0, 2]), [-1, params['n_output']]) final_k_output = tf.reshape( tf.transpose(tf.stack(kres_lst), [1, 0, 2]), [-1, params['n_output']]) final_meas_output = tf.reshape( tf.transpose(tf.stack(tres_lst), [1, 0, 2]), [-1, params['n_output']]) flt = tf.squeeze(tf.reshape(self.repeat_data, [-1, 1]), [1]) where_flt = tf.not_equal(flt, 0) indices = tf.where(where_flt) y = tf.reshape(self.target_data, [-1, params["n_output"]]) self.final_output = tf.gather(final_output, tf.squeeze(indices, [1])) self.final_pred_output = tf.gather(final_pred_output, tf.squeeze(indices, [1])) self.final_q_output = tf.gather(final_q_output, tf.squeeze(indices, [1])) self.final_r_output = tf.gather(final_r_output, tf.squeeze(indices, [1])) self.final_k_output = tf.gather(final_k_output, tf.squeeze(indices, [1])) self.final_meas_output = tf.gather(final_meas_output, tf.squeeze(indices, [1])) self.y = tf.gather(y, tf.squeeze(indices, [1])) tmp = self.final_output - self.y loss = tf.nn.l2_loss(tmp) tmp_pred = self.final_pred_output - self.y loss_pred = tf.nn.l2_loss(tmp_pred) # tmp_pred = self.final_pred_output - self.y # loss_pred = tf.nn.l2_loss(tmp_pred) self.tvars = tf.trainable_variables() l2_reg = tf.reduce_sum([tf.nn.l2_loss(var) for var in self.tvars]) l2_reg = tf.multiply(l2_reg, 1e-4) self.cost = tf.reduce_mean( loss) + l2_reg + 0.8 * tf.reduce_mean(loss_pred) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() total_parameters = 0 for variable in self.tvars: # shape is an array of tf.Dimension shape = variable.get_shape() variable_parametes = 1 for dim in shape: variable_parametes *= dim.value total_parameters += variable_parametes self.total_parameters = total_parameters grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.states = {} self.states["F_t"] = state_F self.states["Q_t"] = state_Q self.states["R_t"] = state_R self.states["PCov_t"] = self._P self.states["_x_t"] = self._x self.xres_lst = xres_lst self.pres_lst = pres_lst self.tres_lst = tres_lst self.kres_lst = kres_lst
def atten_col(self, avg_atten): row_sum = tf.reduce_sum(avg_atten, axis=1) diag_softmax = tf.matrix_diag_part(avg_atten) attended_by = tf.math.subtract(row_sum, diag_softmax) return attended_by
def sub_model(self, model_input, vocab_size, is_training, num_mixtures=None, l2_penalty=1e-8, sub_scope="", dropout=False, keep_prob=None, noise_level=None, **unused_params): num_mixtures = num_mixtures or FLAGS.moe_num_mixtures low_rank_gating = FLAGS.moe_low_rank_gating l2_penalty = FLAGS.moe_l2 gating_probabilities = FLAGS.moe_prob_gating gating_input = FLAGS.moe_prob_gating_input remove_diag = FLAGS.gating_remove_diag if dropout: model_input = tf.nn.dropout(model_input, keep_prob=keep_prob) gate_activations = slim.fully_connected( model_input, vocab_size * (num_mixtures + 1), activation_fn=None, biases_initializer=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="gates-" + sub_scope) expert_activations = slim.fully_connected( model_input, vocab_size * num_mixtures, activation_fn=None, weights_regularizer=slim.l2_regularizer(l2_penalty), scope="experts-" + sub_scope) gating_distribution = tf.nn.softmax( tf.reshape(gate_activations, [-1, num_mixtures + 1 ])) # (Batch * #Labels) x (num_mixtures + 1) expert_distribution = tf.nn.sigmoid( tf.reshape(expert_activations, [-1, num_mixtures])) # (Batch * #Labels) x num_mixtures final_probabilities_by_class_and_batch = tf.reduce_sum( gating_distribution[:, :num_mixtures] * expert_distribution, 1) final_probabilities = tf.reshape( final_probabilities_by_class_and_batch, [-1, vocab_size]) probabilities = final_probabilities with tf.variable_scope(sub_scope): if gating_probabilities: if gating_input == 'prob': gating_weights = tf.get_variable( "gating_prob_weights", [vocab_size, vocab_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(vocab_size))) gates = tf.matmul(probabilities, gating_weights) else: gating_weights = tf.get_variable( "gating_prob_weights", [input_size, vocab_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(vocab_size))) gates = tf.matmul(model_input, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals, probabilities) gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_prob_bn") gates = tf.sigmoid(gates) probabilities = tf.multiply(probabilities, gates) final_probabilities = probabilities return final_probabilities
def _add_diagonal_shift(matrix, shift): return tf.matrix_set_diag( matrix, tf.matrix_diag_part(matrix) + shift, name='add_diagonal_shift')
def outer_diag(self, tensor): trans = tf.transpose(tensor) diag = tf.matrix_diag_part(trans) return tf.transpose(diag)
def create_model(self, model_input, vocab_size, num_frames, iterations=None, add_batch_norm=None, sample_random_frames=None, cluster_size=None, hidden_size=None, **unused_params): iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.netvlad_cluster_size hidden1_size = hidden_size or FLAGS.netvlad_hidden_size relu = FLAGS.netvlad_relu dimred = FLAGS.netvlad_dimred gating = FLAGS.gating remove_diag = FLAGS.gating_remove_diag print "FLAGS.lightvlad", FLAGS.lightvlad lightvlad = FLAGS.lightvlad vlagd = FLAGS.vlagd num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) print "num_frames:", num_frames if random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, iterations) max_frames = model_input.get_shape().as_list()[1] feature_size = model_input.get_shape().as_list()[2] reshaped_input = tf.reshape(model_input, [-1, feature_size]) if lightvlad: video_NetVLAD = LightVLAD(1024, max_frames, cluster_size, add_batch_norm) audio_NetVLAD = LightVLAD(128, max_frames, cluster_size / 2, add_batch_norm) if add_batch_norm: # and not lightvlad: reshaped_input = slim.batch_norm(reshaped_input, center=True, scale=True, scope="input_bn") with tf.variable_scope("video_VLAD"): vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024]) with tf.variable_scope("audio_VLAD"): vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:]) vlad = tf.concat([vlad_video, vlad_audio], 1) vlad_dim = vlad.get_shape().as_list()[1] hidden1_weights = tf.get_variable( "hidden1_weights", [vlad_dim, hidden1_size], initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size))) activation = tf.matmul(vlad, hidden1_weights) if add_batch_norm and relu: activation = slim.batch_norm(activation, center=True, scale=True, scope="hidden1_bn") else: hidden1_biases = tf.get_variable( "hidden1_biases", [hidden1_size], initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases if relu: activation = tf.nn.relu6(activation) if gating: gating_weights = tf.get_variable( "gating_weights_2", [hidden1_size, hidden1_size], initializer=tf.random_normal_initializer( stddev=1 / math.sqrt(hidden1_size))) gates = tf.matmul(activation, gating_weights) if remove_diag: #removes diagonals coefficients diagonals = tf.matrix_diag_part(gating_weights) gates = gates - tf.multiply(diagonals, activation) if add_batch_norm: gates = slim.batch_norm(gates, center=True, scale=True, scope="gating_bn") else: gating_biases = tf.get_variable( "gating_biases", [cluster_size], initializer=tf.random_normal(stddev=1 / math.sqrt(feature_size))) gates += gating_biases gates = tf.sigmoid(gates) activation = tf.multiply(activation, gates) aggregated_model = getattr(video_level_models, FLAGS.video_level_classifier_model) return aggregated_model().create_model(model_input=activation, vocab_size=vocab_size, **unused_params)
def __init__(self, df, scale=None, scale_tril=None, input_output_cholesky=False, validate_args=False, allow_nan_stats=True, name="Wishart"): """Construct Wishart distributions. Args: df: `float` or `double` `Tensor`. Degrees of freedom, must be greater than or equal to dimension of the scale matrix. scale: `float` or `double` `Tensor`. The symmetric positive definite scale matrix of the distribution. Exactly one of `scale` and 'scale_tril` must be passed. scale_tril: `float` or `double` `Tensor`. The Cholesky factorization of the symmetric positive definite scale matrix of the distribution. Exactly one of `scale` and 'scale_tril` must be passed. input_output_cholesky: Python `bool`. If `True`, functions whose input or output have the semantics of samples assume inputs are in Cholesky form and return outputs in Cholesky form. In particular, if this flag is `True`, input to `log_prob` is presumed of Cholesky form and output from `sample`, `mean`, and `mode` are of Cholesky form. Setting this argument to `True` is purely a computational optimization and does not change the underlying distribution; for instance, `mean` returns the Cholesky of the mean, not the mean of Cholesky factors. The `variance` and `stddev` methods are unaffected by this flag. Default value: `False` (i.e., input/output does not have Cholesky semantics). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if zero or both of 'scale' and 'scale_tril' are passed in. """ parameters = dict(locals()) with tf.name_scope(name, values=[scale, scale_tril]) as name: with tf.name_scope("init", values=[scale, scale_tril]): if (scale is None) == (scale_tril is None): raise ValueError( "Must pass scale or scale_tril, but not both.") if scale is not None: scale = tf.convert_to_tensor(scale) if validate_args: scale = distribution_util.assert_symmetric(scale) scale_tril = tf.cholesky(scale) else: # scale_tril is not None scale_tril = tf.convert_to_tensor(scale_tril) if validate_args: scale_tril = control_flow_ops.with_dependencies([ tf.assert_positive( tf.matrix_diag_part(scale_tril), message="scale_tril must be positive definite" ), tf.assert_equal( tf.shape(scale_tril)[-1], tf.shape(scale_tril)[-2], message="scale_tril must be square") ], scale_tril) super(Wishart, self).__init__( df=df, scale_operator=tf.linalg.LinearOperatorLowerTriangular( tril=scale_tril, is_non_singular=True, is_positive_definite=True, is_square=True), input_output_cholesky=input_output_cholesky, validate_args=validate_args, allow_nan_stats=allow_nan_stats, name=name) self._parameters = parameters
# Declare k-value and batch size k = 4 batch_size = len(x_vals_test) # Placeholders x_data_train = tf.placeholder(shape=[None, num_features], dtype=tf.float32) x_data_test = tf.placeholder(shape=[None, num_features], dtype=tf.float32) y_target_train = tf.placeholder(shape=[None, 1], dtype=tf.float32) y_target_test = tf.placeholder(shape=[None, 1], dtype=tf.float32) # Declare weighted distance metric # Weighted - L2 = sqrt((x-y)^T * A * (x-y)) subtraction_term = tf.subtract(x_data_train, tf.expand_dims(x_data_test, 1)) first_product = tf.matmul(subtraction_term, tf.tile(tf.expand_dims(weight_matrix, 0), [batch_size, 1, 1])) second_product = tf.matmul(first_product, tf.transpose(subtraction_term, perm=[0, 2, 1])) distance = tf.sqrt(tf.matrix_diag_part(second_product)) # Predict: Get min distance index (Nearest neighbor) top_k_xvals, top_k_indices = tf.nn.top_k(tf.negative(distance), k=k) x_sums = tf.expand_dims(tf.reduce_sum(top_k_xvals, 1), 1) x_sums_repeated = tf.matmul(x_sums, tf.ones([1, k], tf.float32)) x_val_weights = tf.expand_dims(tf.div(top_k_xvals, x_sums_repeated), 1) top_k_yvals = tf.gather(y_target_train, top_k_indices) prediction = tf.squeeze(tf.matmul(x_val_weights, top_k_yvals), squeeze_dims=[1]) # Calculate MSE mse = tf.div(tf.reduce_sum(tf.square(tf.subtract(prediction, y_target_test))), batch_size) # Calculate how many loops over training data num_loops = int(np.ceil(len(x_vals_test) / batch_size))
def _expectation(p, kern1, feat1, kern2, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - Ka_{.,.}, Kb_{.,.} :: RBF kernels Ka and Kb as well as Z1 and Z2 can differ from each other, but this is supported only if the Gaussian p is Diagonal (p.cov NxD) and Ka, Kb have disjoint active_dims in which case the joint expectations simplify into a product of expectations :return: NxMxM """ if kern1.on_separate_dims(kern2) and isinstance( p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (kern1, feat1)) eKxz2 = expectation(p, (kern2, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2 or kern1 != kern2: raise NotImplementedError( "The expectation over two kernels has only an " "analytical implementation if both kernels are equal.") kern = kern1 feat = feat1 with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov( tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov ) Z, Xmu = kern._slice(feat.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] squared_lengthscales = kern.lengthscales ** 2. if kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales ** 2. sqrt_det_L = tf.reduce_prod(0.5 * squared_lengthscales)**0.5 C = tf.cholesky(0.5 * tf.matrix_diag(squared_lengthscales) + Xcov) # NxDxD dets = sqrt_det_L / tf.exp( tf.reduce_sum(tf.log(tf.matrix_diag_part(C)), axis=1)) # N C_inv_mu = tf.matrix_triangular_solve(C, tf.expand_dims(Xmu, 2), lower=True) # NxDx1 C_inv_z = tf.matrix_triangular_solve( C, tf.tile(tf.expand_dims(tf.transpose(Z) / 2., 0), [N, 1, 1]), lower=True) # NxDxM mu_CC_inv_mu = tf.expand_dims(tf.reduce_sum(tf.square(C_inv_mu), 1), 2) # Nx1x1 z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1) # NxM zm_CC_inv_zn = tf.matmul(C_inv_z, C_inv_z, transpose_a=True) # NxMxM two_z_CC_inv_mu = 2 * tf.matmul(C_inv_z, C_inv_mu, transpose_a=True)[:, :, 0] # NxM exponent_mahalanobis = mu_CC_inv_mu + tf.expand_dims(z_CC_inv_z, 1) + \ tf.expand_dims(z_CC_inv_z, 2) + 2 * zm_CC_inv_zn - \ tf.expand_dims(two_z_CC_inv_mu, 2) - tf.expand_dims(two_z_CC_inv_mu, 1) # NxMxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxMxM # Compute sqrt(self.K(Z)) explicitly to prevent automatic gradient from # being NaN sometimes, see pull request #615 kernel_sqrt = tf.exp(-0.25 * kern.square_dist(Z, None)) return kern.variance ** 2 * kernel_sqrt * \ tf.reshape(dets, [N, 1, 1]) * exponent_mahalanobis
def _variance(self): x = tf.sqrt(self.df) * self._square_scale_operator() d = tf.expand_dims(tf.matrix_diag_part(x), -1) v = tf.square(x) + tf.matmul(d, d, adjoint_b=True) return v
def _log_prob(self, x): if self.input_output_cholesky: x_sqrt = x else: # Complexity: O(nbk**3) x_sqrt = tf.cholesky(x) batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() ndims = tf.rank(x_sqrt) # sample_ndims = ndims - batch_ndims - event_ndims sample_ndims = ndims - tf.shape(batch_shape)[0] - 2 sample_shape = tf.strided_slice(tf.shape(x_sqrt), [0], [sample_ndims]) # We need to be able to pre-multiply each matrix by its corresponding # batch scale matrix. Since a Distribution Tensor supports multiple # samples per batch, this means we need to reshape the input matrix `x` # so that the first b dimensions are batch dimensions and the last two # are of shape [dimension, dimensions*number_of_samples]. Doing these # gymnastics allows us to do a batch_solve. # # After we're done with sqrt_solve (the batch operation) we need to undo # this reshaping so what we're left with is a Tensor partitionable by # sample, batch, event dimensions. # Complexity: O(nbk**2) since transpose must access every element. scale_sqrt_inv_x_sqrt = x_sqrt perm = tf.concat( [tf.range(sample_ndims, ndims), tf.range(0, sample_ndims)], 0) scale_sqrt_inv_x_sqrt = tf.transpose(scale_sqrt_inv_x_sqrt, perm) shape = tf.concat( (batch_shape, (tf.cast(self.dimension, dtype=tf.int32), -1)), 0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) # Complexity: O(nbM*k) where M is the complexity of the operator solving a # vector system. For LinearOperatorLowerTriangular, each solve is O(k**2) so # this step has complexity O(nbk^3). scale_sqrt_inv_x_sqrt = self.scale_operator.solve( scale_sqrt_inv_x_sqrt) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = tf.concat([batch_shape, event_shape, sample_shape], 0) scale_sqrt_inv_x_sqrt = tf.reshape(scale_sqrt_inv_x_sqrt, shape) perm = tf.concat([ tf.range(ndims - sample_ndims, ndims), tf.range(0, ndims - sample_ndims) ], 0) scale_sqrt_inv_x_sqrt = tf.transpose(scale_sqrt_inv_x_sqrt, perm) # Write V = SS', X = LL'. Then: # tr[inv(V) X] = tr[inv(S)' inv(S) L L'] # = tr[inv(S) L L' inv(S)'] # = tr[(inv(S) L) (inv(S) L)'] # = sum_{ik} (inv(S) L)_{ik}**2 # The second equality follows from the cyclic permutation property. # Complexity: O(nbk**2) trace_scale_inv_x = tf.reduce_sum(tf.square(scale_sqrt_inv_x_sqrt), axis=[-2, -1]) # Complexity: O(nbk) half_log_det_x = tf.reduce_sum(tf.log(tf.matrix_diag_part(x_sqrt)), axis=[-1]) # Complexity: O(nbk**2) log_prob = ((self.df - self.dimension - 1.) * half_log_det_x - 0.5 * trace_scale_inv_x - self.log_normalization()) # Set shape hints. # Try to merge what we know from the input then what we know from the # parameters of this distribution. if x.get_shape().ndims is not None: log_prob.set_shape(x.get_shape()[:-2]) if (log_prob.get_shape().ndims is not None and self.batch_shape.ndims is not None and self.batch_shape.ndims > 0): log_prob.get_shape()[-self.batch_shape.ndims:].merge_with( self.batch_shape) return log_prob
def model_fn(features, labels, mode, params, config): visit_items_index = features["visit_items_index"] # num * 5 continuous_features_value = features["continuous_features_value"] # num * 16 next_visit_item_index = labels # num keep_prob = params["keep_prob"] embedding_size = params["embedding_size"] item_num = params["item_num"] learning_rate = params["learning_rate"] top_k = params["top_k"] # items embedding 初始化 initializer = tf.initializers.random_uniform(minval=-0.5 / embedding_size, maxval=0.5 / embedding_size) partitioner = tf.fixed_size_partitioner(num_shards=embedding_size) item_embedding = tf.get_variable("item_embedding", [item_num, embedding_size], tf.float32, initializer=initializer, partitioner=partitioner) visit_items_embedding = tf.nn.embedding_lookup(item_embedding, visit_items_index) # num * 5 * embedding_size visit_items_average_embedding = tf.reduce_mean(visit_items_embedding, axis=1) # num * embedding_size input_embedding = tf.concat([visit_items_average_embedding, continuous_features_value], 1) # num * (embedding_size + 16) kernel_initializer_1 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_1 = tf.initializers.random_normal(mean=0.0, stddev=0.1) layer_1 = tf.layers.dense(input_embedding, 64, activation=tf.nn.relu, kernel_initializer=kernel_initializer_1, bias_initializer=bias_initializer_1, name="layer_1") layer_dropout_1 = tf.nn.dropout(layer_1, keep_prob=keep_prob, name="layer_dropout_1") kernel_initializer_2 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_2 = tf.initializers.random_normal(mean=0.0, stddev=0.1) layer_2 = tf.layers.dense(layer_dropout_1, 32, activation=tf.nn.relu, kernel_initializer=kernel_initializer_2, bias_initializer=bias_initializer_2, name="layer_2") layer_dropout_2 = tf.nn.dropout(layer_2, keep_prob=keep_prob, name="layer_dropout_2") # user vector, num * embedding_size kernel_initializer_3 = tf.initializers.random_normal(mean=0.0, stddev=0.1) bias_initializer_3 = tf.initializers.random_normal(mean=0.0, stddev=0.1) user_vector = tf.layers.dense(layer_dropout_2, embedding_size, activation=tf.nn.relu, kernel_initializer=kernel_initializer_3, bias_initializer=bias_initializer_3, name="user_vector") if mode == tf.estimator.ModeKeys.TRAIN: # 训练 output_embedding = tf.nn.embedding_lookup(item_embedding, next_visit_item_index) # num * embedding_size logits = tf.matmul(user_vector, output_embedding, transpose_a=False, transpose_b=True) # num * num yhat = tf.nn.softmax(logits) # num * num cross_entropy = tf.reduce_mean(-tf.log(tf.matrix_diag_part(yhat) + 1e-16)) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train = optimizer.minimize(cross_entropy, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=cross_entropy, train_op=train) if mode == tf.estimator.ModeKeys.EVAL: # 评估 output_embedding = tf.nn.embedding_lookup(item_embedding, next_visit_item_index) # num * embedding_size logits = tf.matmul(user_vector, output_embedding, transpose_a=False, transpose_b=True) # num * num yhat = tf.nn.softmax(logits) # num * num cross_entropy = tf.reduce_mean(-tf.log(tf.matrix_diag_part(yhat) + 1e-16)) return tf.estimator.EstimatorSpec(mode, loss=cross_entropy) if mode == tf.estimator.ModeKeys.PREDICT: logits_predict = tf.matmul(user_vector, item_embedding, transpose_a=False, transpose_b=True) # num * item_num yhat_predict = tf.nn.softmax(logits_predict) # num * item_num _, indices = tf.nn.top_k(yhat_predict, k=top_k, sorted=True) index = tf.identity(indices, name="index") # num * top_k # 预测 predictions = { "user_vector": user_vector, "index": index } export_outputs = { "prediction": tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
import tensorflow as tf """tf.matrix_diag_part(input,name=None) 功能:返回批对角阵的对角元素 输入:tensor,批对角阵""" a = tf.constant([[[1, 3, 0], [0, 2, 0], [0, 0, 3]], [[4, 0, 0], [0, 5, 0], [0, 0, 6]]]) z = tf.matrix_diag_part(a) sess = tf.Session() print(sess.run(z)) sess.close() # z==>[[1 2 3] # [4 5 6]]
def call(self, inputs): if self.conditional_inputs is None and self.conditional_outputs is None: covariance_matrix = self.covariance_fn(inputs, inputs) # Tile locations so output has shape [units, batch_size]. Covariance will # broadcast to [units, batch_size, batch_size], and we perform # shape manipulations to get a random variable over [batch_size, units]. loc = self.mean_fn(inputs) loc = tf.tile(loc[tf.newaxis], [self.units] + [1] * len(loc.shape)) else: knn = self.covariance_fn(inputs, inputs) knm = self.covariance_fn(inputs, self.conditional_inputs) kmm = self.covariance_fn(self.conditional_inputs, self.conditional_inputs) kmm = tf.matrix_set_diag( kmm, tf.matrix_diag_part(kmm) + tf.keras.backend.epsilon()) kmm_tril = tf.linalg.cholesky(kmm) kmm_tril_operator = tf.linalg.LinearOperatorLowerTriangular( kmm_tril) knm_operator = tf.linalg.LinearOperatorFullMatrix(knm) # TODO(trandustin): Vectorize linear algebra for multiple outputs. For # now, we do each separately and stack to obtain a locations Tensor of # shape [units, batch_size]. loc = [] for conditional_outputs_unit in tf.unstack( self.conditional_outputs, axis=-1): center = conditional_outputs_unit - self.mean_fn( self.conditional_inputs) loc_unit = knm_operator.matvec( kmm_tril_operator.solvevec( kmm_tril_operator.solvevec(center), adjoint=True)) loc.append(loc_unit) loc = tf.stack(loc) + self.mean_fn(inputs)[tf.newaxis] covariance_matrix = knn covariance_matrix -= knm_operator.matmul( kmm_tril_operator.solve(kmm_tril_operator.solve( knm, adjoint_arg=True), adjoint=True)) covariance_matrix = tf.matrix_set_diag( covariance_matrix, tf.matrix_diag_part(covariance_matrix) + tf.keras.backend.epsilon()) # Form a multivariate normal random variable with batch_shape units and # event_shape batch_size. Then make it be independent across the units # dimension. Then transpose its dimensions so it is [batch_size, units]. random_variable = ed.MultivariateNormalFullCovariance( loc=loc, covariance_matrix=covariance_matrix) random_variable = ed.Independent(random_variable.distribution, reinterpreted_batch_ndims=1) bijector = tfp.bijectors.Inline( forward_fn=lambda x: tf.transpose(x, [1, 0]), inverse_fn=lambda y: tf.transpose(y, [1, 0]), forward_event_shape_fn=lambda input_shape: input_shape[::-1], forward_event_shape_tensor_fn=lambda input_shape: input_shape[::-1 ], inverse_log_det_jacobian_fn=lambda y: tf.cast(0, y.dtype), forward_min_event_ndims=2) random_variable = ed.TransformedDistribution( random_variable.distribution, bijector=bijector) return random_variable
def _model(self, features): Z = features['numbers'] C = features['srdf'] # masking mask = tf.cast(tf.expand_dims(Z, 1) * tf.expand_dims(Z, 2), tf.float32) diag = tf.matrix_diag_part(mask) diag = tf.ones_like(diag) offdiag = 1 - tf.matrix_diag(diag) mask *= offdiag mask = tf.expand_dims(mask, -1) I = np.eye(self.max_z).astype(np.float32) ZZ = tf.nn.embedding_lookup(I, Z) r = tf.sqrt(1. / tf.sqrt(float(self.n_basis))) X = L.dense(ZZ, self.n_basis, use_bias=False, weight_init=tf.random_normal_initializer(stddev=r)) fC = L.dense(C, self.n_factors, use_bias=True) reuse = None for i in range(self.n_interactions): tmp = tf.expand_dims(X, 1) fX = L.dense(tmp, self.n_factors, use_bias=True, scope='in2fac', reuse=reuse) fVj = fX * fC Vj = L.dense(fVj, self.n_basis, use_bias=False, weight_init=tf.constant_initializer(0.0), nonlinearity=tf.nn.tanh, scope='fac2out', reuse=reuse) V = L.masked_sum(Vj, mask, axes=2) X += V reuse = True # output o1 = L.dense(X, self.n_basis // 2, nonlinearity=tf.nn.tanh) yi = L.dense(o1, 1, weight_init=tf.constant_initializer(0.0), use_bias=True) mu = tf.get_variable('mu', shape=(1,), initializer=L.reference_initializer(self.mu), trainable=False) std = tf.get_variable('std', shape=(1,), initializer=L.reference_initializer(self.std), trainable=False) yi = yi * std + mu if self.atom_ref is not None: E0i = L.embedding(Z, 100, 1, reference=self.atom_ref, trainable=False) yi += E0i atom_mask = tf.expand_dims(Z, -1) if self.per_atom: y = L.masked_mean(yi, atom_mask, axes=1) #E0 = L.masked_mean(E0i, atom_mask, axes=1) else: y = L.masked_sum(yi, atom_mask, axes=1) #E0 = L.masked_sum(E0i, atom_mask, axes=1) return {'y': y, 'y_i': yi} #, 'E0': E0}
def call(self, list_tensors): layer = tf.keras.layers.Dot(axes=[2, 2], normalize=True) output_dot = layer(list_tensors) output_diag = tf.matrix_diag_part(output_dot) return output_diag
def test_SU(self): N = 5 batch_size = 1000 algebra = SU(N) # check random unitaries u = algebra.random_element(batch_size) self.assertEqual(u.shape.as_list(), [batch_size, N, N]) det = tf.linalg.det(u) prod = tf.matmul(u, tf.linalg.adjoint(u)) re = tf.reduce_mean(tf.abs(tf.real(u))) im = tf.reduce_mean(tf.abs(tf.imag(u))) with tf.Session() as sess: det, prod, re, im = sess.run([det, prod, re, im]) self.assertTrue(0.9 < re / im < 1.1) self.assertTrue(np.allclose(det, 1)) self.assertTrue(np.allclose(prod, np.eye(N), atol=1e-5)) # test gauge fixing: shapes mat = tf.complex(tf.random.normal([batch_size, 3, N, N]), tf.random.normal([batch_size, 3, N, N])) mat = mat + tf.linalg.adjoint(mat) # make it hermitian g, rep = algebra.gauge_fixing(mat) self.assertEqual(g.shape, [batch_size, N, N]) self.assertEqual(rep.shape, [batch_size, 3, N, N]) # test gauge fixing: g should be in SU(N) det = tf.linalg.det(g) prod = tf.matmul(g, tf.linalg.adjoint(g)) with tf.Session() as sess: det, prod = sess.run([det, prod]) self.assertTrue(np.allclose(det, 1)) self.assertTrue(np.allclose(prod, np.eye(N), atol=1e-5)) # test gauge fixing: rep should agree with the gauge choice rep_adj = tf.linalg.adjoint(rep) diag = tf.matrix_diag_part(rep[:, 0, :, :]) off_diag = rep[:, 0, :, :] - tf.matrix_diag(diag) diff = diag[:, 1:] - diag[:, :-1] next_diagonal = tf.matrix_diag_part( tf.roll(rep[:, 1, :, :], shift=-1, axis=-1))[:, :-1] with tf.Session() as sess: r, rep_adj, off_diag, diff, next_diagonal = sess.run( [rep, rep_adj, off_diag, diff, next_diagonal]) self.assertTrue(np.allclose(r, rep_adj, atol=1e-5)) self.assertTrue(np.allclose(off_diag, 0, atol=1e-5)) self.assertTrue(np.allclose(diff, np.abs(diff))) self.assertTrue( np.allclose(next_diagonal, -np.conj(next_diagonal), atol=1e-5)) self.assertTrue( np.allclose(np.imag(next_diagonal), np.abs(np.imag(next_diagonal)), atol=1e-5)) # test gauge fixing and action: g rep is mat mat_ = tf.einsum("bij,brjk,bkl->bril", g, rep, tf.linalg.adjoint(g)) mat__ = algebra.action(g, rep) g_, rep_ = algebra.gauge_fixing(mat__) self.assertEqual(g_.shape, g.shape) self.assertEqual(rep_.shape, rep.shape) with tf.Session() as sess: g, g_, r, rep_, m, mat_, mat__ = sess.run( [g, g_, rep, rep_, mat, mat_, mat__]) self.assertTrue( np.allclose(np.abs(g), np.abs(g_), atol=1e-2)) # g and g_ may differ by an overall phase self.assertTrue(np.allclose(r, rep_, atol=1e-2)) self.assertTrue(np.allclose(m, mat_, atol=1e-5)) self.assertTrue(np.allclose(m, mat__, atol=1e-5)) # test uniqueness of the gauge representative _, rep_ = algebra.gauge_fixing(algebra.action(u, mat)) with tf.Session() as sess: rep1, rep2 = sess.run([rep, rep_]) self.assertTrue(np.allclose(rep1, rep2, atol=1e-3)) # check the shape of log_orbit_measure m = algebra.log_orbit_measure(rep) self.assertEqual(m.shape, [batch_size]) # test infinitesimal action dg = algebra.random_algebra_element(batch_size) dmat = algebra.infinitesimal_action(dg, mat) dmat_adj = tf.linalg.adjoint(dmat) eps = 1e-4 dmat_ = (algebra.action( tf.linalg.expm(1j * eps * algebra.vector_to_matrix(dg)), mat) - mat) / eps o = tf.reduce_sum(tf.conj(mat) * dmat, axis=[-2, -1]) with tf.Session() as sess: dmat, dmat_adj, dmat_, o = sess.run([dmat, dmat_adj, dmat_, o]) self.assertTrue(np.allclose(dmat, dmat_adj, atol=1e-5)) self.assertTrue(np.allclose(dmat, dmat_, atol=1e-1)) self.assertTrue(np.allclose(o, 0, atol=1e-4)) # test conversion between vectors and matrices dg = algebra.random_algebra_element(batch_size) norm = tf.linalg.norm(dg, axis=-1) mat = algebra.vector_to_matrix(dg) dg_ = algebra.matrix_to_vector(mat) mat_ = algebra.vector_to_matrix(dg_) self.assertEqual(algebra.N, N) self.assertEqual(algebra.dim, N * N - 1) self.assertEqual(dg_.shape, [batch_size, algebra.dim]) self.assertEqual(mat_.shape, [batch_size, algebra.N, algebra.N]) with tf.Session() as sess: norm, dg, dg_, mat, mat_ = sess.run([norm, dg, dg_, mat, mat_]) self.assertTrue(np.allclose(norm, np.sqrt(N * N - 1))) self.assertTrue(np.allclose(dg, dg_, atol=1e-5)) self.assertTrue(np.allclose(mat, mat_, atol=1e-5))
def build(self): dd_q_input = Input( (self.config.nb_supervised_doc, self.config.doc_topk_term, 1), name='dd_q_input') dd_d_input = Input((self.config.nb_supervised_doc, self.config.doc_topk_term, self.config.hist_size), name='dd_d_input') dd_q_w = Dense(1, kernel_initializer=self.initializer_gate, use_bias=False, name='dd_q_gate')(dd_q_input) dd_q_w = Lambda(lambda x: softmax(x, axis=2), output_shape=( self.config.nb_supervised_doc, self.config.doc_topk_term, ), name='dd_q_softmax')(dd_q_w) z = dd_d_input for i in range(self.config.nb_layers): z = Dense(self.config.hidden_size[i], activation='tanh', kernel_initializer=self.initializer_fc, name='hidden')(z) z = Dense(self.config.out_size, kernel_initializer=self.initializer_fc, name='dd_d_gate')(z) z = Reshape(( self.config.nb_supervised_doc, self.config.doc_topk_term, ))(z) dd_q_w = Reshape(( self.config.nb_supervised_doc, self.config.doc_topk_term, ))(dd_q_w) # out = Dot(axes=[2, 2], name='dd_pseudo_out')([z, dd_q_w]) out = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]), name='dd_pseudo_out')([z, dd_q_w]) dd_init_out = Lambda(lambda x: tf.matrix_diag_part(x), output_shape=(self.config.nb_supervised_doc, ), name='dd_init_out')(out) ''' dd_init_out = Lambda(lambda x: tf.reduce_sum(x, axis=2), output_shape=(self.config.nb_supervised_doc,))(z) ''' #dd_out = Reshape((self.config.nb_supervised_doc,))(dd_out) # dd out gating dd_gate = Input((self.config.nb_supervised_doc, 1), name='baseline_doc_score') dd_w = Dense(1, kernel_initializer=self.initializer_gate, use_bias=False, name='dd_gate')(dd_gate) # dd_w = Lambda(lambda x: softmax(x, axis=1), output_shape=(self.config.nb_supervised_doc,), name='dd_softmax')(dd_w) # dd_out = Dot(axes=[1, 1], name='dd_out')([dd_init_out, dd_w]) dd_w = Reshape((self.config.nb_supervised_doc, ))(dd_w) dd_init_out = Reshape((self.config.nb_supervised_doc, ))(dd_init_out) if self.config.method in [1, 3]: # no doc gating, with dense layer z = dd_init_out elif self.config.method == 2: logging.info("Apply doc gating") z = Multiply(name='dd_out')([dd_init_out, dd_w]) else: raise ValueError( "Method not initialized, please check config file") if self.config.method in [1, 2]: logging.info("Dense layer on top") z = Dense(self.config.merge_hidden, activation='tanh', name='merge_hidden')(z) out = Dense(self.config.merge_out, name='score')(z) else: logging.info( "Apply doc gating, No dense layer on top, sum up scores") out = Dot(axes=[1, 1], name='score')([z, dd_w]) model = Model(inputs=[dd_q_input, dd_d_input, dd_gate], outputs=[out]) print(model.summary()) return model
def trace_represent(self): self.density_diag = tf.matrix_diag_part(self.M_qa) self.density_trace = tf.expand_dims(tf.trace(self.M_qa),-1) self.match_represent = tf.concat([self.density_diag,self.density_trace],1)
def _mean_of_covariance_given_quadrature_component(self, diag_only): p = self.mixture_distribution.probs # To compute E[Cov(Z|V)], we'll add matrices within three categories: # scaled-identity, diagonal, and full. Then we'll combine these at the end. scale_identity_multiplier = None diag = None full = None for k, aff in enumerate(self.interpolated_affine): s = aff.scale # Just in case aff.scale has side-effects, we'll call once. if (s is None or isinstance(s, tf.linalg.LinearOperatorIdentity)): scale_identity_multiplier = add(scale_identity_multiplier, p[..., k, tf.newaxis]) elif isinstance(s, tf.linalg.LinearOperatorScaledIdentity): scale_identity_multiplier = add( scale_identity_multiplier, (p[..., k, tf.newaxis] * tf.square(s.multiplier))) elif isinstance(s, tf.linalg.LinearOperatorDiag): diag = add(diag, (p[..., k, tf.newaxis] * tf.square(s.diag_part()))) else: x = (p[..., k, tf.newaxis, tf.newaxis] * s.matmul(s.to_dense(), adjoint_arg=True)) if diag_only: x = tf.matrix_diag_part(x) full = add(full, x) # We must now account for the fact that the base distribution might have a # non-unity variance. Recall that, since X ~ iid Law(X_0), # `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`. # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid # samples from a scalar-event distribution. v = self.distribution.variance() if scale_identity_multiplier is not None: scale_identity_multiplier *= v if diag is not None: diag *= v[..., tf.newaxis] if full is not None: full *= v[..., tf.newaxis] if diag_only: # Apparently we don't need the full matrix, just the diagonal. r = add(diag, full) if r is None and scale_identity_multiplier is not None: ones = tf.ones(self.event_shape_tensor(), dtype=self.dtype) return scale_identity_multiplier[..., tf.newaxis] * ones return add(r, scale_identity_multiplier) # `None` indicates we don't know if the result is positive-definite. is_positive_definite = (True if all( aff.scale.is_positive_definite for aff in self.endpoint_affine) else None) to_add = [] if diag is not None: to_add.append( tf.linalg.LinearOperatorDiag( diag=diag, is_positive_definite=is_positive_definite)) if full is not None: to_add.append( tf.linalg.LinearOperatorFullMatrix( matrix=full, is_positive_definite=is_positive_definite)) if scale_identity_multiplier is not None: to_add.append( tf.linalg.LinearOperatorScaledIdentity( num_rows=self.event_shape_tensor()[0], multiplier=scale_identity_multiplier, is_positive_definite=is_positive_definite)) return (linop_add_lib.add_operators(to_add)[0].to_dense() if to_add else None)
def tril_matrix(elements): tfd = tfp.distributions tril_m = tfd.fill_triangular(elements) tf.matrix_set_diag(tril_m, tf.exp(tf.matrix_diag_part(tril_m))) return tril_m
def call(self, inputs): self.call_weights() if (not isinstance(inputs, ed.RandomVariable) and not isinstance(self.kernel, ed.RandomVariable) and not isinstance(self.bias, ed.RandomVariable)): return super(DenseDVI, self).call(inputs) inputs_mean, inputs_variance, inputs_covariance = get_moments(inputs) kernel_mean, kernel_variance, _ = get_moments(self.kernel) if self.use_bias: bias_mean, _, bias_covariance = get_moments(self.bias) # E[outputs] = E[inputs] * E[kernel] + E[bias] mean = tf.tensordot(inputs_mean, kernel_mean, [[-1], [0]]) if self.use_bias: mean = tf.nn.bias_add(mean, bias_mean) # Cov = E[inputs**2] Cov(kernel) + E[W]^T Cov(inputs) E[W] + Cov(bias) # For first term, assume Cov(kernel) = 0 on off-diagonals so we only # compute diagonal term. covariance_diag = tf.tensordot(inputs_variance + inputs_mean**2, kernel_variance, [[-1], [0]]) # Compute quadratic form E[W]^T Cov E[W] from right-to-left. First is # [..., features, features], [features, units] -> [..., features, units]. cov_w = tf.tensordot(inputs_covariance, kernel_mean, [[-1], [0]]) # Next is [..., features, units], [features, units] -> [..., units, units]. w_cov_w = tf.tensordot(cov_w, kernel_mean, [[-2], [0]]) covariance = w_cov_w if self.use_bias: covariance += bias_covariance covariance = tf.matrix_set_diag( covariance, tf.matrix_diag_part(covariance) + covariance_diag) if self.activation in (tf.keras.activations.relu, tf.nn.relu): # Compute activation's moments with variable names from Wu et al. (2018). variance = tf.matrix_diag_part(covariance) scale = tf.sqrt(variance) mu = mean / (scale + tf.keras.backend.epsilon()) mean = scale * soft_relu(mu) pairwise_variances = (tf.expand_dims(variance, -1) * tf.expand_dims(variance, -2) ) # [..., units, units] rho = covariance / tf.sqrt(pairwise_variances + tf.keras.backend.epsilon()) rho = tf.clip_by_value(rho, -1. / (1. + tf.keras.backend.epsilon()), 1. / (1. + tf.keras.backend.epsilon())) s = covariance / (rho + tf.keras.backend.epsilon()) mu1 = tf.expand_dims(mu, -1) # [..., units, 1] mu2 = tf.matrix_transpose(mu1) # [..., 1, units] a = (soft_relu(mu1) * soft_relu(mu2) + rho * tfp.distributions.Normal(0., 1.).cdf(mu1) * tfp.distributions.Normal(0., 1.).cdf(mu2)) gh = tf.asinh(rho) bar_rho = tf.sqrt(1. - rho**2) gr = gh + rho / (1. + bar_rho) # Include numerically stable versions of gr and rho when multiplying or # dividing them. The sign of gr*rho and rho/gr is always positive. safe_gr = tf.abs(gr) + 0.5 * tf.keras.backend.epsilon() safe_rho = tf.abs(rho) + tf.keras.backend.epsilon() exp_negative_q = gr / ( 2. * math.pi) * tf.exp(-safe_rho / (2. * safe_gr * (1 + bar_rho)) + (gh - rho) / (safe_gr * safe_rho) * mu1 * mu2) covariance = s * (a + exp_negative_q) elif self.activation not in (tf.keras.activations.linear, None): raise NotImplementedError( 'Activation is {}. Deterministic variational ' 'inference is only available if activation is ' 'ReLU or None.'.format(self.activation)) return ed.MultivariateNormalFullCovariance(mean, covariance)
def cov_diag_loss(self): with tf.variable_scope("GMM_diag_loss"): diag_loss = tf.reduce_sum(tf.divide(1, tf.matrix_diag_part(self.sigma))) return diag_loss
def log_cholesky_det(chol): return 2 * tf.reduce_sum(tf.log(tf.matrix_diag_part(chol)), axis=-1)