def testInvalidShapeAtEval(self): with self.test_session(use_gpu=self._use_gpu): v = tf.placeholder(dtype=tf.float32) with self.assertRaisesOpError("input must be at least 2-dim"): tf.matrix_set_diag(v, [v]).eval(feed_dict={v: 0.0}) with self.assertRaisesOpError( r"but received input shape: \[1,1\] and diagonal shape: \[\]"): tf.matrix_set_diag([[v]], v).eval(feed_dict={v: 0.0})
def testRectangular(self): with self.test_session(use_gpu=self._use_gpu): v = np.array([3.0, 4.0]) mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]]) expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]]) output = tf.matrix_set_diag(mat, v) self.assertEqual((2, 3), output.get_shape()) self.assertAllEqual(expected, output.eval()) v = np.array([3.0, 4.0]) mat = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]]) expected = np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]]) output = tf.matrix_set_diag(mat, v) self.assertEqual((3, 2), output.get_shape()) self.assertAllEqual(expected, output.eval())
def _covariance(self): p = self.probs * tf.ones_like( self.total_count)[..., tf.newaxis] return tf.matrix_set_diag( -tf.matmul(self._mean_val[..., tf.newaxis], p[..., tf.newaxis, :]), # outer product self._variance())
def random_tril_matrix( shape, dtype, force_well_conditioned=False, remove_upper=True): """[batch] lower triangular matrix. Args: shape: `TensorShape` or Python `list`. Shape of the returned matrix. dtype: `TensorFlow` `dtype` or Python dtype force_well_conditioned: Python `bool`. If `True`, returned matrix will have eigenvalues with modulus in `(1, 2)`. Otherwise, eigenvalues are unit normal random variables. remove_upper: Python `bool`. If `True`, zero out the strictly upper triangle. If `False`, the lower triangle of returned matrix will have desired properties, but will not not have the strictly upper triangle zero'd out. Returns: `Tensor` with desired shape and dtype. """ with tf.name_scope("random_tril_matrix"): # Totally random matrix. Has no nice properties. tril = random_normal(shape, dtype=dtype) if remove_upper: tril = tf.matrix_band_part(tril, -1, 0) # Create a diagonal with entries having modulus in [1, 2]. if force_well_conditioned: maxval = tf.convert_to_tensor(np.sqrt(2.), dtype=dtype.real_dtype) diag = random_sign_uniform( shape[:-1], dtype=dtype, minval=1., maxval=maxval) tril = tf.matrix_set_diag(tril, diag) return tril
def CombineArcAndRootPotentials(arcs, roots): """Combines arc and root potentials into a single set of potentials. Args: arcs: [B,N,N] tensor of batched arc potentials. roots: [B,N] matrix of batched root potentials. Returns: [B,N,N] tensor P of combined potentials where P_{b,s,t} = s == t ? roots[b,t] : arcs[b,s,t] """ # All arguments must have statically-known rank. check.Eq(arcs.get_shape().ndims, 3, 'arcs must be rank 3') check.Eq(roots.get_shape().ndims, 2, 'roots must be a matrix') # All arguments must share the same type. dtype = arcs.dtype.base_dtype check.Same([dtype, roots.dtype.base_dtype], 'dtype mismatch') roots_shape = tf.shape(roots) arcs_shape = tf.shape(arcs) batch_size = roots_shape[0] num_tokens = roots_shape[1] with tf.control_dependencies([ tf.assert_equal(batch_size, arcs_shape[0]), tf.assert_equal(num_tokens, arcs_shape[1]), tf.assert_equal(num_tokens, arcs_shape[2])]): return tf.matrix_set_diag(arcs, roots)
def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder): shape = list(shape) diag_shape = shape[:-1] # Upper triangle will be ignored. # Use a diagonal that ensures this matrix is well conditioned. tril = tf.random_normal(shape=shape, dtype=dtype.real_dtype) diag = tf.random_uniform( shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.) if dtype.is_complex: tril = tf.complex( tril, tf.random_normal(shape, dtype=dtype.real_dtype)) diag = tf.complex( diag, tf.random_uniform( shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.)) tril = tf.matrix_set_diag(tril, diag) tril_ph = tf.placeholder(dtype=dtype) if use_placeholder: # Evaluate the tril here because (i) you cannot feed a tensor, and (ii) # tril is random and we want the same value used for both mat and # feed_dict. tril = tril.eval() operator = linalg.LinearOperatorTriL(tril_ph) feed_dict = {tril_ph: tril} else: operator = linalg.LinearOperatorTriL(tril) feed_dict = None mat = tf.matrix_band_part(tril, -1, 0) return operator, mat, feed_dict
def _sample_n(self, n, seed): batch_shape = self.batch_shape_tensor() event_shape = self.event_shape_tensor() batch_ndims = tf.shape(batch_shape)[0] ndims = batch_ndims + 3 # sample_ndims=1, event_ndims=2 shape = tf.concat([[n], batch_shape, event_shape], 0) stream = seed_stream.SeedStream(seed, salt="Wishart") # Complexity: O(nbk**2) x = tf.random_normal( shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=stream()) # Complexity: O(nbk) # This parametrization is equivalent to Chi2, i.e., # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2) expanded_df = self.df * tf.ones( self.scale_operator.batch_shape_tensor(), dtype=self.df.dtype.base_dtype) g = tf.random_gamma( shape=[n], alpha=self._multi_gamma_sequence(0.5 * expanded_df, self.dimension), beta=0.5, dtype=self.dtype, seed=stream()) # Complexity: O(nbk**2) x = tf.matrix_band_part(x, -1, 0) # Tri-lower. # Complexity: O(nbk) x = tf.matrix_set_diag(x, tf.sqrt(g)) # Make batch-op ready. # Complexity: O(nbk**2) perm = tf.concat([tf.range(1, ndims), [0]], 0) x = tf.transpose(x, perm) shape = tf.concat([batch_shape, [event_shape[0]], [-1]], 0) x = tf.reshape(x, shape) # Complexity: O(nbM) where M is the complexity of the operator solving a # vector system. For LinearOperatorLowerTriangular, each matmul is O(k^3) so # this step has complexity O(nbk^3). x = self.scale_operator.matmul(x) # Undo make batch-op ready. # Complexity: O(nbk**2) shape = tf.concat([batch_shape, event_shape, [n]], 0) x = tf.reshape(x, shape) perm = tf.concat([[ndims - 1], tf.range(0, ndims - 1)], 0) x = tf.transpose(x, perm) if not self.input_output_cholesky: # Complexity: O(nbk**3) x = tf.matmul(x, x, adjoint_b=True) return x
def testVector(self): with self.test_session(use_gpu=self._use_gpu): v = np.array([1.0, 2.0, 3.0]) mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]]) mat_set_diag = np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]]) output = tf.matrix_set_diag(mat, v) self.assertEqual((3, 3), output.get_shape()) self.assertAllEqual(mat_set_diag, output.eval())
def sample(means, logvars, latent_dim, iaf=True, kl_min=None, anneal=False, kl_rate=None, dtype=tf.float32): """Perform sampling and calculate KL divergence. Args: means: tensor of shape (batch_size, latent_dim) logvars: tensor of shape (batch_size, latent_dim) latent_dim: dimension of latent space. iaf: perform linear IAF or not. kl_min: lower bound for KL divergence. anneal: perform KL cost annealing or not. kl_rate: KL divergence is multiplied by kl_rate if anneal is set to True. Returns: latent_vector: latent variable after sampling. A vector of shape (batch_size, latent_dim). kl_obj: objective to be minimized for the KL term. kl_cost: real KL divergence. """ if iaf: with tf.variable_scope('iaf'): prior = DiagonalGaussian(tf.zeros_like(means, dtype=dtype), tf.zeros_like(logvars, dtype=dtype)) posterior = DiagonalGaussian(means, logvars) z = posterior.sample logqs = posterior.logps(z) L = tf.get_variable("inverse_cholesky", [latent_dim, latent_dim], dtype=dtype, initializer=tf.zeros_initializer) diag_one = tf.ones([latent_dim], dtype=dtype) L = tf.matrix_set_diag(L, diag_one) mask = np.tril(np.ones([latent_dim,latent_dim])) L = L * mask latent_vector = tf.matmul(z, L) logps = prior.logps(latent_vector) kl_cost = logqs - logps else: noise = tf.random_normal(tf.shape(mean)) sample = mean + tf.exp(0.5 * logvar) * noise kl_cost = -0.5 * (logvars - tf.square(means) - tf.exp(logvars) + 1.0) kl_ave = tf.reduce_mean(kl_cost, [0]) #mean of kl_cost over batches kl_obj = kl_cost = tf.reduce_sum(kl_ave) if kl_min: kl_obj = tf.reduce_sum(tf.maximum(kl_ave, kl_min)) if anneal: kl_obj = kl_obj * kl_rate return latent_vector, kl_obj, kl_cost #both kl_obj and kl_cost are scalar
def testGrad(self): shapes = ((3, 4, 4), (7, 4, 8, 8)) with self.test_session(use_gpu=self._use_gpu): for shape in shapes: x = tf.constant(np.random.rand(*shape), dtype=tf.float32) x_diag = tf.constant(np.random.rand(*shape[:-1]), dtype=tf.float32) y = tf.matrix_set_diag(x, x_diag) error_x = tf.test.compute_gradient_error(x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error_x, 1e-4) error_x_diag = tf.test.compute_gradient_error( x_diag, x_diag.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error_x_diag, 1e-4)
def testGradWithNoShapeInformation(self): with self.test_session(use_gpu=self._use_gpu) as sess: v = tf.placeholder(dtype=tf.float32) mat = tf.placeholder(dtype=tf.float32) grad_input = tf.placeholder(dtype=tf.float32) output = tf.matrix_set_diag(mat, v) grads = tf.gradients(output, [mat, v], grad_ys=grad_input) grad_input_val = np.random.rand(3, 3).astype(np.float32) grad_vals = sess.run( grads, feed_dict={v: 2 * np.ones(3), mat: np.ones((3, 3)), grad_input: grad_input_val}) self.assertAllEqual(np.diag(grad_input_val), grad_vals[1]) self.assertAllEqual(grad_input_val - np.diag(np.diag(grad_input_val)), grad_vals[0])
def fit(self, x=None, y=None): # p(coeffs | x, y) = Normal(coeffs | # mean = (1/noise_variance) (1/noise_variance x^T x + I)^{-1} x^T y, # covariance = (1/noise_variance x^T x + I)^{-1}) # TODO(trandustin): We newly fit the data at each call. Extend to do # Bayesian updating. kernel_matrix = tf.matmul(x, x, transpose_a=True) / self.noise_variance coeffs_precision = tf.matrix_set_diag( kernel_matrix, tf.matrix_diag_part(kernel_matrix) + 1.) coeffs_precision_tril = tf.linalg.cholesky(coeffs_precision) self.coeffs_precision_tril_op = tf.linalg.LinearOperatorLowerTriangular( coeffs_precision_tril) self.coeffs_mean = self.coeffs_precision_tril_op.solvevec( self.coeffs_precision_tril_op.solvevec(tf.einsum('nm,n->m', x, y)), adjoint=True) / self.noise_variance # TODO(trandustin): To be fully Keras-compatible, return History object. return
def testGrad(self): shapes = ((3, 4, 4), (7, 4, 8, 8)) with self.test_session(use_gpu=self._use_gpu): for shape in shapes: x = tf.constant(np.random.rand(*shape), dtype=tf.float32) x_diag = tf.constant(np.random.rand(*shape[:-1]), dtype=tf.float32) y = tf.matrix_set_diag(x, x_diag) error_x = tf.test.compute_gradient_error( x, x.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error_x, 1e-4) error_x_diag = tf.test.compute_gradient_error( x_diag, x_diag.get_shape().as_list(), y, y.get_shape().as_list()) self.assertLess(error_x_diag, 1e-4)
def future_lookup_prevention_mask(keys_len, query_len, prevail_val=1.0, cancel_val=0.0, include_diagonal=False): ones = tf.ones(keys_len * (keys_len + 1) // 2) zero_ones_mask = tf.ones( (keys_len, keys_len)) - tf.contrib.distributions.fill_triangular( ones, upper=True) prevail_mask = zero_ones_mask * prevail_val cancel_mask = tf.contrib.distributions.fill_triangular( ones * tf.constant(cancel_val, dtype=tf.float32), upper=True) mask = prevail_mask + cancel_mask if include_diagonal: mask = tf.matrix_set_diag( mask, tf.ones(keys_len, dtype=tf.float32) * prevail_val) return mask[(-query_len):, :]
def testRectangularBatch(self): with self.test_session(use_gpu=self._use_gpu): v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]]) mat_batch = np.array( [[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]], [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]]) mat_set_diag_batch = np.array( [[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]], [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]]) output = tf.matrix_set_diag(mat_batch, v_batch) self.assertEqual((2, 2, 3), output.get_shape()) self.assertAllEqual(mat_set_diag_batch, output.eval())
def KLdivergence(P, Y, low_dim=2): dtype = P.dtype with tf.Session(): alpha = low_dim - 1. sum_Y = tf.reduce_sum(tf.square(Y), 1) eps = tf.Variable(10e-15, dtype=dtype, name="eps").initialized_value() Q = tf.reshape(sum_Y, [-1, 1]) + -2 * tf.matmul(Y, tf.transpose(Y)) Q = sum_Y + Q / alpha Q = tf.pow(1 + Q, -(alpha + 1) / 2) #Q = Q * (1 - tf.diag(tf.ones(self.batch_size, dtype=dtype))) Q_d = tf.diag_part(Q) Q_d = Q_d - Q_d Q = tf.matrix_set_diag(Q, Q_d) Q = Q / tf.reduce_sum(Q) Q = tf.maximum(Q, eps) C = tf.log((P + eps) / (Q + eps)) C = tf.reduce_sum(P * C) return C
def testGradWithNoShapeInformation(self): with self.test_session(use_gpu=self._use_gpu) as sess: v = tf.placeholder(dtype=tf.float32) mat = tf.placeholder(dtype=tf.float32) grad_input = tf.placeholder(dtype=tf.float32) output = tf.matrix_set_diag(mat, v) grads = tf.gradients(output, [mat, v], grad_ys=grad_input) grad_input_val = np.random.rand(3, 3).astype(np.float32) grad_vals = sess.run(grads, feed_dict={ v: 2 * np.ones(3), mat: np.ones((3, 3)), grad_input: grad_input_val }) self.assertAllEqual(np.diag(grad_input_val), grad_vals[1]) self.assertAllEqual( grad_input_val - np.diag(np.diag(grad_input_val)), grad_vals[0])
def kl_loss(y_true, y_pred, alpha=1.0, batch_size=None, num_perplexities=None, _eps=DEFAULT_EPS): """ Kullback-Leibler Loss function (Tensorflow) between the "true" output and the "predicted" output Parameters ---------- y_true : 2d array_like (N, N*P) Should be the P matrix calculated from input data. Differences in input points using a Gaussian probability distribution Different P (perplexity) values stacked along dimension 1 y_pred : 2d array_like (N, output_dims) Output of the neural network. We will calculate the Q matrix based on this output alpha : float, optional Parameter used to calculate Q. Default 1.0 batch_size : int, required Number of samples per batch. y_true.shape[0] num_perplexities : int, required Number of perplexities stacked along axis 1 Returns ------- kl_loss : tf.Tensor, scalar value Kullback-Leibler divergence P_ || Q_ """ P_ = y_true Q_ = _make_Q(y_pred, alpha, batch_size) _tf_eps = tf.constant(_eps, dtype=P_.dtype) kls_per_beta = [] components = tf.split(P_, num_perplexities, axis=1, name='split_perp') for cur_beta_P in components: #yrange = tf.range(zz*batch_size, (zz+1)*batch_size) #cur_beta_P = tf.slice(P_, [zz*batch_size, [-1, batch_size]) #cur_beta_P = P_ kl_matr = tf.multiply(cur_beta_P, tf.log(cur_beta_P + _tf_eps) - tf.log(Q_ + _tf_eps), name='kl_matr') toset = tf.constant(0, shape=[batch_size], dtype=kl_matr.dtype) kl_matr_keep = tf.matrix_set_diag(kl_matr, toset) kl_total_cost_cur_beta = tf.reduce_sum(kl_matr_keep) kls_per_beta.append(kl_total_cost_cur_beta) kl_total_cost = tf.add_n(kls_per_beta) #kl_total_cost = kl_total_cost_cur_beta return kl_total_cost
def full_mvn_loss(truth, h): """ Takes the output of a neural network after it's last activation, performs an affine transform. It returns the mahalonobis distances between the targets and the result of the affine transformation, according to a parametrized Normal distribution. The log of the determinant of the parametrized covariance matrix is meant to be minimized to avoid a trivial optimization. :param truth: Actual datapoints to compare against learned distribution :param h: output of neural network (after last non-linear transform) :return: (tf.Tensor[MB X D], tf.Tensor[MB X 1]) Loss matrix, log_of_determinants of covariance matrices. """ fan_in = h.get_shape().as_list()[1] dimension = truth.get_shape().as_list()[1] U = 100 * tf.Variable( tf.truncated_normal( [fan_in, dimension + dimension**2], dtype=tf.float32, name='U')) b = tf.Variable(tf.zeros([dimension + dimension**2])) y = tf.matmul(h, U) + b mu = tf.slice(y, [0, 0], [-1, dimension]) # is MB x dimension # is MB x dimension^2 # WARNING WARNING TODO FIX THIS MAGIC NUMBER var = tf.slice(y, [0, dimension], [-1, -1]) * 0.0001 # make it a MB x D x D tensor (var is a superset of the lower triangular # part of a Cholesky decomp) var = tf.reshape(var, [-1, dimension, dimension]) var_diag = tf.exp(tf.matrix_diag_part(var)) + \ 1 # WARNING: FIX THIS MAGIC NUMBER var = tf.matrix_set_diag(var, var_diag) var = tf.matrix_band_part(var, -1, 0) z = tf.squeeze( tf.matrix_triangular_solve(var, tf.reshape(truth - mu, [-1, dimension, 1]), lower=True, adjoint=False)) # z should be MB x D # take row-wise inner products of z, leaving MB x 1 vector inner_prods = tf.reduce_sum(tf.square(z), 1) # diag_part converts MB x D x D to MB x D, square and log preserve, then # sum makes MB x 1 logdet = tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(var))), 1) # is MB x 1 ... hard to track of individual features' contributions due to # correlations loss_column = inner_prods tf.add_to_collection('full', var_diag) tf.add_to_collection('full', var) return tf.reshape(loss_column, [-1, 1]), tf.reshape(logdet, [-1, 1])
def get_scores(thought_vectors, dropout_rate): def use_dropout(): a, b = thought_vectors[0], thought_vectors[1] dropout_mask_shape = tf.transpose(tf.shape(a)) dropout_mask = tf.random_uniform(dropout_mask_shape) > DROPOUT_RATE dropout_mask = tf.where(dropout_mask, tf.ones(dropout_mask_shape), tf.zeros(dropout_mask_shape)) dropout_mask *= (1/dropout_rate) a *= dropout_mask b *= dropout_mask return a, b def no_dropout(): return thought_vectors[0], thought_vectors[1] a, b = tf.cond(dropout_rate > 0, use_dropout, no_dropout) scores = tf.matmul(a, b, transpose_b=True) scores = tf.matrix_set_diag(scores, tf.zeros_like(scores[0])) return scores
def contrastive_loss(y_true, y_pred): shape = tf.shape(y_true) # a list: [None, 9, 2] dim = tf.mul(shape[1], shape[2]) # dim = prod(9,2) = 18 y_true = tf.reshape(y_true, [-1, dim]) # -1 means "all" y_pred = tf.reshape(y_pred, [-1, dim]) # -1 means "all" x2 = tf.expand_dims(tf.transpose(y_pred, [0, 1]), 1) y2 = tf.expand_dims(tf.transpose(y_true, [0, 1]), 0) diff = y2 - x2 maximum = tf.maximum(diff, 0.0) tensor_pow = tf.square(maximum) errors = tf.reduce_sum(tensor_pow, 2) diagonal = tf.diag_part(errors) cost_s = tf.maximum(0.05 - errors + diagonal, 0.0) cost_im = tf.maximum(0.05 - errors + tf.reshape(diagonal, (-1, 1)), 0.0) cost_tot = cost_s + cost_im zero_diag = tf.mul(diagonal, 0.0) cost_tot_diag = tf.matrix_set_diag(cost_tot, zero_diag) tot_sum = tf.reduce_sum(cost_tot_diag) return tot_sum
def get_matrix_tree(self, r, A, mask1, mask2): L = tf.zeros_like(A) L = L - A tmp = tf.reduce_sum(A, 1) L = tf.matrix_set_diag(L, tmp) L_dash = tf.concat([L[:, 1:, :], tf.expand_dims(r, 1)], 1) #(B*T,S,S) L_dash_inv = tf.matrix_inverse(L_dash) proot = tf.multiply(r, L_dash_inv[:, :, 0]) ##(B*T,S,) pz1 = mask1 * tf.multiply(A, tf.matrix_transpose( tf.expand_dims( tf.matrix_diag_part(L_dash_inv), 2))) #(B*T,S,S) pz2 = mask2 * tf.multiply(A, tf.matrix_transpose(L_dash_inv)) #(B*T,S,S) pz = pz1 - pz2 return proot, pz
def _get_anchor_positive_triplet_mask(labels): """Return a 2D mask where mask[a, p] is True iff a and p are distinct and have same label. Args: labels: tf.int32 `Tensor` with shape [batch_size] Returns: mask: tf.bool `Tensor` with shape [batch_size, batch_size] """ with tf.name_scope("anchor_positive_mask") as scope: # Check if labels[i] == labels[j] # Uses broadcasting where the 1st argument has shape (1, batch_size) and the 2nd (batch_size, 1) labels_equal = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1)) # Remove the diagonal, that is, the space where a == p mask = tf.matrix_set_diag(labels_equal, tf.zeros(tf.shape(labels)[0], dtype=tf.bool)) return mask
def discriminative_instance_loss(y_true, y_pred, delta_v=0.5, delta_d=1.5, order=2, gamma=1e-3): """Computes the discriminative instance loss # Arguments: y_true: A tensor of the same shape as `y_pred`. y_pred: A tensor of the vector embedding """ def temp_norm(ten, axis=-1): return tf.sqrt(K.epsilon() + tf.reduce_sum(tf.square(ten), axis=axis)) channel_axis = 1 if K.image_data_format() == 'channels_first' else len(y_pred.get_shape()) - 1 other_axes = [x for x in list(range(len(y_pred.get_shape()))) if x != channel_axis] # Compute variance loss cells_summed = tf.tensordot(y_true, y_pred, axes=[other_axes, other_axes]) n_pixels = tf.cast(tf.count_nonzero(y_true, axis=other_axes), dtype=K.floatx()) + K.epsilon() n_pixels_expand = tf.expand_dims(n_pixels, axis=1) + K.epsilon() mu = tf.divide(cells_summed, n_pixels_expand) delta_v = tf.constant(delta_v, dtype=K.floatx()) mu_tensor = tf.tensordot(y_true, mu, axes=[[channel_axis], [0]]) L_var_1 = y_pred - mu_tensor L_var_2 = tf.square(tf.nn.relu(temp_norm(L_var_1, axis=channel_axis) - delta_v)) L_var_3 = tf.tensordot(L_var_2, y_true, axes=[other_axes, other_axes]) L_var_4 = tf.divide(L_var_3, n_pixels) L_var = tf.reduce_mean(L_var_4) # Compute distance loss mu_a = tf.expand_dims(mu, axis=0) mu_b = tf.expand_dims(mu, axis=1) diff_matrix = tf.subtract(mu_b, mu_a) L_dist_1 = temp_norm(diff_matrix, axis=channel_axis) L_dist_2 = tf.square(tf.nn.relu(tf.constant(2 * delta_d, dtype=K.floatx()) - L_dist_1)) diag = tf.constant(0, dtype=K.floatx()) * tf.diag_part(L_dist_2) L_dist_3 = tf.matrix_set_diag(L_dist_2, diag) L_dist = tf.reduce_mean(L_dist_3) # Compute regularization loss L_reg = gamma * temp_norm(mu, axis=-1) L = L_var + L_dist + tf.reduce_mean(L_reg) return L
def quadratic_regression_pd(SA, costs, diag_cost=False): assert not diag_cost global global_step dsa = SA.shape[-1] C = tf.get_variable('cost_mat{}'.format(global_step), shape=[dsa, dsa], dtype=tf.float32, initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1)) L = tf.matrix_band_part(C, -1, 0) L = tf.matrix_set_diag(L, tf.maximum(tf.matrix_diag_part(L), 0.0)) LL = tf.matmul(L, tf.transpose(L)) c = tf.get_variable('cost_vec{}'.format(global_step), shape=[dsa], dtype=tf.float32, initializer=tf.zeros_initializer()) b = tf.get_variable('cost_bias{}'.format(global_step), shape=[], dtype=tf.float32, initializer=tf.zeros_initializer()) s_ = tf.placeholder(tf.float32, [None, dsa]) c_ = tf.placeholder(tf.float32, [None]) pred_cost = 0.5 * tf.einsum('na,ab,nb->n', s_, LL, s_) + \ tf.einsum('na,a->n', s_, c) + b mse = tf.reduce_mean(tf.square(pred_cost - c_)) opt = tf.train.MomentumOptimizer(1e-3, 0.9).minimize(mse) N = SA.shape[0] SA = SA.reshape([-1, dsa]) costs = costs.reshape([-1]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for itr in tqdm.trange(1000, desc='Fitting cost'): _, m = sess.run([opt, mse], feed_dict={ s_: SA, c_: costs, }) if itr == 0 or itr == 999: print('mse itr {}: {}'.format(itr, m)) cost_mat, cost_vec = sess.run((LL, c)) global_step += 1 return cost_mat, cost_vec
def deep_linear(x, full_cov=True): # x: [batch_size, x_dim] h = x for n_units in layer_sizes: # h: [batch_size, n_units] h = tf.layers.dense(h, n_units, activation=activation) # w_mean: [n_units] n_units = layer_sizes[-1] w_mean = tf.get_variable("w_mean", shape=[n_units], dtype=tf.float64, initializer=tf.truncated_normal_initializer( stddev=0.001, dtype=tf.float64)) w_cov_raw = tf.get_variable("w_cov", dtype=tf.float64, initializer=tf.eye(n_units, dtype=tf.float64)) w_cov_tril = tf.matrix_set_diag( tf.matrix_band_part(w_cov_raw, -1, 0), tf.nn.softplus(tf.matrix_diag_part(w_cov_raw))) # f_mean: [batch_size] f_mean = tf.squeeze(tf.matmul(h, w_mean[:, None]), -1) # f_cov: [batch_size, batch_size] f_cov_half = tf.matmul(h, w_cov_tril) if full_cov: f_cov = tf.matmul(f_cov_half, f_cov_half, transpose_b=True) f_cov = f_cov + tf.eye(tf.shape(f_cov)[0], dtype=tf.float64) * \ gpflow.settings.jitter if mvn: f_cov_tril = tf.cholesky(f_cov) f_dist = zs.distributions.MultivariateNormalCholesky( f_mean, f_cov_tril) return f_dist else: return f_mean, f_cov else: # hw_cov: [batch_size, n_units] hw_cov = tf.matmul(f_cov_half, w_cov_tril, transpose_b=True) # f_cov_diag: [batch_size] f_var = tf.reduce_sum(hw_cov * h, axis=-1) f_var += gpflow.settings.jitter return f_mean, f_var
def test_gamma_gaussian_equivalent(self): # Check that the Cholesky-Wishart distribution with the sparsity correction factor is equivalent to a # SquareRootGamma-Gaussian distribution after removing the log probability of the zero terms in the off diagonal sqrt_gamma_gaussian = SqrtGammaGaussian( df=self.sqrt_w.df, log_diag_scale=self.sqrt_w.log_diag_scale) x_with_log_diag = tf.matrix_set_diag( self.x, self.x_cov_obj.log_diag_chol_precision) log_prob1_gamma = sqrt_gamma_gaussian._log_prob_sqrt_gamma( x_with_log_diag) log_prob1_normal = sqrt_gamma_gaussian.normal_dist.log_prob(self.x) off_diag_mask = self.x_cov_obj.np_off_diag_mask() log_prob1_normal = tf.reduce_sum(log_prob1_normal * off_diag_mask, axis=[1, 2]) log_prob_gg = log_prob1_gamma + log_prob1_normal log_prob_wishart = self.sqrt_w.log_prob(self.x_cov_obj) self._asset_allclose_tf_feed(log_prob_gg, log_prob_wishart)
def test_log_prob_sparse(self): # Test that square root Gamma Gaussian with sparse matrices is the same as a the dense version, # when the sparse elements are removed afterwards x_with_log_diag = tf.matrix_set_diag( self.x, self.x_cov_obj.log_diag_chol_precision) log_prob1_gamma = self.sqrt_gamma_gaussian_dense._log_prob_sqrt_gamma( x_with_log_diag) log_prob1_normal = self.sqrt_gamma_gaussian_dense.normal_dist.log_prob( self.x) off_diag_mask = self.x_cov_obj.np_off_diag_mask( ) # Zero out off-diagonal terms log_prob1_normal = tf.reduce_sum(log_prob1_normal * off_diag_mask, axis=[1, 2]) log_prob1 = log_prob1_gamma + log_prob1_normal log_prob2 = self.sqrt_gamma_gaussian.log_prob(self.x_cov_obj) self._asset_allclose_tf_feed(log_prob1, log_prob2)
def _get_normed_sym_tf(X_, batch_size): """ Compute the normalized and symmetrized probability matrix from relative probabilities X_, where X_ is a Tensorflow Tensor Parameters ---------- X_ : 2-d Tensor (N, N) asymmetric probabilities. For instance, X_(i, j) = P(i|j) Returns ------- P : 2-d Tensor (N, N) symmetric probabilities, making the assumption that P(i|j) = P(j|i) Diagonals are all 0s.""" toset = tf.constant(0, shape=[batch_size], dtype=X_.dtype) X_ = tf.matrix_set_diag(X_, toset) norm_facs = tf.reduce_sum(X_, axis=0, keep_dims=True) X_ = X_ / norm_facs X_ = 0.5*(X_ + tf.transpose(X_)) return X_
def get_cholesky_variable(name, shape=None, dtype=None, initializer=None, regularizer=None, trainable=True, collections=None, caching_device=None, partitioner=None, validate_shape=True, custom_getter=None, transform=None): """ Get an existing Cholesky variable or create a new one. """ x = get_tril_variable(name, shape, dtype, initializer, regularizer, trainable, collections, caching_device, partitioner, validate_shape, custom_getter) transform = transform or tf.nn.softplus return tf.matrix_set_diag(x, transform(tf.matrix_diag_part(x)))
def calc_min_feasible_power(self, abs_H, min_rates, noise_power): abs_H_2 = tf.reshape( tf.square(abs_H), [-1, self.top_config.user_num, self.top_config.user_num]) check_mat = tf.matrix_transpose(abs_H_2) diag_part = tf.matrix_diag_part(abs_H_2) + 1e-10 diag_zeros = tf.zeros(tf.shape(diag_part)) diag_part = tf.reshape(diag_part, [-1, self.top_config.user_num, 1]) check_mat = tf.divide(check_mat, diag_part) check_mat = tf.matrix_set_diag(check_mat, diag_zeros) min_snrs = tf.cast( tf.reshape(2**min_rates - 1, [self.top_config.user_num, 1]), tf.float32) check_mat = tf.multiply(check_mat, min_snrs) u = np.divide(min_snrs, diag_part) * noise_power inv_id_sub_check_mat = tf.matrix_inverse( tf.subtract(tf.eye(self.top_config.user_num), check_mat)) min_feasible_power = tf.matmul(inv_id_sub_check_mat, u) min_feasible_power = tf.reshape(min_feasible_power, [-1, self.top_config.user_num]) return min_feasible_power
def linear_covariance(x_mean, x_cov, A, b): x_var_diag = tf.matrix_diag_part(x_cov) xx_mean = x_var_diag + x_mean * x_mean term1_diag = tf.matmul(xx_mean, A.var) flat_xCov = tf.reshape(x_cov, [-1, A.shape[0]]) # [b*x, x] xCov_A = tf.matmul(flat_xCov, A.mean) # [b*x, y] xCov_A = tf.reshape(xCov_A, [-1, A.shape[0], A.shape[1]]) # [b, x, y] xCov_A = tf.transpose(xCov_A, [0, 2, 1]) # [b, y, x] xCov_A = tf.reshape(xCov_A, [-1, A.shape[0]]) # [b*y, x] A_xCov_A = tf.matmul(xCov_A, A.mean) # [b*y, y] A_xCov_A = tf.reshape(A_xCov_A, [-1, A.shape[1], A.shape[1]]) # [b, y, y] term2 = A_xCov_A term2_diag = tf.matrix_diag_part(term2) term3_diag = b.var result_diag = term1_diag + term2_diag + term3_diag return tf.matrix_set_diag(term2, result_diag)
def meanfield_nn(D, k, temp=None, exclude_self=False): logits = D if temp is not None: logits = logits * temp # temp is actually treated as inverse temperature since this is numerically more stable print('with temp') if exclude_self: infs = tf.ones_like(logits[:, :, :, 0]) * np.inf # infs = tf.ones_like(logits[:,:,:,0]) * (10000.0) # setting diagonal to -inf produces numerical problems ... logits = tf.matrix_set_diag(logits, -infs) W = [] for i in range(k): weights_exp = tf.nn.softmax(logits, axis=-1) eps = 1.2e-7 weights_exp = tf.clip_by_value(weights_exp, eps, 1 - eps) W.append(weights_exp) logits = logits + tf.log1p(-weights_exp) return W
def call(self, x, mask, training=False): self.step += 1 x_ = x x = dropout(x, keep_prob=self.keep_prob, training=training) if self.step == 0: if not self.identity: self.linear = layers.Dense(melt.get_shape(x, -1), activation=tf.nn.relu) else: self.linear = None # NOTICE shared linear! if self.linear is not None: x = self.linear(x) scores = tf.matmul(x, tf.transpose(x, [0, 2, 1])) # x = tf.constant([[[1,2,3], [4,5,6],[7,8,9]],[[1,2,3],[4,5,6],[7,8,9]]], dtype=tf.float32) # shape=(2, 3, 3) # z = tf.matrix_set_diag(x, tf.zeros([2, 3])) if not self.diag: # TODO better dim dim0 = melt.get_shape(scores, 0) dim1 = melt.get_shape(scores, 1) scores = tf.matrix_set_diag(scores, tf.zeros([dim0, dim1])) if mask is not None: JX = melt.get_shape(x, 1) mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) scores = softmax_mask(scores, mask) alpha = tf.nn.softmax(scores) self.alpha = alpha x = tf.matmul(alpha, x) if self.combine is None: return y else: return self.combine(x_, x, training=training)
def LSEnet(model, Ip, u1p, u2p): # computation graph that defines least squared estimation of the electric field delta_Ep_pred = tf.cast(tf.tensordot(u1p, model.G1_real, axes=[[-1], [1]]) + tf.tensordot(u2p, model.G2_real, axes=[[-1], [1]]), tf.complex128) + \ + 1j * tf.cast(tf.tensordot(u1p, model.G1_imag, axes=[[-1], [1]]) + tf.tensordot(u2p, model.G2_imag, axes=[[-1], [1]]), tf.complex128) delta_Ep_expand = tf.expand_dims(delta_Ep_pred, 2) delta_Ep_expand_diff = delta_Ep_expand[:, 1:: 2, :, :] - delta_Ep_expand[:, 2:: 2, :, :] y = tf.transpose(Ip[:, 1::2, :] - Ip[:, 2::2, :], [0, 2, 1]) H = tf.concat( [2 * tf.real(delta_Ep_expand_diff), 2 * tf.imag(delta_Ep_expand_diff)], axis=2) H = tf.transpose(H, [0, 3, 1, 2]) Ht_H = tf.matmul(tf.transpose(H, [0, 1, 3, 2]), H) Ht_H_inv_Ht = tf.matmul( tf.matrix_inverse(Ht_H + tf.eye(2, dtype=tf.float64) * 1e-12), tf.transpose(H, [0, 1, 3, 2])) x_new = tf.squeeze(tf.matmul(Ht_H_inv_Ht, tf.expand_dims(y, -1)), -1) n_observ = model.n_observ contrast_p = tf.reduce_mean(Ip, axis=2) d_contrast_p = tf.reduce_mean(tf.abs(delta_Ep_pred)**2, axis=2) Rp = tf.tensordot( tf.expand_dims(model.R0 + model.R1 * contrast_p + 4 * (model.Q0 + model.Q1 * d_contrast_p) * contrast_p, axis=-1), tf.ones((1, model.num_pix), dtype=tf.float64), axes=[[-1], [0]]) + 1e-24 Rp = tf.transpose(Rp, [0, 2, 1]) R_diff = Rp[:, :, 1::2] + Rp[:, :, 2::2] R = tf.matrix_set_diag( tf.concat([tf.expand_dims(tf.zeros_like(R_diff), -1)] * (n_observ // 2), -1), R_diff) P_new = tf.matmul(tf.matmul(Ht_H_inv_Ht, R), tf.transpose(Ht_H_inv_Ht, [0, 1, 3, 2])) Enp_pred_new = tf.cast(x_new[:, :, 0], dtype=tf.complex128) + 1j * tf.cast( x_new[:, :, 1], dtype=tf.complex128) return Enp_pred_new, P_new, H
def _decode_verification(self): with tf.variable_scope("Cross_passage_verification"): batch_size = tf.shape(self.start_label)[0] content_probs = tf.reshape( self.content_probs, [tf.shape(self.p_emb)[0], tf.shape(self.p_emb)[1], 1]) # [batch * 5 , p , 1] ver_P = content_probs * self.p_emb ver_P = tf.reshape(ver_P, [ batch_size, -1, tf.shape(self.p_emb)[1], 3 * self.append_wordvec_size + self.vocab.embed_dim ]) #[batch , 5 , p , wordvec dimension = 3 * 1024 + 300] RA = tf.reduce_mean(ver_P, axis=2) # [batch , 5 , wordvec] #print("RA_concated.shape = ",RA.shape) #Given the representation of the answer candidates from all passages {rAi }, each answer candidate then attends to other candidates to collect supportive information via attention mechanism #tf.batch_mat_mul() S = tf.matmul(RA, RA, transpose_a=False, transpose_b=True) # [batch , 5 , 5] S = tf.matrix_set_diag( input=S, diagonal=tf.zeros(shape=[batch_size, tf.shape(S)[1]], dtype=S.dtype) ) #[batch , 5 , 5] except for the main digonal of innermost matrices is all 0 S = tf.nn.softmax(S, -1) #[batch , 5 , 5] 每一行都是归一化过的了 RA_Complementary = tf.matmul(S, RA, transpose_a=False, transpose_b=False) #Here ̃rAi is the collected verification information from other passages based on the attention weights. Then we pass it together with the original representation rAi to a fully connected layer RA_concated = tf.concat( [RA, RA_Complementary, RA * RA_Complementary], -1) # [batch , 5 , 3 * (3 * 1024 + 300) = 10116] g = tc.layers.fully_connected(RA_concated, num_outputs=self.max_p_num, activation_fn=None) #[batch , 5 ,1] g = tf.reshape(g, shape=[batch_size, -1]) #[batch , 5] self.pred_pass_prob = tf.nn.softmax(g, -1) #[batch , 5]
def get_initial_state(self, batch_size): with tf.variable_scope('initial_state', reuse=tf.AUTO_REUSE): R_0_params = tf.get_variable( name='R_0_params', dtype=tf.float32, shape=[self.memory_size, self.code_size], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05), trainable=self.trainable_memory) # note we do not use a zero init for the DKM. # this is to allow DKM RLS algo to compute a nonzero-mean addressing weight distribution # when using p(M) to compute the first q(w) during writing of an episode. # note our models use a randomized or otherwise asymmetric init for q^(0)(M) instead of assigning it the prior's values. # so we can (and do) use a zero init for our models' priors. U_0_params = tf.get_variable( name='U_0_params', dtype=tf.float32, shape=[self.memory_size, self.memory_size], initializer=tf.zeros_initializer(), trainable=self.trainable_memory) R_0 = R_0_params upper_tri = tf.matrix_band_part(U_0_params, 0, -1) strictly_upper_tri = tf.matrix_set_diag( upper_tri, tf.zeros_like(tf.matrix_diag_part(upper_tri), dtype=tf.float32)) logdiag = tf.matrix_diag_part(U_0_params) U_0_diag = tf.diag(tf.exp(logdiag)) U_0_offdiag = strictly_upper_tri + tf.transpose(strictly_upper_tri) U_0 = U_0_diag + U_0_offdiag R = tf.tile(tf.expand_dims(R_0, 0), [batch_size, 1, 1]) U = tf.tile(tf.expand_dims(U_0, 0), [batch_size, 1, 1]) return MemoryState(R=R, U=U)
def get_dist_table_novariance(x, dist, symmetric, alpha): batch_size = get_shape(x)[0] P = pairwise_distance(x, x) if dist == 'gauss': P = tf.exp(-P) elif dist == 'tdis': P = tf.pow(1. + P, -1.) toset = tf.constant(0., shape=[batch_size], dtype=tf.float32) P = tf.matrix_set_diag(P, toset) if symmetric == True: m = tf.reduce_sum(P) P = P / m else: m = tf.reduce_sum(P, axis=1) m = tf.tile(tf.expand_dims(m, axis=1), [1, batch_size]) P = tf.div(P, m) P = 0.5 * (P + tf.transpose(P)) P = P / batch_size return P
def _uniform_correlation_like_matrix(num_rows, batch_shape, dtype, seed): """Returns a uniformly random `Tensor` of "correlation-like" matrices. A "correlation-like" matrix is a symmetric square matrix with all entries between -1 and 1 (inclusive) and 1s on the main diagonal. Of these, the ones that are positive semi-definite are exactly the correlation matrices. Args: num_rows: Python `int` dimension of the correlation-like matrices. batch_shape: `Tensor` or Python `tuple` of `int` shape of the batch to return. dtype: `dtype` of the `Tensor` to return. seed: Random seed. Returns: matrices: A `Tensor` of shape `batch_shape + [num_rows, num_rows]` and dtype `dtype`. Each entry is in [-1, 1], and each matrix along the bottom two dimensions is symmetric and has 1s on the main diagonal. """ num_entries = num_rows * (num_rows + 1) / 2 ones = tf.ones(shape=[num_entries], dtype=dtype) # It seems wasteful to generate random values for the diagonal since # I am going to throw them away, but `fill_triangular` fills the # diagonal, so I probably need them. # It's not impossible that it would be more efficient to just fill # the whole matrix with random values instead of messing with # `fill_triangular`. Then would need to filter almost half out with # `matrix_band_part`. unifs = uniform.Uniform(-ones, ones).sample(batch_shape, seed=seed) tril = util.fill_triangular(unifs) symmetric = tril + tf.matrix_transpose(tril) diagonal_ones = tf.ones(shape=util.pad(batch_shape, axis=0, back=True, value=num_rows), dtype=dtype) return tf.matrix_set_diag(symmetric, diagonal_ones)
def get_KL_logistic(X, posterior_alpha, prior_lambda_, posterior_lambda_, prior_alpha): """ Calculates KL divergence between two Concrete distributions using samples from posterior Concrete distribution. KL(Concrete(alpha, posterior_lambda_) || Concrete(prior_alpha, prior_lambda)) Args: X: Tensor of shape S x N x N. These are samples from posterior Concrete distribution. posterior_alpha: Tensor of shape N x N. alpha for posterior distributions. prior_lambda_: Tensor of shape (). prior_lambda_ of prior distribution. posterior_lambda_: Tensor of shape (). posterior_lambda_ for posterior distribution. prior_alpha: Tensor of shape N x N. alpha for prior distributions. Returns: : Tensor of shape () representing KL divergence between the two concrete distributions. """ logdiff = Latnet.logp_logistic( X, posterior_alpha, posterior_lambda_) - Latnet.logp_logistic( X, prior_alpha, prior_lambda_) logdiff = tf.matrix_set_diag( logdiff, tf.zeros((tf.shape(logdiff)[0], tf.shape(logdiff)[1]), dtype=Latnet.FLOAT)) # set diagonal part to zero return tf.reduce_sum(tf.reduce_mean(logdiff, [0]))
def _assertions(self, x): if not self.validate_args: return [] shape = tf.shape(x) is_matrix = tf.assert_rank_at_least( x, 2, message="Input must have rank at least 2.") is_square = tf.assert_equal( shape[-2], shape[-1], message="Input must be a square matrix.") above_diagonal = tf.matrix_band_part( tf.matrix_set_diag(x, tf.zeros(shape[:-1], dtype=tf.float32)), 0, -1) is_lower_triangular = tf.assert_equal( above_diagonal, tf.zeros_like(above_diagonal), message="Input must be lower triangular.") # A lower triangular matrix is nonsingular iff all its diagonal entries are # nonzero. diag_part = tf.matrix_diag_part(x) is_nonsingular = tf.assert_none_equal( diag_part, tf.zeros_like(diag_part), message="Input must have all diagonal entries nonzero.") return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder): shape = list(shape) diag_shape = shape[:-1] # Upper triangle will be ignored. # Use a diagonal that ensures this matrix is well conditioned. tril = tf.random_normal(shape=shape, dtype=dtype.real_dtype) diag = tf.random_uniform(shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.) if dtype.is_complex: tril = tf.complex(tril, tf.random_normal(shape, dtype=dtype.real_dtype)) diag = tf.complex( diag, tf.random_uniform(shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.)) tril = tf.matrix_set_diag(tril, diag) tril_ph = tf.placeholder(dtype=dtype) if use_placeholder: # Evaluate the tril here because (i) you cannot feed a tensor, and (ii) # tril is random and we want the same value used for both mat and # feed_dict. tril = tril.eval() operator = linalg.LinearOperatorTriL(tril_ph) feed_dict = {tril_ph: tril} else: operator = linalg.LinearOperatorTriL(tril) feed_dict = None mat = tf.matrix_band_part(tril, -1, 0) return operator, mat, feed_dict
def SAMME_R_voting_strategy(logits): """ Algorithm 4 of "Multi-class AdaBoost" by Zhu et al. 2006 PDF: Can be found at the bottom of page 9 (https://web.stanford.edu/~hastie/Papers/samme.pdf) Args: See `voting strategy` """ class_num = logits[0].get_shape().as_list()[-1] for x in logits: assert x.shape == logits[0].shape log_probs = [tf.log(tf.nn.softmax(l)) for l in logits] # two steps to get a matrix of -1 except for the diagonal which is 1 hk_inner_prod = tf.constant( (-1 / class_num), dtype=tf.float32, shape=(class_num, class_num)) hk_inner_prod = tf.matrix_set_diag(hk_inner_prod, tf.ones([class_num])) h_ks = [(class_num - 1) * tf.matmul(lp, hk_inner_prod) for lp in log_probs] return tf.accumulate_n(h_ks)
def testSquareBatch(self): with self.test_session(use_gpu=self._use_gpu): v_batch = np.array([[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]]) mat_batch = np.array( [[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]], [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]]) mat_set_diag_batch = np.array( [[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0], [1.0, 0.0, -3.0]], [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]]) output = tf.matrix_set_diag(mat_batch, v_batch) self.assertEqual((2, 3, 3), output.get_shape()) self.assertAllEqual(mat_set_diag_batch, output.eval())
def _uniform_correlation_like_matrix(num_rows, batch_shape, dtype, seed): """Returns a uniformly random `Tensor` of "correlation-like" matrices. A "correlation-like" matrix is a symmetric square matrix with all entries between -1 and 1 (inclusive) and 1s on the main diagonal. Of these, the ones that are positive semi-definite are exactly the correlation matrices. Args: num_rows: Python `int` dimension of the correlation-like matrices. batch_shape: `Tensor` or Python `tuple` of `int` shape of the batch to return. dtype: `dtype` of the `Tensor` to return. seed: Random seed. Returns: matrices: A `Tensor` of shape `batch_shape + [num_rows, num_rows]` and dtype `dtype`. Each entry is in [-1, 1], and each matrix along the bottom two dimensions is symmetric and has 1s on the main diagonal. """ num_entries = num_rows * (num_rows + 1) / 2 ones = tf.ones(shape=[num_entries], dtype=dtype) # It seems wasteful to generate random values for the diagonal since # I am going to throw them away, but `fill_triangular` fills the # diagonal, so I probably need them. # It's not impossible that it would be more efficient to just fill # the whole matrix with random values instead of messing with # `fill_triangular`. Then would need to filter almost half out with # `matrix_band_part`. unifs = uniform.Uniform(-ones, ones).sample(batch_shape, seed=seed) tril = util.fill_triangular(unifs) symmetric = tril + tf.matrix_transpose(tril) diagonal_ones = tf.ones( shape=util.pad(batch_shape, axis=0, back=True, value=num_rows), dtype=dtype) return tf.matrix_set_diag(symmetric, diagonal_ones)
def _covariance(self): # Derivation: https://sachinruk.github.io/blog/von-Mises-Fisher/ event_dim = self.event_shape[0].value if event_dim is None: raise ValueError('event shape must be statically known for _bessel_ive') # TODO(bjp): Enable this; numerically unstable. if event_dim > 2: raise ValueError('vMF covariance is numerically unstable for dim>2') concentration = self.concentration[..., tf.newaxis] safe_conc = tf.where( concentration > 0, concentration, tf.ones_like(concentration)) h = (_bessel_ive(event_dim / 2, safe_conc) / _bessel_ive(event_dim / 2 - 1, safe_conc)) intermediate = ( tf.matmul(self.mean_direction[..., :, tf.newaxis], self.mean_direction[..., tf.newaxis, :]) * (1 - event_dim * h / safe_conc - h**2)[..., tf.newaxis]) cov = tf.matrix_set_diag( intermediate, tf.matrix_diag_part(intermediate) + (h / safe_conc)) return tf.where( concentration[..., tf.newaxis] > tf.zeros_like(cov), cov, tf.linalg.eye(event_dim, batch_shape=self.batch_shape_tensor()) / event_dim)
def testDefaultsYieldCorrectShapesAndValues(self): batch_shape = [4, 3] x_size = 3 mvn_size = 5 x_ = np.random.randn(*np.concatenate([batch_shape, [x_size]])) x = tf.constant(x_) mvn = tfp.trainable_distributions.multivariate_normal_tril(x, dims=mvn_size) scale = mvn.scale.to_dense() scale_upper = tf.matrix_set_diag( tf.matrix_band_part(scale, num_lower=0, num_upper=-1), tf.zeros(np.concatenate([batch_shape, [mvn_size]]), scale.dtype)) scale_diag = tf.matrix_diag_part(scale) self.evaluate(tf.global_variables_initializer()) [ batch_shape_, event_shape_, scale_diag_, scale_upper_, ] = self.evaluate([ mvn.batch_shape_tensor(), mvn.event_shape_tensor(), scale_diag, scale_upper, ]) self.assertAllEqual(batch_shape, mvn.batch_shape) self.assertAllEqual(batch_shape, batch_shape_) self.assertAllEqual([mvn_size], mvn.event_shape) self.assertAllEqual([mvn_size], event_shape_) self.assertAllEqual(np.ones_like(scale_diag_, dtype=np.bool), scale_diag_ > 0.) self.assertAllEqual(np.zeros_like(scale_upper_), scale_upper_)
def weight_change_for_layer(self, meta_opt, l_idx, w_base, b_base, upper_h, lower_h, upper_x, lower_x, prefix, include_bias): """Compute the change in weights for each layer. This computes something roughly analagous to a gradient. """ reduce_upper_h = upper_h reduce_lower_h = lower_h BS = lower_x.shape.as_list()[0] change_w_terms = dict() # initial weight value normalized # normalize the weights per receptive-field, rather than per-matrix weight_scale = tf.rsqrt( tf.reduce_mean(w_base**2, axis=0, keepdims=True) + 1e-6) w_base *= weight_scale change_w_terms['w_base'] = w_base # this will act to decay larger weights towards zero change_w_terms['large_decay'] = w_base**2 * tf.sign(w_base) # term based on activations ux0 = upper_x - tf.reduce_mean(upper_x, axis=0, keepdims=True) uxs0 = ux0 * tf.rsqrt(tf.reduce_mean(ux0**2, axis=0, keepdims=True) + 1e-6) change_U = tf.matmul(uxs0, uxs0, transpose_a=True) / BS change_U /= tf.sqrt(float(change_U.shape.as_list()[0])) cw = tf.matmul(w_base, change_U) cw_scale = tf.rsqrt(tf.reduce_mean(cw**2 + 1e-8)) cw *= cw_scale change_w_terms['decorr_x'] = cw # hebbian term lx0 = lower_x - tf.reduce_mean(lower_x, axis=0, keepdims=True) lxs0 = lx0 * tf.rsqrt(tf.reduce_mean(lx0**2, axis=0, keepdims=True) + 1e-6) cw = tf.matmul(lxs0, uxs0, transpose_a=True) / BS change_w_terms['hebb'] = -cw # 0th order term w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_0', upper_h, lower_h) change_w_terms['0_order'] = w_term # # rbf term (weight update scaled by distance from 0) w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_rbf', reduce_upper_h, reduce_lower_h) change_w_terms['rbf'] = tf.exp(-w_base**2) * w_term # 1st order term (weight dependent update to weights) w_term = meta_opt.low_rank_readout(prefix + 'weight_readout_1', reduce_upper_h, reduce_lower_h) change_w_terms['1_order'] = w_base * w_term # more terms based on single layer readouts. for update_type in ['lin', 'sqr']: for h_source, h_source_name in [(reduce_upper_h, 'upper'), (reduce_lower_h, 'lower')]: structures = ['symm'] if update_type == 'lin' and h_source_name == 'upper': structures += ['psd'] for structure in structures: name = update_type + '_' + h_source_name + '_' + structure if structure == 'symm': change_U = meta_opt.low_rank_readout(prefix + name, h_source, h_source) change_U = (change_U + tf.transpose(change_U)) / tf.sqrt(2.) change_U = tf.matrix_set_diag(change_U, tf.zeros( [change_U.shape.as_list()[0]])) elif structure == 'psd': change_U = meta_opt.low_rank_readout( prefix + name, h_source, None, psd=True) else: assert False change_U /= tf.sqrt(float(change_U.shape.as_list()[0])) if update_type == 'lin': sign_multiplier = tf.ones_like(w_base) w_base_l = w_base elif update_type == 'sqr': sign_multiplier = tf.sign(w_base) w_base_l = tf.sqrt(1. + w_base**2) - 1. if h_source_name == 'upper': cw = tf.matmul(w_base_l, change_U) # [N^l-1 x N^l] elif h_source_name == 'lower': cw = tf.matmul(change_U, w_base_l) change_w_terms[name] = cw * sign_multiplier if prefix == 'forward': change_w = meta_opt.merge_change_w_forward( change_w_terms, global_prefix=prefix, prefix='l%d' % l_idx) elif prefix == 'backward': change_w = meta_opt.merge_change_w_backward( change_w_terms, global_prefix=prefix, prefix='l%d' % l_idx) else: assert (False) if not include_bias: return change_w change_b = tf.reduce_mean(meta_opt.bias_readout(upper_h), [0]) # force nonlinearities to be exercised -- biases can't all be increased without bound change_b_mean = tf.reduce_mean(change_b) offset = -tf.nn.relu(-change_b_mean) change_b -= offset var = tf.reduce_mean(tf.square(change_b), [0], keepdims=True) change_b = (change_b) / tf.sqrt(0.5 + var) return change_w, change_b
def _forward(self, x): diag = self._diag_bijector.forward(tf.matrix_diag_part(x)) return tf.matrix_set_diag(x, diag)
def _add_diagonal_shift(matrix, shift): diag_plus_shift = tf.matrix_diag_part(matrix) + shift return tf.matrix_set_diag(matrix, diag_plus_shift)
def _covariance(self): p = self.probs ret = -tf.matmul(p[..., None], p[..., None, :]) return tf.matrix_set_diag(ret, self._variance())
def _add_diagonal_shift(matrix, shift): return tf.matrix_set_diag( matrix, tf.matrix_diag_part(matrix) + shift, name='add_diagonal_shift')
def _sample_n(self, num_samples, seed=None, name=None): """Returns a Tensor of samples from an LKJ distribution. Args: num_samples: Python `int`. The number of samples to draw. seed: Python integer seed for RNG name: Python `str` name prefixed to Ops created by this function. Returns: samples: A Tensor of correlation matrices with shape `[n, B, D, D]`, where `B` is the shape of the `concentration` parameter, and `D` is the `dimension`. Raises: ValueError: If `dimension` is negative. """ if self.dimension < 0: raise ValueError( 'Cannot sample negative-dimension correlation matrices.') # Notation below: B is the batch shape, i.e., tf.shape(concentration) seed = seed_stream.SeedStream(seed, 'sample_lkj') with tf.name_scope('sample_lkj', name, [self.concentration]): if not self.concentration.dtype.is_floating: raise TypeError('The concentration argument should have floating type,' ' not {}'.format(self.concentration.dtype.name)) concentration = _replicate(num_samples, self.concentration) concentration_shape = tf.shape(concentration) if self.dimension <= 1: # For any dimension <= 1, there is only one possible correlation matrix. shape = tf.concat([ concentration_shape, [self.dimension, self.dimension]], axis=0) return tf.ones(shape=shape, dtype=self.concentration.dtype) beta_conc = concentration + (self.dimension - 2.) / 2. beta_dist = beta.Beta(concentration1=beta_conc, concentration0=beta_conc) # Note that the sampler below deviates from [1], by doing the sampling in # cholesky space. This does not change the fundamental logic of the # sampler, but does speed up the sampling. # This is the correlation coefficient between the first two dimensions. # This is also `r` in reference [1]. corr12 = 2. * beta_dist.sample(seed=seed()) - 1. # Below we construct the Cholesky of the initial 2x2 correlation matrix, # which is of the form: # [[1, 0], [r, sqrt(1 - r**2)]], where r is the correlation between the # first two dimensions. # This is the top-left corner of the cholesky of the final sample. first_row = tf.concat([ tf.ones_like(corr12)[..., tf.newaxis], tf.zeros_like(corr12)[..., tf.newaxis]], axis=-1) second_row = tf.concat([ corr12[..., tf.newaxis], tf.sqrt(1 - corr12**2)[..., tf.newaxis]], axis=-1) chol_result = tf.concat([ first_row[..., tf.newaxis, :], second_row[..., tf.newaxis, :]], axis=-2) for n in range(2, self.dimension): # Loop invariant: on entry, result has shape B + [n, n] beta_conc -= 0.5 # norm is y in reference [1]. norm = beta.Beta( concentration1=n/2., concentration0=beta_conc ).sample(seed=seed()) # distance shape: B + [1] for broadcast distance = tf.sqrt(norm)[..., tf.newaxis] # direction is u in reference [1]. # direction shape: B + [n] direction = _uniform_unit_norm( n, concentration_shape, self.concentration.dtype, seed) # raw_correlation is w in reference [1]. raw_correlation = distance * direction # shape: B + [n] # This is the next row in the cholesky of the result, # which differs from the construction in reference [1]. # In the reference, the new row `z` = chol_result @ raw_correlation^T # = C @ raw_correlation^T (where as short hand we use C = chol_result). # We prove that the below equation is the right row to add to the # cholesky, by showing equality with reference [1]. # Let S be the sample constructed so far, and let `z` be as in # reference [1]. Then at this iteration, the new sample S' will be # [[S z^T] # [z 1]] # In our case we have the cholesky decomposition factor C, so # we want our new row x (same size as z) to satisfy: # [[S z^T] [[C 0] [[C^T x^T] [[CC^T Cx^T] # [z 1]] = [x k]] [0 k]] = [xC^t xx^T + k**2]] # Since C @ raw_correlation^T = z = C @ x^T, and C is invertible, # we have that x = raw_correlation. Also 1 = xx^T + k**2, so k # = sqrt(1 - xx^T) = sqrt(1 - |raw_correlation|**2) = sqrt(1 - # distance**2). new_row = tf.concat( [raw_correlation, tf.sqrt(1. - norm[..., tf.newaxis])], axis=-1) # Finally add this new row, by growing the cholesky of the result. chol_result = tf.concat([ chol_result, tf.zeros_like(chol_result[..., 0][..., tf.newaxis])], axis=-1) chol_result = tf.concat( [chol_result, new_row[..., tf.newaxis, :]], axis=-2) result = tf.matmul(chol_result, chol_result, transpose_b=True) # The diagonal for a correlation matrix should always be ones. Due to # numerical instability the matmul might not achieve that, so manually set # these to ones. result = tf.matrix_set_diag(result, tf.ones( shape=tf.shape(result)[:-1], dtype=result.dtype.base_dtype)) # This sampling algorithm can produce near-PSD matrices on which standard # algorithms such as `tf.cholesky` or `tf.linalg.self_adjoint_eigvals` # fail. Specifically, as documented in b/116828694, around 2% of trials # of 900,000 5x5 matrices (distributed according to 9 different # concentration parameter values) contained at least one matrix on which # the Cholesky decomposition failed. return result
def _covariance(self): x = self._variance_scale_term() * self._mean() return tf.matrix_set_diag( -tf.matmul(x[..., tf.newaxis], x[..., tf.newaxis, :]), # outer prod self._variance())
def testInvalidShape(self): with self.assertRaisesRegexp(ValueError, "must be at least rank 2"): tf.matrix_set_diag(0, [0]) with self.assertRaisesRegexp(ValueError, "must be at least rank 1"): tf.matrix_set_diag([[0]], 0)
def make_tril_scale( loc=None, scale_tril=None, scale_diag=None, scale_identity_multiplier=None, shape_hint=None, validate_args=False, assert_positive=False, name=None): """Creates a LinearOperator representing a lower triangular matrix. Args: loc: Floating-point `Tensor`. This is used for inferring shape in the case where only `scale_identity_multiplier` is set. scale_tril: Floating-point `Tensor` representing the diagonal matrix. `scale_diag` has shape [N1, N2, ... k, k], which represents a k x k lower triangular matrix. When `None` no `scale_tril` term is added to the LinearOperator. The upper triangular elements above the diagonal are ignored. scale_diag: Floating-point `Tensor` representing the diagonal matrix. `scale_diag` has shape [N1, N2, ... k], which represents a k x k diagonal matrix. When `None` no diagonal term is added to the LinearOperator. scale_identity_multiplier: floating point rank 0 `Tensor` representing a scaling done to the identity matrix. When `scale_identity_multiplier = scale_diag = scale_tril = None` then `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added to `scale`. shape_hint: scalar integer `Tensor` representing a hint at the dimension of the identity matrix when only `scale_identity_multiplier` is set. validate_args: Python `bool` indicating whether arguments should be checked for correctness. assert_positive: Python `bool` indicating whether LinearOperator should be checked for being positive definite. name: Python `str` name given to ops managed by this object. Returns: `LinearOperator` representing a lower triangular matrix. Raises: ValueError: If only `scale_identity_multiplier` is set and `loc` and `shape_hint` are both None. """ def _maybe_attach_assertion(x): if not validate_args: return x if assert_positive: return control_flow_ops.with_dependencies([ tf.assert_positive( tf.matrix_diag_part(x), message="diagonal part must be positive"), ], x) return control_flow_ops.with_dependencies([ tf.assert_none_equal( tf.matrix_diag_part(x), tf.zeros([], x.dtype), message="diagonal part must be non-zero"), ], x) with tf.name_scope( name, "make_tril_scale", values=[loc, scale_diag, scale_identity_multiplier]): loc = _convert_to_tensor(loc, name="loc") scale_tril = _convert_to_tensor(scale_tril, name="scale_tril") scale_diag = _convert_to_tensor(scale_diag, name="scale_diag") scale_identity_multiplier = _convert_to_tensor( scale_identity_multiplier, name="scale_identity_multiplier") if scale_tril is not None: scale_tril = tf.matrix_band_part(scale_tril, -1, 0) # Zero out TriU. tril_diag = tf.matrix_diag_part(scale_tril) if scale_diag is not None: tril_diag += scale_diag if scale_identity_multiplier is not None: tril_diag += scale_identity_multiplier[..., tf.newaxis] scale_tril = tf.matrix_set_diag(scale_tril, tril_diag) return tf.linalg.LinearOperatorLowerTriangular( tril=_maybe_attach_assertion(scale_tril), is_non_singular=True, is_self_adjoint=False, is_positive_definite=assert_positive) return make_diag_scale( loc=loc, scale_diag=scale_diag, scale_identity_multiplier=scale_identity_multiplier, shape_hint=shape_hint, validate_args=validate_args, assert_positive=assert_positive, name=name)
def LaplacianMatrix(lengths, arcs, forest=False): r"""Returns the (root-augmented) Laplacian matrix for a batch of digraphs. Args: lengths: [B] vector of input sequence lengths. arcs: [B,M,M] tensor of arc potentials where entry b,t,s is the potential of the arc from s to t in the b'th digraph, while b,t,t is the potential of t as a root. Entries b,t,s where t or s >= lengths[b] are ignored. forest: Whether to produce a Laplacian for trees or forests. Returns: [B,M,M] tensor L with the Laplacian of each digraph, padded with an identity matrix. More concretely, the padding entries (t or s >= lengths[b]) are: L_{b,t,t} = 1.0 L_{b,t,s} = 0.0 Note that this "identity matrix padding" ensures that the determinant of each padded matrix equals the determinant of the unpadded matrix. The non-padding entries (t,s < lengths[b]) depend on whether the Laplacian is constructed for trees or forests. For trees: L_{b,t,0} = arcs[b,t,t] L_{b,t,t} = \sum_{s < lengths[b], t != s} arcs[b,t,s] L_{b,t,s} = -arcs[b,t,s] For forests: L_{b,t,t} = \sum_{s < lengths[b]} arcs[b,t,s] L_{b,t,s} = -arcs[b,t,s] See http://www.aclweb.org/anthology/D/D07/D07-1015.pdf for details, though note that our matrices are transposed from their notation. """ check.Eq(arcs.get_shape().ndims, 3, 'arcs must be rank 3') dtype = arcs.dtype.base_dtype arcs_shape = tf.shape(arcs) batch_size = arcs_shape[0] max_length = arcs_shape[1] with tf.control_dependencies([tf.assert_equal(max_length, arcs_shape[2])]): valid_arc_bxmxm, valid_token_bxm = ValidArcAndTokenMasks( lengths, max_length, dtype=dtype) invalid_token_bxm = tf.constant(1, dtype=dtype) - valid_token_bxm # Zero out all invalid arcs, to avoid polluting bulk summations. arcs_bxmxm = arcs * valid_arc_bxmxm zeros_bxm = tf.zeros([batch_size, max_length], dtype) if not forest: # For trees, extract the root potentials and exclude them from the sums # computed below. roots_bxm = tf.matrix_diag_part(arcs_bxmxm) # only defined for trees arcs_bxmxm = tf.matrix_set_diag(arcs_bxmxm, zeros_bxm) # Sum inbound arc potentials for each target token. These sums will form # the diagonal of the Laplacian matrix. Note that these sums are zero for # invalid tokens, since their arc potentials were masked out above. sums_bxm = tf.reduce_sum(arcs_bxmxm, 2) if forest: # For forests, zero out the root potentials after computing the sums above # so we don't cancel them out when we subtract the arc potentials. arcs_bxmxm = tf.matrix_set_diag(arcs_bxmxm, zeros_bxm) # The diagonal of the result is the combination of the arc sums, which are # non-zero only on valid tokens, and the invalid token indicators, which are # non-zero only on invalid tokens. Note that the latter form the diagonal # of the identity matrix padding. diagonal_bxm = sums_bxm + invalid_token_bxm # Combine sums and negative arc potentials. Note that the off-diagonal # padding entries will be zero thanks to the arc mask. laplacian_bxmxm = tf.matrix_diag(diagonal_bxm) - arcs_bxmxm if not forest: # For trees, replace the first column with the root potentials. roots_bxmx1 = tf.expand_dims(roots_bxm, 2) laplacian_bxmxm = tf.concat([roots_bxmx1, laplacian_bxmxm[:, :, 1:]], 2) return laplacian_bxmxm
def _inverse(self, y): diag = self._diag_bijector.inverse(tf.matrix_diag_part(y)) return tf.matrix_set_diag(y, diag)