def testInvalidAxis(self): matrix = [[0., 1.], [2., 3.]] for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]: error_prefix = ("'axis' must be None, an integer, or a tuple of 2 unique " "integers") with self.assertRaisesRegexp(ValueError, error_prefix): linalg_ops.norm(matrix, axis=axis_)
def testShapesValues(self): def circular_pad(input_, width, kernel_size): """Padding input_ for computing circular convolution. Args: input_: the input tensor width: the width of the tensor. kernel_size: the kernel size of the filter. Returns: a tensor whose width is (width + kernel_size - 1). """ beginning = kernel_size // 2 end = kernel_size - 1 - beginning tmp_up = array_ops.slice(input_, [0, width - beginning, 0, 0, 0], [-1, beginning, -1, -1, -1]) tmp_down = array_ops.slice(input_, [0, 0, 0, 0, 0], [-1, end, -1, -1, -1]) tmp = array_ops.concat([tmp_up, input_, tmp_down], 1) tmp_left = array_ops.slice(tmp, [0, 0, width - beginning, 0, 0], [-1, -1, beginning, -1, -1]) tmp_right = array_ops.slice(tmp, [0, 0, 0, 0, 0], [-1, -1, end, -1, -1]) tmp = array_ops.concat([tmp_left, tmp, tmp_right], 2) tmp_front = array_ops.slice(tmp, [0, 0, 0, width - beginning, 0], [-1, -1, -1, beginning, -1]) tmp_back = array_ops.slice(tmp, [0, 0, 0, 0, 0], [-1, -1, -1, end, -1]) return array_ops.concat([tmp_front, tmp, tmp_back], 3) cout = 32 shape = [1, 7, 7, 7, 16] outputs_shape = shape[0:-1] + [cout] dtype = dtypes.float32 tol = 1e-3 gain = 3.14 # Check orthogonality/isometry by computing the ratio between # the 2-norms of the inputs and outputs. for kernel_size in [[1, 1, 1], [2, 2, 2], [3, 3, 3]]: convolution = convolutional.conv3d inputs = random_ops.random_normal(shape, dtype=dtype) inputs_2norm = linalg_ops.norm(inputs) input_with_circular_pad = circular_pad(inputs, shape[1], kernel_size[0]) outputs = convolution( input_with_circular_pad, padding="valid", filters=cout, kernel_size=kernel_size[0], use_bias=False, kernel_initializer=init_ops.convolutional_orthogonal_3d(gain=gain)) outputs_2norm = linalg_ops.norm(outputs) ratio = outputs_2norm / inputs_2norm my_ops = variables.global_variables_initializer() with self.test_session(use_gpu=True) as sess: sess.run(my_ops) # Check the shape of the outputs t = outputs.eval() self.assertAllEqual(t.shape, outputs_shape) # Check isometry of the orthogonal kernel. self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
def testShapesValues(self): gain = 3.14 for dtype in [dtypes.float32]: for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]: tol = 1e-2 # Check orthogonality by computing ratio between # the 2-norms of the inputs and outputs. if len(kernel_size) == 1: shape = [4, 32, 64] convolution = convolutional.conv1d elif len(kernel_size) == 2: convolution = convolutional.conv2d shape = [4, 32, 32, 64] else: shape = [4, 16, 16, 16, 64] convolution = convolutional.conv3d inputs = random_ops.random_normal(shape, dtype=dtype) inputs_2norm = linalg_ops.norm(inputs) outputs = convolution( inputs, padding="same", filters=128, kernel_size=kernel_size, use_bias=False, kernel_initializer=init_ops.convolutional_delta_orthogonal( gain=gain)) outputs_shape = shape[0:-1] + [128] outputs_2norm = linalg_ops.norm(outputs) ratio = outputs_2norm / inputs_2norm my_ops = variables.global_variables_initializer() with self.test_session(use_gpu=True) as sess: sess.run(my_ops) # Check the shape of the outputs t = outputs.eval() self.assertAllEqual(t.shape, outputs_shape) # Check isometry of the delta-orthogonal kernel. self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
def _CompareNorm(self, matrix): np_norm = np.linalg.norm(matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) with self.cached_session(use_gpu=True) as sess: if use_static_shape_: tf_matrix = constant_op.constant(matrix) tf_norm = linalg_ops.norm( tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = self.evaluate(tf_norm) else: tf_matrix = array_ops.placeholder(dtype_) tf_norm = linalg_ops.norm( tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_) tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix}) self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
def compute_lr(self, grad, var): scaled_lr = self._learning_rate if self._skip_list is None or not any(v in var.name for v in self._skip_list): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(grad, ord=2) trust_ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where( math_ops.greater(g_norm, 0), (self._eeta * w_norm / (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0), 1.0) scaled_lr = self._learning_rate * trust_ratio return scaled_lr
def testTransform(self): # This tests all combinations of: # - ids rank 0, 1, >1 # - params sharded/unsharded # It always applies max_norm. np.random.seed(8) l2_norm = 2. with self.test_session(): # Param values are in [l2_norm, l2_norm+1) so it will always clip. params = np.random.rand(6, 3) + l2_norm params_norm = l2_norm * params / np.sqrt( np.sum(params * params, axis=1, keepdims=True)) # Compute the norm of each embedding. This will change the embedding # rank to 0. params_norm = np.linalg.norm(params_norm, axis=1) transform = lambda x: linalg_ops.norm(x, axis=1) for ids_shape in (), (3), (4, 3), (2, 3, 4): # Test ids rank 0, 1, 2, 3. ids = np.random.randint( params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(ids_shape) # Compare nonsharded to gather. simple = embedding_ops._embedding_lookup_and_transform( params, ids, max_norm=l2_norm, transform_fn=transform).eval() self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval()) # Run a few different sharded versions. for procs in 1, 2, 3: stride = procs * math_ops.range(params.shape[0] // procs) split_params = [ array_ops.gather(params, stride + p) for p in xrange(procs) ] sharded = embedding_ops._embedding_lookup_and_transform( split_params, ids, max_norm=l2_norm, transform_fn=transform).eval() self.assertAllEqual(simple, sharded)
def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile(device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1)) ) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, self.hparams.keep_prob) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.to_int32(next_y) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def _verifySolve(self, x, y, dtype, use_placeholder, fast, l2_regularizer, batch_shape=()): if not fast and l2_regularizer != 0: # The slow path does not support regularization. return maxdim = np.max(x.shape) if dtype == np.float32 or dtype == np.complex64: tol = maxdim * 5e-4 else: tol = maxdim * 5e-7 a = x.astype(dtype) b = y.astype(dtype) if dtype in [np.complex64, np.complex128]: a.imag = a.real b.imag = b.real # numpy.linalg.lstqr does not batching, so we just solve a single system # and replicate the solution. and residual norm. np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer) np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans)) np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r)) if batch_shape is not (): a = np.tile(a, batch_shape + (1, 1)) b = np.tile(b, batch_shape + (1, 1)) np_ans = np.tile(np_ans, batch_shape + (1, 1)) np_r_norm = np.tile(np_r_norm, batch_shape) with self.cached_session(use_gpu=fast) as sess: if use_placeholder: a_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) b_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) feed_dict = {a_ph: a, b_ph: b} tf_ans = linalg_ops.matrix_solve_ls( a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer) else: tf_ans = linalg_ops.matrix_solve_ls( a, b, fast=fast, l2_regularizer=l2_regularizer) feed_dict = {} self.assertEqual(np_ans.shape, tf_ans.get_shape()) if l2_regularizer == 0: # The least squares solution should satisfy A^H * (b - A*x) = 0. tf_r = b - math_ops.matmul(a, tf_ans) tf_r = math_ops.matmul(a, tf_r, adjoint_a=True) tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1]) tf_ans_val, tf_r_norm_val = sess.run( [tf_ans, tf_r_norm], feed_dict=feed_dict) self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol) else: tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict) self.assertEqual(np_ans.shape, tf_ans_val.shape) self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
def mean_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model from activations. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. In this variant, we only compute the difference between the means of the fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet still retains much of the same information as FID. Args: real_activations: 2D array of activations of real images of size [num_images, num_dims] to use to compute Frechet Inception distance. generated_activations: 2D array of activations of generated images of size [num_images, num_dims] to use to compute Frechet Inception distance. Returns: The mean-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute means of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_w)) # This uses the L2 norm. mofid = mean if activations_dtype != dtypes.float64: mofid = math_ops.cast(mofid, activations_dtype) return mofid
def Test(self): np.random.seed(1) n = shape_[-1] batch_shape = shape_[:-2] np_dtype = dtype_.as_numpy_dtype a = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: a += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) a += np.conj(a.T) a = np.tile(a, batch_shape + (1, 1)) # Optimal stepsize for central difference is O(epsilon^{1/3}). epsilon = np.finfo(np_dtype).eps delta = 0.1 * epsilon**(1.0 / 3.0) # tolerance obtained by looking at actual differences using # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64): tol = 1e-2 else: tol = 1e-7 with self.test_session(): tf_a = constant_op.constant(a) if compute_v_: tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. reference = tf_v / linalg_ops.norm( tf_v[..., 0:1, :], axis=-1, keep_dims=True) tf_v *= math_ops.conj(reference) outputs = [tf_e, tf_v] else: tf_e = linalg_ops.self_adjoint_eigvals(tf_a) outputs = [tf_e,] for b in outputs: x_init = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) x_init += np.conj(x_init.T) x_init = np.tile(x_init, batch_shape + (1, 1)) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def make_grouping_predictions(self, input_layer, reuse=None): """model that predicts grouping (grouping_actions). Args: input_layer: group_input_layer reuse: reuse Returns: grouping_actions: actions grouping_log_probs: log probabilities corresponding to actions """ with variable_scope.variable_scope(self.hparams.name, reuse=True): # input_layer: tensor of size [1, num_ops, hidden_size] w_grouping_ff = variable_scope.get_variable("w_grouping_ff") w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax") batch_size = array_ops.shape(input_layer)[0] embedding_dim = array_ops.shape(input_layer)[2] reshaped = array_ops.reshape(input_layer, [batch_size * self.num_ops, embedding_dim]) ff_output = math_ops.matmul(reshaped, w_grouping_ff) logits = math_ops.matmul(ff_output, w_grouping_softmax) if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) logits = array_ops.reshape(logits, [batch_size * self.num_ops, self.num_groups]) actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed) actions = math_ops.to_int32(actions) actions = array_ops.reshape(actions, [batch_size, self.num_ops]) action_label = array_ops.reshape(actions, [-1]) log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_label) log_probs = array_ops.reshape(log_probs, [batch_size, -1]) log_probs = math_ops.reduce_sum(log_probs, 1) grouping_actions = actions grouping_log_probs = log_probs return grouping_actions, grouping_log_probs
def testTransform(self): # This tests all combinations of: # - ids rank 0, 1, >1 # - params sharded/unsharded # It always applies max_norm. np.random.seed(8) l2_norm = 2. with self.cached_session(): # Param values are in [l2_norm, l2_norm+1) so it will always clip. params = np.random.rand(6, 3) + l2_norm params_norm = l2_norm * params / np.sqrt( np.sum(params * params, axis=1, keepdims=True)) # Compute the norm of each embedding. This will change the embedding # rank to 0. params_norm = np.linalg.norm(params_norm, axis=1) transform = lambda x: linalg_ops.norm(x, axis=1) for ids_shape in (), (3), (4, 3), (2, 3, 4): # Test ids rank 0, 1, 2, 3. ids = np.random.randint( params.shape[0], size=np.prod(ids_shape, dtype=np.int64)).reshape(ids_shape) # Compare nonsharded to gather. simple = embedding_ops._embedding_lookup_and_transform( params, ids, max_norm=l2_norm, transform_fn=transform).eval() self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval()) # Run a few different sharded versions. for procs in 1, 2, 3: stride = procs * math_ops.range(params.shape[0] // procs) split_params = [ array_ops.gather(params, stride + p) for p in xrange(procs) ] sharded = embedding_ops._embedding_lookup_and_transform( split_params, ids, max_norm=l2_norm, transform_fn=transform).eval() # assertAllClose is used here as different implementations of sqrt may # be used to compute each of the values being compared. For example, # on AVX512 builds the embedding operation makes use of Eigen's fast # vectorized square root algorithm for doubles. These different # implementations of sqrt are not guaranteed to produce exactly the # same results. Therefore, an exact comparison cannot be made. self.assertAllClose(simple, sharded)
def testBadOrder(self): matrix = [[0., 1.], [2., 3.]] for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): linalg_ops.norm(matrix, ord=ord_) for ord_ in "fro", -7, -1.1, 0: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported vector norm"): linalg_ops.norm(matrix, ord=ord_, axis=-1) for ord_ in "foo", -7, -1.1, 1.1: with self.assertRaisesRegexp(ValueError, "'ord' must be a supported matrix norm"): linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
def operator_and_matrix( self, build_info, dtype, use_placeholder, ensure_self_adjoint_and_pd=False): shape = list(build_info.shape) reflection_axis = linear_operator_test_util.random_sign_uniform( shape[:-1], minval=1., maxval=2., dtype=dtype) # Make sure unit norm. reflection_axis = reflection_axis / linalg_ops.norm( reflection_axis, axis=-1, keepdims=True) lin_op_reflection_axis = reflection_axis if use_placeholder: lin_op_reflection_axis = array_ops.placeholder_with_default( reflection_axis, shape=None) operator = householder.LinearOperatorHouseholder(lin_op_reflection_axis) mat = reflection_axis[..., array_ops.newaxis] matrix = -2 * math_ops.matmul(mat, mat, adjoint_b=True) matrix = array_ops.matrix_set_diag( matrix, 1. + array_ops.matrix_diag_part(matrix)) return operator, matrix
def squared_frobenius_norm(x): """Helper to make KL calculation slightly more readable.""" # http://mathworld.wolfram.com/FrobeniusNorm.html return math_ops.square(linalg_ops.norm(x, ord="fro", axis=[-2, -1]))
def frechet_classifier_distance(real_images, generated_images, classifier_fn, num_batches=1): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_images: Real images to use to compute Frechet Inception distance. generated_images: Generated images to use to compute Frechet Inception distance. classifier_fn: A function that takes images and produces activations based on a classifier. num_batches: Number of batches to split images in to in order to efficiently run them through the classifier network. Returns: The Frechet Inception distance. A floating-point scalar. """ real_images_list = array_ops.split(real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split(generated_images, num_or_size_splits=num_batches) imgs = array_ops.stack(real_images_list + generated_images_list) # Compute the activations using the memory-efficient `map_fn`. activations = functional_ops.map_fn(fn=classifier_fn, elems=imgs, parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') # Split the activations by the real and generated images. real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) gen_a = array_ops.concat(array_ops.unstack(gen_a), 0) real_a.shape.assert_has_rank(2) gen_a.shape.assert_has_rank(2) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_a, 0) m_v = math_ops.reduce_mean(gen_a, 0) num_examples = math_ops.to_float(array_ops.shape(real_a)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T sigma = math_ops.matmul(real_a - m, real_a - m, transpose_a=True) / (num_examples - 1) sigma_v = math_ops.matmul(gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_v)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_v) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean return fid
def monotone_linear_layer(input_tensor, input_dim, output_dim, is_monotone=None, add_bias=True, normalization_order=None, init_weight_mean=2.0, init_weight_stddev=0.5, init_bias=None, l1_reg=None, l2_reg=None): """Creates a partially monotonic linear embedding layer. Returns an output of partially monotonic linear embedding layer, weights in the linear embedding layer, projection ops and regularizers. output = input * weight' + bias and the kth row is constrained to be non-negative, if is_monotone[k] == True. weight is initialized to entrywise Normal random variable (init_weight_mean, init_weight_stdev). If init_b is not provided, then the initial bias is initialized to -1/2 * init_weight_mean * input_dim. This offset term is used to make the initial mean to 0, assuming each input tensor is from the uniform distribution [0, 1]: E[output] = E[input * weight' + bias] = E[input] * E[weight] + bias = 1/2 * init_weight_mean * input_dim + bias = 0. Args: input_tensor: [batch_size, input_dim] tensor. input_dim: (int) input dimension. output_dim: (int) output dimension. is_monotone: A list of input_dim booleans, a single boolean, or None. If None or False, linear layer will not have monotonicity constraints. If True, all of inputs are set to be monotonic. In the case of boolean list, input_tensor[:, k] is set to be monotonic if is_monotone[k] == True. add_bias: (bool) If a bias term should be added. normalization_order: If specified, the returned projection will normalize the weight vector across each output dimension to have norm 1. The norm order can be 1, 2 or np.inf. Norm is lower bounded by 1e-12. init_weight_mean: (float) A mean for Normal random weight initializer. init_weight_stddev: (float) A standard deviation for Normal random weight initializer. init_bias: (float) initial bias. If not provided, -1/2 * init_weight_mean * input_dim is used. l1_reg: (float) amount of l1 regularization. l2_reg: (float) amount of l2 regularization. Returns: A tuple of: * output tensor of shape [batch_size, output_dim] * weight tensor of shape [output_dim, input_dim] * None or projection ops, that must be applied at each step (or every so many steps) to project the model to a feasible space: used for bounding the outputs or for imposing monotonicity. * None or a regularization loss, if regularization is configured. Raises: ValueError: If is_monotone is not None, but its length != input_dim. """ with variable_scope.variable_scope('monotone_linear'): # We use [output_dim, input_dim] convention to use broadcasting in # projeciton. init_weights = random_ops.random_normal( [output_dim, input_dim], mean=init_weight_mean, stddev=init_weight_stddev) if init_bias is None: init_biases = [-init_weight_mean * 0.5 * input_dim] * output_dim else: init_biases = [init_bias] * output_dim w = variable_scope.get_variable( name='weight', initializer=init_weights, dtype=input_tensor.dtype) output_tensor = math_ops.matmul(input_tensor, w, transpose_b=True) if add_bias: b = variable_scope.get_variable( name='bias', initializer=init_biases, dtype=input_tensor.dtype) output_tensor = output_tensor + b # Constructing a projection op. projection = None if is_monotone or normalization_order: with ops.name_scope('monotonic_projection'): diff = None if is_monotone: if isinstance(is_monotone, list): # is_monotone is given as a list. We should only apply positivity # constraints to a masked version of the weights. if input_dim != len(is_monotone): raise ValueError('input_dim (%d) != is_monotone length (%d)' % (input_dim, len(is_monotone))) # Construct a multiplicative mask for monotonic dimension # selection. monotone_mask = array_ops.constant( [1.0 if monotone else 0.0 for monotone in is_monotone], dtype=w.dtype) # Since input_dim is the last dimension of the weight, we can use # broadcasting. masked_w = math_ops.multiply(w, monotone_mask) else: # is_monotone is set to True. masked_w = w projected_w = math_ops.maximum(masked_w, 0.0) diff = projected_w - masked_w if normalization_order: unnormalized_w = w if diff is None else w + diff normalized_w = unnormalized_w / math_ops.maximum( linalg_ops.norm( unnormalized_w, ord=normalization_order, axis=1, keepdims=True), 1e-12) diff = normalized_w - w projection = w.assign_add(diff) # Constructing a regularization op. regularizer = None if l1_reg is not None or l2_reg is not None: with ops.name_scope('linear_regularization'): regularizer = regularizers.linear_regularization(w, l1_reg, l2_reg) return (output_tensor, w, projection, regularizer)
def frechet_classifier_distance(real_images, generated_images, classifier_fn, num_batches=1): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_images: Real images to use to compute Frechet Inception distance. generated_images: Generated images to use to compute Frechet Inception distance. classifier_fn: A function that takes images and produces activations based on a classifier. num_batches: Number of batches to split images in to in order to efficiently run them through the classifier network. Returns: The Frechet Inception distance. A floating-point scalar. """ real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) imgs = array_ops.stack(real_images_list + generated_images_list) # Compute the activations using the memory-efficient `map_fn`. activations = functional_ops.map_fn( fn=classifier_fn, elems=imgs, parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') # Split the activations by the real and generated images. real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) gen_a = array_ops.concat(array_ops.unstack(gen_a), 0) real_a.shape.assert_has_rank(2) gen_a.shape.assert_has_rank(2) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_a, 0) m_v = math_ops.reduce_mean(gen_a, 0) num_examples = math_ops.to_float(array_ops.shape(real_a)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T sigma = math_ops.matmul( real_a - m, real_a - m, transpose_a=True) / (num_examples - 1) sigma_v = math_ops.matmul( gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_v)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_v) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean return fid
def diagonal_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. In this variant, we compute diagonal-only covariance matrices. As a result, instead of computing an expensive matrix square root, we can do something much simpler, and has O(n) vs O(n^2) space complexity. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: Real images to use to compute Frechet Inception distance. generated_activations: Generated images to use to compute Frechet Inception distance. Returns: The diagonal-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. Raises: ValueError: If the shape of the variance and mean vectors are not equal. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m, var = nn_impl.moments(real_activations, axes=[0]) m_w, var_w = nn_impl.moments(generated_activations, axes=[0]) actual_shape = var.get_shape() expected_shape = m.get_shape() if actual_shape != expected_shape: raise ValueError('shape: {} must match expected shape: {}'.format( actual_shape, expected_shape)) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.reduce_sum((var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w))) # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_w)) # This uses the L2 norm. dofid = trace + mean if activations_dtype != dtypes.float64: dofid = math_ops.cast(dofid, activations_dtype) return dofid
def process_quadrature_grid_and_probs(quadrature_grid_and_probs, dtype, validate_args, name=None): """Validates quadrature grid, probs or computes them as necessary. Args: quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s representing the sample points and the corresponding (possibly normalized) weight. When `None`, defaults to: `np.polynomial.hermite.hermgauss(deg=8)`. dtype: The expected `dtype` of `grid` and `probs`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. name: Python `str` name prefixed to Ops created by this class. Returns: quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s representing the sample points and the corresponding (possibly normalized) weight. Raises: ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` """ with ops.name_scope(name, "process_quadrature_grid_and_probs", [quadrature_grid_and_probs]): if quadrature_grid_and_probs is None: grid, probs = np.polynomial.hermite.hermgauss(deg=8) grid = grid.astype(dtype.as_numpy_dtype) probs = probs.astype(dtype.as_numpy_dtype) probs /= np.linalg.norm(probs, ord=1, keepdims=True) grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype) probs = ops.convert_to_tensor(probs, name="probs", dtype=dtype) return grid, probs grid, probs = tuple(quadrature_grid_and_probs) grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype) probs = ops.convert_to_tensor(probs, name="unnormalized_probs", dtype=dtype) probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True, name="probs") def _static_dim_size(x, axis): """Returns the static size of a specific dimension or `None`.""" return x.shape.with_rank_at_least(axis + 1)[axis].value m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0) if m is not None and n is not None: if m != n: raise ValueError( "`quadrature_grid_and_probs` must be a `tuple` of " "same-length zero-th-dimension `Tensor`s " "(saw lengths {}, {})".format(m, n)) elif validate_args: grid = control_flow_ops.with_dependencies([ check_ops.assert_equal( dimension_size(probs, axis=0), dimension_size(grid, axis=0), message=( "`quadrature_grid_and_probs` must be a `tuple` of " "same-length zero-th-dimension `Tensor`s")), ], grid) return grid, probs
def is_in_ball(x, radius, center): return math_ops.cast( linalg_ops.norm(x - center, axis=-1) <= radius, dtype=x.dtype)
def _show_norm(tensor): tensor = math_ops.cast(tensor, dtypes.float64) output_tensor = linalg_ops.norm(tensor) return _print_tensor(tensor_name, -1, tensor, output_tensor)
def frechet_classifier_distance(real_images, generated_images, classifier_fn, num_batches=1): """Classifier distance for evaluating a conditional generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Args: real_images: Real images to use to compute Frechet Inception distance. generated_images: Generated images to use to compute Frechet Inception distance. classifier_fn: A function that takes images and produces activations based on a classifier. num_batches: Number of batches to split images in to in order to efficiently run them through the classifier network. Returns: The Frechet Inception distance. A floating-point scalar. """ real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) imgs = array_ops.stack(real_images_list + generated_images_list) # Compute the activations using the memory-efficient `map_fn`. activations = functional_ops.map_fn( fn=classifier_fn, elems=imgs, parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') # Split the activations by the real and generated images. real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) gen_a = array_ops.concat(array_ops.unstack(gen_a), 0) real_a.shape.assert_has_rank(2) gen_a.shape.assert_has_rank(2) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_a, 0) m_v = math_ops.reduce_mean(gen_a, 0) dim = math_ops.to_float(array_ops.shape(m)[0]) sigma = math_ops.matmul(real_a - m, real_a - m, transpose_b=True) / dim sigma_v = math_ops.matmul(gen_a - m, gen_a - m, transpose_b=True) / dim # Take matrix square root of the product of covariance matrices. sqcc = _matrix_square_root(math_ops.matmul(sigma, sigma_v)) # Compute the two components of FID. trace = math_ops.trace(sigma + sigma_v - 2.0 * sqcc) mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean return fid
def frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: Real images to use to compute Frechet Inception distance. generated_activations: Generated images to use to compute Frechet Inception distance. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_v = math_ops.reduce_mean(generated_activations, 0) num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / (num_examples - 1) gen_centered = generated_activations - m_v sigma_v = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_v)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_v) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def frechet_classifier_distance_from_activations(real_activations, generated_activations): """Classifier distance for evaluating a generative model. This methods computes the Frechet classifier distance from activations of real images and generated images. This can be used independently of the frechet_classifier_distance() method, especially in the case of using large batches during evaluation where we would like precompute all of the activations before computing the classifier distance. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calculates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul(real_centered, real_centered, transpose_a=True) / (num_examples - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul(gen_centered, gen_centered, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_w)) # This uses the L2 norm. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def _init_norm(self, weights): """Set the norm of the weight vector""" from tensorflow.python.ops.linalg_ops import norm with name_scope('init_norm'): flat = array_ops.reshape(weights, [-1, self.layer_depth]) return array_ops.reshape(norm(flat, axis=0), (self.layer_depth, ))
def _show_norm(tensor): tensor = math_ops.cast(tensor, dtypes.float32) output_tensor = linalg_ops.norm(tensor) # The shape has to be 1. Set it if it does not have the information. output_tensor = array_ops.reshape(output_tensor, [1]) return output_tensor
def conjugate_gradient(operator, rhs, preconditioner=None, x=None, tol=1e-4, max_iter=20, name="conjugate_gradient"): r"""Conjugate gradient solver. Solves a linear system of equations `A*x = rhs` for selfadjoint, positive definite matrix `A` and right-hand side vector `rhs`, using an iterative, matrix-free algorithm where the action of the matrix A is represented by `operator`. The iteration terminates when either the number of iterations exceeds `max_iter` or when the residual norm has been reduced to `tol` times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\). Args: operator: An object representing a linear operator with attributes: - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of length 2. `shape[0]` is the dimension on the domain of the operator, `shape[1]` is the dimension of the co-domain of the operator. On other words, if operator represents an N x N matrix A, `shape` must contain `[N, N]`. - dtype: The datatype of input to and output from `apply`. - apply: Callable object taking a vector `x` as input and returning a vector with the result of applying the operator to `x`, i.e. if `operator` represents matrix `A`, `apply` should return `A * x`. rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector. preconditioner: An object representing a linear operator, see `operator` for detail. The preconditioner should approximate the inverse of `A`. An efficient preconditioner could dramatically improve the rate of convergence. If `preconditioner` represents matrix `M`(`M` approximates `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate `A^{-1}x`. For this to be useful, the cost of applying `M` should be much lower than computing `A^{-1}` directly. x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the solution. tol: A float scalar convergence tolerance. max_iter: An integer giving the maximum number of iterations. name: A name scope for the operation. Returns: output: A namedtuple representing the final state with fields: - i: A scalar `int32` `Tensor`. Number of iterations executed. - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution. - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector. - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector. - gamma: \\(r \dot M \dot r\\), equivalent to \\(||r||_2^2\\) when `preconditioner=None`. """ # ephemeral class holding CG state. cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"]) def stopping_criterion(i, state): return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol) def cg_step(i, state): # pylint: disable=missing-docstring z = operator.apply(state.p) alpha = state.gamma / util.dot(state.p, z) x = state.x + alpha * state.p r = state.r - alpha * z if preconditioner is None: gamma = util.dot(r, r) beta = gamma / state.gamma p = r + beta * state.p else: q = preconditioner.apply(r) gamma = util.dot(r, q) beta = gamma / state.gamma p = q + beta * state.p return i + 1, cg_state(i + 1, x, r, p, gamma) with ops.name_scope(name): n = operator.shape[1:] rhs = array_ops.expand_dims(rhs, -1) if x is None: x = array_ops.expand_dims( array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1) r0 = rhs else: x = array_ops.expand_dims(x, -1) r0 = rhs - operator.apply(x) if preconditioner is None: p0 = r0 else: p0 = preconditioner.apply(r0) gamma0 = util.dot(r0, p0) tol *= linalg_ops.norm(r0) i = constant_op.constant(0, dtype=dtypes.int32) state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0) _, state = control_flow_ops.while_loop(stopping_criterion, cg_step, [i, state]) return cg_state(state.i, x=array_ops.squeeze(state.x), r=array_ops.squeeze(state.r), p=array_ops.squeeze(state.p), gamma=state.gamma)
def frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model from activations. This methods computes the Frechet classifier distance from activations of real images and generated images. This can be used independently of the frechet_classifier_distance() method, especially in the case of using large batches during evaluation where we would like precompute all of the activations before computing the classifier distance. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_v = math_ops.reduce_mean(generated_activations, 0) num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / (num_examples - 1) gen_centered = generated_activations - m_v sigma_v = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / (num_examples - 1) # Find the Tr(sqrt(sigma sigma_v)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_v) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_v)) # This uses the L2 norm. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def conjugate_gradient(operator, rhs, preconditioner=None, x=None, tol=1e-4, max_iter=20, name="conjugate_gradient"): r"""Conjugate gradient solver. Solves a linear system of equations `A*x = rhs` for selfadjoint, positive definite matrix `A` and right-hand side vector `rhs`, using an iterative, matrix-free algorithm where the action of the matrix A is represented by `operator`. The iteration terminates when either the number of iterations exceeds `max_iter` or when the residual norm has been reduced to `tol` times its initial value, i.e. \\(||rhs - A x_k|| <= tol ||rhs||\\). Args: operator: An object representing a linear operator with attributes: - shape: Either a list of integers or a 1-D `Tensor` of type `int32` of length 2. `shape[0]` is the dimension on the domain of the operator, `shape[1]` is the dimension of the co-domain of the operator. On other words, if operator represents an N x N matrix A, `shape` must contain `[N, N]`. - dtype: The datatype of input to and output from `apply`. - apply: Callable object taking a vector `x` as input and returning a vector with the result of applying the operator to `x`, i.e. if `operator` represents matrix `A`, `apply` should return `A * x`. rhs: A rank-1 `Tensor` of shape `[N]` containing the right-hand size vector. preconditioner: An object representing a linear operator, see `operator` for detail. The preconditioner should approximate the inverse of `A`. An efficient preconditioner could dramatically improve the rate of convergence. If `preconditioner` represents matrix `M`(`M` approximates `A^{-1}`), the algorithm uses `preconditioner.apply(x)` to estimate `A^{-1}x`. For this to be useful, the cost of applying `M` should be much lower than computing `A^{-1}` directly. x: A rank-1 `Tensor` of shape `[N]` containing the initial guess for the solution. tol: A float scalar convergence tolerance. max_iter: An integer giving the maximum number of iterations. name: A name scope for the operation. Returns: output: A namedtuple representing the final state with fields: - i: A scalar `int32` `Tensor`. Number of iterations executed. - x: A rank-1 `Tensor` of shape `[N]` containing the computed solution. - r: A rank-1 `Tensor` of shape `[M]` containing the residual vector. - p: A rank-1 `Tensor` of shape `[N]`. `A`-conjugate basis vector. - gamma: \\(r \dot M \dot r\\), equivalent to \\(||r||_2^2\\) when `preconditioner=None`. """ # ephemeral class holding CG state. cg_state = collections.namedtuple("CGState", ["i", "x", "r", "p", "gamma"]) def stopping_criterion(i, state): return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol) def cg_step(i, state): # pylint: disable=missing-docstring z = operator.apply(state.p) alpha = state.gamma / util.dot(state.p, z) x = state.x + alpha * state.p r = state.r - alpha * z if preconditioner is None: gamma = util.dot(r, r) beta = gamma / state.gamma p = r + beta * state.p else: q = preconditioner.apply(r) gamma = util.dot(r, q) beta = gamma / state.gamma p = q + beta * state.p return i + 1, cg_state(i + 1, x, r, p, gamma) with ops.name_scope(name): n = operator.shape[1:] rhs = array_ops.expand_dims(rhs, -1) if x is None: x = array_ops.expand_dims( array_ops.zeros(n, dtype=rhs.dtype.base_dtype), -1) r0 = rhs else: x = array_ops.expand_dims(x, -1) r0 = rhs - operator.apply(x) if preconditioner is None: p0 = r0 else: p0 = preconditioner.apply(r0) gamma0 = util.dot(r0, p0) tol *= linalg_ops.norm(r0) i = constant_op.constant(0, dtype=dtypes.int32) state = cg_state(i=i, x=x, r=r0, p=p0, gamma=gamma0) _, state = control_flow_ops.while_loop(stopping_criterion, cg_step, [i, state]) return cg_state( state.i, x=array_ops.squeeze(state.x), r=array_ops.squeeze(state.r), p=array_ops.squeeze(state.p), gamma=state.gamma)
def stopping_criterion(i, state): return math_ops.logical_and(i < max_iter, linalg_ops.norm(state.r) > tol)
def diagonal_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. In this variant, we compute diagonal-only covariance matrices. As a result, instead of computing an expensive matrix square root, we can do something much simpler, and has O(n) vs O(n^2) space complexity. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: Real images to use to compute Frechet Inception distance. generated_activations: Generated images to use to compute Frechet Inception distance. Returns: The diagonal-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. Raises: ValueError: If the shape of the variance and mean vectors are not equal. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m, var = nn_impl.moments(real_activations, axes=[0]) m_w, var_w = nn_impl.moments(generated_activations, axes=[0]) actual_shape = var.get_shape() expected_shape = m.get_shape() if actual_shape != expected_shape: raise ValueError('shape: {} must match expected shape: {}'.format( actual_shape, expected_shape)) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.reduce_sum( (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w))) # Next the distance between means. mean = math_ops.square(linalg_ops.norm(m - m_w)) # This uses the L2 norm. dofid = trace + mean if activations_dtype != dtypes.float64: dofid = math_ops.cast(dofid, activations_dtype) return dofid
def process_quadrature_grid_and_probs( quadrature_grid_and_probs, dtype, validate_args, name=None): """Validates quadrature grid, probs or computes them as necessary. Args: quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s representing the sample points and the corresponding (possibly normalized) weight. When `None`, defaults to: `np.polynomial.hermite.hermgauss(deg=8)`. dtype: The expected `dtype` of `grid` and `probs`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. name: Python `str` name prefixed to Ops created by this class. Returns: quadrature_grid_and_probs: Python pair of `float`-like `Tensor`s representing the sample points and the corresponding (possibly normalized) weight. Raises: ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` """ with ops.name_scope(name, "process_quadrature_grid_and_probs", [quadrature_grid_and_probs]): if quadrature_grid_and_probs is None: grid, probs = np.polynomial.hermite.hermgauss(deg=8) grid = grid.astype(dtype.as_numpy_dtype) probs = probs.astype(dtype.as_numpy_dtype) probs /= np.linalg.norm(probs, ord=1, keepdims=True) grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype) probs = ops.convert_to_tensor(probs, name="probs", dtype=dtype) return grid, probs grid, probs = tuple(quadrature_grid_and_probs) grid = ops.convert_to_tensor(grid, name="grid", dtype=dtype) probs = ops.convert_to_tensor(probs, name="unnormalized_probs", dtype=dtype) probs /= linalg_ops.norm(probs, ord=1, axis=-1, keep_dims=True, name="probs") def _static_dim_size(x, axis): """Returns the static size of a specific dimension or `None`.""" return x.shape.with_rank_at_least(axis + 1)[axis].value m, n = _static_dim_size(probs, axis=0), _static_dim_size(grid, axis=0) if m is not None and n is not None: if m != n: raise ValueError("`quadrature_grid_and_probs` must be a `tuple` of " "same-length zero-th-dimension `Tensor`s " "(saw lengths {}, {})".format(m, n)) elif validate_args: grid = control_flow_ops.with_dependencies([ check_ops.assert_equal( dimension_size(probs, axis=0), dimension_size(grid, axis=0), message=("`quadrature_grid_and_probs` must be a `tuple` of " "same-length zero-th-dimension `Tensor`s")), ], grid) return grid, probs
def _compute_power_iter(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name, iter_count=100, epsilon=1e-6): """Computes mat_g^alpha, where alpha = -1/p, p a positive integer. We use an iterative Schur-Newton method from equation 3.2 on page 9 of: A Schur-Newton Method for the Matrix p-th Root and its Inverse by Chun-Hua Guo and Nicholas J. Higham SIAM Journal on Matrix Analysis and Applications, 2006, Vol. 28, No. 3 : pp. 788-804 https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf Args: var: the variable we are updating. mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g. alpha: exponent, must be -1/p for p a positive integer. mat_h_slot_name: name of slot to store the power, if needed. iter_count: Maximum number of iterations. epsilon: accuracy indicator, useful for early termination. Returns: mat_g^alpha """ identity = linalg_ops.eye(math_ops.to_int32(mat_g_size)) def MatPower(mat_m, p): """Computes mat_m^p, for p a positive integer. Power p is known at graph compile time, so no need for loop and cond. Args: mat_m: a square matrix p: a positive integer Returns: mat_m^p """ assert p == int(p) and p > 0 power = None while p > 0: if p % 2 == 1: power = math_ops.matmul(mat_m, power) if power is not None else mat_m p //= 2 mat_m = math_ops.matmul(mat_m, mat_m) return power def IterCondition(i, mat_m, _): return math_ops.logical_and( i < iter_count, math_ops.reduce_max(math_ops.abs(mat_m - identity)) > epsilon) def IterBody(i, mat_m, mat_x): mat_m_i = (1 - alpha) * identity + alpha * mat_m return (i + 1, math_ops.matmul(MatPower(mat_m_i, -1.0/alpha), mat_m), math_ops.matmul(mat_x, mat_m_i)) if mat_g_size == 1: mat_h = math_ops.pow(mat_g + self._epsilon, alpha) else: damped_mat_g = mat_g + self._epsilon * identity z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g)) # The best value for z is # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / # (c_max^{1-alpha} - c_min^{1-alpha}) # where c_max and c_min are the largest and smallest singular values of # damped_mat_g. # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) # Can replace above line by the one below, but it is less accurate, # hence needs more iterations to converge. # z = (1 - 1/alpha) / math_ops.trace(damped_mat_g) # If we want the method to always converge, use z = 1 / norm(damped_mat_g) # or z = 1 / math_ops.trace(damped_mat_g), but these can result in many # extra iterations. _, _, mat_h = control_flow_ops.while_loop( IterCondition, IterBody, [0, damped_mat_g * z, identity * math_ops.pow(z, -alpha)]) if mat_h_slot_name is not None: return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) return mat_h
def apply_gradients(self, grads_and_vars, global_step, name=None, manual_fp16=False): """See base class.""" assignments = [] steps = tf.cast(global_step, tf.float32) for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = self._get_variable_name(param.name) has_shadow = manual_fp16 and param.dtype.base_dtype != tf.float32 if has_shadow: # create shadow fp32 weights for fp16 variable param_fp32 = tf.get_variable(name=param_name + "/shadow", dtype=tf.float32, trainable=False, initializer=tf.cast( param.initialized_value(), tf.float32)) else: param_fp32 = param m = tf.get_variable(name=param_name + "/adam_m", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) v = tf.get_variable(name=param_name + "/adam_v", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) # LAMB update next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad))) beta1_correction = (1 - self.beta_1**steps) beta2_correction = (1 - self.beta_2**steps) next_m_unbiased = next_m / beta1_correction next_v_unbiased = next_v / beta2_correction update = next_m_unbiased / (tf.sqrt(next_v_unbiased) + self.epsilon) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. if self._do_use_weight_decay(param_name): update += self.weight_decay_rate * param_fp32 w_norm = linalg_ops.norm(param, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) update_with_lr = ratio * self.learning_rate * update next_param = param_fp32 - update_with_lr if has_shadow: # cast shadow fp32 weights to fp16 and assign to trainable variable param.assign(tf.cast(next_param, param.dtype.base_dtype)) assignments.extend([ param_fp32.assign(next_param), m.assign(next_m), v.assign(next_v) ]) return tf.group(*assignments, name=name)