def normalize_moments(counts, mean_ss, variance_ss, shift, name=None): """Calculate the mean and variance of based on the sufficient statistics. Args: counts: A `Tensor` containing a the total count of the data (one value). mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly shifted) sum of the elements to average over. variance_ss: A `Tensor` containing the variance sufficient statistics: the (possibly shifted) squared sum of the data to compute the variance over. shift: A `Tensor` containing the value by which the data is shifted for numerical stability, or `None` if no shift was performed. name: Name used to scope the operations that compute the moments. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.op_scope([counts, mean_ss, variance_ss, shift], name, "normalize"): divisor = math_ops.inv(counts, name="divisor") if shift is not None: shifted_mean = math_ops.mul(mean_ss, divisor, name="shifted_mean") mean = math_ops.add(shifted_mean, shift, name="mean") else: # no shift. shifted_mean = math_ops.mul(mean_ss, divisor, name="mean") mean = shifted_mean variance = math_ops.sub( math_ops.mul(variance_ss, divisor), math_ops.square(shifted_mean), name="variance") return (mean, variance)
def _TanGrad(op, grad): """Returns grad * 1/sec^2(x).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): secx = math_ops.inv(math_ops.cos(x)) secx2 = math_ops.square(secx) return grad * secx2
def _AtanGrad(op, grad): """Returns grad * 1/ (1 + x^2)""" x = op.inputs[0] with ops.control_dependencies([grad.op]): x2 = math_ops.square(x) one = constant_op.constant(1, dtype=grad.dtype) inv = math_ops.inv(math_ops.add(one, x2)) return grad * inv
def dropout(self, input_, keep_prob): with ops.op_scope([input_], None, "dropout") as name: rands = keep_prob + random_ops.random_uniform( array_ops.shape(input_)) floored = math_ops.floor(rands) ret = input_ * math_ops.inv(keep_prob) * floored ret.set_shape(input_.get_shape()) return ret
def moments(x, axes, name=None, keep_dims=False): """Calculate the mean and variance of `x`. The mean and variance are calculated by aggregating the contents of `x` across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean and variance of a vector. When using these moments for batch normalization (see `tf.nn.batch_normalization`): * for so-called "global normalization", used with convolutional filters with shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`. * for simple batch normalization pass `axes=[0]` (batch only). Args: x: A `Tensor`. axes: array of ints. Axes along which to compute mean and variance. keep_dims: produce moments with the same dimensionality as the input. name: Name used to scope the operations that compute the moments. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.op_scope([x, axes], name, "moments"): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if all(x_shape[d].value is not None for d in axes): # The shape is known in the relevant axes, so we can statically # compute the divisor. divisor = 1.0 for d in set(axes): divisor *= x.get_shape()[d].value divisor = constant_op.constant(1.0 / divisor, x.dtype, name="divisor") else: divisor = constant_op.constant(1.0, dtype=x.dtype) x_dynamic_shape = array_ops.shape(x) for d in set(axes): divisor *= math_ops.cast(x_dynamic_shape[d], x.dtype) divisor = math_ops.inv(divisor, name="divisor") constant_axes = constant_op.constant(axes, name="axes") # Note: We do not use Mean here because it is very slow on GPU. mean = math_ops.mul( math_ops.reduce_sum(x, constant_axes, keep_dims=True), divisor, name="mean") var = math_ops.mul( math_ops.reduce_sum( math_ops.squared_difference(x, mean), constant_axes, keep_dims=keep_dims), divisor, name="variance") if keep_dims: return mean, var else: return array_ops.squeeze(mean, squeeze_dims=axes), var
def _AcosGrad(op, grad): """Returns grad * -1/sqrt(1-x^2).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): x2 = math_ops.square(x) one = constant_op.constant(1, dtype=grad.dtype) den = math_ops.sqrt(math_ops.sub(one, x2)) inv = math_ops.inv(den) return -grad * inv
def _SegmentMeanGrad(op, grad): """Gradient for SegmentMean.""" input_rank = array_ops.rank(op.inputs[0]) ones_shape = array_ops.concat( 0, [array_ops.shape(op.inputs[1]), array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)] ) ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype)) scaled_grad = grad * math_ops.inv(math_ops.segment_sum(ones, op.inputs[1])) return array_ops.gather(scaled_grad, op.inputs[1]), None
def moments(x, axes, name=None): """Calculate the mean and variance of `x`. The mean and variance are calculated by aggregating the contents of `x` across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean and variance of a vector. For so-called "global normalization" needed for convolutional filters pass `axes=[0, 1, 2]` (batch, height, width). For batch normalization pass `axes=[0]` (batch). Args: x: A `Tensor`. axes: array of ints. Axes along which to compute mean and variance. name: Name used to scope the operations that compute the moments. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.op_scope([x, axes], name, "moments"): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if all(x_shape[d].value is not None for d in axes): # The shape is known in the relevant axes, so we can statically # compute the divisor. divisor = 1.0 for d in set(axes): divisor *= x.get_shape()[d].value divisor = constant_op.constant(1.0 / divisor, x.dtype, name="divisor") else: divisor = constant_op.constant(1.0, dtype=x.dtype) x_dynamic_shape = array_ops.shape(x) for d in set(axes): divisor *= math_ops.cast(x_dynamic_shape[d], x.dtype) divisor = math_ops.inv(divisor, name="divisor") axes = constant_op.constant(axes, name="axes") # Note: We do not use Mean here because it is very slow on GPU. # Note 2: The expression below is potentially more stable. # It is however a bit slower and stability doesn't appear to be an issue. # mean = math_ops.reduce_sum(math_ops.mul(x, divisor), axes, name="mean") # var = math_ops.reduce_sum(math_ops.mul(math_ops.square(x - mean), # divisor), axes, # name="variance") mean = math_ops.mul(math_ops.reduce_sum(x, axes), divisor, name="mean") # Give x-mean a specific name, so the caller might take advantage of it. # The caller should have a fallback plan, however: this tensor may not be # available if this function implementation changes. x_centered = math_ops.sub(x, mean, name="x_centered") var = math_ops.mul(math_ops.reduce_sum(math_ops.square(x_centered), axes), divisor, name="variance") return mean, var
def _SegmentMeanGrad(op, grad): """Gradient for SegmentMean.""" input_rank = array_ops.rank(op.inputs[0]) ones_shape = array_ops.concat( 0, [array_ops.shape(op.inputs[1]), array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)]) ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype)) scaled_grad = grad * math_ops.inv(math_ops.segment_sum(ones, op.inputs[1])) return array_ops.gather(scaled_grad, op.inputs[1]), None
def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): """Computes dropout. With probability `keep_prob`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`. The scaling is so that the expected sum is unchanged. By default, each element is kept or dropped independently. If `noise_shape` is specified, it must be [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]` will make independent decisions. For example, if `shape(x) = [k, l, m, n]` and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be kept independently and each row and column will be kept or not kept together. Args: x: A tensor. keep_prob: A scalar `Tensor` with the same type as x. The probability that each element is kept. noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for randomly generated keep/drop flags. seed: A Python integer. Used to create random seeds. See [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed) for behavior. name: A name for this operation (optional). Returns: A Tensor of the same shape of `x`. Raises: ValueError: If `keep_prob` is not in `(0, 1]`. """ with ops.op_scope([x], name, "dropout") as name: x = ops.convert_to_tensor(x, name="x") if isinstance(keep_prob, float) and not 0 < keep_prob <= 1: raise ValueError( "keep_prob must be a scalar tensor or a float in the " "range (0, 1], got %g" % keep_prob) keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob") keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar()) noise_shape = noise_shape if noise_shape is not None else array_ops.shape( x) # uniform [keep_prob, 1.0 + keep_prob) random_tensor = keep_prob random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) ret = x * math_ops.inv(keep_prob) * binary_tensor ret.set_shape(x.get_shape()) return ret
def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): """Computes dropout. With probability `keep_prob`, outputs the input element scaled up by `1 / keep_prob`, otherwise outputs `0`. The scaling is so that the expected sum is unchanged. By default, each element is kept or dropped independently. If `noise_shape` is specified, it must be [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html) to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]` will make independent decisions. For example, if `shape(x) = [k, l, m, n]` and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be kept independently and each row and column will be kept or not kept together. Args: x: A tensor. keep_prob: A scalar `Tensor` with the same type as x. The probability that each element is kept. noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for randomly generated keep/drop flags. seed: A Python integer. Used to create random seeds. See [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed) for behavior. name: A name for this operation (optional). Returns: A Tensor of the same shape of `x`. Raises: ValueError: If `keep_prob` is not in `(0, 1]`. """ with ops.op_scope([x], name, "dropout") as name: x = ops.convert_to_tensor(x, name="x") if isinstance(keep_prob, float) and not 0 < keep_prob <= 1: raise ValueError("keep_prob must be a scalar tensor or a float in the " "range (0, 1], got %g" % keep_prob) keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob") keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar()) noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x) # uniform [keep_prob, 1.0 + keep_prob) random_tensor = keep_prob random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) ret = x * math_ops.inv(keep_prob) * binary_tensor ret.set_shape(x.get_shape()) return ret
def dropout(x, keep_prob, noise_shape=None, seed=None, name=None): with ops.op_scope([x], name, "dropout") as name: x = ops.convert_to_tensor(x, name="x") noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x) # uniform [keep_prob, 1.0 + keep_prob) random_tensor = keep_prob random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) ret = x * math_ops.inv(tf.reduce_mean(keep_prob, reduction_indices=[1], keep_dims=True)) * binary_tensor ret.set_shape(x.get_shape()) return ret
def per_image_whitening(image): """Linearly scales `image` to have zero mean and unit norm. This op computes `(x - mean) / adjusted_stddev`, where `mean` is the average of all values in image, and `adjusted_stddev = max(stddev, 1.0/sqrt(image.NumElements()))`. `stddev` is the standard deviation of all values in `image`. It is capped away from zero to protect against division by 0 when handling uniform images. Note that this implementation is limited: * It only whitens based on the statistics of an individual image. * It does not take into account the covariance structure. Args: image: 3-D tensor of shape `[height, width, channels]`. Returns: The whitened image with same shape as `image`. Raises: ValueError: if the shape of 'image' is incompatible with this function. """ image = ops.convert_to_tensor(image, name='image') _Check3DImage(image, require_static=False) num_pixels = math_ops.reduce_prod(array_ops.shape(image)) image = math_ops.cast(image, dtype=dtypes.float32) image_mean = math_ops.reduce_mean(image) variance = (math_ops.reduce_mean(math_ops.square(image)) - math_ops.square(image_mean)) variance = gen_nn_ops.relu(variance) stddev = math_ops.sqrt(variance) # Apply a minimum normalization that protects us against uniform images. min_stddev = math_ops.inv( math_ops.sqrt(math_ops.cast(num_pixels, dtypes.float32))) pixel_value_scale = math_ops.maximum(stddev, min_stddev) pixel_value_offset = image_mean image = math_ops.sub(image, pixel_value_offset) image = math_ops.div(image, pixel_value_scale) return image
def _SelfAdjointEigV2Grad(op, grad_e, grad_v): """Gradient for SelfAdjointEigV2.""" e = op.outputs[0] v = op.outputs[1] # a = op.inputs[0], which satisfies # a[...,:,:] * v[...,:,i] = e[...,i] * v[...,i] with ops.control_dependencies([grad_e.op, grad_v.op]): if grad_v is not None: # Construct the matrix f(i,j) = (i != j ? 1 / (e_i - e_j) : 0). # Notice that because of the term involving f, the gradient becomes # infinite (or NaN in practice) when eigenvalues are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate eigenvalues, the corresponding eigenvectors are only defined # up to arbitrary rotation in a (k-dimensional) subspace. f = array_ops.matrix_set_diag( math_ops.inv( array_ops.expand_dims(e, -2) - array_ops.expand_dims(e, -1)), array_ops.zeros_like(e)) grad_a = math_ops.batch_matmul( v, math_ops.batch_matmul( array_ops.matrix_diag(grad_e) + f * math_ops.batch_matmul(v, grad_v, adj_x=True), v, adj_y=True)) else: grad_a = math_ops.batch_matmul( v, math_ops.batch_matmul(array_ops.matrix_diag(grad_e), v, adj_y=True)) # The forward op only depends on the lower triangular part of a, so here we # symmetrize and take the lower triangle grad_a = array_ops.matrix_band_part( grad_a + array_ops.matrix_transpose(grad_a), -1, 0) grad_a = array_ops.matrix_set_diag( grad_a, 0.5 * array_ops.matrix_diag_part(grad_a)) return grad_a
def _SelfAdjointEigV2Grad(op, grad_e, grad_v): """Gradient for SelfAdjointEigV2.""" e = op.outputs[0] v = op.outputs[1] # a = op.inputs[0], which satisfies # a[...,:,:] * v[...,:,i] = e[...,i] * v[...,i] with ops.control_dependencies([grad_e.op, grad_v.op]): if grad_v is not None: # Construct the matrix f(i,j) = (i != j ? 1 / (e_i - e_j) : 0). # Notice that because of the term involving f, the gradient becomes # infinite (or NaN in practice) when eigenvalues are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate eigenvalues, the corresponding eigenvectors are only defined # up to arbitrary rotation in a (k-dimensional) subspace. f = array_ops.matrix_set_diag( math_ops.inv( array_ops.expand_dims(e, -2) - array_ops.expand_dims(e, -1)), array_ops.zeros_like(e)) grad_a = math_ops.batch_matmul( v, math_ops.batch_matmul( array_ops.matrix_diag(grad_e) + f * math_ops.batch_matmul( v, grad_v, adj_x=True), v, adj_y=True)) else: grad_a = math_ops.batch_matmul( v, math_ops.batch_matmul( array_ops.matrix_diag(grad_e), v, adj_y=True)) # The forward op only depends on the lower triangular part of a, so here we # symmetrize and take the lower triangle grad_a = array_ops.matrix_band_part( grad_a + array_ops.matrix_transpose(grad_a), -1, 0) grad_a = array_ops.matrix_set_diag(grad_a, 0.5 * array_ops.matrix_diag_part(grad_a)) return grad_a
def _SqrtGrad(op, grad): y = op.outputs[0] # y = x^(1/2) return grad * (.5 * math_ops.inv(y))
def _LogGrad(op, grad): """Returns grad * (1/x).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): return grad * math_ops.inv(x)
def _RsqrtGrad(op, grad): x = op.inputs[0] y = op.outputs[0] # y = x^(-1/2) with ops.control_dependencies([grad.op]): return grad * ((-0.5) * math_ops.inv(x) * y)
def _SqrtGrad(op, grad): y = op.outputs[0] # y = x^(1/2) with ops.control_dependencies([grad.op]): return grad * (.5 * math_ops.inv(y))
def _RsqrtGrad(op, grad): x = op.inputs[0] y = op.outputs[0] # y = x^(-1/2) return grad * ((-0.5) * math_ops.inv(x) * y)
def _LogGrad(op, grad): """Returns grad * (1/x).""" x = op.inputs[0] return grad * math_ops.inv(x)
def _Log1pGrad(op, grad): """Returns grad * (1/(1 + x)).""" x = op.inputs[0] with ops.control_dependencies([grad.op]): x = math_ops.conj(x) return grad * math_ops.inv(1 + x)
def _gmm_model_grad(op, dl_dp, dl_dgauss, dl_daux2): x = op.inputs[0] w = op.inputs[1] mu = op.inputs[2] sigma = op.inputs[3] p_x = op.outputs[0] gaussians = op.outputs[1] sigma_inv_x_mu = op.outputs[2] dl_dp = array_ops.expand_dims(dl_dp, -1) x_shape_np = x.get_shape() #array_ops.get_shape(x) mu_shape_np = mu.get_shape() #array_ops.get_shape(mu) x_shape = array_ops.shape(x) mu_shape = array_ops.shape(mu) n_samples = x_shape[0] n_params = mu_shape_np[0] n_kernels = mu_shape_np[1] n_dims = mu_shape[2] #print("x_shape: ", x_shape) #print("n_samples: ", n_samples) #print("n_dims: ", n_dims) #pi= 3.14159265358979323846 #norm_const = math_ops.inv( math_ops.sqrt((math_ops.pow(2.0*pi, math_ops.to_float(n_dims))) * math_ops.reduce_prod(sigma, 2))) sigma_inv = math_ops.inv( sigma ) # 1/x element-wise, shape: [sample_id, kernel_id, sigma...] #x_mu = array_ops.reshape(x, [n_samples, 1, n_dims]) - mu # shape: [sample_id, kernel_id, x-mu] #sigma_inv_x_mu = math_ops.mul( x_mu, sigma_inv ) #gaussians = math_ops.mul( norm_const, math_ops.exp( -0.5* math_ops.reduce_sum( x_mu * sigma_inv_x_mu, 2 ) ) ) # gradient computation # derivative with respect w if n_kernels==1: dl_dw = 0*w else: dl_dw = math_ops.mul( dl_dp , gaussians) # derivative with respect mu w_gaussians = math_ops.mul( w, gaussians) # dgmm_dmu: tensor of shape: [samples, kernel, dim] dp_dmu = math_ops.mul( array_ops.expand_dims(w_gaussians, -1) , sigma_inv_x_mu) # de_dmu: tensor of shape: [samples, kernel, dim] dl_dmu = math_ops.mul( array_ops.expand_dims(dl_dp,-1), dp_dmu) # derivative with respect sigma # dgmm_dmu: tensor of shape: [samples, kernel, dim] dp_dsigma = math_ops.pow(sigma_inv_x_mu, 2.0) - sigma_inv dp_dsigma = 0.5 * math_ops.mul( array_ops.expand_dims(w_gaussians, -1) , dp_dsigma) # de_dmu: tensor of shape: [samples, kernel, dim] dl_dsigma = math_ops.mul( array_ops.expand_dims(dl_dp,-1), dp_dsigma) # derivative with respect x dl_dx = math_ops.reduce_sum(-dl_dmu, 1) if n_params == 1: dl_dw = math_ops.reduce_sum(dl_dw, 0) dl_dw = array_ops.expand_dims(dl_dw, 0) dl_dmu = math_ops.reduce_sum(dl_dmu, 0) dl_dmu = array_ops.expand_dims(dl_dmu, 0) dl_dsigma = math_ops.reduce_sum(dl_dsigma, 0) dl_dsigma = array_ops.expand_dims(dl_dsigma, 0) return dl_dx, dl_dw, dl_dmu, dl_dsigma