def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) m = self.get_slot(var, 'm') m_t = state_ops.assign( m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking) sign_g = ops.IndexedSlices( math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape) sign_gm = ops.IndexedSlices( array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values, sign_g.indices, dense_shape=sign_g.dense_shape) sign_decayed = math_ops.cast( self._sign_decay_t, var.dtype.base_dtype) multiplier_values = alpha_t + sign_decayed * sign_gm.values multiplier = ops.IndexedSlices( multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape) final_update = ops.IndexedSlices( lr_t * multiplier.values * grad.values, multiplier.indices, dense_shape=multiplier.dense_shape) var_update = state_ops.scatter_sub( var, final_update.indices, final_update.values, use_locking=self._use_locking) return control_flow_ops.group(* [var_update, m_t])
def random_sign_uniform(shape, minval=None, maxval=None, dtype=dtypes.float32, seed=None): """Tensor with (possibly complex) random entries from a "sign Uniform". Letting `Z` be a random variable equal to `-1` and `1` with equal probability, Samples from this `Op` are distributed like ``` Z * X, where X ~ Uniform[minval, maxval], if dtype is real, Z * (X + iY), where X, Y ~ Uniform[minval, maxval], if dtype is complex. ``` Args: shape: `TensorShape` or Python list. Shape of the returned tensor. minval: `0-D` `Tensor` giving the minimum values. maxval: `0-D` `Tensor` giving the maximum values. dtype: `TensorFlow` `dtype` or Python dtype seed: Python integer seed for the RNG. Returns: `Tensor` with desired shape and dtype. """ dtype = dtypes.as_dtype(dtype) with ops.name_scope("random_sign_uniform"): unsigned_samples = random_uniform( shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed) if seed is not None: seed += 12 signs = math_ops.sign( random_ops.random_uniform(shape, minval=-1., maxval=1., seed=seed)) return unsigned_samples * math_ops.cast(signs, unsigned_samples.dtype)
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 3 or len(shape) > 5: raise ValueError("The tensor to initialize must be at least " "three-dimensional and at most five-dimensional") if shape[-2] > shape[-1]: raise ValueError("In_filters cannot be greater than out_filters.") # Generate a random matrix a = random_ops.random_normal([shape[-1], shape[-1]], dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) q = q[:shape[-2], :] q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype)) if len(shape) == 3: weight = array_ops.scatter_nd([[(shape[0]-1)//2]], array_ops.expand_dims(q, 0), shape) elif len(shape) == 4: weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2]], array_ops.expand_dims(q, 0), shape) else: weight = array_ops.scatter_nd([[(shape[0]-1)//2, (shape[1]-1)//2, (shape[2]-1)//2]], array_ops.expand_dims(q, 0), shape) return weight
def _Solve(a, b, c): """Return solution of a quadratic minimization. The optimization equation is: f(a, b, c) = argmin_w{1/2 * a * w^2 + b * w + c * |w|} we get optimal solution w*: w* = -(b - sign(b)*c)/a if |b| > c else w* = 0 REQUIRES: Dimensionality of a and b must be same Args: a: A Tensor b: A Tensor c: A Tensor with one element. Returns: A Tensor w, which is solution for the equation """ with ops.name_scope("solve_" + b.op.name): c = ops.convert_to_tensor(c) k = array_ops.fill(array_ops.shape(b), c) zero_t = array_ops.zeros(array_ops.shape(b), dtype=b.dtype) w = (c * math_ops.sign(b) - b) / a w = math_ops.select(math_ops.less(math_ops.abs(b), k), zero_t, w) return w
def sample_n(self, n, seed=None, name="sample_n"): """Sample `n` observations from the Laplace Distributions. Args: n: `Scalar`, type int32, the number of observations to sample. seed: Python integer, the random seed. name: The name to give this op. Returns: samples: `[n, ...]`, a `Tensor` of `n` samples for each of the distributions determined by broadcasting the parameters. """ with ops.name_scope(self.name): with ops.name_scope(name, values=[self._loc, self._scale, n]): n = ops.convert_to_tensor(n) n_val = tensor_util.constant_value(n) shape = array_ops.concat(0, ([n], self.batch_shape())) # Sample uniformly-at-random from the open-interval (-1, 1). uniform_samples = random_ops.random_uniform( shape=shape, minval=np.nextafter(self.dtype.as_numpy_dtype(-1.), self.dtype.as_numpy_dtype(0.)), maxval=self.dtype.as_numpy_dtype(1.), dtype=self.dtype, seed=seed) # Provide some hints to shape inference inferred_shape = tensor_shape.vector(n_val).concatenate( self.get_batch_shape()) uniform_samples.set_shape(inferred_shape) return (self._loc - self._scale * math_ops.sign(uniform_samples) * math_ops.log(1. - math_ops.abs(uniform_samples)))
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (num_cols, num_rows) if num_rows < num_cols else (num_rows, num_cols) # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def __call__(self, shape, dtype=dtypes.float32): """Returns a tensor object initialized as specified by the initializer. Args: shape: Shape of the tensor. dtype: Optional dtype of the tensor. Only floating point types are supported. Raises: ValueError: If the dtype is not floating point or the input shape is not valid. """ dtype = _assert_float_dtype(dtype) # Check the shape if len(shape) < 2: raise ValueError("The tensor to initialize must be " "at least two-dimensional") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows)) # Generate a random matrix a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed) # Compute the qr factorization q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def _resource_apply_dense(self, grad, var): step, beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) if self._initial_total_steps > 0: total_steps = math_ops.cast(self._total_steps_t, var.dtype.base_dtype) warmup_proportion = math_ops.cast(self._warmup_proportion_t, var.dtype.base_dtype) min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype) warmup_steps = total_steps * warmup_proportion decay_steps = math_ops.maximum(total_steps - warmup_steps, 1) decay_rate = (min_lr - lr_t) / decay_steps lr_t = tf.where( step <= warmup_steps, lr_t * (step / warmup_steps), lr_t + decay_rate * math_ops.minimum(step - warmup_steps, decay_steps), ) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) v = self.get_slot(var, "v") if self.clip_gradients: clipVal = math_ops.sqrt( tf.reduce_sum(v) / (1.0 - beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t grad = clip_ops.clip_by_norm(grad, clipVal) sma_inf = 2.0 / (1.0 - beta2_t) - 1.0 sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power) m = self.get_slot(var, "m") v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t grad_corr = grad / v_corr_t m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad_corr, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta1_power) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t, m_corr_t) if var in self.reg_vars: if self._initial_weight_decay > 0.0: var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var if self._L1_decay > 0.0: var_t += math_ops.cast(self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var) with tf.control_dependencies([var_t]): var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] return control_flow_ops.group(*updates)
def test_complex_sign_gradient(self): with context.eager_mode(): x = math_ops.complex(1., 1.) with backprop.GradientTape() as t: t.watch(x) y = math_ops.sign(x) self.assertAllClose(t.gradient(y, x), math_ops.complex(0.353553, -0.353553))
def call(self, y_true, y_pred): y_pred = ops.convert_to_tensor_v2(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) quotient = math_ops.divide(y_pred, y_true) sign = math_ops.sign(quotient) quabs = math_ops.minimum(math_ops.abs(quotient), 100000 * math_ops.abs(y_pred)) quotient = math_ops.exp(10 - 10 * sign) * quabs + 0.000000001 return 100 * K.mean(math_ops.abs(math_ops.log(quotient)), axis=-1)
def Test(self): np.random.seed(1) n = shape_[-1] batch_shape = shape_[:-2] np_dtype = dtype_.as_numpy_dtype a = np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: a += 1j * np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) a += np.conj(a.T) a = np.tile(a, batch_shape + (1, 1)) # Optimal stepsize for central difference is O(epsilon^{1/3}). epsilon = np.finfo(np_dtype).eps delta = 0.1 * epsilon**(1.0 / 3.0) # tolerance obtained by looking at actual differences using # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64): tol = 1e-2 else: tol = 1e-7 with self.session(use_gpu=True): tf_a = constant_op.constant(a) if compute_v_: tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = tf_v[..., 0:1, :] if tf_a.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_v *= phase outputs = [tf_e, tf_v] else: tf_e = linalg_ops.self_adjoint_eigvals(tf_a) outputs = [tf_e] for b in outputs: x_init = np.random.uniform(low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape( [n, n]).astype(np_dtype) x_init += np.conj(x_init.T) x_init = np.tile(x_init, batch_shape + (1, 1)) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def Test(self): np.random.seed(1) n = shape_[-1] batch_shape = shape_[:-2] np_dtype = dtype_.as_numpy_dtype a = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: a += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) a += np.conj(a.T) a = np.tile(a, batch_shape + (1, 1)) # Optimal stepsize for central difference is O(epsilon^{1/3}). epsilon = np.finfo(np_dtype).eps delta = 0.1 * epsilon**(1.0 / 3.0) # tolerance obtained by looking at actual differences using # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64): tol = 1e-2 else: tol = 1e-7 with self.session(use_gpu=True): tf_a = constant_op.constant(a) if compute_v_: tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = tf_v[..., 0:1, :] if tf_a.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_v *= phase outputs = [tf_e, tf_v] else: tf_e = linalg_ops.self_adjoint_eigvals(tf_a) outputs = [tf_e] for b in outputs: x_init = np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) if dtype_.is_complex: x_init += 1j * np.random.uniform( low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype) x_init += np.conj(x_init.T) x_init = np.tile(x_init, batch_shape + (1, 1)) theoretical, numerical = gradient_checker.compute_gradient( tf_a, tf_a.get_shape().as_list(), b, b.get_shape().as_list(), x_init_value=x_init, delta=delta) self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
def _apply_dense(self, grad, var): lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) iter_ = math_ops.cast(self._iter, var.dtype.base_dtype) first_iter = math_ops.cast(self._first_iter,var.dtype.base_dtype) l1 = math_ops.cast(self._l1_accum, var.dtype.base_dtype) v = self.get_slot(var, "accumulator") v_t = state_ops.assign(v, v + first_iter *var - lr*grad, use_locking=self._use_locking) # GRDA update var_update = state_ops.assign(var, math_ops.sign(v_t) * math_ops.maximum(math_ops.abs(v_t) - l1, 0), use_locking=self._use_locking) return control_flow_ops.group(*[v_t,var_update])
def modrelu(z, b, comp): if comp: z_norm = math_ops.sqrt(math_ops.square(math_ops.real(z)) + math_ops.square(math_ops.imag(z))) + 0.00001 step1 = nn_ops.bias_add(z_norm, b) step2 = math_ops.complex(nn_ops.relu(step1), array_ops.zeros_like(z_norm)) step3 = z/math_ops.complex(z_norm, array_ops.zeros_like(z_norm)) else: z_norm = math_ops.abs(z) + 0.00001 step1 = nn_ops.bias_add(z_norm, b) step2 = nn_ops.relu(step1) step3 = math_ops.sign(z) return math_ops.multiply(step3, step2)
def _sample_n(self, n, seed=None): shape = array_ops.concat(0, ([n], self.batch_shape())) # Sample uniformly-at-random from the open-interval (-1, 1). uniform_samples = random_ops.random_uniform( shape=shape, minval=np.nextafter(self.dtype.as_numpy_dtype(-1.), self.dtype.as_numpy_dtype(0.)), maxval=1., dtype=self.dtype, seed=seed) return (self.loc - self.scale * math_ops.sign(uniform_samples) * math_ops.log(1. - math_ops.abs(uniform_samples)))
def Compute(x): e, v = linalg_ops.self_adjoint_eig(x) # (complex) Eigenvectors are only unique up to an arbitrary phase # We normalize the vectors such that the first component has phase 0. top_rows = v[..., 0:1, :] if dtype_.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) v *= phase return e, v
def _sample_n(self, n, seed=None): shape = array_ops.concat(([n], self.batch_shape()), 0) # Sample uniformly-at-random from the open-interval (-1, 1). uniform_samples = random_ops.random_uniform( shape=shape, minval=np.nextafter(self.dtype.as_numpy_dtype(-1.), self.dtype.as_numpy_dtype(0.)), maxval=1., dtype=self.dtype, seed=seed) return (self.loc - self.scale * math_ops.sign(uniform_samples) * math_ops.log(1. - math_ops.abs(uniform_samples)))
def get_grow_tensor(self, weight, method): if method.startswith('grad_scale'): masked_grad = self._weight2masked_grads[weight.name] divisor = extract_number(method) grow_tensor = masked_grad / divisor elif method.startswith('grad_sign'): masked_grad_sign = math_ops.sign( self._weight2masked_grads[weight.name]) divisor = extract_number(method) grow_tensor = masked_grad_sign / divisor else: grow_tensor = super(SparseRigLOptimizer, self).get_grow_tensor(weight, method) return grow_tensor
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) logbase_t = math_ops.cast(self._logbase_t, var.dtype.base_dtype) e_t = math_ops.cast(math.e, var.dtype.base_dtype) m = self.get_slot(var, 'm') m_t = state_ops.assign(m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking) sign_g = ops.IndexedSlices(math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape) sign_gm = ops.IndexedSlices( array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values, sign_g.indices, dense_shape=sign_g.dense_shape) sign_decayed = math_ops.cast(self._sign_decay_t, var.dtype.base_dtype) multiplier_values = math_ops.pow( e_t, logbase_t * sign_decayed * sign_gm.values) multiplier = ops.IndexedSlices(multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape) final_update = ops.IndexedSlices(lr_t * multiplier.values * grad.values, multiplier.indices, dense_shape=multiplier.dense_shape) var_update = state_ops.scatter_sub(var, final_update.indices, final_update.values, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t])
def build(self, inputs_shape): """construct the IndRNN Cell""" if inputs_shape[1].value is None: raise ValueError("Expected input shape[1] is known") input_depth = inputs_shape[1] if self._input_kernel_initializer is None: self._input_kernel_initializer = init_ops.random_normal_initializer( mean=0, stddev=1e-3) # matrix W self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units], initializer=self._input_kernel_initializer, ) if self._recurrent_recurrent_kernel_initializer is None: self._recurrent_recurrent_kernel_initializer = init_ops.constant_initializer( 1.) # matrix U self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_recurrent_kernel_initializer, ) # Clip the U to min - max if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel) if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value( self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs, ) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype), ) # built finished self.built = True
def _apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr = math_ops.cast(self._lr_t, var_dtype) beta = self._beta epsilon = self._epsilon t = math_ops.cast(self.iterations + 1, var_dtype) ops = [] # Update running sum s = self.get_slot(var, 'sum') grad_sq = math_ops.square(grad) s_new = s + grad_sq ops.append(state_ops.assign(s, s_new, use_locking=self._use_locking)) # Update running counter if self._sparse_counter: n = self.get_slot(var, 'counter') n_new = n + math_ops.sign(grad_sq) ops.append( state_ops.assign(n, n_new, use_locking=self._use_locking)) else: # Counter is not sparse; just use the current timestep instead n_new = t # Compute step size average = math_ops.div_no_nan(s_new, n_new) step = grad / (epsilon + math_ops.sqrt(average)) # Update momentum if self._use_momentum: m = self.get_slot(var, 'momentum') m_new = beta * m + (1.0 - beta) * step ops.append( state_ops.assign(m, m_new, use_locking=self._use_locking)) # Bias correction lr = lr / (1.0 - pow(beta, t)) else: # No momentum; just use the current step instead m_new = step # Update parameters ops.append( state_ops.assign_sub(var, lr * m_new, use_locking=self._use_locking)) return control_flow_ops.group(*ops)
def _apply_dense(self, grad, var): lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) v = self.get_slot(var, "accumulator") v_t = state_ops.assign(v, v - lr * grad, use_locking=self._use_locking) iter_ = self._get_iter_variable() iter_ = math_ops.cast(iter_, var.dtype.base_dtype) c = math_ops.cast(self._c, var.dtype.base_dtype) mu = math_ops.cast(self._mu, var.dtype.base_dtype) l1 = math_ops.cast(c * math_ops.pow(lr, (0.5 + mu)) * math_ops.pow(iter_, mu), var.dtype.base_dtype) # GRDA update var_update = state_ops.assign(var, math_ops.sign(v_t) * math_ops.maximum(math_ops.abs(v_t) - l1, 0), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, v_t])
def _orthogonal_matrix(self, n): """Construct an n x n orthogonal matrix. Args: n: Dimension. Returns: A n x n orthogonal matrix. """ a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed) if self.seed: self.seed += 1 q, r = gen_linalg_ops.qr(a) d = array_ops.diag_part(r) # make q uniform q *= math_ops.sign(d) return q
def _orthogonal_matrix(self, n): """Construct an n x n orthogonal matrix. Args: n: dimension. Returns: a n x n orthogonal matrix. """ a = random_ops.random_normal([n, n], dtype=self.dtype, seed=self.seed) if self.seed: self.seed += 1 q, r = linalg_ops.qr(a) d = array_ops.diag_part(r) # make q uniform q *= math_ops.sign(d) return q
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units]) if self._recurrent_initializer is None: # Initialize the recurrent weights uniformly in [-max_abs, max_abs] or # [-1, 1] if max_abs exceeds 1 init_bound = 1.0 if self._recurrent_max_abs and self._recurrent_max_abs < init_bound: init_bound = self._recurrent_max_abs self._recurrent_initializer = init_ops.random_uniform_initializer( minval=-init_bound, maxval=init_bound ) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel ) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) self.built = True
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError( "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) self._input_depth = inputs_shape[1].value self._filters_num = self._input_depth // self._n_nodes self._output_depth = self._n_nodes * self._num_units self._conv_kernel = self.add_variable( "conv_kernel", [self._filters_num, self._num_units], dtype=self.dtype, initializer=self._input_kernel_initializer) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) if self._recurrent_kernel_initializer is None: self._recurrent_kernel_initializer = init_ops.random_uniform_initializer( minval=0., maxval=1, ) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units * self._n_nodes], initializer=self._recurrent_kernel_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs and self._recurrent_min_abs != 0: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel) # Clip the absolute values of the recurrent weights to the specified maximum self._recurrent_max_abs = self._recurrent_max_abs or 1. self._recurrent_kernel = clip_ops.clip_by_value( self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self.built = True
def _BesselI1eGrad(op, grad): """Compute gradient of bessel_i1e(x) with respect to its argument.""" x = op.inputs[0] y = op.outputs[0] with ops.control_dependencies([grad]): # For x = 0, the correct gradient is 0.5. # However, the main branch gives NaN because of the division by x, so # we impute the gradient manually. # An alternative solution is to express the gradient via bessel_i0e and # bessel_i2e, but the latter is not yet implemented in Eigen. eps = np.finfo(x.dtype.as_numpy_dtype).eps zeros = array_ops.zeros_like(x) x_is_not_tiny = math_ops.abs(x) > eps safe_x = array_ops.where(x_is_not_tiny, x, eps + zeros) dy_dx = math_ops.bessel_i0e(safe_x) - y * ( math_ops.sign(safe_x) + math_ops.reciprocal(safe_x)) return grad * array_ops.where(x_is_not_tiny, dy_dx, 0.5 + zeros)
def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) # Uniform variates must be sampled from the open-interval `(-1, 1)` rather # than `[-1, 1)`. In the case of `(0, 1)` we'd use # `np.finfo(self.dtype.as_numpy_dtype).tiny` because it is the smallest, # positive, "normal" number. However, the concept of subnormality exists # only at zero; here we need the smallest usable number larger than -1, # i.e., `-1 + eps/2`. uniform_samples = random_ops.random_uniform( shape=shape, minval=np.nextafter(self.dtype.as_numpy_dtype(-1.), self.dtype.as_numpy_dtype(0.)), maxval=1., dtype=self.dtype, seed=seed) return (self.loc - self.scale * math_ops.sign(uniform_samples) * math_ops.log1p(-math_ops.abs(uniform_samples)))
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError( "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value self._input_kernel = self.add_variable( "input_%s" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, shape=[input_depth, self._num_units]) if self._recurrent_initializer is None: if self._recurrent_max_abs: self._recurrent_initializer = init_ops.random_uniform_initializer( minval=-self._recurrent_max_abs, maxval=self._recurrent_max_abs) else: self._recurrent_initializer = init_ops.random_uniform_initializer( minval=-1.0, maxval=1.0) self._recurrent_kernel = self.add_variable( "recurrent_%s" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME, shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel) if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value( self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( rnn_cell_impl._BIAS_VARIABLE_NAME, shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) self.built = True
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError( "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value #input weights if self._input_initializer is None: self._input_initializer = init_ops.random_normal_initializer( mean=0.0, stddev=0.001) #input weights variable self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units], initializer=self._input_initializer) if self._recurrent_initializer is None: self._recurrent_initializer = init_ops.constant_initializer(1.) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value( self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) self.built = True
def cdf(self, x, name="cdf"): """CDF of observations in `x` under the Laplace distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `loc` and `scale`. name: The name to give this op. Returns: cdf: tensor of dtype `dtype`, the CDFs of `x`. """ with ops.name_scope(self.name): with ops.name_scope(name, values=[self._loc, self._scale, x]): x = ops.convert_to_tensor(x) if x.dtype != self.dtype: raise TypeError("Input x dtype does not match dtype: %s vs. %s" % (x.dtype, self.dtype)) y = x - self._loc return 0.5 + 0.5 * math_ops.sign(y) * ( 1. - math_ops.exp(-math_ops.abs(y) / self._scale))
def cdf(self, x, name="cdf"): """CDF of observations in `x` under the Laplace distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `loc` and `scale`. name: The name to give this op. Returns: cdf: tensor of dtype `dtype`, the CDFs of `x`. """ with ops.name_scope(self.name): with ops.op_scope([self._loc, self._scale, x], name): x = ops.convert_to_tensor(x) if x.dtype != self.dtype: raise TypeError("Input x dtype does not match dtype: %s vs. %s" % (x.dtype, self.dtype)) y = x - self._loc return 0.5 + 0.5 * math_ops.sign(y) * ( 1. - math_ops.exp(-math_ops.abs(y) / self._scale))
def _NormalizingSvd(tf_a): tf_s, tf_u, tf_v = linalg_ops.svd(tf_a, compute_uv=True, full_matrices=True) # Singular vectors are only unique up to an arbitrary phase. We normalize # the vectors such that the first component of u (if m >=n) or v (if n > m) # have phase 0. m = tf_a.shape[-2] n = tf_a.shape[-1] if m >= n: top_rows = tf_u[..., 0:1, :] else: top_rows = tf_v[..., 0:1, :] if tf_u.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_u *= phase[..., :m] tf_v *= phase[..., :n] return tf_s, tf_u, tf_v
def update_weights(self, train_op): """Updates the model weights. This function must be called on at least one worker after `minimize`. In distributed training this call can be omitted on non-chief workers to speed up training. Args: train_op: The operation returned by the `minimize` call. Returns: An Operation that updates the model weights. """ with ops.control_dependencies([train_op]): update_ops = [] # Copy over unshrunk weights to user provided variables. for name in ['sparse_features_weights', 'dense_features_weights']: for var, slot_var in zip(self._variables[name], self._slots['unshrunk_' + name]): for v, sv in zip(self._var_to_list(var), self._var_to_list(slot_var)): update_ops.append(v.assign(sv)) # Apply proximal step. if self._symmetric_l1_regularization() > 0: shrinkage = (self._symmetric_l1_regularization() / self._symmetric_l2_regularization()) with ops.control_dependencies(update_ops): update_ops = [] for name in [ 'sparse_features_weights', 'dense_features_weights' ]: for var in self._variables[name]: for v in self._var_to_list(var): with ops.device(v.device): v_shrunk = math_ops.sign(v) * math_ops.maximum( 0.0, math_ops.abs(v) - shrinkage) update_ops.append(v.assign(v_shrunk)) return control_flow_ops.group(*update_ops) else: return control_flow_ops.group(*update_ops)
def build(self, inputs_shape): if inputs_shape[1].value is None: raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape) input_depth = inputs_shape[1].value if self._input_initializer is None: self._input_initializer = init_ops.random_normal_initializer(mean=0.0, stddev=0.001) self._input_kernel = self.add_variable( "input_kernel", shape=[input_depth, self._num_units], initializer=self._input_initializer) if self._recurrent_initializer is None: self._recurrent_initializer = init_ops.constant_initializer(1.) self._recurrent_kernel = self.add_variable( "recurrent_kernel", shape=[self._num_units], initializer=self._recurrent_initializer) # Clip the absolute values of the recurrent weights to the specified minimum if self._recurrent_min_abs: abs_kernel = math_ops.abs(self._recurrent_kernel) min_abs_kernel = math_ops.maximum(abs_kernel, self._recurrent_min_abs) self._recurrent_kernel = math_ops.multiply( math_ops.sign(self._recurrent_kernel), min_abs_kernel ) # Clip the absolute values of the recurrent weights to the specified maximum if self._recurrent_max_abs: self._recurrent_kernel = clip_ops.clip_by_value(self._recurrent_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) self._bias = self.add_variable( "bias", shape=[self._num_units], initializer=init_ops.zeros_initializer(dtype=self.dtype)) self.built = True
def _NormalizingSvd(tf_a): tf_s, tf_u, tf_v = linalg_ops.svd( tf_a, compute_uv=True, full_matrices=full_matrices_) # Singular vectors are only unique up to an arbitrary phase. We normalize # the vectors such that the first component of u (if m >=n) or v (if n > m) # have phase 0. m = tf_a.shape[-2] n = tf_a.shape[-1] if m >= n: top_rows = tf_u[..., 0:1, :] else: top_rows = tf_v[..., 0:1, :] if tf_u.dtype.is_complex: angle = -math_ops.angle(top_rows) phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle)) else: phase = math_ops.sign(top_rows) tf_u *= phase[..., :m] tf_v *= phase[..., :n] return tf_s, tf_u, tf_v
def __call__(self, shape, dtype=None, partition_info=None): if dtype is None: dtype = self.dtype # Check the shape if len(shape) < 3 or len(shape) > 5: raise ValueError("The tensor to initialize must be at least " "three-dimensional and at most five-dimensional") if shape[-2] > shape[-1]: raise ValueError("In_filters cannot be greater than out_filters.") # Generate a random matrix a = random_ops.random_normal([shape[-1], shape[-1]], dtype=dtype, seed=self.seed) # Compute the qr factorization if context.executing_eagerly(): with ops.device("cpu:0"): # TODO(b/73102536) q, r = gen_linalg_ops.qr(a, full_matrices=False) else: q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) q = q[:shape[-2], :] q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype)) if len(shape) == 3: weight = array_ops.scatter_nd([[(shape[0] - 1) // 2]], array_ops.expand_dims(q, 0), shape) elif len(shape) == 4: weight = array_ops.scatter_nd([[(shape[0] - 1) // 2, (shape[1] - 1) // 2]], array_ops.expand_dims(q, 0), shape) else: weight = array_ops.scatter_nd([[(shape[0] - 1) // 2, (shape[1] - 1) // 2, (shape[2] - 1) // 2]], array_ops.expand_dims(q, 0), shape) return weight
def __call__(self, shape, dtype=dtypes.float32, **kwargs): """Returns a tensor object initialized as specified by the initializer. Args: shape: Shape of the tensor. dtype: Optional dtype of the tensor. Only floating point types are supported. **kwargs: Additional keyword arguments. Raises: ValueError: If the dtype is not floating point or the input shape is not valid. """ self._validate_kwargs(kwargs, support_partition=False) dtype = _assert_float_dtype(dtype) # Check the shape if len(shape) < 2: raise ValueError( "The tensor to initialize, specified by argument `shape`" " must be at least two-dimensional. Received shape=" f"{shape}") # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows)) # Generate a random matrix a = self._random_generator.random_normal(flat_shape, dtype=dtype) # Compute the qr factorization q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def testDispatchForUnaryElementwiseAPIs(self): @dispatch.dispatch_for_unary_elementwise_apis(MaskedTensor) def unary_elementwise_api_handler(api_func, x): return MaskedTensor(api_func(x.values), x.mask) try: x = MaskedTensor([1, -2, -3], [True, True, False]) # Test calls with positional & keyword argument (& combinations) abs_x = math_ops.abs(x) sign_x = math_ops.sign(x=x) neg_x = math_ops.negative(x, "neg_x") invert_x = bitwise_ops.invert(x, name="invert_x") ones_like_x = array_ops.ones_like(x, name="ones_like_x") ones_like_x_float = array_ops.ones_like(x, dtypes.float32, name="ones_like_x_float") self.assertAllEqual(abs_x.values, [1, 2, 3]) self.assertAllEqual(sign_x.values, [1, -1, -1]) self.assertAllEqual(neg_x.values, [-1, 2, 3]) self.assertAllEqual(invert_x.values, [-2, 1, 2]) self.assertAllEqual(ones_like_x.values, [1, 1, 1]) self.assertAllEqual(ones_like_x_float.values, [1., 1., 1.]) for result in [ abs_x, sign_x, neg_x, invert_x, ones_like_x, ones_like_x_float ]: self.assertAllEqual(result.mask, [True, True, False]) if not context.executing_eagerly( ): # names not defined in eager mode. self.assertRegex(neg_x.values.name, r"^neg_x/Neg:.*") self.assertRegex(invert_x.values.name, r"^invert_x/.*") self.assertRegex(ones_like_x.values.name, r"^ones_like_x/.*") self.assertRegex(ones_like_x_float.values.name, r"^ones_like_x_float/.*") finally: dispatch.unregister_elementwise_api_handler( unary_elementwise_api_handler)
def random_sign_uniform(shape, minval=None, maxval=None, dtype=dtypes.float32, seed=None): """Tensor with (possibly complex) random entries from a "sign Uniform". Letting `Z` be a random variable equal to `-1` and `1` with equal probability, Samples from this `Op` are distributed like ``` Z * X, where X ~ Uniform[minval, maxval], if dtype is real, Z * (X + iY), where X, Y ~ Uniform[minval, maxval], if dtype is complex. ``` Args: shape: `TensorShape` or Python list. Shape of the returned tensor. minval: `0-D` `Tensor` giving the minimum values. maxval: `0-D` `Tensor` giving the maximum values. dtype: `TensorFlow` `dtype` or Python dtype seed: Python integer seed for the RNG. Returns: `Tensor` with desired shape and dtype. """ dtype = dtypes.as_dtype(dtype) with ops.name_scope("random_sign_uniform"): unsigned_samples = random_uniform(shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed) if seed is not None: seed += 12 signs = math_ops.sign( random_ops.random_uniform(shape, minval=-1., maxval=1., seed=seed)) return unsigned_samples * math_ops.cast(signs, unsigned_samples.dtype)
def __call__(self, shape, dtype=None, **kwargs): """Returns a tensor object initialized to an orthogonal matrix. Args: shape: Shape of the tensor. dtype: Optional dtype of the tensor. Only floating point types are supported. If not specified, `tf.keras.backend.floatx()` is used, which default to `float32` unless you configured it otherwise (via `tf.keras.backend.set_floatx(float_dtype)`) **kwargs: Additional keyword arguments. """ _validate_kwargs(self.__class__.__name__, kwargs, support_partition=False) dtype = _assert_float_dtype(_get_dtype(dtype)) # Check the shape if len(shape) < 2: raise ValueError('The tensor to initialize must be ' 'at least two-dimensional') # Flatten the input shape with the last dimension remaining # its original shape so it works for conv2d num_rows = 1 for dim in shape[:-1]: num_rows *= dim num_cols = shape[-1] flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows)) # Generate a random matrix a = self._random_generator.random_normal(flat_shape, dtype=dtype) # Compute the qr factorization q, r = gen_linalg_ops.qr(a, full_matrices=False) # Make Q uniform d = array_ops.tensor_diag_part(r) q *= math_ops.sign(d) if num_rows < num_cols: q = array_ops.matrix_transpose(q) return self.gain * array_ops.reshape(q, shape)
def sample(self, n, seed=None, name="sample"): """Sample `n` observations from the Laplace Distributions. Args: n: `Scalar`, type int32, the number of observations to sample. seed: Python integer, the random seed. name: The name to give this op. Returns: samples: `[n, ...]`, a `Tensor` of `n` samples for each of the distributions determined by broadcasting the parameters. """ with ops.name_scope(self.name): with ops.op_scope([self._loc, self._scale, n], name): n = ops.convert_to_tensor(n) n_val = tensor_util.constant_value(n) shape = array_ops.concat( 0, [array_ops.pack([n]), self.batch_shape()]) # Sample uniformly-at-random from the open-interval (-1, 1). uniform_samples = random_ops.random_uniform( shape=shape, minval=np.nextafter(self.dtype.as_numpy_dtype(-1.), self.dtype.as_numpy_dtype(0.)), maxval=self.dtype.as_numpy_dtype(1.), dtype=self.dtype, seed=seed) # Provide some hints to shape inference inferred_shape = tensor_shape.vector(n_val).concatenate( self.get_batch_shape()) uniform_samples.set_shape(inferred_shape) return (self._loc - self._scale * math_ops.sign(uniform_samples) * math_ops.log(1. - math_ops.abs(uniform_samples)))
def _ComplexAbsGrad(op, grad): """Returns the gradient of ComplexAbs.""" # TODO(b/27786104): The cast to complex could be removed once arithmetic # supports mixtures of complex64 and real values. return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign( op.inputs[0]))
def _ComplexAbsGrad(op, grad): """Returns the gradient of ComplexAbs.""" # TODO(b/27786104): The cast to complex could be removed once arithmetic # supports mixtures of complex64 and real values. return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(op.inputs[0]))
def _BesselI0eGrad(op, grad): """Compute gradient of bessel_i0e(x) with respect to its argument.""" x = op.inputs[0] y = op.outputs[0] with ops.control_dependencies([grad]): return grad * (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
def indicator(x): x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1]) return 0.5 * (math_ops.sign(x1_times_x2) + 1.0)
def reduce_weighted_logsumexp( logx, w=None, axis=None, keep_dims=False, return_sign=False, name=None): """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`. If all weights `w` are known to be positive, it is more efficient to directly use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more efficient than `du.reduce_weighted_logsumexp(logx, w)`. Reduces `input_tensor` along the dimensions given in `axis`. Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in `axis`. If `keep_dims` is true, the reduced dimensions are retained with length 1. If `axis` has no entries, all dimensions are reduced, and a tensor with a single element is returned. This function is more numerically stable than log(sum(w * exp(input))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. For example: ```python x = tf.constant([[0., 0, 0], [0, 0, 0]]) w = tf.constant([[-1., 1, 1], [1, 1, 1]]) du.reduce_weighted_logsumexp(x, w) # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4) du.reduce_weighted_logsumexp(x, w, axis=0) # ==> [log(-1+1), log(1+1), log(1+1)] du.reduce_weighted_logsumexp(x, w, axis=1) # ==> [log(-1+1+1), log(1+1+1)] du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True) # ==> [[log(-1+1+1)], [log(1+1+1)]] du.reduce_weighted_logsumexp(x, w, axis=[0, 1]) # ==> log(-1+5) ``` Args: logx: The tensor to reduce. Should have numeric type. w: The weight tensor. Should have numeric type identical to `logx`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. keep_dims: If true, retains reduced dimensions with length 1. return_sign: If `True`, returns the sign of the result. name: A name for the operation (optional). Returns: lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor. sign: (Optional) The sign of `sum(weight * exp(x))`. """ with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]): logx = ops.convert_to_tensor(logx, name="logx") if w is None: lswe = math_ops.reduce_logsumexp(logx, axis=axis, keep_dims=keep_dims) if return_sign: sgn = array_ops.ones_like(lswe) return lswe, sgn return lswe w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w") log_absw_x = logx + math_ops.log(math_ops.abs(w)) max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keep_dims=True) # If the largest element is `-inf` or `inf` then we don't bother subtracting # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That # this is ok follows from the fact that we're actually free to subtract any # value we like, so long as we add it back after taking the `log(sum(...))`. max_log_absw_x = array_ops.where( math_ops.is_inf(max_log_absw_x), array_ops.zeros_like(max_log_absw_x), max_log_absw_x) wx_over_max_absw_x = ( math_ops.sign(w) * math_ops.exp(log_absw_x - max_log_absw_x)) sum_wx_over_max_absw_x = math_ops.reduce_sum( wx_over_max_absw_x, axis=axis, keep_dims=keep_dims) if not keep_dims: max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis) sgn = math_ops.sign(sum_wx_over_max_absw_x) lswe = max_log_absw_x + math_ops.log(sgn * sum_wx_over_max_absw_x) if return_sign: return lswe, sgn return lswe
def _cdf(self, x): z = self._z(x) return (0.5 + 0.5 * math_ops.sign(z) * (1. - math_ops.exp(-math_ops.abs(z))))
def _AbsGrad(op, grad): x = op.inputs[0] return grad * math_ops.sign(x)
def build(self, input_shape): """Builds the layer. Creates the variables for the network modeling the densities, creates the auxiliary loss estimating the median and tail quantiles of the densities, and then uses that to create the probability mass functions and the update op that produces the discrete cumulative density functions used by the range coder. Args: input_shape: Shape of the input tensor, used to get the number of channels. Raises: ValueError: if `input_shape` doesn't specify the length of the channel dimension. """ input_shape = tensor_shape.TensorShape(input_shape) channel_axis = self._channel_axis(input_shape.ndims) channels = input_shape[channel_axis].value if channels is None: raise ValueError("The channel dimension of the inputs must be defined.") self.input_spec = base_layer.InputSpec( ndim=input_shape.ndims, axes={channel_axis: channels}) filters = (1,) + self.filters + (1,) scale = self.init_scale ** (1 / (len(self.filters) + 1)) # Create variables. self._matrices = [] self._biases = [] self._factors = [] for i in range(len(self.filters) + 1): init = np.log(np.expm1(1 / scale / filters[i + 1])) matrix = self.add_variable( "matrix_{}".format(i), dtype=self.dtype, shape=(channels, filters[i + 1], filters[i]), initializer=init_ops.Constant(init)) matrix = nn.softplus(matrix) self._matrices.append(matrix) bias = self.add_variable( "bias_{}".format(i), dtype=self.dtype, shape=(channels, filters[i + 1], 1), initializer=init_ops.RandomUniform(-.5, .5)) self._biases.append(bias) if i < len(self.filters): factor = self.add_variable( "factor_{}".format(i), dtype=self.dtype, shape=(channels, filters[i + 1], 1), initializer=init_ops.Zeros()) factor = math_ops.tanh(factor) self._factors.append(factor) # To figure out what range of the densities to sample, we need to compute # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we # can't take inverses of the cumulative directly, we make it an optimization # problem: # `quantiles = argmin(|logit(cumulative) - target|)` # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`. # Taking the logit (inverse of sigmoid) of the cumulative makes the # representation of the right target more numerically stable. # Numerically stable way of computing logits of `tail_mass / 2` # and `1 - tail_mass / 2`. target = np.log(2 / self.tail_mass - 1) # Compute lower and upper tail quantile as well as median. target = constant_op.constant([-target, 0, target], dtype=self.dtype) def quantiles_initializer(shape, dtype=None, partition_info=None): del partition_info # unused assert tuple(shape[1:]) == (1, 3) init = constant_op.constant( [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype) return array_ops.tile(init, (shape[0], 1, 1)) quantiles = self.add_variable( "quantiles", shape=(channels, 1, 3), dtype=self.dtype, initializer=quantiles_initializer) logits = self._logits_cumulative(quantiles, stop_gradient=True) loss = math_ops.reduce_sum(abs(logits - target)) self.add_loss(loss, inputs=None) # Save medians for `call`, `compress`, and `decompress`. self._medians = quantiles[:, :, 1:2] if not self.optimize_integer_offset: self._medians = math_ops.round(self._medians) # Largest distance observed between lower tail quantile and median, # or between median and upper tail quantile. minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1]) maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians) minmax = math_ops.maximum(minima, maxima) minmax = math_ops.ceil(minmax) minmax = math_ops.maximum(minmax, 1) # Sample the density up to `minmax` around the median. samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype) samples += self._medians half = constant_op.constant(.5, dtype=self.dtype) # We strip the sigmoid from the end here, so we can use the special rule # below to only compute differences in the left tail of the sigmoid. # This increases numerical stability (see explanation in `call`). lower = self._logits_cumulative(samples - half, stop_gradient=True) upper = self._logits_cumulative(samples + half, stop_gradient=True) # Flip signs if we can move more towards the left tail of the sigmoid. sign = -math_ops.sign(math_ops.add_n([lower, upper])) pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) # Add tail masses to first and last bin of pmf, as we clip values for # compression, meaning that out-of-range values get mapped to these bins. pmf = array_ops.concat([ math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]), pmf[:, 0, 1:-1], math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]), ], axis=-1) self._pmf = pmf cdf = coder_ops.pmf_to_quantized_cdf( pmf, precision=self.range_coder_precision) def cdf_getter(*args, **kwargs): del args, kwargs # ignored return variable_scope.get_variable( "quantized_cdf", dtype=dtypes.int32, initializer=cdf, trainable=False, validate_shape=False, collections=()) # Need to provide a fake shape here since add_variable insists on it. self._quantized_cdf = self.add_variable( "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32, getter=cdf_getter, trainable=False) update_op = state_ops.assign( self._quantized_cdf, cdf, validate_shape=False) self.add_update(update_op, inputs=None) super(EntropyBottleneck, self).build(input_shape)
def _cdf(self, x): y = x - self.loc return (0.5 + 0.5 * math_ops.sign(y) * (1. - math_ops.exp(-math_ops.abs(y) / self.scale)))
def call(self, inputs, training): """Pass a tensor through the bottleneck. Args: inputs: The tensor to be passed through the bottleneck. training: Boolean. If `True`, returns a differentiable approximation of the inputs, and their likelihoods under the modeled probability densities. If `False`, returns the quantized inputs and their likelihoods under the corresponding probability mass function. These quantities can't be used for training, as they are not differentiable, but represent actual compression more closely. Returns: values: `Tensor` with the same shape as `inputs` containing the perturbed or quantized input values. likelihood: `Tensor` with the same shape as `inputs` containing the likelihood of `values` under the modeled probability distributions. Raises: ValueError: if `inputs` has different `dtype` or number of channels than a previous set of inputs the model was invoked with earlier. """ inputs = ops.convert_to_tensor(inputs) ndim = self.input_spec.ndim channel_axis = self._channel_axis(ndim) half = constant_op.constant(.5, dtype=self.dtype) # Convert to (channels, 1, batch) format by commuting channels to front # and then collapsing. order = list(range(ndim)) order.pop(channel_axis) order.insert(0, channel_axis) values = array_ops.transpose(inputs, order) shape = array_ops.shape(values) values = array_ops.reshape(values, (shape[0], 1, -1)) # Add noise or quantize. if training: noise = random_ops.random_uniform(array_ops.shape(values), -half, half) values = math_ops.add_n([values, noise]) elif self.optimize_integer_offset: values = math_ops.round(values - self._medians) + self._medians else: values = math_ops.round(values) # Evaluate densities. # We can use the special rule below to only compute differences in the left # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 # for large x, 0 for small x. Subtracting two numbers close to 0 can be done # with much higher precision than subtracting two numbers close to 1. lower = self._logits_cumulative(values - half, stop_gradient=False) upper = self._logits_cumulative(values + half, stop_gradient=False) # Flip signs if we can move more towards the left tail of the sigmoid. sign = -math_ops.sign(math_ops.add_n([lower, upper])) sign = array_ops.stop_gradient(sign) likelihood = abs( math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) if self.likelihood_bound > 0: likelihood_bound = constant_op.constant( self.likelihood_bound, dtype=self.dtype) # TODO(jballe): Override gradients. likelihood = math_ops.maximum(likelihood, likelihood_bound) # Convert back to input tensor shape. order = list(range(1, ndim)) order.insert(channel_axis, 0) values = array_ops.reshape(values, shape) values = array_ops.transpose(values, order) likelihood = array_ops.reshape(likelihood, shape) likelihood = array_ops.transpose(likelihood, order) if not context.executing_eagerly(): values_shape, likelihood_shape = self.compute_output_shape(inputs.shape) values.set_shape(values_shape) likelihood.set_shape(likelihood_shape) return values, likelihood