def call(self, inputs, weights=None): # If we are not weighting the inputs we can immediately reduce the data # and return it. if weights is None: return get_reduce_op(self.reduction)(inputs, axis=self.axis) # TODO(momernick): Add checks for this and a decent error message if the # weight shape isn't compatible. if weights.shape.rank + 1 == inputs.shape.rank: weights = tf.compat.v1.expand_dims(weights, -1) weighted_inputs = tf.multiply(inputs, weights) # Weighted sum and prod can be expressed as reductions over the weighted # values, as can min and max. if self.reduction in ("sum", "prod", "min", "max"): return get_reduce_op(self.reduction)(weighted_inputs, axis=self.axis) # Weighted mean is a bit more complicated: we have to do a sum of the # weighted values and divide by the sum of the weights. if self.reduction == "mean": input_sum = tf.reduce_sum(weighted_inputs, axis=self.axis) weight_sum = tf.reduce_sum(weights, axis=self.axis) return tf.divide(input_sum, weight_sum) # sqrtn is also more complicated: it's like mean but with a normalized # divisor. if self.reduction == "sqrtn": logging.warning( "Reduction `sqrtn` is deprecated and will be removed " "2021-01-01. Please use the `sum` reduction and divide " "the output by the normalized weights instead.") input_sum = tf.reduce_sum(weighted_inputs, axis=self.axis) squared_weights = tf.pow(weights, 2) squared_weights_sum = tf.reduce_sum(squared_weights, axis=self.axis) sqrt_weights_sum = tf.sqrt(squared_weights_sum) return tf.divide(input_sum, sqrt_weights_sum) raise ValueError("%s is not a supported weighted reduction." % self.reduction)
def _apply_noisy_update(self, mom, grad, var, indices=None): # Compute and apply the gradient update following # preconditioned Langevin dynamics stddev = tf.where( tf.squeeze(self.iterations > tf.cast(self._burnin, tf.int64)), tf.cast(tf.math.rsqrt(self._learning_rate), grad.dtype), tf.zeros([], grad.dtype)) # Keep an exponentially weighted moving average of squared gradients. # Not thread safe decay_tensor = tf.cast(self._decay_tensor, grad.dtype) new_mom = decay_tensor * mom + (1. - decay_tensor) * tf.square(grad) preconditioner = tf.math.rsqrt( new_mom + tf.cast(self._diagonal_bias, grad.dtype)) # Compute gradients of the preconditioner. # Note: Since the preconditioner depends indirectly on `var` through `grad`, # in Eager mode, `diag_jacobian` would need access to the loss function. # This is the only blocker to supporting Eager mode for the SGLD optimizer. _, preconditioner_grads = diag_jacobian( xs=var, ys=preconditioner, parallel_iterations=self._parallel_iterations) mean = 0.5 * ( preconditioner * grad * tf.cast(self._data_size, grad.dtype) - preconditioner_grads[0]) stddev *= tf.sqrt(preconditioner) result_shape = tf.broadcast_dynamic_shape(tf.shape(mean), tf.shape(stddev)) update_ops = [] if indices is None: update_ops.append(mom.assign(new_mom)) else: update_ops.append( self._resource_scatter_update(mom, indices, new_mom)) with tf.control_dependencies(update_ops): return tf.random.normal(shape=result_shape, mean=mean, stddev=stddev, dtype=grad.dtype)
def build_subnetwork(self, features, logits_dimension, training, iteration_step, summary, previous_ensemble=None): """See `adanet.subnetwork.Builder`.""" input_layer = tf.compat.v1.feature_column.input_layer( features=features, feature_columns=self._feature_columns) last_layer = input_layer for _ in range(self._num_layers): last_layer = tf.compat.v1.layers.dense( last_layer, units=self._layer_size, activation=tf.nn.relu, kernel_initializer=tf.compat.v1.glorot_uniform_initializer( seed=self._seed)) last_layer = tf.compat.v1.layers.dropout(last_layer, rate=self._dropout, seed=self._seed, training=training) logits = tf.compat.v1.layers.dense( last_layer, units=logits_dimension, kernel_initializer=tf.compat.v1.glorot_uniform_initializer( seed=self._seed)) # Approximate the Rademacher complexity of this subnetwork as the square- # root of its depth. complexity = tf.sqrt(tf.cast(self._num_layers, dtype=tf.float32)) with tf.name_scope(""): summary.scalar("complexity", complexity) summary.scalar("num_layers", self._num_layers) shared = {_NUM_LAYERS_KEY: self._num_layers} return adanet.Subnetwork(last_layer=last_layer, logits=logits, complexity=complexity, shared=shared)
def testCovarianceFromSampling(self): # We will test mean, cov, var, stddev on a Multinomial constructed via # broadcast between alpha, n. theta = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) theta /= np.sum(theta, 1)[..., tf.newaxis] n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32) # batch_shape=[3, 2], event_shape=[3] dist = tfd.Multinomial(n, theta) x = dist.sample(int(1000e3), seed=test_util.test_seed()) sample_mean = tf.reduce_mean(x, axis=0) x_centered = x - sample_mean[tf.newaxis, ...] sample_cov = tf.reduce_mean(tf.matmul(x_centered[..., tf.newaxis], x_centered[..., tf.newaxis, :]), axis=0) sample_var = tf.linalg.diag_part(sample_cov) sample_stddev = tf.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = self.evaluate([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0.01, rtol=0.01) self.assertAllClose(sample_cov_, analytic_cov, atol=0.01, rtol=0.01) self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
def represent(self, waves): """Transform waves into a representation suited for the DS2 encoder.""" waves = tf.squeeze(waves, -1) # Re-scale. waves = waves / (tf.reduce_max(tf.abs(waves), axis=1, keepdims=True) + 1e-5) waves *= 32767 # To match PSF the following line should be uncommented. But it's not # supported by TPUs. # waves = tf.cast(tf.cast(waves, tf.int16), waves.dtype) # Matching PSF. # Determine frame and step sizes. window_size = int(self.sample_freq * self.window_size) window_step = int(self.sample_freq * self.window_step) # Compute STFT. fft_window = tf.signal.hann_window(window_size, periodic=False, dtype=waves.dtype) fft_window = tf.reshape(fft_window, [1, 1, window_size]) frames = tf.signal.frame(waves, window_size, window_step, True) # Do the slow DFT matmul because window size generally will not be a power # of 2. dft_w = scipy.linalg.dft(window_size).astype(np.complex64) stft = tf.matmul(tf.cast(fft_window * frames, dft_w.dtype), dft_w) mag = tf.abs(stft) / float(window_size) mag = tf.where(tf.less_equal(mag, 1e-30), tf.ones_like(mag) * 1e-30, mag) log_mag = 10. * tf.math.log(mag) / tf.math.log(10.) # Select features and standardize. features = log_mag[Ellipsis, :self.num_features] counts, means_ss, variance_ss, _ = tf.nn.sufficient_statistics( features, axes=[1, 2], keepdims=True) mean, variance = tf.nn.normalize_moments(counts, means_ss, variance_ss, None) features = (features - mean) / tf.sqrt(variance) return features
def call(self, inputs): if self.coeffs_mean is None and self.coeffs_precision_tril_op is None: # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T) predictive_mean = 0. predictive_variance = tf.reduce_sum(tf.square(inputs), axis=-1) else: # p(mean(ynew) | xnew, x, y) = Normal(ynew | # mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y, # variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T) predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean) predictive_covariance = tf.matmul( inputs, self.coeffs_precision_tril_op.solve( self.coeffs_precision_tril_op.solve(inputs, adjoint_arg=True), adjoint=True)) predictive_variance = tf.linalg.tensor_diag_part( predictive_covariance) return generated_random_variables.Normal( loc=predictive_mean, scale=tf.sqrt(predictive_variance))
def __call__(self, x): """Computes regularization given an input ed.RandomVariable.""" if not isinstance(x, random_variable.RandomVariable): raise ValueError('Input must be an ed.RandomVariable.') # variance = (tr( sigma_q + mu_q mu_q^T ) + 2*beta) / (omega + 2*alpha + 2) trace_covariance = tf.reduce_sum(x.distribution.variance()) trace_mean_outer_product = tf.reduce_sum(x.distribution.mean()**2) num_weights = tf.cast(tf.reduce_prod(x.shape), x.dtype) variance = ((trace_covariance + trace_mean_outer_product) + 2. * self.variance_scale) variance /= num_weights + 2. * self.variance_concentration + 2. self.stddev = tf.sqrt(variance) variance_prior = generated_random_variables.InverseGamma( self.variance_concentration, self.variance_scale) regularization = super(NormalEmpiricalBayesKLDivergence, self).__call__(x) regularization -= (self.scale_factor * variance_prior.distribution.log_prob(variance)) return regularization
def subexpd(global_step, start_step, end_step, start_val, end_val, warmup=True, stair=True): """Sub-exponential decay function. Duration decay is sqrt(decay).""" if warmup and start_step == 0: return lerp(global_step, start_step, end_step, start_val, end_val) decay_steps = tf.cast(end_step - start_step, tf.float32) decay_factor = tf.cast(end_val, tf.float32) d_decay_factor = tf.cast(tf.sqrt(decay_factor), tf.float32) step = tf.cast(global_step - start_step, tf.float32) return subexpd_np(step, decay_steps, start_val, d_decay_factor, decay_factor, stair=stair)
def vector_size_to_square_matrix_size(d, validate_args, name=None): """Convert a vector size to a matrix size.""" if isinstance(d, (float, int, np.generic, np.ndarray)): n = (-1 + np.sqrt(1 + 8 * d)) / 2. if float(int(n)) != n: raise ValueError('Vector length {} is not a triangular number.'.format(d)) return int(n) else: with tf.name_scope(name or 'vector_size_to_square_matrix_size') as name: n = (-1. + tf.sqrt(1 + 8. * tf.cast(d, dtype=tf.float32))) / 2. if validate_args: with tf.control_dependencies([ tf.debugging.Assert( tf.math.equal( tf.cast(tf.cast(n, dtype=tf.int32), dtype=tf.float32), n), data=['Vector length is not a triangular number: ', d] ) ]): n = tf.identity(n) return tf.cast(n, d.dtype)
def posterior_jd(): observation_noise_variance = yield InverseGammaWithSampleUpperBound( concentration=( self.observation_noise_variance_posterior_concentration), scale=sampler_state.observation_noise_variance_posterior_scale, upper_bound=self.observation_noise_variance_upper_bound, name='observation_noise_variance') yield MVNPrecisionFactorHardZeros( loc=sampler_state.conditional_weights_mean, # Note that the posterior precision varies inversely with the # noise variance: in worlds with high noise we're also # more uncertain about the values of the weights. # TODO(colcarroll): Tests pass even without a square root on the # observation_noise_variance. Should add a test that would fail. precision_factor=tf.linalg.LinearOperatorLowerTriangular( sampler_state.conditional_posterior_precision_chol / tf.sqrt(observation_noise_variance[..., tf.newaxis, tf.newaxis])), nonzeros=sampler_state.nonzeros, name='weights')
def _prepare_local(self, var_device, var_dtype, apply_state): super(NonFusedAdam, self)._prepare_local(var_device, var_dtype, apply_state) local_step = tf.cast(self.iterations + 1, var_dtype) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) beta_1_power = tf.pow(beta_1_t, local_step) beta_2_power = tf.pow(beta_2_t, local_step) lr = (apply_state[(var_device, var_dtype)]['lr_t'] * (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) apply_state[(var_device, var_dtype)].update( dict(lr=lr, epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), beta_1_t=beta_1_t, beta_1_power=beta_1_power, one_minus_beta_1_t=1 - beta_1_t, beta_2_t=beta_2_t, beta_2_power=beta_2_power, one_minus_beta_2_t=1 - beta_2_t))
def normalize_op(x, norm_type='layer', eps=1e-5): """Apply either Group, Instance, or Layer normalization, or None.""" if norm_type is not None: # mb, h, w, ch x_shape = tf.shape(x) n_groups = { 'instance': x_shape[-1], 'layer': 1, 'group': 32 }[norm_type] x = tf.reshape( x, tf.concat([x_shape[:-1], [n_groups, x_shape[-1] // n_groups]], axis=0)) mean, var = tf.nn.moments(x, [1, 2, 4], keepdims=True) x = (x - mean) / tf.sqrt(var + eps) x = tf.reshape(x, x_shape) return x
def update_step(self, grad, variable): """Update step given gradient and the associated model variable.""" if self._var_key(variable) not in self._index_dict: raise KeyError(f'Optimizer cannot recognize variable {variable.name}, ' f'this usually means you are calling an optimizer ' f'previously used on a different model. Please try ' f'creating a new optimizer instance.') lr = tf.cast(self.learning_rate, variable.dtype) var_key = self._var_key(variable) accumulator = self._accumulators[self._index_dict[var_key]] if isinstance(grad, tf.IndexedSlices): # Sparse gradients. accumulator.scatter_add( tf.IndexedSlices(grad.values * grad.values, grad.indices)) else: # Dense gradients. accumulator.assign_add(grad * grad) variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))
def log_volatility_noncentered_fn(white_noise_shock_scale, persistence_of_volatility): """Noncentered parameterization of log_volatility random variable.""" # The non-centered parameterization for log_volatility improves geometry # but is slower (catastrophically so if FFT is not used). std_log_volatility = yield root( tfd.Sample( tfd.Normal(0., 1.), num_timesteps, name='std_log_volatility', )) if use_fft: return ( white_noise_shock_scale[..., tf.newaxis] * _fft_conv_center(std_log_volatility, persistence_of_volatility)) else: log_volatility = ( std_log_volatility * white_noise_shock_scale[..., tf.newaxis]) log_volatility_0 = ( log_volatility[..., 0] / tf.sqrt(1 - persistence_of_volatility**2)) # Make the time axis be first, for scan to work. log_volatility = distribution_util.move_dimension( log_volatility, -1, 0) # I.e. # log_volatility[t] += (persistence_of_volatility * # log_volatility[t-1]) log_volatility = tf.concat( [ log_volatility_0[tf.newaxis], tf.scan( lambda v_prev, v: persistence_of_volatility * v_prev + v, log_volatility[1:], log_volatility_0) ], axis=0, ) return distribution_util.move_dimension(log_volatility, 0, -1)
def _sample_n(self, n, seed=None): # See https://en.wikipedia.org/wiki/Inverse_Gaussian_distribution or # https://www.jstor.org/stable/2683801 concentration = tf.convert_to_tensor(self.concentration) loc = tf.convert_to_tensor(self.loc) chi2_seed, unif_seed = samplers.split_seed(seed, salt='inverse_gaussian') shape = ps.concat([[n], self._batch_shape_tensor( loc=loc, concentration=concentration)], axis=0) sampled_chi2 = tf.square(samplers.normal( shape, seed=chi2_seed, dtype=self.dtype)) sampled_uniform = samplers.uniform( shape, seed=unif_seed, dtype=self.dtype) # Wikipedia defines an intermediate x with the formula # x = loc + loc ** 2 * y / (2 * conc) # - loc / (2 * conc) * sqrt(4 * loc * conc * y + loc ** 2 * y ** 2) # where y ~ N(0, 1)**2 (sampled_chi2 above) and conc is the concentration. # Let us write # w = loc * y / (2 * conc) # Then we can extract the common factor in the last two terms to obtain # x = loc + loc * w * (1 - sqrt(2 / w + 1)) # Now we see that the Wikipedia formula suffers from catastrphic # cancellation for large w (e.g., if conc << loc). # # Fortunately, we can fix this by multiplying both sides # by 1 + sqrt(2 / w + 1). We get # x * (1 + sqrt(2 / w + 1)) = # = loc * (1 + sqrt(2 / w + 1)) + loc * w * (1 - (2 / w + 1)) # = loc * (sqrt(2 / w + 1) - 1) # The term sqrt(2 / w + 1) + 1 no longer presents numerical # difficulties for large w, and sqrt(2 / w + 1) - 1 is just # sqrt1pm1(2 / w), which we know how to compute accurately. # This just leaves the matter of small w, where 2 / w may # overflow. In the limit a w -> 0, x -> loc, so we just mask # that case. sqrt1pm1_arg = 4 * concentration / (loc * sampled_chi2) # 2 / w above safe_sqrt1pm1_arg = tf.where(sqrt1pm1_arg < np.inf, sqrt1pm1_arg, 1.0) denominator = 1.0 + tf.sqrt(safe_sqrt1pm1_arg + 1.0) ratio = tfp_math.sqrt1pm1(safe_sqrt1pm1_arg) / denominator sampled = loc * tf.where(sqrt1pm1_arg < np.inf, ratio, 1.0) # x above return tf.where(sampled_uniform <= loc / (loc + sampled), sampled, tf.square(loc) / sampled)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') g_prime = grad / coefficients['one_minus_m_schedule_new'] # m_t = beta1 * m + (1 - beta1) * g_t m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'], use_locking=self._use_locking) with tf.control_dependencies([m_t]): m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) m_t_slice = tf.compat.v1.gather(m_t, indices) m_t_prime = m_t_slice / coefficients['one_minus_m_schedule_next'] m_t_bar = (coefficients['one_minus_m_t'] * g_prime + coefficients['m_t_1'] * m_t_prime) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'], use_locking=self._use_locking) with tf.control_dependencies([v_t]): v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) v_t_slice = tf.compat.v1.gather(v_t, indices) v_t_prime = v_t_slice / coefficients['v_t_prime_denominator'] v_prime_sqrt_plus_eps = tf.sqrt(v_t_prime) + coefficients['epsilon'] var_update = self._resource_scatter_add( var, indices, coefficients['neg_lr_t'] * m_t_bar / v_prime_sqrt_plus_eps) return tf.group(*[var_update, m_t_bar, v_t])
def get_marginal_distribution(self, index_points=None): """Compute the marginal of this GP over function values at `index_points`. Args: index_points: `float` `Tensor` representing finite (batch of) vector(s) of points in the index set over which the GP is defined. Shape has the form `[b1, ..., bB, e, f1, ..., fF]` where `F` is the number of feature dimensions and must equal `kernel.feature_ndims` and `e` is the number (size) of index points in each batch. Ultimately this distribution corresponds to a `e`-dimensional multivariate normal. The batch shape must be broadcastable with `kernel.batch_shape` and any batch dims yielded by `mean_fn`. Returns: marginal: a `Normal` or `MultivariateNormalLinearOperator` distribution, according to whether `index_points` consists of one or many index points, respectively. """ with self._name_and_control_scope('get_marginal_distribution'): # TODO(cgs): consider caching the result here, keyed on `index_points`. index_points = self._get_index_points(index_points) covariance = self._compute_covariance(index_points) loc = self._mean_fn(index_points) # If we're sure the number of index points is 1, we can just construct a # scalar Normal. This has computational benefits and supports things like # CDF that aren't otherwise straightforward to provide. if self._is_univariate_marginal(index_points): scale = tf.sqrt(covariance) # `loc` has a trailing 1 in the shape; squeeze it. loc = tf.squeeze(loc, axis=-1) return normal.Normal(loc=loc, scale=scale, validate_args=self._validate_args, allow_nan_stats=self._allow_nan_stats, name='marginal_distribution') else: return self._marginal_fn(loc=loc, covariance=covariance, validate_args=self._validate_args, allow_nan_stats=self._allow_nan_stats, name='marginal_distribution')
def call(self, inputs, global_step=None, training=None): # define scaling factor gp_feature_scale = tf.cast(tf.sqrt(2 / self.num_inducing), inputs.dtype) # compute random feature gp_inputs = inputs if self.normalize_input: gp_inputs = self._input_norm_layer(gp_inputs) gp_feature = self._random_feature(gp_inputs) if self.scale_random_features: gp_feature = gp_feature * gp_feature_scale # compute posterior center (i.e., MAP estimate) and variance. gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias gp_covmat = self._gp_cov_layer(gp_feature, training) if self.return_random_features: return gp_output, gp_covmat, gp_feature return gp_output, gp_covmat
def testSqrtWithFiniteGradsBackpropsCorrectly(self): # Part of implementing a tf.custom_gradient is correctly handling the # `grad_ys` value that is propagating back from downstream ops. This test # checks that we got this right, in a particular case where our sqrt # function is squashed between a couple of other functions. def f(x): return x**2 def g(x): return util.sqrt_with_finite_grads(x) def h(x): return tf.sin(x)**2 # We only test away from zero, since we know the values don't match there. xs = tf.constant(np.linspace(1e-10, 10., 100)) _, grad_tf_sqrt = value_and_gradient(lambda xs_: f(tf.sqrt(h(xs_))), xs) _, grad_safe_sqrt = value_and_gradient(lambda xs_: f(g(h(xs_))), xs) self.assertAllClose(*self.evaluate([grad_tf_sqrt, grad_safe_sqrt]), rtol=1e-10)
def lorenz_system_prior_fn(num_timesteps, innovation_scale, step_size, dtype=tf.float32): """Generative process for the Lorenz System model.""" innovation_scale = tensor_util.convert_nonref_to_tensor( innovation_scale, name='innovation_scale', dtype=dtype) step_size = tensor_util.convert_nonref_to_tensor(step_size, name='step_size', dtype=dtype) loc = yield Root(tfd.Sample(tfd.Normal(0., 1.), sample_shape=3)) for _ in range(num_timesteps - 1): x, y, z = tf.unstack(loc, axis=-1) dx = 10 * (y - x) dy = x * (28 - z) - y dz = x * y - 8 / 3 * z delta = tf.stack([dx, dy, dz], axis=-1) loc = yield tfd.Independent(tfd.Normal( loc + step_size * delta, tf.sqrt(step_size) * innovation_scale[..., tf.newaxis]), reinterpreted_batch_ndims=1)
def bijector_fn(std_log_volatility): """Bijector function to set up the autoregressive dependence.""" shift = tf.concat([ tf.zeros(std_log_volatility.shape[:-1])[..., tf.newaxis], persistence_of_volatility[..., tf.newaxis] * std_log_volatility[..., :-1] ], -1) scale0 = white_noise_shock_scale / tf.sqrt( 1 - persistence_of_volatility**2) scale_rest = ( white_noise_shock_scale[..., tf.newaxis] * tf.ones( ps.concat([ ps.shape(white_noise_shock_scale), ps.shape(std_log_volatility)[-1:] - 1 ], 0))) scale = tf.concat([scale0[..., tf.newaxis], scale_rest], -1) return tfb.Shift(shift)(tfb.Scale(scale))
def mix_over_posterior_draws(means, variances): """Construct a predictive normal distribution that mixes over posterior draws. Args: means: float `Tensor` of shape `[num_posterior_draws, ..., num_timesteps]`. variances: float `Tensor` of shape `[num_posterior_draws, ..., num_timesteps]`. Returns: mixture_dist: `tfd.MixtureSameFamily(tfd.Independent(tfd.Normal))` instance representing a uniform mixture over the posterior samples, with `batch_shape = ...` and `event_shape = [num_timesteps]`. """ # The inputs `means`, `variances` have shape # `concat([ # [num_posterior_draws], # sample_shape, # batch_shape, # [num_timesteps]])` # Because MixtureSameFamily mixes over the rightmost batch dimension, # we need to move the `num_posterior_draws` dimension to be rightmost # in the batch shape. This requires use of `Independent` (to preserve # `num_timesteps` as part of the event shape) and `move_dimension`. # TODO(b/120245392): enhance `MixtureSameFamily` to reduce along an # arbitrary axis, and eliminate `move_dimension` calls here. with tf.name_scope('mix_over_posterior_draws'): num_posterior_draws = dist_util.prefer_static_value(tf.shape(means))[0] component_observations = tfd.Independent(distribution=tfd.Normal( loc=dist_util.move_dimension(means, 0, -2), scale=tf.sqrt(dist_util.move_dimension(variances, 0, -2))), reinterpreted_batch_ndims=1) return tfd.MixtureSameFamily( mixture_distribution=tfd.Categorical(logits=tf.zeros( [num_posterior_draws], dtype=component_observations.dtype)), components_distribution=component_observations)
def _sample_paths(self, times, num_requested_times, initial_state, num_samples, random_type, seed, skip, normal_draws): """Returns a sample of paths from the process.""" if normal_draws is None: # Normal draws needed for sampling normal_draws = utils.generate_mc_normal_draws( num_normal_draws=1, num_time_steps=num_requested_times, num_sample_paths=num_samples, random_type=random_type, seed=seed, dtype=self._dtype, skip=skip) else: # Shape [num_time_points, num_samples, dim] normal_draws = tf.transpose(normal_draws, [1, 0, 2]) num_samples = tf.shape(normal_draws)[1] draws_dim = normal_draws.shape[2] if draws_dim != 1: raise ValueError( "`dim` should be equal to `1` but is {0}".format( draws_dim)) times = tf.concat([[0], times], -1) mu_integral = self._integrate_parameter(self._mu, self._mu_is_constant, times[:-1], times[1:]) sigma_sq_integral = self._integrate_parameter(self._sigma_squared, self._sigma_is_constant, times[:-1], times[1:]) # The logarithm of all the increments between the times. log_increments = ((mu_integral - sigma_sq_integral / 2) + tf.sqrt(sigma_sq_integral) * tf.transpose(tf.squeeze(normal_draws, -1))) # Since the implementation of tf.math.cumsum is single-threaded we # use lower-triangular matrix multiplication instead once = tf.ones([num_requested_times, num_requested_times], dtype=self._dtype) lower_triangular = tf.linalg.band_part(once, -1, 0) cumsum = tf.linalg.matvec(lower_triangular, log_increments) samples = initial_state * tf.math.exp(cumsum) return tf.expand_dims(samples, -1)
def rayleigh(shape, scale=None, dtype=tf.float32, seed=None, name=None): """Generates `Tensor` of positive reals drawn from a Rayleigh distributions. The probability density function of a Rayleigh distribution with `scale` parameter is given by: ```none f(x) = x scale**-2 exp(-x**2 0.5 scale**-2) ``` For more details, see [Rayleigh distribution]( https://en.wikipedia.org/wiki/Rayleigh_distribution) Args: shape: Vector-shaped, `int` `Tensor` representing shape of output. scale: (Optional) Positive `float` `Tensor` representing `Rayleigh` scale. Default value: `None` (i.e., `scale = 1.`). dtype: (Optional) TF `dtype` representing `dtype` of output. Default value: `tf.float32`. seed: PRNG seed; see `tfp.random.sanitize_seed` for details. Default value: `None` (i.e., no seed). name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., 'random_rayleigh'). Returns: rayleigh: `Tensor` with specified `shape` and `dtype` consisting of positive real values drawn from a Rayleigh distribution with specified `scale`. """ with tf.name_scope(name or 'rayleigh'): if scale is not None: # Its important to expand the shape to match scale's, otherwise we won't # have independent draws. scale = tf.convert_to_tensor(scale, dtype=dtype, name='scale') shape = tf.broadcast_dynamic_shape(shape, tf.shape(scale)) x = tf.sqrt(-2. * tf.math.log( samplers.uniform(shape, minval=0, maxval=1, dtype=dtype, seed=seed))) if scale is None: return x return x * scale
def dot_product_attention(q, k, v, normalise): """Computes dot product attention. Args: q: queries. Tensor of shape [batch_size, m, d_k]. k: keys. Tensor of shape [batch_size, n, d_k]. v: values. Tensor of shape [batch_size, n, d_v]. normalise: Boolean that determines whether weights sum to 1. Returns: Tensor of shape [batch_size, m, d_v]. """ d_k = tf.shape(q)[-1] scale = tf.sqrt(tf.cast(d_k, tf.float32)) unnorm_weights = tf.einsum('bjk,bik->bij', k, q) / scale # [batch_size,m,n] if normalise: weight_fn = tf.nn.softmax else: weight_fn = tf.sigmoid weights = weight_fn(unnorm_weights) # [batch_size,m,n] rep = tf.einsum('bik,bkj->bij', weights, v) # [batch_size,m,d_v] return rep
def testCovarianceFromSampling(self): alpha = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) dist = tfd.Dirichlet(alpha) # batch_shape=[2], event_shape=[3] x = dist.sample(int(250e3), seed=test_util.test_seed()) sample_mean = tf.reduce_mean(x, axis=0) x_centered = x - sample_mean[None, ...] sample_cov = tf.reduce_mean(tf.matmul(x_centered[..., None], x_centered[..., None, :]), axis=0) sample_var = tf.linalg.diag_part(sample_cov) sample_stddev = tf.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = self.evaluate([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0.04, rtol=0.) self.assertAllClose(sample_cov_, analytic_cov, atol=0.06, rtol=0.) self.assertAllClose(sample_var_, analytic_var, atol=0.03, rtol=0.) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
def _sqrtx2p1(x): """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`.""" sqrt_eps = np.sqrt(np.finfo(dtype_util.as_numpy_dtype(x.dtype)).eps) return tf1.where( tf.abs(x) * sqrt_eps <= 1., tf.sqrt(x**2. + 1.), # For large x, calculating x**2 can overflow. This can be alleviated by # considering: # sqrt(1 + x**2) # = exp(0.5 log(1 + x**2)) # = exp(0.5 log(x**2 * (1 + x**-2))) # = exp(log(x) + 0.5 * log(1 + x**-2)) # = |x| * exp(0.5 log(1 + x**-2)) # = |x| * sqrt(1 + x**-2) # We omit the last term in this approximation. # When |x| > 1 / sqrt(machineepsilon), the second term will be 1, # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term, # and higher order gradients, since the first order derivative of # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x), # and all nth-order derivatives will be O(x**-(n + 2)). This makes any # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish. tf.abs(x))
def _sample_paths(self, times, num_requested_times, initial_state, num_samples, random_type, seed, skip): """Returns a sample of paths from the process.""" # Normal draws needed for sampling. # Shape [num_requested_times, num_samples, dim] normal_draws = utils.generate_mc_normal_draws( num_normal_draws=self._dim, num_time_steps=num_requested_times, num_sample_paths=num_samples, random_type=random_type, seed=seed, dtype=self._dtype, skip=skip) times = tf.concat([[0], times], -1) # Time increments # Shape [num_requested_times, 1, 1] dt = tf.expand_dims(tf.expand_dims(times[1:] - times[:-1], axis=-1), axis=-1) if self._corr_matrix is None: stochastic_increment = normal_draws else: cholesky = tf.linalg.cholesky(self._corr_matrix) stochastic_increment = tf.linalg.matvec(cholesky, normal_draws) # The logarithm of all the increments between the times. # Shape [num_requested_times, num_samples, dim] log_increments = ((self._means - self._vols**2 / 2) * dt + tf.sqrt(dt) * self._vols * stochastic_increment) # Since the implementation of tf.math.cumsum is single-threaded we # use lower-triangular matrix multiplication instead once = tf.ones([num_requested_times, num_requested_times], dtype=self._dtype) lower_triangular = tf.linalg.band_part(once, -1, 0) cumsum = tf.linalg.matvec(lower_triangular, tf.transpose(log_increments)) cumsum = tf.transpose(cumsum, [1, 2, 0]) samples = initial_state * tf.math.exp(cumsum) return samples
def _update_log_spot(kappa, theta, epsilon, rho, current_vol, next_vol, current_log_spot, time_step, normals, gamma_1=0.5, gamma_2=0.5): """Updates log-spot value.""" k_0 = -rho * kappa * theta / epsilon * time_step k_1 = (gamma_1 * time_step * (kappa * rho / epsilon - 0.5) - rho / epsilon) k_2 = (gamma_2 * time_step * (kappa * rho / epsilon - 0.5) + rho / epsilon) k_3 = gamma_1 * time_step * (1 - rho**2) k_4 = gamma_2 * time_step * (1 - rho**2) next_log_spot = (current_log_spot + k_0 + k_1 * current_vol + k_2 * next_vol + tf.sqrt(k_3 * current_vol + k_4 * next_vol) * normals) return next_log_spot
def __call__(self, step): with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name: initial_learning_rate = tf.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate" ) dtype = initial_learning_rate.dtype decay_steps = tf.cast(self.decay_steps, dtype) initial_variance = tf.cast(self.initial_variance, dtype) variance_decay = tf.cast(self.variance_decay, dtype) num_periods = tf.cast(self.num_periods, dtype) alpha = tf.cast(self.alpha, dtype) beta = tf.cast(self.beta, dtype) global_step_recomp = tf.cast(step, dtype) global_step_recomp = tf.minimum(global_step_recomp, decay_steps) linear_decayed = (decay_steps - global_step_recomp) / decay_steps variance = initial_variance / ( tf.pow(1.0 + global_step_recomp, variance_decay) ) std = tf.sqrt(variance) noisy_linear_decayed = ( linear_decayed + self._random_generator.random_normal( linear_decayed.shape, stddev=std ) ) completed_fraction = global_step_recomp / decay_steps fraction = 2.0 * num_periods * completed_fraction cosine_decayed = 0.5 * ( 1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction) ) noisy_linear_cosine_decayed = ( alpha + noisy_linear_decayed ) * cosine_decayed + beta return tf.multiply( initial_learning_rate, noisy_linear_cosine_decayed, name=name )