Beispiel #1
0
    def call(self, inputs, weights=None):
        # If we are not weighting the inputs we can immediately reduce the data
        # and return it.
        if weights is None:
            return get_reduce_op(self.reduction)(inputs, axis=self.axis)

        # TODO(momernick): Add checks for this and a decent error message if the
        # weight shape isn't compatible.
        if weights.shape.rank + 1 == inputs.shape.rank:
            weights = tf.compat.v1.expand_dims(weights, -1)

        weighted_inputs = tf.multiply(inputs, weights)

        # Weighted sum and prod can be expressed as reductions over the weighted
        # values, as can min and max.
        if self.reduction in ("sum", "prod", "min", "max"):
            return get_reduce_op(self.reduction)(weighted_inputs,
                                                 axis=self.axis)

        # Weighted mean is a bit more complicated: we have to do a sum of the
        # weighted values and divide by the sum of the weights.
        if self.reduction == "mean":
            input_sum = tf.reduce_sum(weighted_inputs, axis=self.axis)
            weight_sum = tf.reduce_sum(weights, axis=self.axis)
            return tf.divide(input_sum, weight_sum)

        # sqrtn is also more complicated: it's like mean but with a normalized
        # divisor.
        if self.reduction == "sqrtn":
            logging.warning(
                "Reduction `sqrtn` is deprecated and will be removed "
                "2021-01-01. Please use the `sum` reduction and divide "
                "the output by the normalized weights instead.")
            input_sum = tf.reduce_sum(weighted_inputs, axis=self.axis)
            squared_weights = tf.pow(weights, 2)
            squared_weights_sum = tf.reduce_sum(squared_weights,
                                                axis=self.axis)
            sqrt_weights_sum = tf.sqrt(squared_weights_sum)
            return tf.divide(input_sum, sqrt_weights_sum)

        raise ValueError("%s is not a supported weighted reduction." %
                         self.reduction)
Beispiel #2
0
    def _apply_noisy_update(self, mom, grad, var, indices=None):
        # Compute and apply the gradient update following
        # preconditioned Langevin dynamics
        stddev = tf.where(
            tf.squeeze(self.iterations > tf.cast(self._burnin, tf.int64)),
            tf.cast(tf.math.rsqrt(self._learning_rate), grad.dtype),
            tf.zeros([], grad.dtype))
        # Keep an exponentially weighted moving average of squared gradients.
        # Not thread safe
        decay_tensor = tf.cast(self._decay_tensor, grad.dtype)
        new_mom = decay_tensor * mom + (1. - decay_tensor) * tf.square(grad)
        preconditioner = tf.math.rsqrt(
            new_mom + tf.cast(self._diagonal_bias, grad.dtype))

        # Compute gradients of the preconditioner.
        # Note: Since the preconditioner depends indirectly on `var` through `grad`,
        # in Eager mode, `diag_jacobian` would need access to the loss function.
        # This is the only blocker to supporting Eager mode for the SGLD optimizer.
        _, preconditioner_grads = diag_jacobian(
            xs=var,
            ys=preconditioner,
            parallel_iterations=self._parallel_iterations)

        mean = 0.5 * (
            preconditioner * grad * tf.cast(self._data_size, grad.dtype) -
            preconditioner_grads[0])
        stddev *= tf.sqrt(preconditioner)
        result_shape = tf.broadcast_dynamic_shape(tf.shape(mean),
                                                  tf.shape(stddev))

        update_ops = []
        if indices is None:
            update_ops.append(mom.assign(new_mom))
        else:
            update_ops.append(
                self._resource_scatter_update(mom, indices, new_mom))

        with tf.control_dependencies(update_ops):
            return tf.random.normal(shape=result_shape,
                                    mean=mean,
                                    stddev=stddev,
                                    dtype=grad.dtype)
Beispiel #3
0
    def build_subnetwork(self,
                         features,
                         logits_dimension,
                         training,
                         iteration_step,
                         summary,
                         previous_ensemble=None):
        """See `adanet.subnetwork.Builder`."""

        input_layer = tf.compat.v1.feature_column.input_layer(
            features=features, feature_columns=self._feature_columns)
        last_layer = input_layer
        for _ in range(self._num_layers):
            last_layer = tf.compat.v1.layers.dense(
                last_layer,
                units=self._layer_size,
                activation=tf.nn.relu,
                kernel_initializer=tf.compat.v1.glorot_uniform_initializer(
                    seed=self._seed))
            last_layer = tf.compat.v1.layers.dropout(last_layer,
                                                     rate=self._dropout,
                                                     seed=self._seed,
                                                     training=training)
        logits = tf.compat.v1.layers.dense(
            last_layer,
            units=logits_dimension,
            kernel_initializer=tf.compat.v1.glorot_uniform_initializer(
                seed=self._seed))

        # Approximate the Rademacher complexity of this subnetwork as the square-
        # root of its depth.
        complexity = tf.sqrt(tf.cast(self._num_layers, dtype=tf.float32))

        with tf.name_scope(""):
            summary.scalar("complexity", complexity)
            summary.scalar("num_layers", self._num_layers)

        shared = {_NUM_LAYERS_KEY: self._num_layers}
        return adanet.Subnetwork(last_layer=last_layer,
                                 logits=logits,
                                 complexity=complexity,
                                 shared=shared)
 def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a Multinomial constructed via
     # broadcast between alpha, n.
     theta = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32)
     theta /= np.sum(theta, 1)[..., tf.newaxis]
     n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32)
     # batch_shape=[3, 2], event_shape=[3]
     dist = tfd.Multinomial(n, theta)
     x = dist.sample(int(1000e3), seed=test_util.test_seed())
     sample_mean = tf.reduce_mean(x, axis=0)
     x_centered = x - sample_mean[tf.newaxis, ...]
     sample_cov = tf.reduce_mean(tf.matmul(x_centered[..., tf.newaxis],
                                           x_centered[..., tf.newaxis, :]),
                                 axis=0)
     sample_var = tf.linalg.diag_part(sample_cov)
     sample_stddev = tf.sqrt(sample_var)
     [
         sample_mean_,
         sample_cov_,
         sample_var_,
         sample_stddev_,
         analytic_mean,
         analytic_cov,
         analytic_var,
         analytic_stddev,
     ] = self.evaluate([
         sample_mean,
         sample_cov,
         sample_var,
         sample_stddev,
         dist.mean(),
         dist.covariance(),
         dist.variance(),
         dist.stddev(),
     ])
     self.assertAllClose(sample_mean_, analytic_mean, atol=0.01, rtol=0.01)
     self.assertAllClose(sample_cov_, analytic_cov, atol=0.01, rtol=0.01)
     self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
     self.assertAllClose(sample_stddev_,
                         analytic_stddev,
                         atol=0.01,
                         rtol=0.01)
Beispiel #5
0
    def represent(self, waves):
        """Transform waves into a representation suited for the DS2 encoder."""
        waves = tf.squeeze(waves, -1)

        # Re-scale.
        waves = waves / (tf.reduce_max(tf.abs(waves), axis=1, keepdims=True) +
                         1e-5)
        waves *= 32767
        # To match PSF the following line should be uncommented. But it's not
        # supported by TPUs.
        # waves = tf.cast(tf.cast(waves, tf.int16), waves.dtype)  # Matching PSF.

        # Determine frame and step sizes.
        window_size = int(self.sample_freq * self.window_size)
        window_step = int(self.sample_freq * self.window_step)

        # Compute STFT.
        fft_window = tf.signal.hann_window(window_size,
                                           periodic=False,
                                           dtype=waves.dtype)
        fft_window = tf.reshape(fft_window, [1, 1, window_size])

        frames = tf.signal.frame(waves, window_size, window_step, True)
        # Do the slow DFT matmul because window size generally will not be a power
        # of 2.
        dft_w = scipy.linalg.dft(window_size).astype(np.complex64)
        stft = tf.matmul(tf.cast(fft_window * frames, dft_w.dtype), dft_w)
        mag = tf.abs(stft) / float(window_size)
        mag = tf.where(tf.less_equal(mag, 1e-30),
                       tf.ones_like(mag) * 1e-30, mag)
        log_mag = 10. * tf.math.log(mag) / tf.math.log(10.)

        # Select features and standardize.
        features = log_mag[Ellipsis, :self.num_features]

        counts, means_ss, variance_ss, _ = tf.nn.sufficient_statistics(
            features, axes=[1, 2], keepdims=True)
        mean, variance = tf.nn.normalize_moments(counts, means_ss, variance_ss,
                                                 None)
        features = (features - mean) / tf.sqrt(variance)

        return features
 def call(self, inputs):
     if self.coeffs_mean is None and self.coeffs_precision_tril_op is None:
         # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T)
         predictive_mean = 0.
         predictive_variance = tf.reduce_sum(tf.square(inputs), axis=-1)
     else:
         # p(mean(ynew) | xnew, x, y) = Normal(ynew |
         #   mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y,
         #   variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T)
         predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean)
         predictive_covariance = tf.matmul(
             inputs,
             self.coeffs_precision_tril_op.solve(
                 self.coeffs_precision_tril_op.solve(inputs,
                                                     adjoint_arg=True),
                 adjoint=True))
         predictive_variance = tf.linalg.tensor_diag_part(
             predictive_covariance)
     return generated_random_variables.Normal(
         loc=predictive_mean, scale=tf.sqrt(predictive_variance))
Beispiel #7
0
    def __call__(self, x):
        """Computes regularization given an input ed.RandomVariable."""
        if not isinstance(x, random_variable.RandomVariable):
            raise ValueError('Input must be an ed.RandomVariable.')
        # variance = (tr( sigma_q + mu_q mu_q^T ) + 2*beta) / (omega + 2*alpha + 2)
        trace_covariance = tf.reduce_sum(x.distribution.variance())
        trace_mean_outer_product = tf.reduce_sum(x.distribution.mean()**2)
        num_weights = tf.cast(tf.reduce_prod(x.shape), x.dtype)
        variance = ((trace_covariance + trace_mean_outer_product) +
                    2. * self.variance_scale)
        variance /= num_weights + 2. * self.variance_concentration + 2.
        self.stddev = tf.sqrt(variance)

        variance_prior = generated_random_variables.InverseGamma(
            self.variance_concentration, self.variance_scale)
        regularization = super(NormalEmpiricalBayesKLDivergence,
                               self).__call__(x)
        regularization -= (self.scale_factor *
                           variance_prior.distribution.log_prob(variance))
        return regularization
Beispiel #8
0
def subexpd(global_step,
            start_step,
            end_step,
            start_val,
            end_val,
            warmup=True,
            stair=True):
    """Sub-exponential decay function. Duration decay is sqrt(decay)."""
    if warmup and start_step == 0:
        return lerp(global_step, start_step, end_step, start_val, end_val)
    decay_steps = tf.cast(end_step - start_step, tf.float32)
    decay_factor = tf.cast(end_val, tf.float32)
    d_decay_factor = tf.cast(tf.sqrt(decay_factor), tf.float32)
    step = tf.cast(global_step - start_step, tf.float32)
    return subexpd_np(step,
                      decay_steps,
                      start_val,
                      d_decay_factor,
                      decay_factor,
                      stair=stair)
def vector_size_to_square_matrix_size(d, validate_args, name=None):
  """Convert a vector size to a matrix size."""
  if isinstance(d, (float, int, np.generic, np.ndarray)):
    n = (-1 + np.sqrt(1 + 8 * d)) / 2.
    if float(int(n)) != n:
      raise ValueError('Vector length {} is not a triangular number.'.format(d))
    return int(n)
  else:
    with tf.name_scope(name or 'vector_size_to_square_matrix_size') as name:
      n = (-1. + tf.sqrt(1 + 8. * tf.cast(d, dtype=tf.float32))) / 2.
      if validate_args:
        with tf.control_dependencies([
            tf.debugging.Assert(
                tf.math.equal(
                    tf.cast(tf.cast(n, dtype=tf.int32), dtype=tf.float32), n),
                data=['Vector length is not a triangular number: ', d]
            )
        ]):
          n = tf.identity(n)
      return tf.cast(n, d.dtype)
Beispiel #10
0
 def posterior_jd():
     observation_noise_variance = yield InverseGammaWithSampleUpperBound(
         concentration=(
             self.observation_noise_variance_posterior_concentration),
         scale=sampler_state.observation_noise_variance_posterior_scale,
         upper_bound=self.observation_noise_variance_upper_bound,
         name='observation_noise_variance')
     yield MVNPrecisionFactorHardZeros(
         loc=sampler_state.conditional_weights_mean,
         # Note that the posterior precision varies inversely with the
         # noise variance: in worlds with high noise we're also
         # more uncertain about the values of the weights.
         # TODO(colcarroll): Tests pass even without a square root on the
         # observation_noise_variance. Should add a test that would fail.
         precision_factor=tf.linalg.LinearOperatorLowerTriangular(
             sampler_state.conditional_posterior_precision_chol /
             tf.sqrt(observation_noise_variance[..., tf.newaxis,
                                                tf.newaxis])),
         nonzeros=sampler_state.nonzeros,
         name='weights')
Beispiel #11
0
    def _prepare_local(self, var_device, var_dtype, apply_state):
        super(NonFusedAdam, self)._prepare_local(var_device, var_dtype,
                                                 apply_state)

        local_step = tf.cast(self.iterations + 1, var_dtype)
        beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype))
        beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype))
        beta_1_power = tf.pow(beta_1_t, local_step)
        beta_2_power = tf.pow(beta_2_t, local_step)
        lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
              (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
        apply_state[(var_device, var_dtype)].update(
            dict(lr=lr,
                 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
                 beta_1_t=beta_1_t,
                 beta_1_power=beta_1_power,
                 one_minus_beta_1_t=1 - beta_1_t,
                 beta_2_t=beta_2_t,
                 beta_2_power=beta_2_power,
                 one_minus_beta_2_t=1 - beta_2_t))
Beispiel #12
0
def normalize_op(x, norm_type='layer', eps=1e-5):
    """Apply either Group, Instance, or Layer normalization, or None."""
    if norm_type is not None:
        # mb, h, w, ch
        x_shape = tf.shape(x)

        n_groups = {
            'instance': x_shape[-1],
            'layer': 1,
            'group': 32
        }[norm_type]
        x = tf.reshape(
            x,
            tf.concat([x_shape[:-1], [n_groups, x_shape[-1] // n_groups]],
                      axis=0))

        mean, var = tf.nn.moments(x, [1, 2, 4], keepdims=True)
        x = (x - mean) / tf.sqrt(var + eps)
        x = tf.reshape(x, x_shape)
    return x
Beispiel #13
0
  def update_step(self, grad, variable):
    """Update step given gradient and the associated model variable."""
    if self._var_key(variable) not in self._index_dict:
      raise KeyError(f'Optimizer cannot recognize variable {variable.name}, '
                     f'this usually means you are calling an optimizer '
                     f'previously used on a different model. Please try '
                     f'creating a new optimizer instance.')
    lr = tf.cast(self.learning_rate, variable.dtype)

    var_key = self._var_key(variable)
    accumulator = self._accumulators[self._index_dict[var_key]]

    if isinstance(grad, tf.IndexedSlices):
      # Sparse gradients.
      accumulator.scatter_add(
          tf.IndexedSlices(grad.values * grad.values, grad.indices))
    else:
      # Dense gradients.
      accumulator.assign_add(grad * grad)
    variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))
      def log_volatility_noncentered_fn(white_noise_shock_scale,
                                        persistence_of_volatility):
        """Noncentered parameterization of log_volatility random variable."""
        # The non-centered parameterization for log_volatility improves geometry
        # but is slower (catastrophically so if FFT is not used).
        std_log_volatility = yield root(
            tfd.Sample(
                tfd.Normal(0., 1.),
                num_timesteps,
                name='std_log_volatility',
            ))

        if use_fft:
          return (
              white_noise_shock_scale[..., tf.newaxis] *
              _fft_conv_center(std_log_volatility, persistence_of_volatility))
        else:
          log_volatility = (
              std_log_volatility * white_noise_shock_scale[..., tf.newaxis])

          log_volatility_0 = (
              log_volatility[..., 0] /
              tf.sqrt(1 - persistence_of_volatility**2))

          # Make the time axis be first, for scan to work.
          log_volatility = distribution_util.move_dimension(
              log_volatility, -1, 0)
          # I.e.
          # log_volatility[t] += (persistence_of_volatility *
          #     log_volatility[t-1])
          log_volatility = tf.concat(
              [
                  log_volatility_0[tf.newaxis],
                  tf.scan(
                      lambda v_prev, v: persistence_of_volatility * v_prev + v,
                      log_volatility[1:], log_volatility_0)
              ],
              axis=0,
          )

          return distribution_util.move_dimension(log_volatility, 0, -1)
Beispiel #15
0
 def _sample_n(self, n, seed=None):
   # See https://en.wikipedia.org/wiki/Inverse_Gaussian_distribution or
   # https://www.jstor.org/stable/2683801
   concentration = tf.convert_to_tensor(self.concentration)
   loc = tf.convert_to_tensor(self.loc)
   chi2_seed, unif_seed = samplers.split_seed(seed, salt='inverse_gaussian')
   shape = ps.concat([[n], self._batch_shape_tensor(
       loc=loc, concentration=concentration)], axis=0)
   sampled_chi2 = tf.square(samplers.normal(
       shape, seed=chi2_seed, dtype=self.dtype))
   sampled_uniform = samplers.uniform(
       shape, seed=unif_seed, dtype=self.dtype)
   # Wikipedia defines an intermediate x with the formula
   #   x = loc + loc ** 2 * y / (2 * conc)
   #       - loc / (2 * conc) * sqrt(4 * loc * conc * y + loc ** 2 * y ** 2)
   # where y ~ N(0, 1)**2 (sampled_chi2 above) and conc is the concentration.
   # Let us write
   #   w = loc * y / (2 * conc)
   # Then we can extract the common factor in the last two terms to obtain
   #   x = loc + loc * w * (1 - sqrt(2 / w + 1))
   # Now we see that the Wikipedia formula suffers from catastrphic
   # cancellation for large w (e.g., if conc << loc).
   #
   # Fortunately, we can fix this by multiplying both sides
   # by 1 + sqrt(2 / w + 1).  We get
   #   x * (1 + sqrt(2 / w + 1)) =
   #     = loc * (1 + sqrt(2 / w + 1)) + loc * w * (1 - (2 / w + 1))
   #     = loc * (sqrt(2 / w + 1) - 1)
   # The term sqrt(2 / w + 1) + 1 no longer presents numerical
   # difficulties for large w, and sqrt(2 / w + 1) - 1 is just
   # sqrt1pm1(2 / w), which we know how to compute accurately.
   # This just leaves the matter of small w, where 2 / w may
   # overflow.  In the limit a w -> 0, x -> loc, so we just mask
   # that case.
   sqrt1pm1_arg = 4 * concentration / (loc * sampled_chi2)  # 2 / w above
   safe_sqrt1pm1_arg = tf.where(sqrt1pm1_arg < np.inf, sqrt1pm1_arg, 1.0)
   denominator = 1.0 + tf.sqrt(safe_sqrt1pm1_arg + 1.0)
   ratio = tfp_math.sqrt1pm1(safe_sqrt1pm1_arg) / denominator
   sampled = loc * tf.where(sqrt1pm1_arg < np.inf, ratio, 1.0)  # x above
   return tf.where(sampled_uniform <= loc / (loc + sampled),
                   sampled, tf.square(loc) / sampled)
Beispiel #16
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))

        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')

        g_prime = grad / coefficients['one_minus_m_schedule_new']

        # m_t = beta1 * m + (1 - beta1) * g_t
        m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
        m_t = tf.compat.v1.assign(m,
                                  m * coefficients['beta_1_t'],
                                  use_locking=self._use_locking)

        with tf.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
            m_t_slice = tf.compat.v1.gather(m_t, indices)

        m_t_prime = m_t_slice / coefficients['one_minus_m_schedule_next']
        m_t_bar = (coefficients['one_minus_m_t'] * g_prime +
                   coefficients['m_t_1'] * m_t_prime)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
        v_t = tf.compat.v1.assign(v,
                                  v * coefficients['beta_2_t'],
                                  use_locking=self._use_locking)

        with tf.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
            v_t_slice = tf.compat.v1.gather(v_t, indices)

        v_t_prime = v_t_slice / coefficients['v_t_prime_denominator']
        v_prime_sqrt_plus_eps = tf.sqrt(v_t_prime) + coefficients['epsilon']

        var_update = self._resource_scatter_add(
            var, indices,
            coefficients['neg_lr_t'] * m_t_bar / v_prime_sqrt_plus_eps)
        return tf.group(*[var_update, m_t_bar, v_t])
Beispiel #17
0
    def get_marginal_distribution(self, index_points=None):
        """Compute the marginal of this GP over function values at `index_points`.

    Args:
      index_points: `float` `Tensor` representing finite (batch of) vector(s) of
        points in the index set over which the GP is defined. Shape has the form
        `[b1, ..., bB, e, f1, ..., fF]` where `F` is the number of feature
        dimensions and must equal `kernel.feature_ndims` and `e` is the number
        (size) of index points in each batch. Ultimately this distribution
        corresponds to a `e`-dimensional multivariate normal. The batch shape
        must be broadcastable with `kernel.batch_shape` and any batch dims
        yielded by `mean_fn`.

    Returns:
      marginal: a `Normal` or `MultivariateNormalLinearOperator` distribution,
        according to whether `index_points` consists of one or many index
        points, respectively.
    """
        with self._name_and_control_scope('get_marginal_distribution'):
            # TODO(cgs): consider caching the result here, keyed on `index_points`.
            index_points = self._get_index_points(index_points)
            covariance = self._compute_covariance(index_points)
            loc = self._mean_fn(index_points)
            # If we're sure the number of index points is 1, we can just construct a
            # scalar Normal. This has computational benefits and supports things like
            # CDF that aren't otherwise straightforward to provide.
            if self._is_univariate_marginal(index_points):
                scale = tf.sqrt(covariance)
                # `loc` has a trailing 1 in the shape; squeeze it.
                loc = tf.squeeze(loc, axis=-1)
                return normal.Normal(loc=loc,
                                     scale=scale,
                                     validate_args=self._validate_args,
                                     allow_nan_stats=self._allow_nan_stats,
                                     name='marginal_distribution')
            else:
                return self._marginal_fn(loc=loc,
                                         covariance=covariance,
                                         validate_args=self._validate_args,
                                         allow_nan_stats=self._allow_nan_stats,
                                         name='marginal_distribution')
    def call(self, inputs, global_step=None, training=None):
        # define scaling factor
        gp_feature_scale = tf.cast(tf.sqrt(2 / self.num_inducing),
                                   inputs.dtype)

        # compute random feature
        gp_inputs = inputs
        if self.normalize_input:
            gp_inputs = self._input_norm_layer(gp_inputs)

        gp_feature = self._random_feature(gp_inputs)
        if self.scale_random_features:
            gp_feature = gp_feature * gp_feature_scale

        # compute posterior center (i.e., MAP estimate) and variance.
        gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias
        gp_covmat = self._gp_cov_layer(gp_feature, training)

        if self.return_random_features:
            return gp_output, gp_covmat, gp_feature
        return gp_output, gp_covmat
Beispiel #19
0
    def testSqrtWithFiniteGradsBackpropsCorrectly(self):
        # Part of implementing a tf.custom_gradient is correctly handling the
        # `grad_ys` value that is propagating back from downstream ops. This test
        # checks that we got this right, in a particular case where our sqrt
        # function is squashed between a couple of other functions.
        def f(x):
            return x**2

        def g(x):
            return util.sqrt_with_finite_grads(x)

        def h(x):
            return tf.sin(x)**2

        # We only test away from zero, since we know the values don't match there.
        xs = tf.constant(np.linspace(1e-10, 10., 100))
        _, grad_tf_sqrt = value_and_gradient(lambda xs_: f(tf.sqrt(h(xs_))),
                                             xs)
        _, grad_safe_sqrt = value_and_gradient(lambda xs_: f(g(h(xs_))), xs)
        self.assertAllClose(*self.evaluate([grad_tf_sqrt, grad_safe_sqrt]),
                            rtol=1e-10)
Beispiel #20
0
def lorenz_system_prior_fn(num_timesteps,
                           innovation_scale,
                           step_size,
                           dtype=tf.float32):
    """Generative process for the Lorenz System model."""
    innovation_scale = tensor_util.convert_nonref_to_tensor(
        innovation_scale, name='innovation_scale', dtype=dtype)
    step_size = tensor_util.convert_nonref_to_tensor(step_size,
                                                     name='step_size',
                                                     dtype=dtype)
    loc = yield Root(tfd.Sample(tfd.Normal(0., 1.), sample_shape=3))
    for _ in range(num_timesteps - 1):
        x, y, z = tf.unstack(loc, axis=-1)
        dx = 10 * (y - x)
        dy = x * (28 - z) - y
        dz = x * y - 8 / 3 * z
        delta = tf.stack([dx, dy, dz], axis=-1)
        loc = yield tfd.Independent(tfd.Normal(
            loc + step_size * delta,
            tf.sqrt(step_size) * innovation_scale[..., tf.newaxis]),
                                    reinterpreted_batch_ndims=1)
        def bijector_fn(std_log_volatility):
          """Bijector function to set up the autoregressive dependence."""
          shift = tf.concat([
              tf.zeros(std_log_volatility.shape[:-1])[..., tf.newaxis],
              persistence_of_volatility[..., tf.newaxis] *
              std_log_volatility[..., :-1]
          ], -1)

          scale0 = white_noise_shock_scale / tf.sqrt(
              1 - persistence_of_volatility**2)

          scale_rest = (
              white_noise_shock_scale[..., tf.newaxis] * tf.ones(
                  ps.concat([
                      ps.shape(white_noise_shock_scale),
                      ps.shape(std_log_volatility)[-1:] - 1
                  ], 0)))

          scale = tf.concat([scale0[..., tf.newaxis], scale_rest], -1)

          return tfb.Shift(shift)(tfb.Scale(scale))
Beispiel #22
0
def mix_over_posterior_draws(means, variances):
    """Construct a predictive normal distribution that mixes over posterior draws.

  Args:
    means: float `Tensor` of shape
      `[num_posterior_draws, ..., num_timesteps]`.
    variances: float `Tensor` of shape
      `[num_posterior_draws, ..., num_timesteps]`.

  Returns:
    mixture_dist: `tfd.MixtureSameFamily(tfd.Independent(tfd.Normal))` instance
      representing a uniform mixture over the posterior samples, with
      `batch_shape = ...` and `event_shape = [num_timesteps]`.

  """
    # The inputs `means`, `variances` have shape
    #   `concat([
    #      [num_posterior_draws],
    #      sample_shape,
    #      batch_shape,
    #      [num_timesteps]])`
    # Because MixtureSameFamily mixes over the rightmost batch dimension,
    # we need to move the `num_posterior_draws` dimension to be rightmost
    # in the batch shape. This requires use of `Independent` (to preserve
    # `num_timesteps` as part of the event shape) and `move_dimension`.
    # TODO(b/120245392): enhance `MixtureSameFamily` to reduce along an
    # arbitrary axis, and eliminate `move_dimension` calls here.

    with tf.name_scope('mix_over_posterior_draws'):
        num_posterior_draws = dist_util.prefer_static_value(tf.shape(means))[0]

        component_observations = tfd.Independent(distribution=tfd.Normal(
            loc=dist_util.move_dimension(means, 0, -2),
            scale=tf.sqrt(dist_util.move_dimension(variances, 0, -2))),
                                                 reinterpreted_batch_ndims=1)

        return tfd.MixtureSameFamily(
            mixture_distribution=tfd.Categorical(logits=tf.zeros(
                [num_posterior_draws], dtype=component_observations.dtype)),
            components_distribution=component_observations)
Beispiel #23
0
 def _sample_paths(self, times, num_requested_times, initial_state,
                   num_samples, random_type, seed, skip, normal_draws):
     """Returns a sample of paths from the process."""
     if normal_draws is None:
         # Normal draws needed for sampling
         normal_draws = utils.generate_mc_normal_draws(
             num_normal_draws=1,
             num_time_steps=num_requested_times,
             num_sample_paths=num_samples,
             random_type=random_type,
             seed=seed,
             dtype=self._dtype,
             skip=skip)
     else:
         # Shape [num_time_points, num_samples, dim]
         normal_draws = tf.transpose(normal_draws, [1, 0, 2])
         num_samples = tf.shape(normal_draws)[1]
         draws_dim = normal_draws.shape[2]
         if draws_dim != 1:
             raise ValueError(
                 "`dim` should be equal to `1` but is {0}".format(
                     draws_dim))
     times = tf.concat([[0], times], -1)
     mu_integral = self._integrate_parameter(self._mu, self._mu_is_constant,
                                             times[:-1], times[1:])
     sigma_sq_integral = self._integrate_parameter(self._sigma_squared,
                                                   self._sigma_is_constant,
                                                   times[:-1], times[1:])
     # The logarithm of all the increments between the times.
     log_increments = ((mu_integral - sigma_sq_integral / 2) +
                       tf.sqrt(sigma_sq_integral) *
                       tf.transpose(tf.squeeze(normal_draws, -1)))
     # Since the implementation of tf.math.cumsum is single-threaded we
     # use lower-triangular matrix multiplication instead
     once = tf.ones([num_requested_times, num_requested_times],
                    dtype=self._dtype)
     lower_triangular = tf.linalg.band_part(once, -1, 0)
     cumsum = tf.linalg.matvec(lower_triangular, log_increments)
     samples = initial_state * tf.math.exp(cumsum)
     return tf.expand_dims(samples, -1)
Beispiel #24
0
def rayleigh(shape, scale=None, dtype=tf.float32, seed=None, name=None):
    """Generates `Tensor` of positive reals drawn from a Rayleigh distributions.

  The probability density function of a Rayleigh distribution with `scale`
  parameter is given by:

  ```none
  f(x) = x scale**-2 exp(-x**2 0.5 scale**-2)
  ```

  For more details, see [Rayleigh distribution](
  https://en.wikipedia.org/wiki/Rayleigh_distribution)

  Args:
    shape: Vector-shaped, `int` `Tensor` representing shape of output.
    scale: (Optional) Positive `float` `Tensor` representing `Rayleigh` scale.
      Default value: `None` (i.e., `scale = 1.`).
    dtype: (Optional) TF `dtype` representing `dtype` of output.
      Default value: `tf.float32`.
    seed: PRNG seed; see `tfp.random.sanitize_seed` for details.
      Default value: `None` (i.e., no seed).
    name: Python `str` name prefixed to Ops created by this function.
      Default value: `None` (i.e., 'random_rayleigh').

  Returns:
    rayleigh: `Tensor` with specified `shape` and `dtype` consisting of positive
      real values drawn from a Rayleigh distribution with specified `scale`.
  """
    with tf.name_scope(name or 'rayleigh'):
        if scale is not None:
            # Its important to expand the shape to match scale's, otherwise we won't
            # have independent draws.
            scale = tf.convert_to_tensor(scale, dtype=dtype, name='scale')
            shape = tf.broadcast_dynamic_shape(shape, tf.shape(scale))
        x = tf.sqrt(-2. * tf.math.log(
            samplers.uniform(shape, minval=0, maxval=1, dtype=dtype,
                             seed=seed)))
        if scale is None:
            return x
        return x * scale
Beispiel #25
0
def dot_product_attention(q, k, v, normalise):
  """Computes dot product attention.

  Args:
    q: queries. Tensor of  shape [batch_size, m, d_k].
    k: keys. Tensor of shape [batch_size, n, d_k].
    v: values. Tensor of shape [batch_size, n, d_v].
    normalise: Boolean that determines whether weights sum to 1.

  Returns:
    Tensor of shape [batch_size, m, d_v].
  """
  d_k = tf.shape(q)[-1]
  scale = tf.sqrt(tf.cast(d_k, tf.float32))
  unnorm_weights = tf.einsum('bjk,bik->bij', k, q) / scale  # [batch_size,m,n]
  if normalise:
    weight_fn = tf.nn.softmax
  else:
    weight_fn = tf.sigmoid
  weights = weight_fn(unnorm_weights)  # [batch_size,m,n]
  rep = tf.einsum('bik,bkj->bij', weights, v)  # [batch_size,m,d_v]
  return rep
    def testCovarianceFromSampling(self):
        alpha = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32)
        dist = tfd.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
        x = dist.sample(int(250e3), seed=test_util.test_seed())
        sample_mean = tf.reduce_mean(x, axis=0)
        x_centered = x - sample_mean[None, ...]
        sample_cov = tf.reduce_mean(tf.matmul(x_centered[..., None],
                                              x_centered[..., None, :]),
                                    axis=0)
        sample_var = tf.linalg.diag_part(sample_cov)
        sample_stddev = tf.sqrt(sample_var)

        [
            sample_mean_,
            sample_cov_,
            sample_var_,
            sample_stddev_,
            analytic_mean,
            analytic_cov,
            analytic_var,
            analytic_stddev,
        ] = self.evaluate([
            sample_mean,
            sample_cov,
            sample_var,
            sample_stddev,
            dist.mean(),
            dist.covariance(),
            dist.variance(),
            dist.stddev(),
        ])

        self.assertAllClose(sample_mean_, analytic_mean, atol=0.04, rtol=0.)
        self.assertAllClose(sample_cov_, analytic_cov, atol=0.06, rtol=0.)
        self.assertAllClose(sample_var_, analytic_var, atol=0.03, rtol=0.)
        self.assertAllClose(sample_stddev_,
                            analytic_stddev,
                            atol=0.02,
                            rtol=0.)
Beispiel #27
0
def _sqrtx2p1(x):
    """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
    sqrt_eps = np.sqrt(np.finfo(dtype_util.as_numpy_dtype(x.dtype)).eps)
    return tf1.where(
        tf.abs(x) * sqrt_eps <= 1.,
        tf.sqrt(x**2. + 1.),
        # For large x, calculating x**2 can overflow. This can be alleviated by
        # considering:
        # sqrt(1 + x**2)
        # = exp(0.5 log(1 + x**2))
        # = exp(0.5 log(x**2 * (1 + x**-2)))
        # = exp(log(x) + 0.5 * log(1 + x**-2))
        # = |x| * exp(0.5 log(1 + x**-2))
        # = |x| * sqrt(1 + x**-2)
        # We omit the last term in this approximation.
        # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
        # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
        # and higher order gradients, since the first order derivative of
        # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
        # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
        # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
        tf.abs(x))
Beispiel #28
0
    def _sample_paths(self, times, num_requested_times, initial_state,
                      num_samples, random_type, seed, skip):
        """Returns a sample of paths from the process."""
        # Normal draws needed for sampling.
        # Shape [num_requested_times, num_samples, dim]
        normal_draws = utils.generate_mc_normal_draws(
            num_normal_draws=self._dim,
            num_time_steps=num_requested_times,
            num_sample_paths=num_samples,
            random_type=random_type,
            seed=seed,
            dtype=self._dtype,
            skip=skip)
        times = tf.concat([[0], times], -1)
        # Time increments
        # Shape [num_requested_times, 1, 1]
        dt = tf.expand_dims(tf.expand_dims(times[1:] - times[:-1], axis=-1),
                            axis=-1)
        if self._corr_matrix is None:
            stochastic_increment = normal_draws
        else:
            cholesky = tf.linalg.cholesky(self._corr_matrix)
            stochastic_increment = tf.linalg.matvec(cholesky, normal_draws)

        # The logarithm of all the increments between the times.
        # Shape [num_requested_times, num_samples, dim]
        log_increments = ((self._means - self._vols**2 / 2) * dt +
                          tf.sqrt(dt) * self._vols * stochastic_increment)

        # Since the implementation of tf.math.cumsum is single-threaded we
        # use lower-triangular matrix multiplication instead
        once = tf.ones([num_requested_times, num_requested_times],
                       dtype=self._dtype)
        lower_triangular = tf.linalg.band_part(once, -1, 0)
        cumsum = tf.linalg.matvec(lower_triangular,
                                  tf.transpose(log_increments))
        cumsum = tf.transpose(cumsum, [1, 2, 0])
        samples = initial_state * tf.math.exp(cumsum)
        return samples
def _update_log_spot(kappa,
                     theta,
                     epsilon,
                     rho,
                     current_vol,
                     next_vol,
                     current_log_spot,
                     time_step,
                     normals,
                     gamma_1=0.5,
                     gamma_2=0.5):
    """Updates log-spot value."""
    k_0 = -rho * kappa * theta / epsilon * time_step
    k_1 = (gamma_1 * time_step * (kappa * rho / epsilon - 0.5) - rho / epsilon)
    k_2 = (gamma_2 * time_step * (kappa * rho / epsilon - 0.5) + rho / epsilon)
    k_3 = gamma_1 * time_step * (1 - rho**2)
    k_4 = gamma_2 * time_step * (1 - rho**2)

    next_log_spot = (current_log_spot + k_0 + k_1 * current_vol +
                     k_2 * next_vol +
                     tf.sqrt(k_3 * current_vol + k_4 * next_vol) * normals)
    return next_log_spot
Beispiel #30
0
    def __call__(self, step):
        with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:
            initial_learning_rate = tf.convert_to_tensor(
                self.initial_learning_rate, name="initial_learning_rate"
            )
            dtype = initial_learning_rate.dtype
            decay_steps = tf.cast(self.decay_steps, dtype)
            initial_variance = tf.cast(self.initial_variance, dtype)
            variance_decay = tf.cast(self.variance_decay, dtype)
            num_periods = tf.cast(self.num_periods, dtype)
            alpha = tf.cast(self.alpha, dtype)
            beta = tf.cast(self.beta, dtype)

            global_step_recomp = tf.cast(step, dtype)
            global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
            linear_decayed = (decay_steps - global_step_recomp) / decay_steps
            variance = initial_variance / (
                tf.pow(1.0 + global_step_recomp, variance_decay)
            )
            std = tf.sqrt(variance)
            noisy_linear_decayed = (
                linear_decayed
                + self._random_generator.random_normal(
                    linear_decayed.shape, stddev=std
                )
            )

            completed_fraction = global_step_recomp / decay_steps
            fraction = 2.0 * num_periods * completed_fraction
            cosine_decayed = 0.5 * (
                1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)
            )
            noisy_linear_cosine_decayed = (
                alpha + noisy_linear_decayed
            ) * cosine_decayed + beta

            return tf.multiply(
                initial_learning_rate, noisy_linear_cosine_decayed, name=name
            )