Beispiel #1
0
def gaussian_processes(draw,
                       kernel_name=None,
                       batch_shape=None,
                       event_dim=None,
                       feature_dim=None,
                       feature_ndims=None,
                       enable_vars=False):
    # First draw a kernel.
    k, _ = draw(
        kernel_hps.base_kernels(
            kernel_name=kernel_name,
            batch_shape=batch_shape,
            event_dim=event_dim,
            feature_dim=feature_dim,
            feature_ndims=feature_ndims,
            # Disable variables
            enable_vars=False))
    compatible_batch_shape = draw(
        tfp_hps.broadcast_compatible_shape(k.batch_shape))
    index_points = draw(
        kernel_hps.kernel_input(batch_shape=compatible_batch_shape,
                                example_ndims=1,
                                feature_dim=feature_dim,
                                feature_ndims=feature_ndims,
                                enable_vars=enable_vars,
                                name='index_points'))
    params = draw(
        broadcasting_params('GaussianProcess',
                            compatible_batch_shape,
                            event_dim=event_dim,
                            enable_vars=enable_vars))

    gp = tfd.GaussianProcess(
        kernel=k,
        index_points=index_points,
        cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0],
        observation_noise_variance=params['observation_noise_variance'])
    return gp
def student_t_processes(draw,
                        kernel_name=None,
                        batch_shape=None,
                        event_dim=None,
                        feature_dim=None,
                        feature_ndims=None,
                        enable_vars=False):
  # First draw a kernel.
  k, _ = draw(kernel_hps.base_kernels(
      kernel_name=kernel_name,
      batch_shape=batch_shape,
      event_dim=event_dim,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      # Disable variables
      enable_vars=False))
  compatible_batch_shape = draw(
      tfp_hps.broadcast_compatible_shape(k.batch_shape))
  index_points = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      enable_vars=enable_vars,
      name='index_points'))
  params = draw(broadcasting_params(
      'StudentTProcess',
      compatible_batch_shape,
      event_dim=event_dim,
      enable_vars=enable_vars))
  stp = tfd.StudentTProcess(
      kernel=k,
      index_points=index_points,
      # The Student-T Process can encounter cholesky decomposition errors,
      # so use a large jitter to avoid that.
      jitter=1e-1,
      df=params['df'])
  return stp
Beispiel #3
0
    def testKernelGradient(self, kernel_name, data):
        if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
            return
        event_dim = data.draw(hps.integers(min_value=2, max_value=6))
        feature_ndims = data.draw(hps.integers(min_value=1, max_value=4))
        kernel, kernel_parameter_variable_names = data.draw(
            kernels(kernel_name=kernel_name,
                    event_dim=event_dim,
                    feature_ndims=feature_ndims,
                    enable_vars=True))

        # Check that variable parameters get passed to the kernel.variables
        kernel_variables_names = [
            v.name.strip('_0123456789:') for v in kernel.variables
        ]
        self.assertEqual(set(kernel_parameter_variable_names),
                         set(kernel_variables_names))

        example_ndims = data.draw(hps.integers(min_value=1, max_value=3))
        input_batch_shape = data.draw(
            tfp_hps.broadcast_compatible_shape(kernel.batch_shape))
        xs = tf.identity(
            data.draw(
                kernel_input(batch_shape=input_batch_shape,
                             example_ndims=example_ndims,
                             feature_ndims=feature_ndims)))

        # Check that we pick up all relevant kernel parameters.
        wrt_vars = [xs] + list(kernel.variables)

        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `apply` of {}'.format(kernel)):
                tape.watch(wrt_vars)
                diag = kernel.apply(xs, xs, example_ndims=example_ndims)
        grads = tape.gradient(diag, wrt_vars)
        assert_no_none_grad(kernel, 'apply', wrt_vars, grads)
  def testDistribution(self, dist_name, data):
    seed = test_util.test_seed()
    # Explicitly draw event_dim here to avoid relying on _params_event_ndims
    # later, so this test can support distributions that do not implement the
    # slicing protocol.
    event_dim = data.draw(hps.integers(min_value=2, max_value=6))
    dist = data.draw(dhps.distributions(
        dist_name=dist_name, event_dim=event_dim, enable_vars=True))
    batch_shape = dist.batch_shape
    batch_shape2 = data.draw(tfp_hps.broadcast_compatible_shape(batch_shape))
    dist2 = data.draw(
        dhps.distributions(
            dist_name=dist_name,
            batch_shape=batch_shape2,
            event_dim=event_dim,
            enable_vars=True))
    self.evaluate([var.initializer for var in dist.variables])

    # Check that the distribution passes Variables through to the accessor
    # properties (without converting them to Tensor or anything like that).
    for k, v in six.iteritems(dist.parameters):
      if not tensor_util.is_ref(v):
        continue
      self.assertIs(getattr(dist, k), v)

    # Check that standard statistics do not read distribution parameters more
    # than twice (once in the stat itself and up to once in any validation
    # assertions).
    max_permissible = 2 + extra_tensor_conversions_allowed(dist)
    for stat in sorted(data.draw(
        hps.sets(
            hps.one_of(
                map(hps.just, [
                    'covariance', 'entropy', 'mean', 'mode', 'stddev',
                    'variance'
                ])),
            min_size=3,
            max_size=3))):
      hp.note('Testing excessive var usage in {}.{}'.format(dist_name, stat))
      try:
        with tfp_hps.assert_no_excessive_var_usage(
            'statistic `{}` of `{}`'.format(stat, dist),
            max_permissible=max_permissible):
          getattr(dist, stat)()

      except NotImplementedError:
        pass

    # Check that `sample` doesn't read distribution parameters more than twice,
    # and that it produces non-None gradients (if the distribution is fully
    # reparameterized).
    with tf.GradientTape() as tape:
      # TDs do bijector assertions twice (once by distribution.sample, and once
      # by bijector.forward).
      max_permissible = 2 + extra_tensor_conversions_allowed(dist)
      with tfp_hps.assert_no_excessive_var_usage(
          'method `sample` of `{}`'.format(dist),
          max_permissible=max_permissible):
        sample = dist.sample(seed=seed)
    if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
      grads = tape.gradient(sample, dist.variables)
      for grad, var in zip(grads, dist.variables):
        var_name = var.name.rstrip('_0123456789:')
        if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()):
          continue
        if grad is None:
          raise AssertionError(
              'Missing sample -> {} grad for distribution {}'.format(
                  var_name, dist_name))

    # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's
    # own samples.  Also, to relax conversion counts for KL (might do >2 w/
    # validate_args).
    dist = dist.copy(validate_args=False)
    dist2 = dist2.copy(validate_args=False)

    # Test that KL divergence reads distribution parameters at most once, and
    # that is produces non-None gradients.
    try:
      for d1, d2 in (dist, dist2), (dist2, dist):
        with tf.GradientTape() as tape:
          with tfp_hps.assert_no_excessive_var_usage(
              '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'.format(
                  d1, d1.variables, d2, d2.variables),
              max_permissible=1):  # No validation => 1 convert per var.
            kl = d1.kl_divergence(d2)
        wrt_vars = list(d1.variables) + list(d2.variables)
        grads = tape.gradient(kl, wrt_vars)
        for grad, var in zip(grads, wrt_vars):
          if grad is None and dist_name not in NO_KL_PARAM_GRADS:
            raise AssertionError('Missing KL({} || {}) -> {} grad:\n'
                                 '{} vars: {}\n{} vars: {}'.format(
                                     d1, d2, var, d1, d1.variables, d2,
                                     d2.variables))
    except NotImplementedError:
      pass

    # Test that log_prob produces non-None gradients, except for distributions
    # on the NO_LOG_PROB_PARAM_GRADS blacklist.
    if dist_name not in NO_LOG_PROB_PARAM_GRADS:
      with tf.GradientTape() as tape:
        lp = dist.log_prob(tf.stop_gradient(sample))
      grads = tape.gradient(lp, dist.variables)
      for grad, var in zip(grads, dist.variables):
        if grad is None:
          raise AssertionError(
              'Missing log_prob -> {} grad for distribution {}'.format(
                  var, dist_name))

    # Test that all forms of probability evaluation avoid reading distribution
    # parameters more than once.
    for evaluative in sorted(data.draw(
        hps.sets(
            hps.one_of(
                map(hps.just, [
                    'log_prob', 'prob', 'log_cdf', 'cdf',
                    'log_survival_function', 'survival_function'
                ])),
            min_size=3,
            max_size=3))):
      hp.note('Testing excessive var usage in {}.{}'.format(
          dist_name, evaluative))
      try:
        # No validation => 1 convert. But for TD we allow 2:
        # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp)
        max_permissible = 2 + extra_tensor_conversions_allowed(dist)
        with tfp_hps.assert_no_excessive_var_usage(
            'evaluative `{}` of `{}`'.format(evaluative, dist),
            max_permissible=max_permissible):
          getattr(dist, evaluative)(sample)
      except NotImplementedError:
        pass
  def testBijector(self, bijector_name, data):
    if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
      return
    event_dim = data.draw(hps.integers(min_value=2, max_value=6))
    bijector = data.draw(
        bijectors(bijector_name=bijector_name, event_dim=event_dim,
                  enable_vars=True))

    # Forward mapping: Check differentiation through forward mapping with
    # respect to the input and parameter variables.  Also check that any
    # variables are not referenced overmuch.
    # TODO(axch): Would be nice to get rid of all this shape inference logic and
    # just rely on a notion of batch and event shape for bijectors, so we can
    # pass those through `domain_tensors` and `codomain_tensors` and use
    # `tensors_in_support`.  However, `RationalQuadraticSpline` behaves weirdly
    # somehow and I got confused.
    shp = bijector.inverse_event_shape([event_dim] *
                                       bijector.inverse_min_event_ndims)
    shp = tensorshape_util.concatenate(
        data.draw(
            tfp_hps.broadcast_compatible_shape(
                shp[:shp.ndims - bijector.forward_min_event_ndims])),
        shp[shp.ndims - bijector.forward_min_event_ndims:])
    xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)), name='xs')
    wrt_vars = [xs] + [v for v in bijector.trainable_variables
                       if v.dtype.is_floating]
    with tf.GradientTape() as tape:
      with tfp_hps.assert_no_excessive_var_usage(
          'method `forward` of {}'.format(bijector)):
        tape.watch(wrt_vars)
        # TODO(b/73073515): Fix graph mode gradients with bijector caching.
        ys = bijector.forward(xs + 0)
    grads = tape.gradient(ys, wrt_vars)
    assert_no_none_grad(bijector, 'forward', wrt_vars, grads)

    # FLDJ: Check differentiation through forward log det jacobian with
    # respect to the input and parameter variables.  Also check that any
    # variables are not referenced overmuch.
    event_ndims = data.draw(
        hps.integers(
            min_value=bijector.forward_min_event_ndims,
            max_value=bijector.forward_event_shape(xs.shape).ndims))
    with tf.GradientTape() as tape:
      max_permitted = 2 if hasattr(bijector, '_forward_log_det_jacobian') else 4
      if is_invert(bijector):
        max_permitted = (2 if hasattr(bijector.bijector,
                                      '_inverse_log_det_jacobian') else 4)
      with tfp_hps.assert_no_excessive_var_usage(
          'method `forward_log_det_jacobian` of {}'.format(bijector),
          max_permissible=max_permitted):
        tape.watch(wrt_vars)
        # TODO(b/73073515): Fix graph mode gradients with bijector caching.
        ldj = bijector.forward_log_det_jacobian(xs + 0, event_ndims=event_ndims)
    grads = tape.gradient(ldj, wrt_vars)
    assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars, grads)

    # Inverse mapping: Check differentiation through inverse mapping with
    # respect to the codomain "input" and parameter variables.  Also check that
    # any variables are not referenced overmuch.
    shp = bijector.forward_event_shape([event_dim] *
                                       bijector.forward_min_event_ndims)
    shp = tensorshape_util.concatenate(
        data.draw(
            tfp_hps.broadcast_compatible_shape(
                shp[:shp.ndims - bijector.inverse_min_event_ndims])),
        shp[shp.ndims - bijector.inverse_min_event_ndims:])
    ys = tf.identity(
        data.draw(codomain_tensors(bijector, shape=shp)), name='ys')
    wrt_vars = [ys] + [v for v in bijector.trainable_variables
                       if v.dtype.is_floating]
    with tf.GradientTape() as tape:
      with tfp_hps.assert_no_excessive_var_usage(
          'method `inverse` of {}'.format(bijector)):
        tape.watch(wrt_vars)
        # TODO(b/73073515): Fix graph mode gradients with bijector caching.
        xs = bijector.inverse(ys + 0)
    grads = tape.gradient(xs, wrt_vars)
    assert_no_none_grad(bijector, 'inverse', wrt_vars, grads)

    # ILDJ: Check differentiation through inverse log det jacobian with respect
    # to the codomain "input" and parameter variables.  Also check that any
    # variables are not referenced overmuch.
    event_ndims = data.draw(
        hps.integers(
            min_value=bijector.inverse_min_event_ndims,
            max_value=bijector.inverse_event_shape(ys.shape).ndims))
    with tf.GradientTape() as tape:
      max_permitted = 2 if hasattr(bijector, '_inverse_log_det_jacobian') else 4
      if is_invert(bijector):
        max_permitted = (2 if hasattr(bijector.bijector,
                                      '_forward_log_det_jacobian') else 4)
      with tfp_hps.assert_no_excessive_var_usage(
          'method `inverse_log_det_jacobian` of {}'.format(bijector),
          max_permissible=max_permitted):
        tape.watch(wrt_vars)
        # TODO(b/73073515): Fix graph mode gradients with bijector caching.
        xs = bijector.inverse_log_det_jacobian(ys + 0, event_ndims=event_ndims)
    grads = tape.gradient(xs, wrt_vars)
    assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars, grads)
  def testDistribution(self, dist_name, data):
    if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
      return
    tf1.set_random_seed(
        data.draw(
            hpnp.arrays(dtype=np.int64, shape=[]).filter(lambda x: x != 0)))
    dist, batch_shape = data.draw(
        distributions(dist_name=dist_name, enable_vars=True))
    batch_shape2 = data.draw(tfp_hps.broadcast_compatible_shape(batch_shape))
    dist2, _ = data.draw(
        distributions(
            dist_name=dist_name,
            batch_shape=batch_shape2,
            event_dim=get_event_dim(dist),
            enable_vars=True))
    del batch_shape
    logging.info(
        'distribution: %s; parameters used: %s', dist,
        [k for k, v in six.iteritems(dist.parameters) if v is not None])
    self.evaluate([var.initializer for var in dist.variables])
    for k, v in six.iteritems(dist.parameters):
      if not tensor_util.is_mutable(v):
        continue
      try:
        self.assertIs(getattr(dist, k), v)
      except AssertionError as e:
        raise AssertionError(
            'No attr found for parameter {} of distribution {}: \n{}'.format(
                k, dist_name, e))

    for stat in data.draw(
        hps.sets(
            hps.one_of(
                map(hps.just, [
                    'covariance', 'entropy', 'mean', 'mode', 'stddev',
                    'variance'
                ])),
            min_size=3,
            max_size=3)):
      logging.info('%s.%s', dist_name, stat)
      try:
        with tfp_hps.assert_no_excessive_var_usage(
            'statistic `{}` of `{}`'.format(stat, dist)):
          getattr(dist, stat)()

      except NotImplementedError:
        pass

    with tf.GradientTape() as tape:
      with tfp_hps.assert_no_excessive_var_usage(
          'method `sample` of `{}`'.format(dist)):
        sample = dist.sample()
    if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
      grads = tape.gradient(sample, dist.variables)
      for grad, var in zip(grads, dist.variables):
        var_name = var.name.rstrip('_0123456789:')
        if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()):
          continue
        if grad is None:
          raise AssertionError(
              'Missing sample -> {} grad for distribution {}'.format(
                  var_name, dist_name))

    # Turn off validations, since log_prob can choke on dist's own samples.
    # Also, to relax conversion counts for KL (might do >2 w/ validate_args).
    dist = dist.copy(validate_args=False)
    dist2 = dist2.copy(validate_args=False)

    try:
      for d1, d2 in (dist, dist2), (dist2, dist):
        with tf.GradientTape() as tape:
          with tfp_hps.assert_no_excessive_var_usage(
              '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'.format(
                  d1, d1.variables, d2, d2.variables),
              max_permissible=1):  # No validation => 1 convert per var.
            kl = d1.kl_divergence(d2)
        wrt_vars = list(d1.variables) + list(d2.variables)
        grads = tape.gradient(kl, wrt_vars)
        for grad, var in zip(grads, wrt_vars):
          if grad is None and dist_name not in NO_KL_PARAM_GRADS:
            raise AssertionError('Missing KL({} || {}) -> {} grad:\n'
                                 '{} vars: {}\n{} vars: {}'.format(
                                     d1, d2, var, d1, d1.variables, d2,
                                     d2.variables))
    except NotImplementedError:
      pass

    if dist_name not in NO_LOG_PROB_PARAM_GRADS:
      with tf.GradientTape() as tape:
        lp = dist.log_prob(tf.stop_gradient(sample))
      grads = tape.gradient(lp, dist.variables)
      for grad, var in zip(grads, dist.variables):
        if grad is None:
          raise AssertionError(
              'Missing log_prob -> {} grad for distribution {}'.format(
                  var, dist_name))

    for evaluative in data.draw(
        hps.sets(
            hps.one_of(
                map(hps.just, [
                    'log_prob', 'prob', 'log_cdf', 'cdf',
                    'log_survival_function', 'survival_function'
                ])),
            min_size=3,
            max_size=3)):
      logging.info('%s.%s', dist_name, evaluative)
      try:
        with tfp_hps.assert_no_excessive_var_usage(
            'evaluative `{}` of `{}`'.format(evaluative, dist),
            max_permissible=1):  # No validation => 1 convert
          getattr(dist, evaluative)(sample)
      except NotImplementedError:
        pass
Beispiel #7
0
    def testBijector(self, bijector_name, data):
        tfp_hps.guitar_skip_if_matches('Tanh', bijector_name, 'b/144163991')
        if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
            return
        event_dim = data.draw(hps.integers(min_value=2, max_value=6))
        bijector = data.draw(
            bijectors(bijector_name=bijector_name,
                      event_dim=event_dim,
                      enable_vars=True))
        self.evaluate(tf.group(*[v.initializer for v in bijector.variables]))

        # Forward mapping: Check differentiation through forward mapping with
        # respect to the input and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        # TODO(axch): Would be nice to get rid of all this shape inference logic and
        # just rely on a notion of batch and event shape for bijectors, so we can
        # pass those through `domain_tensors` and `codomain_tensors` and use
        # `tensors_in_support`.  However, `RationalQuadraticSpline` behaves weirdly
        # somehow and I got confused.
        codomain_event_shape = [event_dim] * bijector.inverse_min_event_ndims
        codomain_event_shape = constrain_inverse_shape(bijector,
                                                       codomain_event_shape)
        shp = bijector.inverse_event_shape(codomain_event_shape)
        shp = tensorshape_util.concatenate(
            data.draw(
                tfp_hps.broadcast_compatible_shape(
                    shp[:shp.ndims - bijector.forward_min_event_ndims])),
            shp[shp.ndims - bijector.forward_min_event_ndims:])
        xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)),
                         name='xs')
        wrt_vars = [xs] + [
            v for v in bijector.trainable_variables if v.dtype.is_floating
        ]
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ys = bijector.forward(xs + 0)
        grads = tape.gradient(ys, wrt_vars)
        assert_no_none_grad(bijector, 'forward', wrt_vars, grads)

        # For scalar bijectors, verify correctness of the _is_increasing method.
        if (bijector.forward_min_event_ndims == 0
                and bijector.inverse_min_event_ndims == 0):
            dydx = grads[0]
            hp.note('dydx: {}'.format(dydx))
            isfinite = tf.math.is_finite(dydx)
            incr_or_slope_eq0 = bijector._internal_is_increasing() | tf.equal(
                dydx, 0)  # pylint: disable=protected-access
            self.assertAllEqual(
                isfinite & incr_or_slope_eq0,
                isfinite & (dydx >= 0) | tf.zeros_like(incr_or_slope_eq0))

        # FLDJ: Check differentiation through forward log det jacobian with
        # respect to the input and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.forward_min_event_ndims,
                         max_value=xs.shape.ndims))
        with tf.GradientTape() as tape:
            max_permitted = 2 if hasattr(bijector,
                                         '_forward_log_det_jacobian') else 4
            if is_invert(bijector):
                max_permitted = (2 if hasattr(
                    bijector.bijector, '_inverse_log_det_jacobian') else 4)
            elif is_transform_diagonal(bijector):
                max_permitted = (2 if hasattr(bijector.diag_bijector,
                                              '_forward_log_det_jacobian') else
                                 4)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ldj = bijector.forward_log_det_jacobian(
                    xs + 0, event_ndims=event_ndims)
        grads = tape.gradient(ldj, wrt_vars)
        assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars,
                            grads)

        # Inverse mapping: Check differentiation through inverse mapping with
        # respect to the codomain "input" and parameter variables.  Also check that
        # any variables are not referenced overmuch.
        domain_event_shape = [event_dim] * bijector.forward_min_event_ndims
        domain_event_shape = constrain_forward_shape(bijector,
                                                     domain_event_shape)
        shp = bijector.forward_event_shape(domain_event_shape)
        shp = tensorshape_util.concatenate(
            data.draw(
                tfp_hps.broadcast_compatible_shape(
                    shp[:shp.ndims - bijector.inverse_min_event_ndims])),
            shp[shp.ndims - bijector.inverse_min_event_ndims:])
        ys = tf.identity(data.draw(codomain_tensors(bijector, shape=shp)),
                         name='ys')
        wrt_vars = [ys] + [
            v for v in bijector.trainable_variables if v.dtype.is_floating
        ]
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                xs = bijector.inverse(ys + 0)
        grads = tape.gradient(xs, wrt_vars)
        assert_no_none_grad(bijector, 'inverse', wrt_vars, grads)

        # ILDJ: Check differentiation through inverse log det jacobian with respect
        # to the codomain "input" and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.inverse_min_event_ndims,
                         max_value=ys.shape.ndims))
        with tf.GradientTape() as tape:
            max_permitted = 2 if hasattr(bijector,
                                         '_inverse_log_det_jacobian') else 4
            if is_invert(bijector):
                max_permitted = (2 if hasattr(
                    bijector.bijector, '_forward_log_det_jacobian') else 4)
            elif is_transform_diagonal(bijector):
                max_permitted = (2 if hasattr(bijector.diag_bijector,
                                              '_inverse_log_det_jacobian') else
                                 4)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ldj = bijector.inverse_log_det_jacobian(
                    ys + 0, event_ndims=event_ndims)
        grads = tape.gradient(ldj, wrt_vars)
        assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars,
                            grads)
    def testDistribution(self, dist_name, data):
        if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
            return
        tf1.set_random_seed(
            data.draw(
                hpnp.arrays(dtype=np.int64,
                            shape=[]).filter(lambda x: x != 0)))
        dist = data.draw(distributions(dist_name=dist_name, enable_vars=True))
        batch_shape = dist.batch_shape
        batch_shape2 = data.draw(
            tfp_hps.broadcast_compatible_shape(batch_shape))
        dist2 = data.draw(
            distributions(dist_name=dist_name,
                          batch_shape=batch_shape2,
                          event_dim=get_event_dim(dist),
                          enable_vars=True))
        logging.info(
            'distribution: %s; parameters used: %s', dist,
            [k for k, v in six.iteritems(dist.parameters) if v is not None])
        self.evaluate([var.initializer for var in dist.variables])

        # Check that the distribution passes Variables through to the accessor
        # properties (without converting them to Tensor or anything like that).
        for k, v in six.iteritems(dist.parameters):
            if not tensor_util.is_ref(v):
                continue
            self.assertIs(getattr(dist, k), v)

        # Check that standard statistics do not read distribution parameters more
        # than once.
        for stat in data.draw(
                hps.sets(hps.one_of(
                    map(hps.just, [
                        'covariance', 'entropy', 'mean', 'mode', 'stddev',
                        'variance'
                    ])),
                         min_size=3,
                         max_size=3)):
            logging.info('%s.%s', dist_name, stat)
            try:
                with tfp_hps.assert_no_excessive_var_usage(
                        'statistic `{}` of `{}`'.format(stat, dist)):
                    getattr(dist, stat)()

            except NotImplementedError:
                pass

        # Check that `sample` doesn't read distribution parameters more than once,
        # and that it produces non-None gradients (if the distribution is fully
        # reparameterized).
        with tf.GradientTape() as tape:
            # TDs do bijector assertions twice (once by distribution.sample, and once
            # by bijector.forward).
            max_permissible = (3 if isinstance(
                dist, tfd.TransformedDistribution) else 2)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `sample` of `{}`'.format(dist),
                    max_permissible=max_permissible):
                sample = dist.sample()
        if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
            grads = tape.gradient(sample, dist.variables)
            for grad, var in zip(grads, dist.variables):
                var_name = var.name.rstrip('_0123456789:')
                if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()):
                    continue
                if grad is None:
                    raise AssertionError(
                        'Missing sample -> {} grad for distribution {}'.format(
                            var_name, dist_name))

        # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's
        # own samples.  Also, to relax conversion counts for KL (might do >2 w/
        # validate_args).
        dist = dist.copy(validate_args=False)
        dist2 = dist2.copy(validate_args=False)

        # Test that KL divergence reads distribution parameters at most once, and
        # that is produces non-None gradients.
        try:
            for d1, d2 in (dist, dist2), (dist2, dist):
                with tf.GradientTape() as tape:
                    with tfp_hps.assert_no_excessive_var_usage(
                            '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'
                            .format(d1, d1.variables, d2, d2.variables),
                            max_permissible=1
                    ):  # No validation => 1 convert per var.
                        kl = d1.kl_divergence(d2)
                wrt_vars = list(d1.variables) + list(d2.variables)
                grads = tape.gradient(kl, wrt_vars)
                for grad, var in zip(grads, wrt_vars):
                    if grad is None and dist_name not in NO_KL_PARAM_GRADS:
                        raise AssertionError(
                            'Missing KL({} || {}) -> {} grad:\n'
                            '{} vars: {}\n{} vars: {}'.format(
                                d1, d2, var, d1, d1.variables, d2,
                                d2.variables))
        except NotImplementedError:
            pass

        # Test that log_prob produces non-None gradients, except for distributions
        # on the NO_LOG_PROB_PARAM_GRADS blacklist.
        if dist_name not in NO_LOG_PROB_PARAM_GRADS:
            with tf.GradientTape() as tape:
                lp = dist.log_prob(tf.stop_gradient(sample))
            grads = tape.gradient(lp, dist.variables)
            for grad, var in zip(grads, dist.variables):
                if grad is None:
                    raise AssertionError(
                        'Missing log_prob -> {} grad for distribution {}'.
                        format(var, dist_name))

        # Test that all forms of probability evaluation avoid reading distribution
        # parameters more than once.
        for evaluative in data.draw(
                hps.sets(hps.one_of(
                    map(hps.just, [
                        'log_prob', 'prob', 'log_cdf', 'cdf',
                        'log_survival_function', 'survival_function'
                    ])),
                         min_size=3,
                         max_size=3)):
            logging.info('%s.%s', dist_name, evaluative)
            try:
                # No validation => 1 convert. But for TD we allow 2:
                # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp)
                max_permissible = (2 if isinstance(
                    dist, tfd.TransformedDistribution) else 1)
                with tfp_hps.assert_no_excessive_var_usage(
                        'evaluative `{}` of `{}`'.format(evaluative, dist),
                        max_permissible=max_permissible):
                    getattr(dist, evaluative)(sample)
            except NotImplementedError:
                pass
Beispiel #9
0
def student_t_process_regression_models(draw,
                                        kernel_name=None,
                                        batch_shape=None,
                                        event_dim=None,
                                        feature_dim=None,
                                        feature_ndims=None,
                                        enable_vars=False):
  # First draw a kernel.
  k, _ = draw(kernel_hps.base_kernels(
      kernel_name=kernel_name,
      batch_shape=batch_shape,
      event_dim=event_dim,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      # Disable variables
      enable_vars=False))
  compatible_batch_shape = draw(
      tfp_hps.broadcast_compatible_shape(k.batch_shape))
  index_points = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      enable_vars=enable_vars,
      name='index_points'))
  hp.note('Index points:\n{}'.format(repr(index_points)))

  observation_index_points = draw(
      kernel_hps.kernel_input(
          batch_shape=compatible_batch_shape,
          example_ndims=1,
          feature_dim=feature_dim,
          feature_ndims=feature_ndims,
          enable_vars=enable_vars,
          name='observation_index_points'))
  hp.note('Observation index points:\n{}'.format(
      repr(observation_index_points)))

  observations = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      # This is the example dimension suggested observation_index_points.
      example_dim=int(observation_index_points.shape[-(feature_ndims + 1)]),
      # No feature dimensions.
      feature_dim=0,
      feature_ndims=0,
      enable_vars=enable_vars,
      name='observations'))
  hp.note('Observations:\n{}'.format(repr(observations)))

  params = draw(broadcasting_params(
      'StudentTProcessRegressionModel',
      compatible_batch_shape,
      event_dim=event_dim,
      enable_vars=enable_vars))
  hp.note('Params:\n{}'.format(repr(params)))

  stp = tfd.StudentTProcessRegressionModel(
      # Ensure that the `df` parameter is not a `Variable` since we pass
      # in a `DeferredTensor` of the `df` parameter.
      df=tf.convert_to_tensor(params['df']),
      kernel=k,
      index_points=index_points,
      observation_index_points=observation_index_points,
      observations=observations,
      cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0],
      observation_noise_variance=params['observation_noise_variance'])
  return stp
Beispiel #10
0
def variational_gaussian_processes(draw,
                                   kernel_name=None,
                                   batch_shape=None,
                                   event_dim=None,
                                   feature_dim=None,
                                   feature_ndims=None,
                                   enable_vars=False):
  # First draw a kernel.
  k, _ = draw(kernel_hps.base_kernels(
      kernel_name=kernel_name,
      batch_shape=batch_shape,
      event_dim=event_dim,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      # Disable variables
      enable_vars=False))
  compatible_batch_shape = draw(
      tfp_hps.broadcast_compatible_shape(k.batch_shape))
  index_points = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      enable_vars=enable_vars,
      name='index_points'))
  hp.note('Index points:\n{}'.format(repr(index_points)))

  inducing_index_points = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      enable_vars=enable_vars,
      name='inducing_index_points'))
  hp.note('Inducing index points:\n{}'.format(repr(inducing_index_points)))
  num_inducing_points = int(inducing_index_points.shape[-(feature_ndims + 1)])

  variational_inducing_observations_loc = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      example_dim=num_inducing_points,
      feature_dim=0,
      feature_ndims=0,
      enable_vars=enable_vars,
      name='variational_inducing_observations_loc'))
  hp.note('Variational inducing observations loc:\n{}'.format(
      repr(variational_inducing_observations_loc)))

  variational_inducing_observations_scale = draw(tfp_hps.tensors_in_support(
      support=tfp_hps.Support.MATRIX_LOWER_TRIL_POSITIVE_DEFINITE,
      batch_shape=compatible_batch_shape.as_list(),
      event_dim=num_inducing_points,
      dtype=np.float64))
  hp.note('Variational inducing observations scale:\n{}'.format(
      repr(variational_inducing_observations_scale)))

  params = draw(broadcasting_params(
      'GaussianProcessRegressionModel',
      compatible_batch_shape,
      event_dim=event_dim,
      enable_vars=enable_vars))
  hp.note('Params:\n{}'.format(repr(params)))

  vgp = tfd.VariationalGaussianProcess(
      kernel=k,
      index_points=index_points,
      inducing_index_points=inducing_index_points,
      variational_inducing_observations_loc=(
          variational_inducing_observations_loc),
      variational_inducing_observations_scale=(
          variational_inducing_observations_scale),
      observation_noise_variance=params[
          'observation_noise_variance'])
  return vgp
Beispiel #11
0
def gaussian_process_regression_models(draw,
                                       kernel_name=None,
                                       batch_shape=None,
                                       event_dim=None,
                                       feature_dim=None,
                                       feature_ndims=None,
                                       enable_vars=False):
  # First draw a kernel.
  k, _ = draw(kernel_hps.base_kernels(
      kernel_name=kernel_name,
      batch_shape=batch_shape,
      event_dim=event_dim,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      # Disable variables
      enable_vars=False))
  compatible_batch_shape = draw(
      tfp_hps.broadcast_compatible_shape(k.batch_shape))
  index_points = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      feature_dim=feature_dim,
      feature_ndims=feature_ndims,
      enable_vars=enable_vars,
      name='index_points'))
  hp.note('Index points:\n{}'.format(repr(index_points)))

  observation_index_points = draw(
      kernel_hps.kernel_input(
          batch_shape=compatible_batch_shape,
          example_ndims=1,
          feature_dim=feature_dim,
          feature_ndims=feature_ndims,
          enable_vars=enable_vars,
          name='observation_index_points'))
  hp.note('Observation index points:\n{}'.format(
      repr(observation_index_points)))

  observations = draw(kernel_hps.kernel_input(
      batch_shape=compatible_batch_shape,
      example_ndims=1,
      # This is the example dimension suggested observation_index_points.
      example_dim=int(observation_index_points.shape[-(feature_ndims + 1)]),
      # No feature dimensions.
      feature_dim=0,
      feature_ndims=0,
      enable_vars=enable_vars,
      name='observations'))
  hp.note('Observations:\n{}'.format(repr(observations)))

  params = draw(broadcasting_params(
      'GaussianProcessRegressionModel',
      compatible_batch_shape,
      event_dim=event_dim,
      enable_vars=enable_vars))
  hp.note('Params:\n{}'.format(repr(params)))

  gp = tfd.GaussianProcessRegressionModel(
      kernel=k,
      index_points=index_points,
      observation_index_points=observation_index_points,
      observations=observations,
      cholesky_fn=lambda x: marginal_fns.retrying_cholesky(x)[0],
      observation_noise_variance=params['observation_noise_variance'])
  return gp
Beispiel #12
0
    def testBijector(self, bijector_name, data):
        if tf.executing_eagerly() != (FLAGS.tf_mode == 'eager'):
            return
        bijector, batch_shape = data.draw(
            bijectors(bijector_name=bijector_name, enable_vars=True))
        del batch_shape

        event_dim = data.draw(hps.integers(min_value=2, max_value=6))

        # Forward mapping.
        shp = bijector.inverse_event_shape([event_dim] *
                                           bijector.inverse_min_event_ndims)
        shp = tensorshape_util.concatenate(
            data.draw(
                tfp_hps.broadcast_compatible_shape(
                    shp[:shp.ndims - bijector.forward_min_event_ndims])),
            shp[shp.ndims - bijector.forward_min_event_ndims:])
        xs = tf.identity(data.draw(domain_tensors(bijector, shape=shp)),
                         name='xs')
        wrt_vars = [xs] + list(bijector.variables)
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ys = bijector.forward(xs + 0)
        grads = tape.gradient(ys, wrt_vars)
        assert_no_none_grad(bijector, 'forward', wrt_vars, grads)

        # FLDJ.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.forward_min_event_ndims,
                         max_value=bijector.forward_event_shape(
                             xs.shape).ndims))
        with tf.GradientTape() as tape:
            max_permitted = 2 if hasattr(bijector,
                                         '_forward_log_det_jacobian') else 4
            if is_invert(bijector):
                max_permitted = (2 if hasattr(
                    bijector.bijector, '_inverse_log_det_jacobian') else 4)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ldj = bijector.forward_log_det_jacobian(
                    xs + 0, event_ndims=event_ndims)
        grads = tape.gradient(ldj, wrt_vars)
        assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars,
                            grads)

        # Inverse mapping.
        shp = bijector.forward_event_shape([event_dim] *
                                           bijector.forward_min_event_ndims)
        shp = tensorshape_util.concatenate(
            data.draw(
                tfp_hps.broadcast_compatible_shape(
                    shp[:shp.ndims - bijector.inverse_min_event_ndims])),
            shp[shp.ndims - bijector.inverse_min_event_ndims:])
        ys = tf.identity(data.draw(codomain_tensors(bijector, shape=shp)),
                         name='ys')
        wrt_vars = [ys] + list(bijector.variables)
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                xs = bijector.inverse(ys + 0)
        grads = tape.gradient(xs, wrt_vars)
        assert_no_none_grad(bijector, 'inverse', wrt_vars, grads)

        # ILDJ.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.inverse_min_event_ndims,
                         max_value=bijector.inverse_event_shape(
                             ys.shape).ndims))
        with tf.GradientTape() as tape:
            max_permitted = 2 if hasattr(bijector,
                                         '_inverse_log_det_jacobian') else 4
            if is_invert(bijector):
                max_permitted = (2 if hasattr(
                    bijector.bijector, '_forward_log_det_jacobian') else 4)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                xs = bijector.inverse_log_det_jacobian(ys + 0,
                                                       event_ndims=event_ndims)
        grads = tape.gradient(xs, wrt_vars)
        assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars,
                            grads)