def __init__(self,
                 normal_sampler_fn,
                 log_likelihood_fn,
                 seed=None,
                 name=None):
        """Initializes this transition kernel.

    Args:
      normal_sampler_fn: Python callable that takes in a seed and returns a
        sample from a multivariate normal distribution. Note that the shape of
        the samples must agree with `log_likelihood_fn`.
      log_likelihood_fn: Python callable which takes an argument like
        `current_state` (or `*current_state` if it is a list) and returns its
        (possibly unnormalized) log-likelihood.
      seed: Python integer to seed the random number generator.
      name: Python `str` name prefixed to Ops created by this function.
        Default value: `None` (i.e., 'slice_sampler_kernel').

    Returns:
      next_state: Tensor or Python list of `Tensor`s representing the state(s)
        of the Markov chain(s) at each result step. Has same shape as
        `current_state`.
      kernel_results: `collections.namedtuple` of internal calculations used to
        advance the chain.
    """
        self._seed_stream = tfp_util.SeedStream(
            seed, salt='elliptical_slice_sampler')
        self._parameters = dict(normal_sampler_fn=normal_sampler_fn,
                                log_likelihood_fn=log_likelihood_fn,
                                seed=seed,
                                name=name)
def _get_mixing_indices(size, seed=None, name=None):
  """Generates an array of indices suitable for mutation operation.

  The mutation operation in differential evolution requires that for every
  element of the population, three distinct other elements be chosen to produce
  a trial candidate. This function generates an array of shape [size, 3]
  satisfying the properties that:
    (a). array[i, :] does not contain the index 'i'.
    (b). array[i, :] does not contain any overlapping indices.
    (c). All elements in the array are between 0 and size - 1 inclusive.

  Args:
    size: Scalar integer `Tensor`. The number of samples as well as a the range
      of the indices to sample from.
    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
      applied.
      Default value: `None`.
    name:  Python `str` name prefixed to Ops created by this function.
      Default value: 'get_mixing_indices'.

  Returns:
    sample: A `Tensor` of shape [size, 3] and same dtype as `size` containing
    samples without replacement between 0 and size - 1 (inclusive) with the
    `i`th row not including the number `i`.
  """
  with tf1.name_scope(
      name, default_name='get_mixing_indices', values=[size]):
    size = tf.convert_to_tensor(value=size)
    dtype = size.dtype
    seed_stream = tfp_util.SeedStream(seed, salt='get_mixing_indices')
    first = tf.random.uniform([size],
                              maxval=size-1,
                              dtype=dtype,
                              seed=seed_stream())
    second = tf.random.uniform([size],
                               maxval=size-2,
                               dtype=dtype,
                               seed=seed_stream())
    third = tf.random.uniform([size],
                              maxval=size-3,
                              dtype=dtype,
                              seed=seed_stream())

    # Shift second if it is on top of or to the right of first
    second = tf1.where(first < second, x=second, y=second + 1)
    smaller = tf.math.minimum(first, second)
    larger = tf.math.maximum(first, second)
    # Shift the third one so it does not coincide with either the first or the
    # second number. Assuming first < second, shift by 1 if the number is in
    # [first, second) and by 2 if the number is greater than or equal to the
    # second.
    third = tf1.where(third < smaller, x=third, y=third + 1)
    third = tf1.where(third < larger, x=third, y=third + 1)
    sample = tf.stack([first, second, third], axis=1)
    to_avoid = tf.expand_dims(tf.range(size), axis=-1)
    sample = tf1.where(sample < to_avoid, x=sample, y=sample + 1)
    return sample
    def _joint_sample_n(self, n, seed=None):
        """Draw a joint sample from the prior over latents and observations.

    This sampler is specific to LocalLevel models and is faster than the
    generic LinearGaussianStateSpaceModel implementation.

    Args:
      n: `int` `Tensor` number of samples to draw.
      seed: Optional `int` `Tensor` seed for the random number generator.
    Returns:
      latents: `float` `Tensor` of shape `concat([[n], self.batch_shape,
        [self.num_timesteps, self.latent_size]], axis=0)` representing samples
        of latent trajectories.
      observations: `float` `Tensor` of shape `concat([[n], self.batch_shape,
        [self.num_timesteps, self.observation_size]], axis=0)` representing
        samples of observed series generated from the sampled `latents`.
    """
        with tf.name_scope('joint_sample_n'):
            strm = util.SeedStream(seed,
                                   'LocalLevelStateSpaceModel_joint_sample_n')

            if self.batch_shape.is_fully_defined():
                batch_shape = self.batch_shape.as_list()
            else:
                batch_shape = self.batch_shape_tensor()
            sample_and_batch_shape = tf.cast(
                prefer_static.concat([[n], batch_shape], axis=0), tf.int32)

            # Sample the initial timestep from the prior.  Since we want
            # this sample to have full batch shape (not just the batch shape
            # of the self.initial_state_prior object which might in general be
            # smaller), we augment the sample shape to include whatever
            # extra batch dimensions are required.
            initial_level = self.initial_state_prior.sample(
                linear_gaussian_ssm._augment_sample_shape(  # pylint: disable=protected-access
                    self.initial_state_prior, sample_and_batch_shape,
                    self.validate_args),
                seed=strm())

            # Sample the latent random walk and observed noise, more efficiently than
            # the generic loop in `LinearGaussianStateSpaceModel`.
            level_jumps = (tf.random.normal(prefer_static.concat(
                [sample_and_batch_shape, [self.num_timesteps - 1]], axis=0),
                                            dtype=self.dtype,
                                            seed=strm()) *
                           self.level_scale[..., tf.newaxis])
            prior_level_sample = tf.cumsum(tf.concat(
                [initial_level, level_jumps], axis=-1),
                                           axis=-1)
            prior_observation_sample = prior_level_sample + (  # Sample noise.
                tf.random.normal(prefer_static.shape(prior_level_sample),
                                 dtype=self.dtype,
                                 seed=strm()) *
                self.observation_noise_scale[..., tf.newaxis])

            return (prior_level_sample[..., tf.newaxis],
                    prior_observation_sample[..., tf.newaxis])
Exemple #4
0
    def prior_sample(self,
                     num_timesteps,
                     initial_step=0,
                     params_sample_shape=(),
                     trajectories_sample_shape=(),
                     seed=None):
        """Sample from the joint prior over model parameters and trajectories.

    Args:
      num_timesteps: Scalar `int` `Tensor` number of timesteps to model.
      initial_step: Optional scalar `int` `Tensor` specifying the starting
        timestep.
          Default value: 0.
      params_sample_shape: Number of possible worlds to sample iid from the
        parameter prior, or more generally, `Tensor` `int` shape to fill with
        iid samples.
          Default value: [] (i.e., draw a single sample and don't expand the
          shape).
      trajectories_sample_shape: For each sampled set of parameters, number
        of trajectories to sample, or more generally, `Tensor` `int` shape to
        fill with iid samples.
        Default value: [] (i.e., draw a single sample and don't expand the
          shape).
      seed: Python `int` random seed.

    Returns:
      trajectories: `float` `Tensor` of shape
        `trajectories_sample_shape + params_sample_shape + [num_timesteps, 1]`
        containing all sampled trajectories.
      param_samples: list of sampled parameter value `Tensor`s, in order
        corresponding to `self.parameters`, each of shape
        `params_sample_shape + prior.batch_shape + prior.event_shape`.
    """

        seed = tfp_util.SeedStream(seed,
                                   salt='StructuralTimeSeries_prior_sample')

        with tf1.name_scope('prior_sample',
                            values=[
                                num_timesteps, params_sample_shape,
                                trajectories_sample_shape
                            ]):
            param_samples = [
                p.prior.sample(params_sample_shape, seed=seed(), name=p.name)
                for p in self.parameters
            ]
            model = self.make_state_space_model(num_timesteps=num_timesteps,
                                                initial_step=initial_step,
                                                param_vals=param_samples)
            return model.sample(trajectories_sample_shape,
                                seed=seed()), param_samples
def _get_starting_population(initial_population,
                             initial_position,
                             population_size,
                             population_stddev,
                             seed):
  """Constructs the initial population.

  If an initial population is not already provided, this function constructs
  a population by adding random normal noise to the initial position.

  Args:
    initial_population: None or a list of `Tensor`s. The initial population.
    initial_position: None or a list of `Tensor`s. The initial position.
      If initial_population is None, this argument must not be None.
    population_size: Scalar integer `Tensor`. The number of members in the
      population. If the initial population is not None, this parameter is
      ignored.
    population_stddev: A positive scalar real `Tensor` of the same dtype
      as `initial_position` or `initial_population` (whichever is not None).
      This parameter is ignored if `initial_population`
      is specified. Used to generate the population from the
      `initial_position` by adding random normal noise with zero mean and
      the specified standard deviation.
    seed: Seed for random number generation.

  Returns:
    A list of `Tensor`s. The initial population.
  """
  if initial_population is not None:
    return [tf.convert_to_tensor(value=part) for part in initial_population]
  # Constructs the population by adding normal noise to the initial position.
  seed_stream = tfp_util.SeedStream(seed, salt='get_starting_population')
  population = []
  for part in initial_position:
    part = tf.convert_to_tensor(value=part)
    part_event_shape = tf.shape(input=part)
    # We only draw population_size-1 random vectors because we want to ensure
    # that the supplied position is part of the population. The first member
    # is set to be the initial_position.
    population_part_shape = tf.concat([[population_size-1],
                                       part_event_shape], axis=0)
    population_part = tf.random.normal(population_part_shape,
                                       stddev=population_stddev,
                                       dtype=part.dtype.base_dtype,
                                       seed=seed_stream())
    population_part += part
    population_part = tf.concat([[part], population_part], axis=0)
    population.append(population_part)
  return population
Exemple #6
0
def build_factored_surrogate_posterior(
        event_shape=None,
        constraining_bijectors=None,
        initial_unconstrained_loc=_sample_uniform_initial_loc,
        initial_unconstrained_scale=1e-2,
        trainable_distribution_fn=_build_trainable_normal_dist,
        seed=None,
        validate_args=False,
        name=None):
    """Builds a joint variational posterior that factors over model variables.

  By default, this method creates an independent trainable Normal distribution
  for each variable, transformed using a bijector (if provided) to
  match the support of that variable. This makes extremely strong
  assumptions about the posterior: that it is approximately normal (or
  transformed normal), and that all model variables are independent.

  Args:
    event_shape: `Tensor` shape, or nested structure of `Tensor` shapes,
      specifying the event shape(s) of the posterior variables.
    constraining_bijectors: Optional `tfb.Bijector` instance, or nested
      structure of such instances, defining support(s) of the posterior
      variables. The structure must match that of `event_shape` and may
      contain `None` values. A posterior variable will
      be modeled as `tfd.TransformedDistribution(underlying_dist,
      constraining_bijector)` if a corresponding constraining bijector is
      specified, otherwise it is modeled as supported on the
      unconstrained real line.
    initial_unconstrained_loc: Optional Python `callable` with signature
      `tensor = initial_unconstrained_loc(shape, seed)` used to sample
      real-valued initializations for the unconstrained representation of each
      variable. May alternately be a nested structure of
      `Tensor`s, giving specific initial locations for each variable; these
      must have structure matching `event_shape` and shapes determined by the
      inverse image of `event_shape` under `constraining_bijectors`, which
      may optionally be prefixed with a common batch shape.
      Default value: `functools.partial(tf.random.uniform,
        minval=-2., maxval=2., dtype=tf.float32)`.
    initial_unconstrained_scale: Optional scalar float `Tensor` initial
      scale for the unconstrained distributions, or a nested structure of
      `Tensor` initial scales for each variable.
      Default value: `1e-2`.
    trainable_distribution_fn: Optional Python `callable` with signature
      `trainable_dist = trainable_distribution_fn(initial_loc, initial_scale,
      event_ndims, validate_args)`. This is called for each model variable to
      build the corresponding factor in the surrogate posterior. It is expected
      that the distribution returned is supported on unconstrained real values.
      Default value: `functools.partial(
        tfp.experimental.vi.build_trainable_location_scale_distribution,
        distribution_fn=tfd.Normal)`, i.e., a trainable Normal distribution.
    seed: Python integer to seed the random number generator. This is used
      only when `initial_loc` is not specified.
    validate_args: Python `bool`. Whether to validate input with asserts. This
      imposes a runtime cost. If `validate_args` is `False`, and the inputs are
      invalid, correct behavior is not guaranteed.
      Default value: `False`.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'build_factored_surrogate_posterior').

  Returns:
    surrogate_posterior: A `tfd.Distribution` instance whose samples have
      shape and structure matching that of `event_shape` or `initial_loc`.

  ### Examples

  Consider a Gamma model with unknown parameters, expressed as a joint
  Distribution:

  ```python
  Root = tfd.JointDistributionCoroutine.Root
  def model_fn():
    concentration = yield Root(tfd.Exponential(1.))
    rate = yield Root(tfd.Exponential(1.))
    y = yield tfd.Sample(tfd.Gamma(concentration=concentration, rate=rate),
                         sample_shape=4)
  model = tfd.JointDistributionCoroutine(model_fn)
  ```

  Let's use variational inference to approximate the posterior over the
  data-generating parameters for some observed `y`. We'll build a
  surrogate posterior distribution by specifying the shapes of the latent
  `rate` and `concentration` parameters, and that both are constrained to
  be positive.

  ```python
  surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior(
    event_shape=model.event_shape_tensor()[:-1],  # Omit the observed `y`.
    constraining_bijectors=[tfb.Softplus(),   # Rate is positive.
                            tfb.Softplus()])  # Concentration is positive.
  ```

  This creates a trainable joint distribution, defined by variables in
  `surrogate_posterior.trainable_variables`. We use `fit_surrogate_posterior`
  to fit this distribution by minimizing a divergence to the true posterior.

  ```python
  y = [0.2, 0.5, 0.3, 0.7]
  losses = tfp.vi.fit_surrogate_posterior(
    lambda rate, concentration: model.log_prob([rate, concentration, y]),
    surrogate_posterior=surrogate_posterior,
    num_steps=100,
    optimizer=tf.optimizers.Adam(0.1),
    sample_size=10)

  # After optimization, samples from the surrogate will approximate
  # samples from the true posterior.
  samples = surrogate_posterior.sample(100)
  posterior_mean = [tf.reduce_mean(x) for x in samples]     # mean ~= [1.1, 2.1]
  posterior_std = [tf.math.reduce_std(x) for x in samples]  # std  ~= [0.3, 0.8]
  ```

  If we wanted to initialize the optimization at a specific location, we can
  specify one when we build the surrogate posterior. This function requires the
  initial location to be specified in *unconstrained* space; we do this by
  inverting the constraining bijectors (note this section also demonstrates the
  creation of a dict-structured model).

  ```python
  initial_loc = {'concentration': 0.4, 'rate': 0.2}
  constraining_bijectors={'concentration': tfb.Softplus(),   # Rate is positive.
                          'rate': tfb.Softplus()}   # Concentration is positive.
  initial_unconstrained_loc = tf.nest.map_fn(
    lambda b, x: b.inverse(x) if b is not None else x,
    constraining_bijectors, initial_loc)
  surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior(
    event_shape=tf.nest.map_fn(tf.shape, initial_loc),
    constraining_bijectors=constraining_bijectors,
    initial_unconstrained_loc=initial_unconstrained_state,
    initial_unconstrained_scale=1e-4)
  ```

  """

    with tf.name_scope(name or 'build_factored_surrogate_posterior'):
        seed = tfp_util.SeedStream(seed,
                                   salt='build_factored_surrogate_posterior')

        # Convert event shapes to Tensors.
        shallow_structure = _get_event_shape_shallow_structure(event_shape)
        event_shape = nest.map_structure_up_to(
            shallow_structure,
            lambda s: tf.convert_to_tensor(s, dtype=tf.int32), event_shape)
        flat_event_shapes = tf.nest.flatten(event_shape)

        # For simplicity, we'll work with flattened lists of state parts and
        # repack the structure at the end.
        if constraining_bijectors is not None:
            flat_bijectors = tf.nest.flatten(constraining_bijectors)
        else:
            flat_bijectors = [None for _ in flat_event_shapes]
        flat_unconstrained_event_shapes = [
            b.inverse_event_shape_tensor(s) if b is not None else s
            for s, b in zip(flat_event_shapes, flat_bijectors)
        ]

        # Construct initial locations for the internal unconstrained dists.
        if callable(
                initial_unconstrained_loc):  # Sample random initialization.
            flat_unconstrained_locs = [
                initial_unconstrained_loc(shape=s, seed=seed())
                for s in flat_unconstrained_event_shapes
            ]
        else:  # Use provided initialization.
            flat_unconstrained_locs = nest.flatten_up_to(
                shallow_structure,
                initial_unconstrained_loc,
                check_types=False)

        if nest.is_nested(initial_unconstrained_scale):
            flat_unconstrained_scales = nest.flatten_up_to(
                shallow_structure,
                initial_unconstrained_scale,
                check_types=False)
        else:
            flat_unconstrained_scales = [
                initial_unconstrained_scale for _ in flat_unconstrained_locs
            ]

        # Extract the rank of each event, so that we build distributions with the
        # correct event shapes.
        flat_unconstrained_event_ndims = [
            prefer_static.rank_from_shape(s)
            for s in flat_unconstrained_event_shapes
        ]

        # Build the component surrogate posteriors.
        flat_component_dists = []
        for initial_loc, initial_scale, event_ndims, bijector in zip(
                flat_unconstrained_locs, flat_unconstrained_scales,
                flat_unconstrained_event_ndims, flat_bijectors):
            unconstrained_dist = trainable_distribution_fn(
                initial_loc=initial_loc,
                initial_scale=initial_scale,
                event_ndims=event_ndims,
                validate_args=validate_args)
            flat_component_dists.append(
                bijector(unconstrained_dist
                         ) if bijector is not None else unconstrained_dist)
        component_distributions = tf.nest.pack_sequence_as(
            event_shape, flat_component_dists)

        # Return a `Distribution` object whose events have the specified structure.
        return (joint_distribution_util.
                independent_joint_distribution_from_structure(
                    component_distributions, validate_args=validate_args))
def _binary_crossover(population,
                      population_size,
                      mutants,
                      crossover_prob,
                      seed):
  """Performs recombination by binary crossover for the current population.

  Let v_i denote the i'th component of the member v and m_i the corresponding
  component of the mutant vector corresponding to v. Then the crossed over
  vector w_i is determined by setting w_i =
  (m_i with probability=crossover_prob else v_i). In addition, DE requires that
  at least one of the components is crossed over (otherwise we end
  up with no change). This is done by choosing on index say k randomly where
  a force crossover is performed (i.e. w_k = m_k). This is the scheme
  implemented in this function.

  Args:
    population: A Python list of `Tensor`s where each `Tensor` in the list
      must be of rank at least 1 and all the elements must have a common
      first dimension. The base population to cross over.
    population_size: A scalar integer `Tensor`. The number of elements in the
      population (i.e. size of the first dimension of any member of
      `population`).
    mutants: A Python list of `Tensor`s with the same structure as `population`.
      The mutated population.
    crossover_prob: A positive real scalar `Tensor` bounded above by 1.0. The
      probability of a crossover being performed for each axis.
    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
      applied.

  Returns:
    A list of `Tensor`s of the same structure, dtype and shape as `population`.
    The recombined population.
  """
  sizes = [tf.cast(tf.size(input=x), dtype=tf.float64) for x in population]
  seed_stream = tfp_util.SeedStream(seed, salt='binary_crossover')
  force_crossover_group = distributions.Categorical(sizes).sample(
      [population_size, 1], seed=seed_stream())
  recombinants = []
  for i, population_part in enumerate(population):
    pop_part_flat = tf.reshape(population_part, [population_size, -1])
    mutant_part_flat = tf.reshape(mutants[i], [population_size, -1])
    part_size = tf.size(input=population_part) // population_size
    force_crossovers = tf.one_hot(
        tf.random.uniform([population_size],
                          minval=0,
                          maxval=part_size,
                          dtype=tf.int32,
                          seed=seed_stream()),
        part_size,
        on_value=True,
        off_value=False,
        dtype=tf.bool)  # Tensor of shape [population_size, size]
    group_mask = tf.math.equal(force_crossover_group, i)
    force_crossovers &= group_mask
    do_binary_crossover = tf.random.uniform(
        [population_size, part_size],
        dtype=crossover_prob.dtype.base_dtype,
        seed=seed_stream()) < crossover_prob
    do_binary_crossover |= force_crossovers
    recombinant_flat = tf1.where(
        do_binary_crossover, x=mutant_part_flat, y=pop_part_flat)
    recombinant = tf.reshape(recombinant_flat, tf.shape(input=population_part))
    recombinants.append(recombinant)
  return recombinants
def one_step(
    objective_function,
    population,
    population_values=None,
    differential_weight=0.5,
    crossover_prob=0.9,
    seed=None,
    name=None):
  """Performs one step of the differential evolution algorithm.

  Args:
    objective_function:  A Python callable that accepts a batch of possible
      solutions and returns the values of the objective function at those
      arguments as a rank 1 real `Tensor`. This specifies the function to be
      minimized. The input to this callable may be either a single `Tensor`
      or a Python `list` of `Tensor`s. The signature must match the format of
      the argument `population`. (i.e., objective_function(*population) must
      return the value of the function to be minimized).
    population:  `Tensor` or Python `list` of `Tensor`s representing the
      current population vectors. Each `Tensor` must be of the same real dtype.
      The first dimension indexes individual population members while the
      rest of the dimensions are consumed by the value function. For example,
      if the population is a single `Tensor` of shape [n, m1, m2], then `n` is
      the population size and the output of `objective_function` applied to the
      population is a `Tensor` of shape [n]. If the population is a python
      list of `Tensor`s then each `Tensor` in the list should have the first
      axis of a common size, say `n` and `objective_function(*population)`
      should return a `Tensor of shape [n]. The population must have at least
      4 members for the algorithm to work correctly.
    population_values: A `Tensor` of rank 1 and real dtype. The result of
      applying `objective_function` to the `population`. If not supplied it is
      computed using the `objective_function`.
      Default value: None.
    differential_weight: Real scalar `Tensor`. Must be positive and less than
      2.0. The parameter controlling the strength of mutation.
      Default value: 0.5
    crossover_prob: Real scalar `Tensor`. Must be between 0 and 1. The
      probability of recombination per site.
      Default value: 0.9
    seed: `int` or None. The random seed for this `Op`. If `None`, no seed is
      applied.
      Default value: None.
    name: (Optional) Python str. The name prefixed to the ops created by this
      function. If not supplied, the default name 'one_step' is
      used.
      Default value: None

  Returns:
    A sequence containing the following elements (in order):
    next_population: A `Tensor` or Python `list` of `Tensor`s of the same
      structure as the input population. The population at the next generation.
    next_population_values: A `Tensor` of same shape and dtype as input
      `population_values`. The function values for the `next_population`.
  """
  with tf1.name_scope(
      name, 'one_step',
      [population, population_values, differential_weight, crossover_prob]):
    population, _ = _ensure_list(population)
    if population_values is None:
      population_values = objective_function(*population)
    population_size = tf.shape(input=population[0])[0]
    seed_stream = tfp_util.SeedStream(seed, salt='one_step')
    mixing_indices = _get_mixing_indices(population_size, seed=seed_stream())
    # Construct the mutated solution vectors. There is one for each member of
    # the population.
    mutants = _get_mutants(population,
                           population_size,
                           mixing_indices,
                           differential_weight)
    # Perform recombination between the parents and the mutants.
    candidates = _binary_crossover(population,
                                   population_size,
                                   mutants,
                                   crossover_prob,
                                   seed=seed_stream())
    candidate_values = objective_function(*candidates)
    if population_values is None:
      population_values = objective_function(*population)

    infinity = tf.zeros_like(population_values) + np.inf

    population_values = tf1.where(
        tf.math.is_nan(population_values), x=infinity, y=population_values)

    to_replace = candidate_values < population_values
    next_population = [
        tf1.where(to_replace, x=candidates_part, y=population_part)
        for candidates_part, population_part in zip(candidates, population)
    ]
    next_values = tf1.where(
        to_replace, x=candidate_values, y=population_values)

  return next_population, next_values
Exemple #9
0
def fit_with_hmc(model,
                 observed_time_series,
                 num_results=100,
                 num_warmup_steps=50,
                 num_leapfrog_steps=15,
                 initial_state=None,
                 initial_step_size=None,
                 chain_batch_shape=(),
                 num_variational_steps=150,
                 variational_optimizer=None,
                 variational_sample_size=5,
                 seed=None,
                 name=None):
    """Draw posterior samples using Hamiltonian Monte Carlo (HMC).

  Markov chain Monte Carlo (MCMC) methods are considered the gold standard of
  Bayesian inference; under suitable conditions and in the limit of infinitely
  many draws they generate samples from the true posterior distribution. HMC [1]
  uses gradients of the model's log-density function to propose samples,
  allowing it to exploit posterior geometry. However, it is computationally more
  expensive than variational inference and relatively sensitive to tuning.

  This method attempts to provide a sensible default approach for fitting
  StructuralTimeSeries models using HMC. It first runs variational inference as
  a fast posterior approximation, and initializes the HMC sampler from the
  variational posterior, using the posterior standard deviations to set
  per-variable step sizes (equivalently, a diagonal mass matrix). During the
  warmup phase, it adapts the step size to target an acceptance rate of 0.75,
  which is thought to be in the desirable range for optimal mixing [2].


  Args:
    model: An instance of `StructuralTimeSeries` representing a
      time-series model. This represents a joint distribution over
      time-series and their parameters with batch shape `[b1, ..., bN]`.
    observed_time_series: `float` `Tensor` of shape
      `concat([sample_shape, model.batch_shape, [num_timesteps, 1]]) where
      `sample_shape` corresponds to i.i.d. observations, and the trailing `[1]`
      dimension may (optionally) be omitted if `num_timesteps > 1`. May
      optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes
      a mask `Tensor` to specify timesteps with missing observations.
    num_results: Integer number of Markov chain draws.
      Default value: `100`.
    num_warmup_steps: Integer number of steps to take before starting to
      collect results. The warmup steps are also used to adapt the step size
      towards a target acceptance rate of 0.75.
      Default value: `50`.
    num_leapfrog_steps: Integer number of steps to run the leapfrog integrator
      for. Total progress per HMC step is roughly proportional to
      `step_size * num_leapfrog_steps`.
      Default value: `15`.
    initial_state: Optional Python `list` of `Tensor`s, one for each model
      parameter, representing the initial state(s) of the Markov chain(s). These
      should have shape `concat([chain_batch_shape, param.prior.batch_shape,
      param.prior.event_shape])`. If `None`, the initial state is set
      automatically using a sample from a variational posterior.
      Default value: `None`.
    initial_step_size: Python `list` of `Tensor`s, one for each model parameter,
      representing the step size for the leapfrog integrator. Must
      broadcast with the shape of `initial_state`. Larger step sizes lead to
      faster progress, but too-large step sizes make rejection exponentially
      more likely. If `None`, the step size is set automatically using the
      standard deviation of a variational posterior.
      Default value: `None`.
    chain_batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of chains
      to run in parallel.
      Default value: `[]` (i.e., a single chain).
    num_variational_steps: Python `int` number of steps to run the variational
      optimization to determine the initial state and step sizes.
      Default value: `150`.
    variational_optimizer: Optional `tf.train.Optimizer` instance to use in
      the variational optimization. If `None`, defaults to
      `tf.train.AdamOptimizer(0.1)`.
      Default value: `None`.
    variational_sample_size: Python `int` number of Monte Carlo samples to use
      in estimating the variational divergence. Larger values may stabilize
      the optimization, but at higher cost per step in time and memory.
      Default value: `1`.
    seed: Python integer to seed the random number generator.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'fit_with_hmc').

  Returns:
    samples: Python `list` of `Tensors` representing posterior samples of model
      parameters, with shapes `[concat([[num_results], chain_batch_shape,
      param.prior.batch_shape, param.prior.event_shape]) for param in
      model.parameters]`.
    kernel_results: A (possibly nested) `tuple`, `namedtuple` or `list` of
      `Tensor`s representing internal calculations made within the HMC sampler.

  #### Examples

  Assume we've built a structural time-series model:

  ```python
    day_of_week = tfp.sts.Seasonal(
        num_seasons=7,
        observed_time_series=observed_time_series,
        name='day_of_week')
    local_linear_trend = tfp.sts.LocalLinearTrend(
        observed_time_series=observed_time_series,
        name='local_linear_trend')
    model = tfp.sts.Sum(components=[day_of_week, local_linear_trend],
                        observed_time_series=observed_time_series)
  ```

  To draw posterior samples using HMC under default settings:

  ```python
  samples, kernel_results = tfp.sts.fit_with_hmc(model, observed_time_series)
  print("acceptance rate: {}".format(
    np.mean(kernel_results.inner_results.inner_results.is_accepted, axis=0)))
  print("posterior means: {}".format(
    {param.name: np.mean(param_draws, axis=0)
     for (param, param_draws) in zip(model.parameters, samples)}))
  ```

  We can also run multiple chains. This may help diagnose convergence issues
  and allows us to exploit vectorization to draw samples more quickly, although
  warmup still requires the same number of sequential steps.

  ```python
  from matplotlib import pylab as plt

  samples, kernel_results = tfp.sts.fit_with_hmc(
    model, observed_time_series, chain_batch_shape=[10])
  print("acceptance rate: {}".format(
    np.mean(kernel_results.inner_results.inner_results.is_accepted, axis=0)))

  # Plot the sampled traces for each parameter. If the chains have mixed, their
  # traces should all cover the same region of state space, frequently crossing
  # over each other.
  for (param, param_draws) in zip(model.parameters, samples):
    if param.prior.event_shape.ndims > 0:
      print("Only plotting traces for scalar parameters, skipping {}".format(
        param.name))
      continue
    plt.figure(figsize=[10, 4])
    plt.title(param.name)
    plt.plot(param_draws.numpy())
    plt.ylabel(param.name)
    plt.xlabel("HMC step")

  # Combining the samples from multiple chains into a single dimension allows
  # us to easily pass sampled parameters to downstream forecasting methods.
  combined_samples = [np.reshape(param_draws,
                                 [-1] + list(param_draws.shape[2:]))
                      for param_draws in samples]
  ```

  For greater flexibility, you may prefer to implement your own sampler using
  the TensorFlow Probability primitives in `tfp.mcmc`. The following recipe
  constructs a basic HMC sampler, using a `TransformedTransitionKernel` to
  incorporate constraints on the parameter space.

  ```python
  transformed_hmc_kernel = tfp.mcmc.TransformedTransitionKernel(
      inner_kernel=tfp.mcmc.DualAveragingStepSizeAdaptation(
          inner_kernel=tfp.mcmc.HamiltonianMonteCarlo(
              target_log_prob_fn=model.joint_log_prob(observed_time_series),
              step_size=step_size,
              num_leapfrog_steps=num_leapfrog_steps,
              state_gradients_are_stopped=True,
              seed=seed),
          num_adaptation_steps = int(0.8 * num_warmup_steps)),
      bijector=[param.bijector for param in model.parameters])

  # Initialize from a Uniform[-2, 2] distribution in unconstrained space.
  initial_state = [tfp.sts.sample_uniform_initial_state(
    param, return_constrained=True) for param in model.parameters]

  samples, kernel_results = tfp.mcmc.sample_chain(
    kernel=transformed_hmc_kernel,
    num_results=num_results,
    current_state=initial_state,
    num_burnin_steps=num_warmup_steps)
  ```

  #### References

  [1]: Radford Neal. MCMC Using Hamiltonian Dynamics. _Handbook of Markov Chain
       Monte Carlo_, 2011. https://arxiv.org/abs/1206.1901
  [2]  M.J. Betancourt, Simon Byrne, and Mark Girolami. Optimizing The
       Integrator Step Size for Hamiltonian Monte Carlo.
       https://arxiv.org/abs/1411.6669

  """
    with tf.name_scope(name or 'fit_with_hmc') as name:
        seed = tfp_util.SeedStream(seed,
                                   salt='StructuralTimeSeries_fit_with_hmc')

        observed_time_series = sts_util.pad_batch_dimension_for_multiple_chains(
            observed_time_series, model, chain_batch_shape=chain_batch_shape)
        target_log_prob_fn = model.joint_log_prob(observed_time_series)

        # Initialize state and step sizes from a variational posterior if not
        # specified.
        if initial_step_size is None or initial_state is None:
            variational_posterior = build_factored_surrogate_posterior(
                model, batch_shape=chain_batch_shape, seed=seed())

            if variational_optimizer is None:
                variational_optimizer = tf1.train.AdamOptimizer(
                    learning_rate=0.1
                )  # TODO(b/137299119) Replace with TF2 optimizer.
            loss_curve = vi.fit_surrogate_posterior(
                target_log_prob_fn,
                variational_posterior,
                sample_size=variational_sample_size,
                num_steps=num_variational_steps,
                optimizer=variational_optimizer,
                seed=seed())

            with tf.control_dependencies([loss_curve]):
                if initial_state is None:
                    posterior_sample = variational_posterior.sample()
                    initial_state = [
                        posterior_sample[p.name] for p in model.parameters
                    ]

                # Set step sizes using the unconstrained variational distribution.
                if initial_step_size is None:
                    q_dists_by_name, _ = variational_posterior.sample_distributions(
                    )
                    initial_step_size = [
                        q_dists_by_name[p.name].distribution.stddev()
                        for p in model.parameters
                    ]

        # Run HMC to sample from the posterior on parameters.
        @tf.function(autograph=False)
        def run_hmc():
            return mcmc.sample_chain(
                num_results=num_results,
                current_state=initial_state,
                num_burnin_steps=num_warmup_steps,
                kernel=mcmc.DualAveragingStepSizeAdaptation(
                    inner_kernel=mcmc.TransformedTransitionKernel(
                        inner_kernel=mcmc.HamiltonianMonteCarlo(
                            target_log_prob_fn=target_log_prob_fn,
                            step_size=initial_step_size,
                            num_leapfrog_steps=num_leapfrog_steps,
                            state_gradients_are_stopped=True,
                            seed=seed()),
                        bijector=[
                            param.bijector for param in model.parameters
                        ]),
                    num_adaptation_steps=int(num_warmup_steps * 0.8)),
                parallel_iterations=1 if seed is not None else 10)

        samples, kernel_results = run_hmc()

        return samples, kernel_results
Exemple #10
0
def build_factored_surrogate_posterior(model,
                                       batch_shape=(),
                                       seed=None,
                                       name=None):
    """Build a variational posterior that factors over model parameters.

  The surrogate posterior consists of independent Normal distributions for
  each parameter with trainable `loc` and `scale`, transformed using the
  parameter's `bijector` to the appropriate support space for that parameter.

  Args:
    model: An instance of `StructuralTimeSeries` representing a
        time-series model. This represents a joint distribution over
        time-series and their parameters with batch shape `[b1, ..., bN]`.
    batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of initial
      states to optimize in parallel.
      Default value: `()`. (i.e., just run a single optimization).
    seed: Python integer to seed the random number generator.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'build_factored_surrogate_posterior').
  Returns:
    variational_posterior: `tfd.JointDistributionNamed` defining a trainable
        surrogate posterior over model parameters. Samples from this
        distribution are Python `dict`s with Python `str` parameter names as
        keys.

  ### Examples

  Assume we've built a structural time-series model:

  ```python
    day_of_week = tfp.sts.Seasonal(
        num_seasons=7,
        observed_time_series=observed_time_series,
        name='day_of_week')
    local_linear_trend = tfp.sts.LocalLinearTrend(
        observed_time_series=observed_time_series,
        name='local_linear_trend')
    model = tfp.sts.Sum(components=[day_of_week, local_linear_trend],
                        observed_time_series=observed_time_series)
  ```

  To fit the model to data, we define a surrogate posterior and fit it
  by optimizing a variational bound:

  ```python
    surrogate_posterior = tfp.sts.build_factored_surrogate_posterior(
      model=model)
    loss_curve = tfp.vi.fit_surrogate_posterior(
      target_log_prob_fn=model.joint_log_prob(observed_time_series),
      surrogate_posterior=surrogate_posterior,
      optimizer=tf.optimizers.Adam(learning_rate=0.1),
      num_steps=200)
    posterior_samples = surrogate_posterior.sample(50)

    # In graph mode, we would need to write:
    # with tf.control_dependencies([loss_curve]):
    #   posterior_samples = surrogate_posterior.sample(50)
  ```

  For more control, we can also build and optimize a variational loss
  manually:

  ```python
    @tf.function(autograph=False)  # Ensure the loss is computed efficiently
    def loss_fn():
      return tfp.vi.monte_carlo_variational_loss(
        model.joint_log_prob(observed_time_series),
        surrogate_posterior,
        sample_size=10)

    optimizer = tf.optimizers.Adam(learning_rate=0.1)
    for step in range(200):
      with tf.GradientTape() as tape:
        loss = loss_fn()
      grads = tape.gradient(loss, surrogate_posterior.trainable_variables)
      optimizer.apply_gradients(
        zip(grads, surrogate_posterior.trainable_variables))
      if step % 20 == 0:
        print('step {} loss {}'.format(step, loss))

    posterior_samples = surrogate_posterior.sample(50)
  ```

  """
    with tf.name_scope(name or 'build_factored_surrogate_posterior'):
        seed = tfp_util.SeedStream(
            seed,
            salt='StructuralTimeSeries_build_factored_surrogate_posterior')
        variational_posterior = collections.OrderedDict()
        for param in model.parameters:
            variational_posterior[
                param.name] = _build_posterior_for_one_parameter(
                    param, batch_shape=batch_shape, seed=seed())
        return joint_distribution_named_lib.JointDistributionNamed(
            variational_posterior)
def _build_sampler_loop_body(model,
                             observed_time_series,
                             is_missing=None,
                             compile_steps_with_xla=False,
                             seed=None):
  """Builds a Gibbs sampler for the given model and observed data.

  Args:
    model: A `tf.sts.StructuralTimeSeries` model instance. This must be of the
      form constructed by `build_model_for_gibbs_sampling`.
    observed_time_series: Float `Tensor` time series of shape
      `[..., num_timesteps]`.
    is_missing: Optional `bool` `Tensor` of shape `[..., num_timesteps]`. A
      `True` value indicates that the observation for that timestep is missing.
    compile_steps_with_xla: Optional Python `bool`. If `True`, XLA compilation
      is used to accelerate sampling steps when supported.
    seed: Optional `Python` `int` seed controlling the sampled values.
  Returns:
    sampler_loop_body: Python callable that performs a single cycle of Gibbs
      sampling. Its first argument is a `GibbsSamplerState`, and it returns a
      new `GibbsSamplerState`. The second argument (passed by `tf.scan`) is
      ignored.
  """

  # Require that the model has exactly the parameters expected by
  # `GibbsSamplerState`.
  observation_noise_param, level_scale_param, weights_param = model.parameters
  if (('observation_noise' not in observation_noise_param.name) or
      ('level_scale' not in level_scale_param.name) or
      ('weights' not in weights_param.name)):
    raise ValueError('Model parameters {} do not match the expected sampler '
                     'state.'.format(model.parameters))

  level_component = model.components[0]
  if not isinstance(level_component, sts.LocalLevel):
    raise ValueError('Expected the first model component to be an instance of '
                     '`tfp.sts.LocalLevel`; instead saw {}'.format(
                         level_component))

  if is_missing is not None:  # Ensure series does not contain NaNs.
    observed_time_series = tf.where(is_missing,
                                    tf.zeros_like(observed_time_series),
                                    observed_time_series)

  num_observed_steps = prefer_static.shape(observed_time_series)[-1]
  design_matrix = _get_design_matrix(model).to_dense()[:num_observed_steps]

  # Compile the functions that sample from Gibbs conditional posteriors.
  # In principle, we should XLA-compile the entire loop body or even the entire
  # `fit_with_gibbs_sampling` loop. However, XLA can't currently compile the
  # gamma sampling op inside `_resample_scale` (b/141253568), so for now we
  # leave that method uncompiled but compile the other two sampling steps.
  # Empirically, the vast majority of sampling time is spent in
  # `resample_level`, so compiling it gives us most of the wins.
  # TODO(davmre): Wrap the entire sampling loop in `tf.function` while still
  #  XLA-compiling these pieces as appropriate.
  # TODO(b/141253568): XLA-compile the entire sampling loop.
  compiled_resample_level = _build_resample_level_fn(
      initial_state_prior=level_component.initial_state_prior,
      is_missing=is_missing,
      compile_with_xla=compile_steps_with_xla)
  compiled_resample_weights = tf.function(
      _resample_weights, autograph=False,
      experimental_compile=compile_steps_with_xla)
  compiled_resample_scale = tf.function(
      _resample_scale, autograph=False,
      experimental_compile=False)

  # Untransform scale priors -> variance priors by reaching thru Sqrt bijector.
  level_scale_variance_prior = level_scale_param.prior.distribution
  observation_noise_variance_prior = observation_noise_param.prior.distribution

  # InverseGamma samplers are currently stateful, so we only need (and want)
  # a single seed for each, shared across loop iterations.
  strm = tfp_util.SeedStream(seed, salt='_sampler_loop_body')
  observation_noise_scale_seed = strm()
  level_scale_seed = strm()

  def sampler_loop_body(previous_sample, _):
    """Runs one sampler iteration, resampling all model variables."""

    (weights_seed,
     level_seed,
     loop_seed) = samplers.split_seed(
         previous_sample.seed, n=3, salt='sampler_loop_body')

    # We encourage a reasonable initialization by sampling the weights first,
    # so at the first step they are regressed directly against the observed
    # time series. If we instead sampled the level first it might 'explain away'
    # some observed variation that we would ultimately prefer to explain through
    # the regression weights, because the level can represent arbitrary
    # variation, while the weights are limited to representing variation in the
    # subspace given by the design matrix.
    weights = compiled_resample_weights(
        design_matrix=design_matrix,
        target_residuals=(observed_time_series - previous_sample.level),
        observation_noise_scale=previous_sample.observation_noise_scale,
        weights_prior_scale=weights_param.prior.distribution.scale,
        is_missing=is_missing,
        seed=weights_seed)

    regression_residuals = observed_time_series - tf.linalg.matvec(
        design_matrix, weights)
    level = compiled_resample_level(
        observed_residuals=regression_residuals,
        level_scale=previous_sample.level_scale,
        observation_noise_scale=previous_sample.observation_noise_scale,
        seed=level_seed)

    # Estimate level scale from the empirical changes in level.
    level_scale = compiled_resample_scale(
        prior_scale=level_scale_variance_prior.scale,
        prior_concentration=level_scale_variance_prior.concentration,
        observed_residuals=level[..., 1:] - level[..., :-1],
        is_missing=None, seed=level_scale_seed)
    # Estimate noise scale from the residuals.
    observation_noise_scale = compiled_resample_scale(
        prior_scale=observation_noise_variance_prior.scale,
        prior_concentration=observation_noise_variance_prior.concentration,
        observed_residuals=regression_residuals - level,
        is_missing=is_missing, seed=observation_noise_scale_seed)

    return GibbsSamplerState(
        observation_noise_scale=observation_noise_scale,
        level_scale=level_scale,
        weights=weights,
        level=level,
        seed=loop_seed)
  return sampler_loop_body
Exemple #12
0
def build_factored_variational_loss(model,
                                    observed_time_series,
                                    init_batch_shape=(),
                                    seed=None,
                                    name=None):
  """Build a loss function for variational inference in STS models.

  Variational inference searches for the distribution within some family of
  approximate posteriors that minimizes a divergence between the approximate
  posterior `q(z)` and true posterior `p(z|observed_time_series)`. By converting
  inference to optimization, it's generally much faster than sampling-based
  inference algorithms such as HMC. The tradeoff is that the approximating
  family rarely contains the true posterior, so it may miss important aspects of
  posterior structure (in particular, dependence between variables) and should
  not be blindly trusted. Results may vary; it's generally wise to compare to
  HMC to evaluate whether inference quality is sufficient for your task at hand.

  This method constructs a loss function for variational inference using the
  Kullback-Liebler divergence `KL[q(z) || p(z|observed_time_series)]`, with an
  approximating family given by independent Normal distributions transformed to
  the appropriate parameter space for each parameter. Minimizing this loss (the
  negative ELBO) maximizes a lower bound on the log model evidence `-log
  p(observed_time_series)`. This is equivalent to the 'mean-field' method
  implemented in [1]. and is a standard approach. The resulting posterior
  approximations are unimodal; they will tend to underestimate posterior
  uncertainty when the true posterior contains multiple modes (the `KL[q||p]`
  divergence encourages choosing a single mode) or dependence between variables.

  Args:
    model: An instance of `StructuralTimeSeries` representing a
      time-series model. This represents a joint distribution over
      time-series and their parameters with batch shape `[b1, ..., bN]`.
    observed_time_series: `float` `Tensor` of shape
      `concat([sample_shape, model.batch_shape, [num_timesteps, 1]]) where
      `sample_shape` corresponds to i.i.d. observations, and the trailing `[1]`
      dimension may (optionally) be omitted if `num_timesteps > 1`. May
      optionally be an instance of `tfp.sts.MaskedTimeSeries`, which includes
      a mask `Tensor` to specify timesteps with missing observations.
    init_batch_shape: Batch shape (Python `tuple`, `list`, or `int`) of initial
      states to optimize in parallel.
      Default value: `()`. (i.e., just run a single optimization).
    seed: Python integer to seed the random number generator.
    name: Python `str` name prefixed to ops created by this function.
      Default value: `None` (i.e., 'build_factored_variational_loss').

  Returns:
    variational_loss: `float` `Tensor` of shape
      `concat([init_batch_shape, model.batch_shape])`, encoding a stochastic
      estimate of an upper bound on the negative model evidence `-log p(y)`.
      Minimizing this loss performs variational inference; the gap between the
      variational bound and the true (generally unknown) model evidence
      corresponds to the divergence `KL[q||p]` between the approximate and true
      posterior.
    variational_distributions: `collections.OrderedDict` giving
      the approximate posterior for each model parameter. The keys are
      Python `str` parameter names in order, corresponding to
      `[param.name for param in model.parameters]`. The values are
      `tfd.Distribution` instances with batch shape
      `concat([init_batch_shape, model.batch_shape])`; these will typically be
      of the form `tfd.TransformedDistribution(tfd.Normal(...),
      bijector=param.bijector)`.

  #### Examples

  Assume we've built a structural time-series model:

  ```python
    day_of_week = tfp.sts.Seasonal(
        num_seasons=7,
        observed_time_series=observed_time_series,
        name='day_of_week')
    local_linear_trend = tfp.sts.LocalLinearTrend(
        observed_time_series=observed_time_series,
        name='local_linear_trend')
    model = tfp.sts.Sum(components=[day_of_week, local_linear_trend],
                        observed_time_series=observed_time_series)
  ```

  To run variational inference, we simply construct the loss and optimize
  it:

  ```python
    (variational_loss,
     variational_distributions) = tfp.sts.build_factored_variational_loss(
       model=model, observed_time_series=observed_time_series)

    train_op = tf.train.AdamOptimizer(0.1).minimize(variational_loss)
    with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())

      for step in range(200):
        _, loss_ = sess.run((train_op, variational_loss))

        if step % 20 == 0:
          print("step {} loss {}".format(step, loss_))

      posterior_samples_ = sess.run({
        param_name: q.sample(50)
        for param_name, q in variational_distributions.items()})
  ```

  As a more complex example, we might try to avoid local optima by optimizing
  from multiple initializations in parallel, and selecting the result with the
  lowest loss:

  ```python
    (variational_loss,
     variational_distributions) = tfp.sts.build_factored_variational_loss(
       model=model, observed_time_series=observed_time_series,
       init_batch_shape=[10])

    train_op = tf.train.AdamOptimizer(0.1).minimize(variational_loss)
    with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())

      for step in range(200):
        _, loss_ = sess.run((train_op, variational_loss))

        if step % 20 == 0:
          print("step {} losses {}".format(step, loss_))

      # Draw multiple samples to reduce Monte Carlo error in the optimized
      # variational bounds.
      avg_loss = np.mean(
        [sess.run(variational_loss) for _ in range(25)], axis=0)
      best_posterior_idx = np.argmin(avg_loss, axis=0).astype(np.int32)
  ```

  #### References

  [1]: Alp Kucukelbir, Dustin Tran, Rajesh Ranganath, Andrew Gelman, and
       David M. Blei. Automatic Differentiation Variational Inference. In
       _Journal of Machine Learning Research_, 2017.
       https://arxiv.org/abs/1603.00788

  """

  with tf.name_scope(name or 'build_factored_variational_loss') as name:
    seed = tfp_util.SeedStream(
        seed, salt='StructuralTimeSeries_build_factored_variational_loss')

    variational_posterior = build_factored_surrogate_posterior(
        model, batch_shape=init_batch_shape, seed=seed())

    # Multiple initializations (similar to HMC chains) manifest as an extra
    # param batch dimension, so we need to add corresponding batch dimension(s)
    # to `observed_time_series`.
    observed_time_series = sts_util.pad_batch_dimension_for_multiple_chains(
        observed_time_series, model, chain_batch_shape=init_batch_shape)

    loss = vi.monte_carlo_variational_loss(
        model.joint_log_prob(observed_time_series),
        surrogate_posterior=variational_posterior,
        sample_size=1,
        seed=seed())

    ds, _ = variational_posterior._flat_sample_distributions()  # pylint: disable=protected-access
    ds_dict = variational_posterior._model_unflatten(ds)  # pylint: disable=protected-access
    variational_distributions = collections.OrderedDict([
        (p.name, ds_dict[p.name]) for p in model.parameters])
    return loss, variational_distributions