Beispiel #1
0
 def to_noncentered(centered_state):
     set_values = ed_transforms.make_value_setter(*centered_state)
     with ed.tape() as noncentered_tape:
         with ed.interception(ed_transforms.ncp):
             with ed.interception(set_values):
                 model(*model_args)
     return [tf.identity(v) for v in list(noncentered_tape.values())[:-1]]
Beispiel #2
0
    def _sanity_check_conversion(self, model, model_args, observed, to_cp,
                                 to_ncp, make_to_cp):

        with ed.tape() as model_tape:
            model(*model_args)

        model_tape_ = self.evaluate(model_tape)
        example_params = model_tape_.values()[:-1]

        # Test that `make_to_cp`, when given the centered parameterization as the
        # source, generates the identity fn.
        param_names = [
            p for v in model_tape_.keys() for p in (v + '_a', v + '_b')
        ]
        centered_parameterization = {p: 1. for p in param_names}
        identity_cp = make_to_cp(**centered_parameterization)
        example_params_copy = identity_cp(example_params)
        c1_ = self.evaluate(example_params_copy)
        c2_ = self.evaluate(example_params_copy)
        self.assertAllClose(c1_, c2_)
        self.assertAllClose(c1_, example_params)

        # Test that `to_ncp` and `to_cp` are deterministic and consistent
        ncp_params = to_ncp(example_params)
        cp_params = to_cp(ncp_params)

        ncp_params_, cp_params_ = self.evaluate((ncp_params, cp_params))
        ncp_params2_, cp_params2_ = self.evaluate((ncp_params, cp_params))
        # Test determinism
        self.assertAllClose(ncp_params_, ncp_params2_)
        self.assertAllClose(cp_params_, cp_params2_)

        # Test round-trip consistency:
        self.assertAllClose(cp_params_, example_params)
Beispiel #3
0
 def loop_body(i):  # trace the model to draw a single joint sample
     with ed.tape() as model_tape:
         model(*model_args)
     # pfor works with Tensors only, so extract RV values
     values = collections.OrderedDict(
         (k, rv.value) for k, rv in model_tape.items())
     return values
Beispiel #4
0
 def to_centered(uncentered_state):
     set_values = ed_transforms.make_value_setter(*uncentered_state)
     with ed.interception(set_values):
         with ed.interception(parametrisation):
             with ed.tape() as centered_tape:
                 model(*model_args)
     return [tf.identity(v) for v in list(centered_tape.values())[:-1]]
Beispiel #5
0
    def _build_model(self):

        with contextmanager.randvar_registry.init(self.graph):
            with contextmanager.layer_registry.init():
                # use edward2 model tape to capture RandomVariable declarations
                with ed.tape() as model_tape:
                    self.builder()

                # store the losses from the build layers through layers.sequential.Sequential
                # NOTE: this must be done inside the layer_registry context, where the sequentials are stored
                self.layer_losses = contextmanager.layer_registry.get_losses()

            # get variables from parameters
            var_parameters = contextmanager.randvar_registry.get_var_parameters(
            )

            # wrap captured edward2 RVs into inferpy RVs
            model_vars = OrderedDict()
            for k, v in model_tape.items():
                registered_rv = contextmanager.randvar_registry.get_variable(k)
                if registered_rv is None:
                    # a ed Random Variable. Create a inferpy Random Variable and assign the var directly.
                    # do not know the args and kwars used to build the ed random variable. Use None.
                    model_vars[k] = RandomVariable(v,
                                                   name=k,
                                                   is_datamodel=False,
                                                   ed_cls=None,
                                                   var_args=None,
                                                   var_kwargs=None,
                                                   sample_shape=())
                else:
                    model_vars[k] = registered_rv

        return model_vars, var_parameters
Beispiel #6
0
def run_parametrised_hmc(model_config,
                         interceptor,
                         num_samples=2000,
                         burnin=1000,
                         num_leapfrog_steps=4,
                         num_adaptation_steps=500,
                         num_optimization_steps=2000):
  """Given a (centred) model, this function transforms it based on the provided

  interceptor, and runs HMC on the reparameterised model.
  """

  def model_ncp(*params):
    with ed.interception(interceptor):
      return model_config.model(*params)

  log_joint_noncentered = ed.make_log_joint_fn(model_ncp)

  with ed.tape() as model_tape:
    _ = model_ncp(*model_config.model_args)

  param_shapes = collections.OrderedDict()
  target_ncp_kwargs = {}
  for param in model_tape.keys():
    if param not in model_config.observed_data.keys():
      param_shapes[param] = model_tape[param].shape
    else:
      target_ncp_kwargs[param] = model_config.observed_data[param]

  def target_ncp(*param_args):
    i = 0
    for param in model_tape.keys():
      if param not in model_config.observed_data.keys():
        target_ncp_kwargs[param] = param_args[i]
        i = i + 1

    return log_joint_noncentered(*model_config.model_args, **target_ncp_kwargs)

  stepsize_kwargs = {'num_leapfrog_steps': num_leapfrog_steps}
  stepsize_kwargs = {'num_optimization_steps': num_optimization_steps}
  for key in model_config.observed_data:
    stepsize_kwargs[key] = model_config.observed_data[key]
  (step_size_init_ncp, stepsize_elbo_ncp,
   vi_time) = util.approximate_mcmc_step_size(model_ncp,
                                              *model_config.model_args,
                                              **stepsize_kwargs)

  results = _run_hmc(
      target_ncp,
      param_shapes,
      step_size_init=step_size_init_ncp,
      transform=model_config.to_centered,
      num_samples=num_samples,
      burnin=burnin,
      num_adaptation_steps=num_adaptation_steps,
      num_leapfrog_steps=num_leapfrog_steps)

  results['elbo'] = stepsize_elbo_ncp
  results['vi_time'] = vi_time
  return results
Beispiel #7
0
def make_cvip_graph(model_config,
                    parameterisation_type='exp',
                    tied_pparams=False):
    """
                Constructs the cVIP graph of the given model.
                Resets the default TF graph.
        """

    tf.reset_default_graph()

    results = collections.OrderedDict()

    (learnable_parameters, learnable_parametrisation,
     _) = ed_transforms.make_learnable_parametrisation(
         tau=1.,
         parameterisation_type=parameterisation_type,
         tied_pparams=tied_pparams)

    def model_vip(*params):
        with ed.interception(learnable_parametrisation):
            return model_config.model(*params)

    if model_config.bijectors_fn is not None:
        model_vip = ed_transforms.transform_with_bijectors(
            model_vip, model_config.bijectors_fn)

    log_joint_vip = ed.make_log_joint_fn(model_vip)  # log_joint_fn

    with ed.tape() as model_tape:
        _ = model_vip(*model_config.model_args)

    target_vip_kwargs = {}
    for param in model_tape.keys():
        if param in model_config.observed_data.keys():
            target_vip_kwargs[param] = model_config.observed_data[param]

    def target_vip(*param_args):  # latent_log_joint_fn
        i = 0
        for param in model_tape.keys():
            if param not in model_config.observed_data.keys():
                target_vip_kwargs[param] = param_args[i]
                i = i + 1
        return log_joint_vip(*model_config.model_args, **target_vip_kwargs)

    #full_kwargs = collections.OrderedDict(model_config.observed_data.items())
    #full_kwargs['parameterisation'] = collections.OrderedDict()
    #for k in learnable_parameters.keys():
    #	full_kwargs['parameterisation'][k] = learnable_parameters[k]

    elbo, variational_parameters = util.get_mean_field_elbo(
        model_vip,
        target_vip,
        num_mc_samples=FLAGS.num_mc_samples,
        model_args=model_config.model_args,
        model_obs_kwargs=model_config.observed_data,
        vi_kwargs={'parameterisation':
                   learnable_parameters})  #vi_kwargs=full_kwargs

    return target_vip, model_vip, elbo, variational_parameters, learnable_parameters
  def testTapeNoName(self):
    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1.)
      return x + y

    with ed.tape() as model_tape:
      _ = model()

    self.assertEqual(list(six.iterkeys(model_tape)), ["x"])
Beispiel #9
0
  def testTapeNoName(self):
    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1.)
      return x + y

    with ed.tape() as model_tape:
      _ = model()

    self.assertEqual(list(six.iterkeys(model_tape)), ["x"])
Beispiel #10
0
    def to_noncentered(centered_state):
        set_values = ed_transforms.make_value_setter(*centered_state)
        with ed.tape() as noncentered_tape:
            with ed.interception(ed_transforms.ncp):
                with ed.interception(set_values):
                    model(*model_args)

        param_vals = [
            tf.identity(v) for k, v in noncentered_tape.items()
            if k not in observed_data.keys()
        ]
        return param_vals
Beispiel #11
0
def run_centered_hmc(model_config,
                     num_samples=2000,
                     burnin=1000,
                     num_leapfrog_steps=4,
                     num_adaptation_steps=500,
                     num_optimization_steps=2000):
  """Runs HMC on the provided (centred) model."""

  tf.compat.v1.reset_default_graph()

  log_joint_centered = ed.make_log_joint_fn(model_config.model)

  with ed.tape() as model_tape:
    _ = model_config.model(*model_config.model_args)

  param_shapes = collections.OrderedDict()
  target_cp_kwargs = {}
  for param in model_tape.keys():
    if param not in model_config.observed_data.keys():
      param_shapes[param] = model_tape[param].shape
    else:
      target_cp_kwargs[param] = model_config.observed_data[param]

  def target_cp(*param_args):
    i = 0
    for param in model_tape.keys():
      if param not in model_config.observed_data.keys():
        target_cp_kwargs[param] = param_args[i]
        i = i + 1

    return log_joint_centered(*model_config.model_args, **target_cp_kwargs)

  stepsize_kwargs = {'num_leapfrog_steps': num_leapfrog_steps}
  stepsize_kwargs = {'num_optimization_steps': num_optimization_steps}
  for key in model_config.observed_data:
    stepsize_kwargs[key] = model_config.observed_data[key]
  (step_size_init_cp, stepsize_elbo_cp,
   vi_time) = util.approximate_mcmc_step_size(model_config.model,
                                              *model_config.model_args,
                                              **stepsize_kwargs)

  results = _run_hmc(
      target_cp,
      param_shapes,
      step_size_init=step_size_init_cp,
      num_samples=num_samples,
      burnin=burnin,
      num_adaptation_steps=num_adaptation_steps,
      num_leapfrog_steps=num_leapfrog_steps)

  results['elbo'] = stepsize_elbo_cp
  results['vi_time'] = vi_time
  return results
Beispiel #12
0
  def testTape(self):
    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1., name="y")
      return x + y

    with ed.tape() as model_tape:
      output = model()

    expected_value, actual_value = self.evaluate([
        model_tape["x"] + model_tape["y"], output])
    self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"])
    self.assertEqual(expected_value, actual_value)
  def testTape(self):
    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1., name="y")
      return x + y

    with ed.tape() as model_tape:
      output = model()

    expected_value, actual_value = self.evaluate([
        model_tape["x"] + model_tape["y"], output])
    self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"])
    self.assertEqual(expected_value, actual_value)
Beispiel #14
0
def make_dvip_graph(model_config, reparam, parameterisation_type='exp'):
    """
                Constructs the dVIP graph of the given model, where `reparam` is
                a cVIP
                reparameterisation obtained previously.
                Resets the default TF graph.
        """

    tf.reset_default_graph()

    results = collections.OrderedDict()

    _, insightful_parametrisation, _ = ed_transforms.make_learnable_parametrisation(
        learnable_parameters=reparam,
        parameterisation_type=parameterisation_type)

    def model_vip(*params):
        with ed.interception(insightful_parametrisation):
            return model_config.model(*params)

    if model_config.bijectors_fn is not None:
        model_vip = ed_transforms.transform_with_bijectors(
            model_vip, model_config.bijectors_fn)

    log_joint_vip = ed.make_log_joint_fn(model_vip)  # log_joint_fn

    with ed.tape() as model_tape:
        _ = model_vip(*model_config.model_args)

    target_vip_kwargs = {}
    for param in model_tape.keys():
        if param in model_config.observed_data.keys():
            target_vip_kwargs[param] = model_config.observed_data[param]

    def target_vip(*param_args):  # latent_log_joint_fn
        i = 0
        for param in model_tape.keys():
            if param not in model_config.observed_data.keys():
                target_vip_kwargs[param] = param_args[i]
                i = i + 1
        return log_joint_vip(*model_config.model_args, **target_vip_kwargs)

    elbo, variational_parameters = util.get_mean_field_elbo(
        model_vip,
        target_vip,
        num_mc_samples=FLAGS.num_mc_samples,
        model_args=model_config.model_args,
        model_obs_kwargs=model_config.observed_data,
        vi_kwargs={'parameterisation': reparam})

    return target_vip, model_vip, elbo, variational_parameters, None
  def testTapeInnerForwarding(self):
    def double(f, *args, **kwargs):
      return 2. * ed.interceptable(f)(*args, **kwargs)

    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1., name="y")
      return x + y

    with ed.interception(double):
      with ed.tape() as model_tape:
        output = model()

    expected_value, actual_value = self.evaluate([
        model_tape["x"] + model_tape["y"], output])
    self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"])
    self.assertEqual(expected_value, actual_value)
Beispiel #16
0
def make_ncp_graph(model_config):
    """
                Constructs the CP graph of the given model.
                Resets the default TF graph.
        """
    tf.reset_default_graph()

    interceptor = ed_transforms.ncp

    def model_ncp(*params):
        with ed.interception(interceptor):
            return model_config.model(*params)

    if model_config.bijectors_fn is not None:
        model_ncp = ed_transforms.transform_with_bijectors(
            model_ncp, model_config.bijectors_fn)

    log_joint_noncentered = ed.make_log_joint_fn(model_ncp)

    with ed.tape() as model_tape:
        _ = model_ncp(*model_config.model_args)

    target_ncp_kwargs = {}
    for param in model_tape.keys():
        if param in model_config.observed_data.keys():
            target_ncp_kwargs[param] = model_config.observed_data[param]

    def target_ncp(*param_args):
        i = 0
        for param in model_tape.keys():
            if param not in model_config.observed_data.keys():
                target_ncp_kwargs[param] = param_args[i]
                i = i + 1

        return log_joint_noncentered(*model_config.model_args,
                                     **target_ncp_kwargs)

    elbo, variational_parameters = util.get_mean_field_elbo(
        model_config.model,
        target_ncp,
        num_mc_samples=FLAGS.num_mc_samples,
        model_args=model_config.model_args,
        model_obs_kwargs=model_config.observed_data,
        vi_kwargs=None)

    return target_ncp, model_ncp, elbo, variational_parameters, None
Beispiel #17
0
  def testTapeInnerForwarding(self):
    def double(f, *args, **kwargs):
      return 2. * ed.interceptable(f)(*args, **kwargs)

    def model():
      x = ed.Normal(loc=0., scale=1., name="x")
      y = ed.Normal(loc=x, scale=1., name="y")
      return x + y

    with ed.interception(double):
      with ed.tape() as model_tape:
        output = model()

    expected_value, actual_value = self.evaluate([
        model_tape["x"] + model_tape["y"], output])
    self.assertEqual(list(six.iterkeys(model_tape)), ["x", "y"])
    self.assertEqual(expected_value, actual_value)
Beispiel #18
0
    def _build_model(self):
        # get the global variables defined before building the model
        _before_global_variables = tf.global_variables()

        with contextmanager.randvar_registry.init(self.graph):
            # use edward2 model tape to capture RandomVariable declarations
            with ed.tape() as model_tape:
                self.builder()

            # get variables from parameters
            var_parameters = contextmanager.randvar_registry.get_var_parameters(
            )

            # wrap captured edward2 RVs into inferpy RVs
            model_vars = OrderedDict()
            for k, v in model_tape.items():
                registered_rv = contextmanager.randvar_registry.get_variable(k)
                if registered_rv is None:
                    # a ed Random Variable. Create a inferpy Random Variable and assign the var directly.
                    # do not know the args and kwars used to build the ed random variable. Use None.
                    model_vars[k] = RandomVariable(v,
                                                   name=k,
                                                   is_datamodel=False,
                                                   ed_cls=None,
                                                   var_args=None,
                                                   var_kwargs=None,
                                                   sample_shape=())
                else:
                    model_vars[k] = registered_rv

        # get the global variables defined after building the model
        _after_global_variables = tf.global_variables()
        # compute the new global variables defined when building the model
        created_vars = [
            v for v in _after_global_variables
            if v not in _before_global_variables
        ]
        util.get_session().run(tf.variables_initializer(created_vars))

        return model_vars, var_parameters
Beispiel #19
0
def make_cp_graph(model_config):
    """
                Constructs the CP graph of the given model.
                Resets the default TF graph.
        """

    tf.reset_default_graph()

    log_joint_centered = ed.make_log_joint_fn(model_config.model)

    with ed.tape() as model_tape:
        _ = model_config.model(*model_config.model_args)

    param_shapes = collections.OrderedDict()
    target_cp_kwargs = {}
    for param in model_tape.keys():
        if param not in model_config.observed_data.keys():
            param_shapes[param] = model_tape[param].shape
        else:
            target_cp_kwargs[param] = model_config.observed_data[param]

    def target_cp(*param_args):
        i = 0
        for param in model_tape.keys():
            if param not in model_config.observed_data.keys():
                target_cp_kwargs[param] = param_args[i]
                i = i + 1

        return log_joint_centered(*model_config.model_args, **target_cp_kwargs)

    elbo, variational_parameters = util.get_mean_field_elbo(
        model_config.model,
        target_cp,
        num_mc_samples=FLAGS.num_mc_samples,
        model_args=model_config.model_args,
        model_obs_kwargs=model_config.observed_data,
        vi_kwargs=None)

    return target_cp, model_config.model, elbo, variational_parameters, None
Beispiel #20
0
def _make_likelihood(rv_dict, model):
    """Produces optimizable tensor for model likelihood.

    Args:
        rv_dict: (dict of RandomVariable) Dictionary of random variables
            representing variational family for each model parameter.
        model: (Model) A model that contains definition, likelihood and
            training labels.

    Returns:
        log_likelihood: (tf.Tensor) A likelihood tensor with registered
            gradient with respect to VI parameters.
        outcome_rv: (ed.RandomVariable) A random variable representing
            model's predictive distribution.
        model_tape: (ContextManager) A ContextManager recording the
            model variables in model graph.
    """
    with ed.tape() as model_tape:
        with ed.interception(model_util.make_value_setter(**rv_dict)):
            outcome_rv = model.definition()

    log_likelihood = model.likelihood(outcome_rv, model.outcome_obs)

    return log_likelihood, outcome_rv, model_tape
Beispiel #21
0
    return set_values


DATA_SIZE = 100
FEATURE_SIZE = 41
UNITS = [23, 7, 2]
SHAPE = 0.1
x, w2, w1, w0, z2, z1, z0 = deep_exponential_family(DATA_SIZE, FEATURE_SIZE,
                                                    UNITS, SHAPE)
qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational(
    w2, w1, w0, z2, z1, z0)

# x_sample = np.random.poisson(5., size=[DATA_SIZE, FEATURE_SIZE])  # 生成虚拟的训练数据,size与模型匹配
x_sample = tf.placeholder(tf.float32,
                          shape=[DATA_SIZE, FEATURE_SIZE])  # 可以用placeholder占位符
with ed.tape() as model_tape:
    with ed.interception(
            make_value_setter(w2=qw2, w1=qw1, w0=qw0, z2=qz2, z1=qz1, z0=qz0)):
        # 对分布的参数用后验分布进行替换,生成后验分布
        posterior_predictive, _, _, _, _, _, _ = deep_exponential_family(
            DATA_SIZE, FEATURE_SIZE, UNITS, SHAPE)
log_likelihood = posterior_predictive.distribution.log_prob(x_sample)
print(log_likelihood)  # log_likelihood为根据x_sample计算的对数似然函数

# 损失函数的定义,用变分法
kl = 0.
for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2),
                                ("w0", qw0), ("w1", qw1), ("w2", qw2)]:
    # rv_name代表先验分布的name
    # variational_rv代表后验分布的名字
    kl += tf.reduce_sum(
def model_fn(features, labels, mode, params, config):
  """Builds the model function for use in an Estimator.

  Arguments:
    features: The input features for the Estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.

  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """
  del labels, config

  # Set up the model's learnable parameters.
  logit_concentration = tf.get_variable(
      "logit_concentration",
      shape=[1, params["num_topics"]],
      initializer=tf.constant_initializer(
          _softplus_inverse(params["prior_initial_value"])))
  concentration = _clip_dirichlet_parameters(
      tf.nn.softplus(logit_concentration))

  num_words = features.shape[1]
  topics_words_logits = tf.get_variable(
      "topics_words_logits",
      shape=[params["num_topics"], num_words],
      initializer=tf.glorot_normal_initializer())
  topics_words = tf.nn.softmax(topics_words_logits, axis=-1)

  # Compute expected log-likelihood. First, sample from the variational
  # distribution; second, compute the log-likelihood given the sample.
  lda_variational = make_lda_variational(
      params["activation"],
      params["num_topics"],
      params["layer_sizes"])
  with ed.tape() as variational_tape:
    _ = lda_variational(features)

  with ed.tape() as model_tape:
    with ed.interception(
        make_value_setter(topics=variational_tape["topics_posterior"])):
      posterior_predictive = latent_dirichlet_allocation(concentration,
                                                         topics_words)

  log_likelihood = posterior_predictive.distribution.log_prob(features)
  tf.summary.scalar("log_likelihood", tf.reduce_mean(log_likelihood))

  # Compute the KL-divergence between two Dirichlets analytically.
  # The sampled KL does not work well for "sparse" distributions
  # (see Appendix D of [2]).
  kl = variational_tape["topics_posterior"].distribution.kl_divergence(
      model_tape["topics"].distribution)
  tf.summary.scalar("kl", tf.reduce_mean(kl))

  # Ensure that the KL is non-negative (up to a very small slack).
  # Negative KL can happen due to numerical instability.
  with tf.control_dependencies([tf.assert_greater(kl, -1e-3, message="kl")]):
    kl = tf.identity(kl)

  elbo = log_likelihood - kl
  avg_elbo = tf.reduce_mean(elbo)
  tf.summary.scalar("elbo", avg_elbo)
  loss = -avg_elbo

  # Perform variational inference by minimizing the -ELBO.
  global_step = tf.train.get_or_create_global_step()
  optimizer = tf.train.AdamOptimizer(params["learning_rate"])

  # This implements the "burn-in" for prior parameters (see Appendix D of [2]).
  # For the first prior_burn_in_steps steps they are fixed, and then trained
  # jointly with the other parameters.
  grads_and_vars = optimizer.compute_gradients(loss)
  grads_and_vars_except_prior = [
      x for x in grads_and_vars if x[1] != logit_concentration]

  def train_op_except_prior():
    return optimizer.apply_gradients(
        grads_and_vars_except_prior,
        global_step=global_step)

  def train_op_all():
    return optimizer.apply_gradients(
        grads_and_vars,
        global_step=global_step)

  train_op = tf.cond(
      global_step < params["prior_burn_in_steps"],
      true_fn=train_op_except_prior,
      false_fn=train_op_all)

  # The perplexity is an exponent of the average negative ELBO per word.
  words_per_document = tf.reduce_sum(features, axis=1)
  log_perplexity = -elbo / words_per_document
  tf.summary.scalar("perplexity", tf.exp(tf.reduce_mean(log_perplexity)))
  (log_perplexity_tensor, log_perplexity_update) = tf.metrics.mean(
      log_perplexity)
  perplexity_tensor = tf.exp(log_perplexity_tensor)

  # Obtain the topics summary. Implemented as a py_func for simplicity.
  topics = tf.py_func(
      functools.partial(get_topics_strings, vocabulary=params["vocabulary"]),
      [topics_words, concentration], tf.string, stateful=False)
  tf.summary.text("topics", topics)

  return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops={
          "elbo": tf.metrics.mean(elbo),
          "log_likelihood": tf.metrics.mean(log_likelihood),
          "kl": tf.metrics.mean(kl),
          "perplexity": (perplexity_tensor, log_perplexity_update),
          "topics": (topics, tf.no_op()),
      },
  )
Beispiel #23
0
def main(argv):
    del argv  # unused
    if tf.gfile.Exists(FLAGS.model_dir):
        tf.logging.warning("Warning: deleting old log directory at {}".format(
            FLAGS.model_dir))
        tf.gfile.DeleteRecursively(FLAGS.model_dir)
    tf.gfile.MakeDirs(FLAGS.model_dir)
    tf.enable_eager_execution()

    grammar = SmilesGrammar()
    synthetic_data_distribution = ProbabilisticGrammar(
        grammar=grammar,
        latent_size=FLAGS.latent_size,
        num_units=FLAGS.num_units)

    print("Random examples from synthetic data distribution:")
    for _ in range(5):
        productions = synthetic_data_distribution()
        string = grammar.convert_to_string(productions)
        print(string)

    probabilistic_grammar = ProbabilisticGrammar(grammar=grammar,
                                                 latent_size=FLAGS.latent_size,
                                                 num_units=FLAGS.num_units)
    probabilistic_grammar_variational = ProbabilisticGrammarVariational(
        latent_size=FLAGS.latent_size)

    checkpoint = tf.train.Checkpoint(
        synthetic_data_distribution=synthetic_data_distribution,
        probabilistic_grammar=probabilistic_grammar,
        probabilistic_grammar_variational=probabilistic_grammar_variational)
    global_step = tf.train.get_or_create_global_step()
    optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
    writer = tf.contrib.summary.create_file_writer(FLAGS.model_dir)
    writer.set_as_default()

    start_time = time.time()
    for step in range(FLAGS.max_steps):
        productions = synthetic_data_distribution()
        with tf.GradientTape() as tape:
            # Sample from amortized variational distribution and record its trace.
            with ed.tape() as variational_tape:
                _ = probabilistic_grammar_variational(productions)

            # Set model trace to take on the data's values and the sample from the
            # variational distribution.
            values = {"latent_code": variational_tape["latent_code_posterior"]}
            values.update({
                "production_" + str(t): production
                for t, production in enumerate(tf.unstack(productions, axis=1))
            })
            with ed.tape() as model_tape:
                with ed.interception(make_value_setter(**values)):
                    _ = probabilistic_grammar()

            # Compute the ELBO given the variational sample, averaged over the batch
            # size and the number of time steps (number of productions). Although the
            # ELBO per data point sums over time steps, we average in order to have a
            # value that remains on the same scale across batches.
            log_likelihood = 0.
            for name, rv in six.iteritems(model_tape):
                if name.startswith("production"):
                    log_likelihood += rv.distribution.log_prob(rv.value)

            kl = tfp.distributions.kl_divergence(
                variational_tape["latent_code_posterior"].distribution,
                model_tape["latent_code"].distribution)

            timesteps = tf.to_float(productions.shape[1])
            elbo = tf.reduce_mean(log_likelihood - kl) / timesteps
            loss = -elbo
            with tf.contrib.summary.record_summaries_every_n_global_steps(500):
                tf.contrib.summary.scalar(
                    "log_likelihood",
                    tf.reduce_mean(log_likelihood) / timesteps)
                tf.contrib.summary.scalar("kl", tf.reduce_mean(kl) / timesteps)
                tf.contrib.summary.scalar("elbo", elbo)

        variables = (probabilistic_grammar.variables +
                     probabilistic_grammar_variational.variables)
        grads = tape.gradient(loss, variables)
        grads_and_vars = zip(grads, variables)
        optimizer.apply_gradients(grads_and_vars, global_step)

        if step % 500 == 0:
            duration = time.time() - start_time
            print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format(
                step, loss, duration))
            checkpoint.save(file_prefix=FLAGS.model_dir)
def model_fn(features, labels, mode, params, config):
    """Builds the model function for use in an Estimator.

  Arguments:
    features: The input features for the Estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.

  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """
    del labels, config

    # Set up the model's learnable parameters.
    logit_concentration = tf.get_variable(
        "logit_concentration",
        shape=[1, params["num_topics"]],
        initializer=tf.constant_initializer(
            _softplus_inverse(params["prior_initial_value"])))
    concentration = _clip_dirichlet_parameters(
        tf.nn.softplus(logit_concentration))

    num_words = features.shape[1]
    topics_words_logits = tf.get_variable(
        "topics_words_logits",
        shape=[params["num_topics"], num_words],
        initializer=tf.glorot_normal_initializer())
    topics_words = tf.nn.softmax(topics_words_logits, axis=-1)

    # Compute expected log-likelihood. First, sample from the variational
    # distribution; second, compute the log-likelihood given the sample.
    lda_variational = make_lda_variational(params["activation"],
                                           params["num_topics"],
                                           params["layer_sizes"])
    with ed.tape() as variational_tape:
        _ = lda_variational(features)

    with ed.tape() as model_tape:
        with ed.interception(
                make_value_setter(
                    topics=variational_tape["topics_posterior"])):
            posterior_predictive = latent_dirichlet_allocation(
                concentration, topics_words)

    log_likelihood = posterior_predictive.distribution.log_prob(features)
    tf.summary.scalar("log_likelihood", tf.reduce_mean(log_likelihood))

    # Compute the KL-divergence between two Dirichlets analytically.
    # The sampled KL does not work well for "sparse" distributions
    # (see Appendix D of [2]).
    kl = variational_tape["topics_posterior"].distribution.kl_divergence(
        model_tape["topics"].distribution)
    tf.summary.scalar("kl", tf.reduce_mean(kl))

    # Ensure that the KL is non-negative (up to a very small slack).
    # Negative KL can happen due to numerical instability.
    with tf.control_dependencies([tf.assert_greater(kl, -1e-3, message="kl")]):
        kl = tf.identity(kl)

    elbo = log_likelihood - kl
    avg_elbo = tf.reduce_mean(elbo)
    tf.summary.scalar("elbo", avg_elbo)
    loss = -avg_elbo

    # Perform variational inference by minimizing the -ELBO.
    global_step = tf.train.get_or_create_global_step()
    optimizer = tf.train.AdamOptimizer(params["learning_rate"])

    # This implements the "burn-in" for prior parameters (see Appendix D of [2]).
    # For the first prior_burn_in_steps steps they are fixed, and then trained
    # jointly with the other parameters.
    grads_and_vars = optimizer.compute_gradients(loss)
    grads_and_vars_except_prior = [
        x for x in grads_and_vars if x[1] != logit_concentration
    ]

    def train_op_except_prior():
        return optimizer.apply_gradients(grads_and_vars_except_prior,
                                         global_step=global_step)

    def train_op_all():
        return optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    train_op = tf.cond(global_step < params["prior_burn_in_steps"],
                       true_fn=train_op_except_prior,
                       false_fn=train_op_all)

    # The perplexity is an exponent of the average negative ELBO per word.
    words_per_document = tf.reduce_sum(features, axis=1)
    log_perplexity = -elbo / words_per_document
    tf.summary.scalar("perplexity", tf.exp(tf.reduce_mean(log_perplexity)))
    (log_perplexity_tensor,
     log_perplexity_update) = tf.metrics.mean(log_perplexity)
    perplexity_tensor = tf.exp(log_perplexity_tensor)

    # Obtain the topics summary. Implemented as a py_func for simplicity.
    topics = tf.py_func(functools.partial(get_topics_strings,
                                          vocabulary=params["vocabulary"]),
                        [topics_words, concentration],
                        tf.string,
                        stateful=False)
    tf.summary.text("topics", topics)

    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        eval_metric_ops={
            "elbo": tf.metrics.mean(elbo),
            "log_likelihood": tf.metrics.mean(log_likelihood),
            "kl": tf.metrics.mean(kl),
            "perplexity": (perplexity_tensor, log_perplexity_update),
            "topics": (topics, tf.no_op()),
        },
    )
def main(argv):
  del argv  # unused
  FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes]
  if len(FLAGS.layer_sizes) != 3:
    raise NotImplementedError("Specifying fewer or more than 3 layers is not "
                              "currently available.")
  if tf.gfile.Exists(FLAGS.model_dir):
    tf.logging.warning(
        "Warning: deleting old log directory at {}".format(FLAGS.model_dir))
    tf.gfile.DeleteRecursively(FLAGS.model_dir)
  tf.gfile.MakeDirs(FLAGS.model_dir)

  if FLAGS.fake_data:
    bag_of_words = np.random.poisson(1., size=[10, 25])
    words = [str(i) for i in range(25)]
  else:
    bag_of_words, words = load_nips2011_papers(FLAGS.data_dir)

  total_count = np.sum(bag_of_words)
  bag_of_words = tf.to_float(bag_of_words)
  data_size, feature_size = bag_of_words.shape

  # Compute expected log-likelihood. First, sample from the variational
  # distribution; second, compute the log-likelihood given the sample.
  qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational(
      data_size,
      feature_size,
      FLAGS.layer_sizes)

  with ed.tape() as model_tape:
    with ed.interception(make_value_setter(w2=qw2, w1=qw1, w0=qw0,
                                           z2=qz2, z1=qz1, z0=qz0)):
      posterior_predictive = deep_exponential_family(data_size,
                                                     feature_size,
                                                     FLAGS.layer_sizes,
                                                     FLAGS.shape)

  log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words)
  log_likelihood = tf.reduce_sum(log_likelihood)
  tf.summary.scalar("log_likelihood", log_likelihood)

  # Compute analytic KL-divergence between variational and prior distributions.
  kl = 0.
  for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2),
                                  ("w0", qw0), ("w1", qw1), ("w2", qw2)]:
    kl += tf.reduce_sum(variational_rv.distribution.kl_divergence(
        model_tape[rv_name].distribution))

  tf.summary.scalar("kl", kl)

  elbo = log_likelihood - kl
  tf.summary.scalar("elbo", elbo)
  optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
  train_op = optimizer.minimize(-elbo)

  sess = tf.Session()
  summary = tf.summary.merge_all()
  summary_writer = tf.summary.FileWriter(FLAGS.model_dir, sess.graph)
  start_time = time.time()

  sess.run(tf.global_variables_initializer())
  for step in range(FLAGS.max_steps):
    start_time = time.time()
    _, elbo_value = sess.run([train_op, elbo])
    if step % 500 == 0:
      duration = time.time() - start_time
      print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format(
          step, elbo_value, duration))
      summary_str = sess.run(summary)
      summary_writer.add_summary(summary_str, step)
      summary_writer.flush()

      # Compute perplexity of the full data set. The model's negative
      # log-likelihood of data is upper bounded by the variational objective.
      negative_log_likelihood = -elbo_value
      perplexity = np.exp(negative_log_likelihood / total_count)
      print("Negative log-likelihood <= {:0.3f}".format(
          negative_log_likelihood))
      print("Perplexity <= {:0.3f}".format(perplexity))

      # Print top 10 words for first 10 topics.
      qw0_values = sess.run(qw0)
      for k in range(min(10, FLAGS.layer_sizes[-1])):
        top_words_idx = qw0_values[k, :].argsort()[-10:][::-1]
        top_words = " ".join([words[i] for i in top_words_idx])
        print("Topic {}: {}".format(k, top_words))
def run_vip_hmc_continuous(model_config,
                           num_samples=2000,
                           burnin=1000,
                           use_iaf_posterior=False,
                           num_leapfrog_steps=4,
                           num_adaptation_steps=500,
                           num_optimization_steps=2000,
                           num_mc_samples=32,
                           tau=1.,
                           do_sample=True,
                           description='',
                           experiments_dir=''):

  tf.reset_default_graph()

  if use_iaf_posterior:
    # IAF posterior doesn't give us stddevs for step sizes for HMC (we could
    # extract them by sampling but I haven't implemented that), and we mostly
    # care about it for ELBOs anyway.
    do_sample = False

  init_val_loc = tf.placeholder('float', shape=())
  init_val_scale = tf.placeholder('float', shape=())

  (learnable_parameters,
   learnable_parametrisation, _) = ed_transforms.make_learnable_parametrisation(
       init_val_loc=init_val_loc, init_val_scale=init_val_scale, tau=tau)

  def model_vip(*params):
    with ed.interception(learnable_parametrisation):
      return model_config.model(*params)

  log_joint_vip = ed.make_log_joint_fn(model_vip)

  with ed.tape() as model_tape:
    _ = model_vip(*model_config.model_args)

  param_shapes = collections.OrderedDict()
  target_vip_kwargs = {}
  for param in model_tape.keys():
    if param not in model_config.observed_data.keys():
      param_shapes[param] = model_tape[param].shape
    else:
      target_vip_kwargs[param] = model_config.observed_data[param]

  def target_vip(*param_args):
    i = 0
    for param in model_tape.keys():
      if param not in model_config.observed_data.keys():
        target_vip_kwargs[param] = param_args[i]
        i = i + 1
    return log_joint_vip(*model_config.model_args, **target_vip_kwargs)

  full_kwargs = collections.OrderedDict(model_config.observed_data.items())
  full_kwargs['parameterisation'] = collections.OrderedDict()
  for k in learnable_parameters.keys():
    full_kwargs['parameterisation'][k] = learnable_parameters[k]

  if use_iaf_posterior:
    elbo = util.get_iaf_elbo(
        target_vip,
        num_mc_samples=num_mc_samples,
        param_shapes=param_shapes)
    variational_parameters = {}
  else:
    elbo, variational_parameters = util.get_mean_field_elbo(
        model_vip,
        target_vip,
        num_mc_samples=num_mc_samples,
        model_args=model_config.model_args,
        vi_kwargs=full_kwargs)
    vip_step_size_approx = util.get_approximate_step_size(
        variational_parameters, num_leapfrog_steps)

  ##############################################################################

  best_elbo = None
  model_dir = os.path.join(experiments_dir,
                           str(description + '_' + model_config.model.__name__))

  if not tf.gfile.Exists(model_dir):
    tf.gfile.MakeDirs(model_dir)

  saver = tf.train.Saver()
  dir_save = os.path.join(model_dir, 'saved_params_{}'.format(gen_id()))

  if not tf.gfile.Exists(dir_save):
    tf.gfile.MakeDirs(dir_save)

  best_lr = None
  best_init_loc = None
  best_init_scale = None

  learning_rate_ph = tf.placeholder(shape=[], dtype=tf.float32)
  learning_rate = tf.Variable(learning_rate_ph, trainable=False)
  optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
  train = optimizer.minimize(-elbo)
  init = tf.global_variables_initializer()

  learning_rates = [0.003, 0.01, 0.01, 0.1, 0.003, 0.01]
  if use_iaf_posterior:
    learning_rates = [3e-5, 1e-4, 3e-4, 1e-4]

  start_time = time.time()
  for learning_rate_val in learning_rates:
    for init_loc in [0.]:  #, 10., -10.]:
      for init_scale in [init_loc]:

        timeline = []

        with tf.Session() as sess:

          init.run(feed_dict={init_val_loc: init_loc,
                              init_val_scale: init_scale,
                              learning_rate_ph: learning_rate_val})

          this_timeline = []
          for i in range(num_optimization_steps):
            _, e = sess.run([train, elbo])

            if np.isnan(e):
              util.print('got NaN in ELBO optimization, stopping...')
              break

            this_timeline.append(e)

          this_elbo = np.mean(this_timeline[-100:])
          info_str = ('finished cVIP optimization with elbo {} vs '
                      'best ELBO {}'.format(this_elbo, best_elbo))
          util.print(info_str)
          if best_elbo is None or best_elbo < this_elbo:
            best_elbo = this_elbo
            timeline = this_timeline

            vals = sess.run(list(learnable_parameters.values()))
            learned_reparam = collections.OrderedDict(
                zip(learnable_parameters.keys(), vals))
            vals = sess.run(list(variational_parameters.values()))
            learned_variational_params = collections.OrderedDict(
                zip(variational_parameters.keys(), vals))

            util.print('learned params {}'.format(learned_reparam))
            util.print('learned variational params {}'.format(
                learned_variational_params))

            _ = saver.save(sess, dir_save)
            best_lr = learning_rate
            best_init_loc = init_loc
            best_init_scale = init_scale

  vi_time = time.time() - start_time

  util.print('BEST: LR={}, init={}, {}'.format(best_lr, best_init_loc,
                                               best_init_scale))
  util.print('ELBO: {}'.format(best_elbo))

  to_centered = model_config.make_to_centered(**learned_reparam)

  results = collections.OrderedDict()
  results['elbo'] = best_elbo

  with tf.Session() as sess:

    saver.restore(sess, dir_save)
    results['vp'] = learned_variational_params

    if do_sample:

      vip_step_size_init = sess.run(vip_step_size_approx)

      vip_step_size = [tf.get_variable(
          name='step_size_vip'+str(i),
          initializer=np.array(vip_step_size_init[i], dtype=np.float32),
          use_resource=True,  # For TFE compatibility.
          trainable=False) for i in range(len(vip_step_size_init))]

      kernel_vip = mcmc.HamiltonianMonteCarlo(
          target_log_prob_fn=target_vip,
          step_size=vip_step_size,
          num_leapfrog_steps=num_leapfrog_steps,
          step_size_update_fn=mcmc.make_simple_step_size_update_policy(
              num_adaptation_steps=num_adaptation_steps, target_rate=0.85))

      states, kernel_results_vip = mcmc.sample_chain(
          num_results=num_samples,
          num_burnin_steps=burnin,
          current_state=[
              tf.zeros(param_shapes[param]) for param in param_shapes.keys()
          ],
          kernel=kernel_vip,
          num_steps_between_results=1)

      states_vip = transform_mcmc_states(states, to_centered)

      init_again = tf.global_variables_initializer()
      init_again.run(feed_dict={
          init_val_loc: best_init_loc, init_val_scale: best_init_scale,
          learning_rate_ph: 1.0})  # learning rate doesn't matter for HMC.

      ess_vip = tfp.mcmc.effective_sample_size(states_vip)

      start_time = time.time()
      samples, is_accepted, ess, ss_vip, log_accept_ratio = sess.run(
          (states_vip, kernel_results_vip.is_accepted, ess_vip,
           kernel_results_vip.extra.step_size_assign,
           kernel_results_vip.log_accept_ratio))

      sampling_time = time.time() - start_time

      results['samples'] = collections.OrderedDict()
      results['is_accepted'] = is_accepted
      results['acceptance_rate'] = np.sum(is_accepted) * 100. / float(
          num_samples)
      results['ess'] = ess
      results['sampling_time'] = sampling_time
      results['log_accept_ratio'] = log_accept_ratio
      results['step_size'] = [s[0] for s in ss_vip]

      i = 0
      for param in param_shapes.keys():
        results['samples'][param] = samples[i]
        i = i + 1

    # end if

    results['parameterisation'] = collections.OrderedDict()

    i = 0
    for param in param_shapes.keys():
      name_a = param[:-5] + 'a'
      name_b = param[:-5] + 'b'
      try:
        results['parameterisation'][name_a] = learned_reparam[name_a]
        results['parameterisation'][name_b] = learned_reparam[name_b]
      except KeyError:
        continue
      i = i + 1

    results['elbo_timeline'] = timeline
    results['vi_time'] = vi_time

    results['init_pos'] = best_init_loc

    return results
def run_interleaved_hmc(model_config,
                        num_samples=2000, step_size_cp=0.1, step_size_ncp=0.1,
                        burnin=1000, num_leapfrog_steps=4):
  """Given a (centred) model, this function transforms it to a fully
  non-centred one, and uses both models to run interleaved HMC.
  """

  tf.reset_default_graph()

  log_joint_centered = ed.make_log_joint_fn(model_config.model)

  with ed.tape() as model_tape_cp:
    _ = model_config.model(*model_config.model_args)

  param_shapes = collections.OrderedDict()
  target_cp_kwargs = {}
  for param in model_tape_cp.keys():
    if param not in model_config.observed_data.keys():
      param_shapes[param] = model_tape_cp[param].shape
    else:
      target_cp_kwargs[param] = model_config.observed_data[param]

  def target_cp(*param_args):
    i = 0
    for param in model_tape_cp.keys():
      if param not in model_config.observed_data.keys():
        target_cp_kwargs[param] = param_args[i]
        i = i + 1

    return log_joint_centered(*model_config.model_args, **target_cp_kwargs)

  def model_noncentered(*params):
    with ed.interception(ed_transforms.ncp):
      return model_config.model(*params)

  log_joint_noncentered = ed.make_log_joint_fn(model_noncentered)

  with ed.tape() as model_tape_ncp:
    _ = model_noncentered(*model_config.model_args)

  param_shapes = collections.OrderedDict()
  target_ncp_kwargs = {}
  for param in model_tape_ncp.keys():
    if param not in model_config.observed_data.keys():
      param_shapes[param] = model_tape_ncp[param].shape
    else:
      target_ncp_kwargs[param] = model_config.observed_data[param]

  def target_ncp(*param_args):
    i = 0
    for param in model_tape_ncp.keys():
      if param not in model_config.observed_data.keys():
        target_ncp_kwargs[param] = param_args[i]
        i = i + 1

    return log_joint_noncentered(*model_config.model_args, **target_ncp_kwargs)

  return _run_hmc_interleaved(target_cp, target_ncp, param_shapes,
                              to_centered=model_config.to_centered,
                              to_noncentered=model_config.to_noncentered,
                              num_samples=num_samples,
                              step_size_cp=step_size_cp,
                              step_size_ncp=step_size_ncp,
                              burnin=burnin,
                              num_leapfrog_steps=num_leapfrog_steps)
Beispiel #28
0
def run_interleaved_hmc(model_config, results_dir, file_path):
    filename_cp = 'CP.json'
    filename_ncp = 'NCP.json'

    file_path_cp = os.path.join(results_dir, filename_cp)
    file_path_ncp = os.path.join(results_dir, filename_ncp)

    with ed.tape() as model_tape:
        model_config.model(*model_config.model_args)
    param_names = [
        k for k in list(model_tape.keys())
        if k not in model_config.observed_data
    ]

    if tf.io.gfile.exists(file_path_cp) and tf.io.gfile.exists(file_path_ncp):
        with tf.io.gfile.GFile(file_path_cp, 'r') as f:
            prev_results = json.load(f)
            initial_step_size_cp = prev_results['initial_step_size']
            num_leapfrog_steps_cp = get_best_num_leapfrog_steps_from_tuning_runs(
                prev_results['tuning_runs'])
            learned_variational_params_cp = prev_results[
                'learned_variational_params']

        with tf.io.gfile.GFile(file_path_ncp, 'r') as f:
            prev_results = json.load(f)
            initial_step_size_ncp = prev_results['initial_step_size']
            num_leapfrog_steps_ncp = get_best_num_leapfrog_steps_from_tuning_runs(
                prev_results['tuning_runs'])
    else:
        raise Exception('Run VI first to find initial step sizes, and HMC'
                        'first to find num_leapfrog_steps.')

    initial_states_cp = util.variational_inits_from_params(
        learned_variational_params_cp,
        param_names=param_names,
        num_inits=FLAGS.num_chains).values()

    best_ess_min = 0
    best_num_ls = None
    results = ()
    for num_ls in set([num_leapfrog_steps_ncp, num_leapfrog_steps_cp]):
        util.print('\nNumber of leaprog steps is set to {}.\n'.format(
            FLAGS.num_leapfrog_steps))
        FLAGS.num_leapfrog_steps = num_ls + num_ls
        (ess_min, sem_min, acceptance_rate_cp, acceptance_rate_ncp, mcmc_time,
         samples,
         normalized_ess_final) = run_interleaved_hmc_with_leapfrog_steps(
             model_config=model_config,
             results_dir=results_dir,
             num_leapfrog_steps_cp=num_ls,
             num_leapfrog_steps_ncp=num_ls,
             initial_step_size_cp=initial_step_size_cp,
             initial_step_size_ncp=initial_step_size_ncp,
             initial_states_cp=initial_states_cp)
        if ess_min.item() > best_ess_min:
            best_ess_min = ess_min.item()
            best_num_ls = num_ls
            results = (ess_min, sem_min, acceptance_rate_cp,
                       acceptance_rate_ncp, mcmc_time, samples,
                       normalized_ess_final)
    (ess_min, sem_min, acceptance_rate_cp, acceptance_rate_ncp, mcmc_time,
     samples, normalized_ess_final) = results
    FLAGS.num_leapfrog_steps = best_num_ls + best_num_ls

    save_hmc_results(file_path=file_path,
                     initial_step_size_ncp=initial_step_size_ncp,
                     initial_step_size_cp=nitial_step_size_cp,
                     num_leapfrog_steps=best_num_ls,
                     ess_min=ess_min.item(),
                     sem_min=sem_min.item(),
                     acceptance_rate_cp=acceptance_rate_cp.item(),
                     acceptance_rate_ncp=acceptance_rate_ncp.item(),
                     mcmc_time_sec=mcmc_time)

    save_ess(file_path_base=file_path[:-5],
             samples=samples,
             param_names=param_names,
             normalized_ess_final=normalized_ess_final,
             num_chains_to_save=FLAGS.num_chains_to_save)
Beispiel #29
0
def run_hmc(model_config, results_dir, file_path, tuning=False):
    if tf.io.gfile.exists(file_path):
        with tf.io.gfile.GFile(file_path, 'r') as f:
            prev_results = json.load(f)
    else:
        raise Exception('Run VI first to find initial step sizes')

    with ed.tape() as model_tape:
        model_config.model(*model_config.model_args)
    param_names = [
        k for k in list(model_tape.keys())
        if k not in model_config.observed_data
    ]

    initial_step_size = prev_results['initial_step_size']
    initial_states = util.variational_inits_from_params(
        prev_results['learned_variational_params'],
        param_names=param_names,
        num_inits=FLAGS.num_chains).values()

    if tuning:
        if not FLAGS.num_leapfrog_steps:
            raise ValueError(
                'You must specify the number of leapfrog steps for a '
                'tuning run.')
        for existing_run in prev_results.get('tuning_runs', []):
            if existing_run['num_leapfrog_steps'] == FLAGS.num_leapfrog_steps:
                print(
                    'A tuning run already exists for HMC with {} leapfrog steps ',
                    'skipping. ({})'.format(FLAGS.num_leapfrog_steps,
                                            existing_run))
                return

    if not FLAGS.num_leapfrog_steps:
        FLAGS.num_leapfrog_steps = get_best_num_leapfrog_steps_from_tuning_runs(
            prev_results['tuning_runs'])
    util.print('\nNumber of leaprog steps is set to {}.\n'.format(
        FLAGS.num_leapfrog_steps))

    if FLAGS.count_in_leapfrog_steps:
        FLAGS.num_samples = int(FLAGS.num_samples /
                                float(FLAGS.num_leapfrog_steps))
        FLAGS.num_burnin_steps = int(FLAGS.num_burnin_steps /
                                     float(FLAGS.num_leapfrog_steps))
        FLAGS.num_adaptation_steps = int(FLAGS.num_adaptation_steps /
                                         float(FLAGS.num_leapfrog_steps))

    (target, _, elbo, variational_parameters, learnable_parameters,
     actual_reparam) = create_target_graph(model_config, results_dir)

    (states_orig, kernel_results, states,
     ess) = inference.hmc(target,
                          model_config,
                          initial_step_size,
                          initial_states=initial_states,
                          reparam=(actual_reparam if actual_reparam is not None
                                   else learned_reparam))

    init = tf.compat.v1.global_variables_initializer()

    with tf.compat.v1.Session() as sess:
        #sess = tf_debug.LocalCLIDebugWrapperSession(
        #    sess, dump_root="/usr/local/google/tmp/tfdbg")

        init.run()
        start_time = time.time()
        samples, is_accepted, ess_final, samples_orig = sess.run(
            (states, kernel_results.inner_results.is_accepted, ess,
             states_orig))

        mcmc_time = time.time() - start_time

    normalized_ess_final = []
    for ess_ in ess_final:
        # report effective samples per 1000 gradient evals
        normalized_ess_final.append(
            1000 * ess_ / (FLAGS.num_samples * FLAGS.num_leapfrog_steps))
    del ess_final

    ess_min, sem_min = util.get_min_ess(normalized_ess_final)
    util.print('ESS per 1000 gradients: {} +/- {}'.format(ess_min, sem_min))

    acceptance_rate = (np.sum(is_accepted) * 100. /
                       float(FLAGS.num_samples * FLAGS.num_chains))

    if tuning:
        save_hmc_results(file_path=file_path,
                         tuning_runs={
                             'num_leapfrog_steps': FLAGS.num_leapfrog_steps,
                             'ess_min': ess_min.item(),
                             'sem_min': sem_min.item(),
                             'acceptance_rate': acceptance_rate.item(),
                             'mcmc_time': mcmc_time,
                             'num_samples': FLAGS.num_samples,
                             'num_burnin_steps': FLAGS.num_burnin_steps
                         })
    else:
        save_hmc_results(file_path=file_path,
                         ess_min=ess_min.item(),
                         sem_min=sem_min.item(),
                         acceptance_rate=acceptance_rate.item(),
                         mcmc_time_sec=mcmc_time)

        save_ess(file_path_base=file_path[:-5],
                 samples=samples,
                 param_names=param_names,
                 normalized_ess_final=normalized_ess_final,
                 num_chains_to_save=FLAGS.num_chains_to_save)
Beispiel #30
0
def main(argv):
    del argv  # unused
    FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes]
    if len(FLAGS.layer_sizes) != 3:
        raise NotImplementedError(
            "Specifying fewer or more than 3 layers is not "
            "currently available.")
    if tf.io.gfile.exists(FLAGS.model_dir):
        tf.compat.v1.logging.warning(
            "Warning: deleting old log directory at {}".format(
                FLAGS.model_dir))
        tf.io.gfile.rmtree(FLAGS.model_dir)
    tf.io.gfile.makedirs(FLAGS.model_dir)

    if FLAGS.fake_data:
        bag_of_words = np.random.poisson(1., size=[10, 25])
        words = [str(i) for i in range(25)]
    else:
        bag_of_words, words = load_nips2011_papers(FLAGS.data_dir)

    total_count = np.sum(bag_of_words)
    bag_of_words = tf.cast(bag_of_words, dtype=tf.float32)
    data_size, feature_size = bag_of_words.shape

    # Compute expected log-likelihood. First, sample from the variational
    # distribution; second, compute the log-likelihood given the sample.
    qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational(
        data_size, feature_size, FLAGS.layer_sizes)

    with ed.tape() as model_tape:
        with ed.interception(
                make_value_setter(w2=qw2,
                                  w1=qw1,
                                  w0=qw0,
                                  z2=qz2,
                                  z1=qz1,
                                  z0=qz0)):
            posterior_predictive = deep_exponential_family(
                data_size, feature_size, FLAGS.layer_sizes, FLAGS.shape)

    log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words)
    log_likelihood = tf.reduce_sum(input_tensor=log_likelihood)
    tf.compat.v1.summary.scalar("log_likelihood", log_likelihood)

    # Compute analytic KL-divergence between variational and prior distributions.
    kl = 0.
    for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2),
                                    ("w0", qw0), ("w1", qw1), ("w2", qw2)]:
        kl += tf.reduce_sum(input_tensor=variational_rv.distribution.
                            kl_divergence(model_tape[rv_name].distribution))

    tf.compat.v1.summary.scalar("kl", kl)

    elbo = log_likelihood - kl
    tf.compat.v1.summary.scalar("elbo", elbo)
    optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate)
    train_op = optimizer.minimize(-elbo)

    sess = tf.compat.v1.Session()
    summary = tf.compat.v1.summary.merge_all()
    summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.model_dir,
                                                     sess.graph)
    start_time = time.time()

    sess.run(tf.compat.v1.global_variables_initializer())
    for step in range(FLAGS.max_steps):
        start_time = time.time()
        _, elbo_value = sess.run([train_op, elbo])
        if step % 500 == 0:
            duration = time.time() - start_time
            print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format(
                step, elbo_value, duration))
            summary_str = sess.run(summary)
            summary_writer.add_summary(summary_str, step)
            summary_writer.flush()

            # Compute perplexity of the full data set. The model's negative
            # log-likelihood of data is upper bounded by the variational objective.
            negative_log_likelihood = -elbo_value
            perplexity = np.exp(negative_log_likelihood / total_count)
            print("Negative log-likelihood <= {:0.3f}".format(
                negative_log_likelihood))
            print("Perplexity <= {:0.3f}".format(perplexity))

            # Print top 10 words for first 10 topics.
            qw0_values = sess.run(qw0)
            for k in range(min(10, FLAGS.layer_sizes[-1])):
                top_words_idx = qw0_values[k, :].argsort()[-10:][::-1]
                top_words = " ".join([words[i] for i in top_words_idx])
                print("Topic {}: {}".format(k, top_words))