def testMakeValueSetterWorksWithPartialAssignment(self):
        def normal_with_unknown_mean():
            loc = ed.Normal(loc=0., scale=1., name="loc")
            x = ed.Normal(loc=loc, scale=0.5, name="x")
            return x

        # Setting only the latents produces the posterior predictive distribution.
        loc_value = 3.
        with ed.interception(ed.make_value_setter(loc=loc_value)):
            x_predictive = normal_with_unknown_mean()
        self.assertAllEqual(self.evaluate(x_predictive.distribution.mean()),
                            loc_value)

        # Setting observed values allows calling the log joint as a fn of latents.
        x_value = 4.

        def model_with_observed_x():
            with ed.interception(ed.make_value_setter(x=x_value)):
                normal_with_unknown_mean()

        observed_log_joint_fn = ed.make_log_joint_fn(model_with_observed_x)

        expected_joint_log_prob = (
            tfd.Normal(0., 1.).log_prob(loc_value) +
            tfd.Normal(loc_value, 0.5).log_prob(x_value))
        self.assertEqual(self.evaluate(expected_joint_log_prob),
                         self.evaluate(observed_log_joint_fn(loc=loc_value)))
    def testMakeValueSetterSetsValues(self):
        def normal_with_unknown_mean():
            loc = ed.Normal(loc=0., scale=1., name="loc")
            x = ed.Normal(loc=loc, scale=0.5, name="x")
            return loc, x

        loc_value, x_value = 3., 4.
        with ed.interception(ed.make_value_setter(loc=loc_value, x=x_value)):
            loc_rv, x_rv = normal_with_unknown_mean()
        self.assertAllEqual(self.evaluate((loc_rv, x_rv)),
                            (loc_value, x_value))
def PAC2VI(dataSource=tf.keras.datasets.fashion_mnist,
           NPixels=14,
           algorithm=0,
           PARTICLES=20,
           batch_size=100,
           num_epochs=50,
           num_hidden_units=20):
    """ Run experiments for MAP, Variational, PAC^2-Variational and PAC^2_T-Variational algorithms for the self-supervised classification task with a Categorical data model.
        Args:
            dataSource: The data set used in the evaluation.
            NLabels: The number of labels to predict.
            NPixels: The size of the images: NPixels\times NPixels.
            algorithm: Integer indicating the algorithm to be run.
                0- MAP Learning
                1- Variational Learning
                2- PAC^2-Variational Learning
                3- PAC^2_T-Variational Learning
            PARTICLES: Number of Monte-Carlo samples used to compute the posterior prediction distribution.
            batch_size: Size of the batch.
            num_epochs: Number of epochs.
            num_hidden_units: Number of hidden units in the MLP.
        Returns:
            NLL: The negative log-likelihood over the test data set.
            :param algorithm:
    """

    np.random.seed(1)
    tf.set_random_seed(1)

    sess = tf.Session()

    (x_train, y_train), (x_test, y_test) = dataSource.load_data()

    if (dataSource.__name__.__contains__('cifar')):
        x_train = sess.run(
            tf.cast(tf.squeeze(tf.image.rgb_to_grayscale(x_train)),
                    dtype=tf.float32))
        x_test = sess.run(
            tf.cast(tf.squeeze(tf.image.rgb_to_grayscale(x_test)),
                    dtype=tf.float32))

    x_train = (x_train < 128).astype(np.int32)
    x_test = (x_test < 128).astype(np.int32)

    NPixels = np.int(NPixels / 2)

    y_train = x_train[:, NPixels:]
    x_train = x_train[:, 0:NPixels]

    y_test = x_test[:, NPixels:]
    x_test = x_test[:, 0:NPixels]

    NPixels = NPixels * NPixels * 2

    N = x_train.shape[0]
    M = batch_size

    x_batch = tf.placeholder(dtype=tf.float32,
                             name="x_batch",
                             shape=[None, NPixels])
    y_batch = tf.placeholder(dtype=tf.int32,
                             name="y_batch",
                             shape=[None, NPixels])

    def model(NHIDDEN, x):
        W = ed.Normal(loc=tf.zeros([NPixels, NHIDDEN]), scale=1., name="W")
        b = ed.Normal(loc=tf.zeros([1, NHIDDEN]), scale=1., name="b")

        W_out = ed.Normal(loc=tf.zeros([NHIDDEN, 2 * NPixels]),
                          scale=1.,
                          name="W_out")
        b_out = ed.Normal(loc=tf.zeros([1, 2 * NPixels]),
                          scale=1.,
                          name="b_out")

        hidden_layer = tf.nn.relu(tf.matmul(x, W) + b)
        out = tf.matmul(hidden_layer, W_out) + b_out
        y = ed.Categorical(logits=tf.reshape(
            out, [tf.shape(x_batch)[0], NPixels, 2]),
                           name="y")

        return W, b, W_out, b_out, x, y

    def qmodel(NHIDDEN):
        W_loc = tf.Variable(
            tf.random_normal([NPixels, NHIDDEN], 0.0, 0.1, dtype=tf.float32))
        b_loc = tf.Variable(
            tf.random_normal([1, NHIDDEN], 0.0, 0.1, dtype=tf.float32))

        if algorithm == 0:
            W_scale = 0.000001
            b_scale = 0.000001
        else:
            W_scale = tf.nn.softplus(
                tf.Variable(
                    tf.random_normal([NPixels, NHIDDEN],
                                     -3.,
                                     stddev=0.1,
                                     dtype=tf.float32)))
            b_scale = tf.nn.softplus(
                tf.Variable(
                    tf.random_normal([1, NHIDDEN],
                                     -3.,
                                     stddev=0.1,
                                     dtype=tf.float32)))

        qW = ed.Normal(W_loc, scale=W_scale, name="W")
        qW_ = ed.Normal(W_loc, scale=W_scale, name="W")

        qb = ed.Normal(b_loc, scale=b_scale, name="b")
        qb_ = ed.Normal(b_loc, scale=b_scale, name="b")

        W_out_loc = tf.Variable(
            tf.random_normal([NHIDDEN, 2 * NPixels],
                             0.0,
                             0.1,
                             dtype=tf.float32))
        b_out_loc = tf.Variable(
            tf.random_normal([1, 2 * NPixels], 0.0, 0.1, dtype=tf.float32))
        if algorithm == 0:
            W_out_scale = 0.000001
            b_out_scale = 0.000001
        else:
            W_out_scale = tf.nn.softplus(
                tf.Variable(
                    tf.random_normal([NHIDDEN, 2 * NPixels],
                                     -3.,
                                     stddev=0.1,
                                     dtype=tf.float32)))
            b_out_scale = tf.nn.softplus(
                tf.Variable(
                    tf.random_normal([1, 2 * NPixels],
                                     -3.,
                                     stddev=0.1,
                                     dtype=tf.float32)))

        qW_out = ed.Normal(W_out_loc, scale=W_out_scale, name="W_out")
        qb_out = ed.Normal(b_out_loc, scale=b_out_scale, name="b_out")

        qW_out_ = ed.Normal(W_out_loc, scale=W_out_scale, name="W_out")
        qb_out_ = ed.Normal(b_out_loc, scale=b_out_scale, name="b_out")

        return qW, qW_, qb, qb_, qW_out, qW_out_, qb_out, qb_out_

    W, b, W_out, b_out, x, y = model(num_hidden_units, x_batch)

    qW, qW_, qb, qb_, qW_out, qW_out_, qb_out, qb_out_ = qmodel(
        num_hidden_units)

    with ed.interception(
            ed.make_value_setter(W=qW, b=qb, W_out=qW_out, b_out=qb_out)):
        pW, pb, pW_out, pb_out, px, py = model(num_hidden_units, x)

    with ed.interception(
            ed.make_value_setter(W=qW_, b=qb_, W_out=qW_out_, b_out=qb_out_)):
        pW_, pb_, pW_out_, pb_out_, px_, py_ = model(num_hidden_units, x)

    pylogprob = tf.expand_dims(
        tf.reduce_sum(py.distribution.log_prob(y_batch), axis=1), 1)
    py_logprob = tf.expand_dims(
        tf.reduce_sum(py_.distribution.log_prob(y_batch), axis=1), 1)

    logmax = tf.stop_gradient(tf.math.maximum(pylogprob, py_logprob) + 0.1)
    logmean_logmax = tf.math.reduce_logsumexp(tf.concat(
        [pylogprob - logmax, py_logprob - logmax], 1),
                                              axis=1) - tf.log(2.)
    alpha = tf.expand_dims(logmean_logmax, 1)

    if (algorithm == 3):
        hmax = 2 * tf.stop_gradient(
            alpha / tf.math.pow(1 - tf.math.exp(alpha), 2) +
            tf.math.pow(tf.math.exp(alpha) * (1 - tf.math.exp(alpha)), -1))
    else:
        hmax = 1.

    var = 0.5 * (
        tf.reduce_mean(tf.exp(2 * pylogprob - 2 * logmax) * hmax) -
        tf.reduce_mean(tf.exp(pylogprob + py_logprob - 2 * logmax) * hmax))

    datalikelihood = tf.reduce_mean(pylogprob)


    logprior = tf.reduce_sum(pW.distribution.log_prob(pW.value)) + \
             tf.reduce_sum(pb.distribution.log_prob(pb.value)) + \
             tf.reduce_sum(pW_out.distribution.log_prob(pW_out.value)) + \
             tf.reduce_sum(pb_out.distribution.log_prob(pb_out.value))


    entropy = tf.reduce_sum(qW.distribution.log_prob(qW.value)) + \
              tf.reduce_sum(qb.distribution.log_prob(qb.value)) + \
              tf.reduce_sum(qW_out.distribution.log_prob(qW_out.value)) + \
              tf.reduce_sum(qb_out.distribution.log_prob(qb_out.value))

    entropy = -entropy

    KL = (-entropy - logprior) / N

    if (algorithm == 2 or algorithm == 3):
        elbo = datalikelihood + var - KL
    elif algorithm == 1:
        elbo = datalikelihood - KL
    elif algorithm == 0:
        elbo = datalikelihood + logprior / N

    verbose = True
    optimizer = tf.train.AdamOptimizer(0.001)
    t = []
    train = optimizer.minimize(-elbo)
    init = tf.global_variables_initializer()
    sess.run(init)

    for i in range(num_epochs + 1):
        perm = np.random.permutation(N)
        x_train = np.take(x_train, perm, axis=0)
        y_train = np.take(y_train, perm, axis=0)

        x_batches = np.array_split(x_train, N / M)
        y_batches = np.array_split(y_train, N / M)

        for j in range(N // M):
            batch_x = np.reshape(
                x_batches[j], [x_batches[j].shape[0], -1]).astype(np.float32)
            batch_y = np.reshape(
                y_batches[j], [y_batches[j].shape[0], -1]).astype(np.float32)

            value, _ = sess.run([elbo, train],
                                feed_dict={
                                    x_batch: batch_x,
                                    y_batch: batch_y
                                })
            t.append(-value)
            if verbose:
                #if j % 1 == 0: print(".", end="", flush=True)
                if i % 50 == 0 and j % 1000 == 0:
                    #if j >= 5 :
                    print("\nEpoch: " + str(i))
                    str_elbo = str(t[-1])
                    print("\n" + str(j) + " epochs\t" + str_elbo,
                          end="",
                          flush=True)
                    print("\n" + str(j) + " data\t" + str(
                        sess.run(datalikelihood,
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " var\t" + str(
                        sess.run(var,
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " KL\t" + str(
                        sess.run(KL,
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " energy\t" + str(
                        sess.run(logprior,
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " entropy\t" + str(
                        sess.run(entropy,
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " hmax\t" + str(
                        sess.run(tf.reduce_mean(hmax),
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " alpha\t" + str(
                        sess.run(tf.reduce_mean(alpha),
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)
                    print("\n" + str(j) + " logmax\t" + str(
                        sess.run(tf.reduce_mean(logmax),
                                 feed_dict={
                                     x_batch: batch_x,
                                     y_batch: batch_y
                                 })),
                          end="",
                          flush=True)

    M = 1000

    N = x_test.shape[0]
    x_batches = np.array_split(x_test, N / M)
    y_batches = np.array_split(y_test, N / M)

    NLL = 0

    for j in range(N // M):
        batch_x = np.reshape(x_batches[j],
                             [x_batches[j].shape[0], -1]).astype(np.float32)
        batch_y = np.reshape(y_batches[j],
                             [y_batches[j].shape[0], -1]).astype(np.float32)
        y_pred_list = []
        for i in range(PARTICLES):
            y_pred_list.append(
                sess.run(pylogprob,
                         feed_dict={
                             x_batch: batch_x,
                             y_batch: batch_y
                         }))
        y_preds = np.concatenate(y_pred_list, axis=1)
        score = tf.reduce_sum(
            tf.math.reduce_logsumexp(y_preds, axis=1) -
            tf.log(np.float32(PARTICLES)))
        score = sess.run(score)
        NLL = NLL + score
        if verbose:
            if j % 1 == 0: print(".", end="", flush=True)
            if j % 1 == 0:
                str_elbo = str(score)
                print("\n" + str(j) + " epochs\t" + str_elbo,
                      end="",
                      flush=True)

    print("\nNLL: " + str(NLL))

    return NLL
 def model_with_observed_x():
     with ed.interception(ed.make_value_setter(x=x_value)):
         normal_with_unknown_mean()
Beispiel #5
0
            t.append(sess.run([elbo]))

    w_mean_inferred = sess.run(qw_mean)
    w_stddv_inferred = sess.run(qw_stddv)
    z_mean_inferred = sess.run(qz_mean)
    z_stddv_inferred = sess.run(qz_stddv)

print("Inferred axes:")
print(w_mean_inferred)
print("Standard Deviation:")
print(w_stddv_inferred)

plt.plot(range(1, num_epochs, 5), t)
plt.show()

with ed.interception(ed.make_value_setter(w=w_mean_inferred,
                                          z=z_mean_inferred)):
    generate = probabilistic_matrix_factorization(
        data_dim=N,
        latent_dim=D,
        num_datapoints=M,
        stddv_datapoints=stddv_datapoints)

with tf.Session() as sess:
    x_generated, _ = sess.run(generate)

plt.scatter(data[:, 0],
            data[:, 1],
            color='blue',
            alpha=0.1,
            label='Actual data')
plt.scatter(x_generated[0, :],
Beispiel #6
0
def main(argv):
    del argv  # unused
    if tf.io.gfile.exists(FLAGS.model_dir):
        tf.compat.v1.logging.warning(
            "Warning: deleting old log directory at {}".format(
                FLAGS.model_dir))
        tf.io.gfile.rmtree(FLAGS.model_dir)
    tf.io.gfile.makedirs(FLAGS.model_dir)
    tf.compat.v1.enable_eager_execution()

    grammar = SmilesGrammar()
    synthetic_data_distribution = ProbabilisticGrammar(
        grammar=grammar,
        latent_size=FLAGS.latent_size,
        num_units=FLAGS.num_units)

    print("Random examples from synthetic data distribution:")
    for _ in range(5):
        productions = synthetic_data_distribution()
        string = grammar.convert_to_string(productions)
        print(string)

    probabilistic_grammar = ProbabilisticGrammar(grammar=grammar,
                                                 latent_size=FLAGS.latent_size,
                                                 num_units=FLAGS.num_units)
    probabilistic_grammar_variational = ProbabilisticGrammarVariational(
        latent_size=FLAGS.latent_size)

    checkpoint = tf.train.Checkpoint(
        synthetic_data_distribution=synthetic_data_distribution,
        probabilistic_grammar=probabilistic_grammar,
        probabilistic_grammar_variational=probabilistic_grammar_variational)
    global_step = tf.compat.v1.train.get_or_create_global_step()
    optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate)
    writer = tf.compat.v2.summary.create_file_writer(FLAGS.model_dir)
    writer.set_as_default()

    start_time = time.time()
    for step in range(FLAGS.max_steps):
        productions = synthetic_data_distribution()
        with tf.GradientTape() as tape:
            # Sample from amortized variational distribution and record its trace.
            with ed.tape() as variational_tape:
                _ = probabilistic_grammar_variational(productions)

            # Set model trace to take on the data's values and the sample from the
            # variational distribution.
            values = {"latent_code": variational_tape["latent_code_posterior"]}
            values.update({
                "production_" + str(t): production
                for t, production in enumerate(tf.unstack(productions, axis=1))
            })
            with ed.tape() as model_tape:
                with ed.interception(ed.make_value_setter(**values)):
                    _ = probabilistic_grammar()

            # Compute the ELBO given the variational sample, averaged over the batch
            # size and the number of time steps (number of productions). Although the
            # ELBO per data point sums over time steps, we average in order to have a
            # value that remains on the same scale across batches.
            log_likelihood = 0.
            for name, rv in six.iteritems(model_tape):
                if name.startswith("production"):
                    log_likelihood += rv.distribution.log_prob(rv.value)

            kl = tfp.distributions.kl_divergence(
                variational_tape["latent_code_posterior"].distribution,
                model_tape["latent_code"].distribution)

            timesteps = tf.cast(productions.shape[1], dtype=tf.float32)
            elbo = tf.reduce_mean(input_tensor=log_likelihood - kl) / timesteps
            loss = -elbo
            with tf.compat.v2.summary.record_if(
                    lambda: tf.math.equal(0, global_step % 500)):
                tf.compat.v2.summary.scalar(
                    "log_likelihood",
                    tf.reduce_mean(input_tensor=log_likelihood) / timesteps,
                    step=global_step)
                tf.compat.v2.summary.scalar("kl",
                                            tf.reduce_mean(input_tensor=kl) /
                                            timesteps,
                                            step=global_step)
                tf.compat.v2.summary.scalar("elbo", elbo, step=global_step)

        variables = (probabilistic_grammar.variables +
                     probabilistic_grammar_variational.variables)
        grads = tape.gradient(loss, variables)
        grads_and_vars = zip(grads, variables)
        optimizer.apply_gradients(grads_and_vars, global_step)

        if step % 500 == 0:
            duration = time.time() - start_time
            print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format(
                step, loss, duration))
            checkpoint.save(file_prefix=FLAGS.model_dir)
Beispiel #7
0
def model_fn(features, labels, mode, params, config):
  """Builds the model function for use in an Estimator.
  Arguments:
    features: The input features for the Estimator.
    labels: The labels, unused here.
    mode: Signifies whether it is train or test or predict.
    params: Some hyperparameters as a dictionary.
    config: The RunConfig, unused here.
  Returns:
    EstimatorSpec: A tf.estimator.EstimatorSpec instance.
  """
  del labels, config

  # Set up the model's learnable parameters.
  logit_concentration = tf.compat.v1.get_variable(
      "logit_concentration",
      shape=[1, params["num_topics"]],
      initializer=tf.compat.v1.initializers.constant(
          _softplus_inverse(params["prior_initial_value"])))
  concentration = _clip_dirichlet_parameters(
      tf.nn.softplus(logit_concentration))

  num_words = features.shape[1]
  topics_words_logits = tf.compat.v1.get_variable(
      "topics_words_logits",
      shape=[params["num_topics"], num_words],
      initializer=tf.compat.v1.glorot_normal_initializer())
  topics_words = tf.nn.softmax(topics_words_logits, axis=-1)

  # Compute expected log-likelihood. First, sample from the variational
  # distribution; second, compute the log-likelihood given the sample.
  lda_variational, encoder_net  = make_lda_variational(
      params["activation"],
      params["num_topics"],
      params["layer_sizes"])
  with ed.tape() as variational_tape:
    _ = lda_variational(features)

  with ed.tape() as model_tape:
    with ed.interception(
        ed.make_value_setter(topics=variational_tape["topics_posterior"])):
      posterior_predictive = latent_dirichlet_allocation(concentration,
                                                         topics_words)

  log_likelihood = posterior_predictive.distribution.log_prob(features)
  tf.compat.v1.summary.scalar("log_likelihood",
                              tf.reduce_mean(input_tensor=log_likelihood))

  # Compute the KL-divergence between two Dirichlets analytically.
  # The sampled KL does not work well for "sparse" distributions
  # (see Appendix D of [2]).
  kl = variational_tape["topics_posterior"].distribution.kl_divergence(
      model_tape["topics"].distribution)
  tf.compat.v1.summary.scalar("kl", tf.reduce_mean(input_tensor=kl))

  # Ensure that the KL is non-negative (up to a very small slack).
  # Negative KL can happen due to numerical instability.
  with tf.control_dependencies(
      [tf.compat.v1.assert_greater(kl, -1e-3, message="kl")]):
    kl = tf.identity(kl)

  elbo = log_likelihood - kl
  avg_elbo = tf.reduce_mean(input_tensor=elbo)
  tf.compat.v1.summary.scalar("elbo", avg_elbo)
  loss = -avg_elbo

  # Perform variational inference by minimizing the -ELBO.
  global_step = tf.compat.v1.train.get_or_create_global_step()
  optimizer = tf.compat.v1.train.AdamOptimizer(params["learning_rate"])

  # This implements the "burn-in" for prior parameters (see Appendix D of [2]).
  # For the first prior_burn_in_steps steps they are fixed, and then trained
  # jointly with the other parameters.
  grads_and_vars = optimizer.compute_gradients(loss)
  grads_and_vars_except_prior = [
      x for x in grads_and_vars if x[1] != logit_concentration]

  def train_op_except_prior():
    return optimizer.apply_gradients(
        grads_and_vars_except_prior,
        global_step=global_step)

  def train_op_all():
    return optimizer.apply_gradients(
        grads_and_vars,
        global_step=global_step)

  train_op = tf.cond(
      pred=global_step < params["prior_burn_in_steps"],
      true_fn=train_op_except_prior,
      false_fn=train_op_all)

  # The perplexity is an exponent of the average negative ELBO per word.
#   words_per_document = tf.reduce_sum(input_tensor=features, axis=1)
  
  log_perplexity = -tf.reduce_sum(elbo) / tf.reduce_sum(features)
  
#   tf.compat.v1.summary.scalar(
#       "perplexity", tf.exp(tf.reduce_mean(input_tensor=log_perplexity)))
  (log_perplexity_tensor,
   log_perplexity_update) = tf.compat.v1.metrics.mean(log_perplexity)
  perplexity_tensor = tf.exp(log_perplexity_tensor)

  # Obtain the topics summary. Implemented as a py_func for simplicity.
  topics = tf.compat.v1.py_func(
      functools.partial(get_topics_strings, vocabulary=params["vocabulary"]),
      [topics_words, concentration],
      tf.string,
      stateful=False)
  tf.compat.v1.summary.text("topics", topics)
  
  var_concentration = _clip_dirichlet_parameters(encoder_net(features))

  return tf.estimator.EstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      eval_metric_ops={
          "elbo": tf.compat.v1.metrics.mean(elbo),
          "log_likelihood": tf.compat.v1.metrics.mean(log_likelihood),
          "kl": tf.compat.v1.metrics.mean(kl),
          "perplexity": (perplexity_tensor, log_perplexity_update),
          "topics": (topics, tf.no_op()),
      },
      
      predictions={'topics_posterior_params': var_concentration}
      
      )
Beispiel #8
0
def main(argv):
  del argv  # unused
  FLAGS.layer_sizes = [int(layer_size) for layer_size in FLAGS.layer_sizes]
  if len(FLAGS.layer_sizes) != 3:
    raise NotImplementedError("Specifying fewer or more than 3 layers is not "
                              "currently available.")
  if tf.io.gfile.exists(FLAGS.model_dir):
    tf.compat.v1.logging.warning(
        "Warning: deleting old log directory at {}".format(FLAGS.model_dir))
    tf.io.gfile.rmtree(FLAGS.model_dir)
  tf.io.gfile.makedirs(FLAGS.model_dir)

  if FLAGS.fake_data:
    bag_of_words = np.random.poisson(1., size=[10, 25])
    words = [str(i) for i in range(25)]
  else:
    bag_of_words, words = load_nips2011_papers(FLAGS.data_dir)

  total_count = np.sum(bag_of_words)
  bag_of_words = tf.cast(bag_of_words, dtype=tf.float32)
  data_size, feature_size = bag_of_words.shape

  # Compute expected log-likelihood. First, sample from the variational
  # distribution; second, compute the log-likelihood given the sample.
  qw2, qw1, qw0, qz2, qz1, qz0 = deep_exponential_family_variational(
      data_size,
      feature_size,
      FLAGS.layer_sizes)

  with ed.tape() as model_tape:
    with ed.interception(ed.make_value_setter(w2=qw2, w1=qw1, w0=qw0,
                                              z2=qz2, z1=qz1, z0=qz0)):
      posterior_predictive = deep_exponential_family(data_size,
                                                     feature_size,
                                                     FLAGS.layer_sizes,
                                                     FLAGS.shape)

  log_likelihood = posterior_predictive.distribution.log_prob(bag_of_words)
  log_likelihood = tf.reduce_sum(input_tensor=log_likelihood)
  tf.compat.v1.summary.scalar("log_likelihood", log_likelihood)

  # Compute analytic KL-divergence between variational and prior distributions.
  kl = 0.
  for rv_name, variational_rv in [("z0", qz0), ("z1", qz1), ("z2", qz2),
                                  ("w0", qw0), ("w1", qw1), ("w2", qw2)]:
    kl += tf.reduce_sum(
        input_tensor=variational_rv.distribution.kl_divergence(
            model_tape[rv_name].distribution))

  tf.compat.v1.summary.scalar("kl", kl)

  elbo = log_likelihood - kl
  tf.compat.v1.summary.scalar("elbo", elbo)
  optimizer = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate)
  train_op = optimizer.minimize(-elbo)

  sess = tf.compat.v1.Session()
  summary = tf.compat.v1.summary.merge_all()
  summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.model_dir, sess.graph)
  start_time = time.time()

  sess.run(tf.compat.v1.global_variables_initializer())
  for step in range(FLAGS.max_steps):
    start_time = time.time()
    _, elbo_value = sess.run([train_op, elbo])
    if step % 500 == 0:
      duration = time.time() - start_time
      print("Step: {:>3d} Loss: {:.3f} ({:.3f} sec)".format(
          step, elbo_value, duration))
      summary_str = sess.run(summary)
      summary_writer.add_summary(summary_str, step)
      summary_writer.flush()

      # Compute perplexity of the full data set. The model's negative
      # log-likelihood of data is upper bounded by the variational objective.
      negative_log_likelihood = -elbo_value
      perplexity = np.exp(negative_log_likelihood / total_count)
      print("Negative log-likelihood <= {:0.3f}".format(
          negative_log_likelihood))
      print("Perplexity <= {:0.3f}".format(perplexity))

      # Print top 10 words for first 10 topics.
      qw0_values = sess.run(qw0)
      for k in range(min(10, FLAGS.layer_sizes[-1])):
        top_words_idx = qw0_values[k, :].argsort()[-10:][::-1]
        top_words = " ".join([words[i] for i in top_words_idx])
        print("Topic {}: {}".format(k, top_words))
Beispiel #9
0
                     stddv_datapoints=stddv_datapoints,
                     w=w,
                     z=z,
                     x=x_train)


energy = -target(w, z)
optimizer = tf.train.AdamOptimizer(learning_rate=0.05)
train = optimizer.minimize(energy)
init = tf.global_variables_initializer()
t = []
num_epochs = 200
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_epochs):
        sess.run(train)
        if i % 5 == 0:
            cE, cw, cz = sess.run([energy, w, z])
            t.append(cE)
    w_inferred_map = sess.run(w)
    z_inferred_map = sess.run(z)
with ed.interception(ed.make_value_setter(w=w_inferred_map, z=z_inferred_map)):
    generate = probabilistic_pca(data_dim=data_dim,
                                 latent_dim=latent_dim,
                                 num_datapoints=num_datapoints,
                                 stddv_datapoints=stddv_datapoints)
with tf.Session() as sess:
    x_generated, _ = sess.run(generate)
plt.imshow(x_generated)
plt.show()