Example #1
0
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            if FLAGS.version2 and FLAGS.ensemble_size > 1:
                images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
                if not (FLAGS.member_sampling or FLAGS.expected_probs):
                    labels = tf.tile(labels, [FLAGS.ensemble_size])

            if FLAGS.num_train_samples > 1:
                images = tf.tile(images, [FLAGS.num_train_samples, 1, 1, 1])

            with tf.GradientTape() as tape:
                logits = model(images, training=True)
                probs = tf.nn.softmax(logits)
                # Diversity evaluation.
                if FLAGS.version2 and FLAGS.ensemble_size > 1:
                    per_probs = tf.reshape(
                        probs,
                        tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]],
                                  0))

                    diversity_results = ed.metrics.average_pairwise_diversity(
                        per_probs, FLAGS.ensemble_size)

                if FLAGS.num_train_samples > 1:
                    probs = tf.reshape(
                        probs,
                        tf.concat(
                            [[FLAGS.num_train_samples, -1], probs.shape[1:]],
                            0))
                    probs = tf.reduce_mean(probs, 0)

                if FLAGS.member_sampling and FLAGS.version2 and FLAGS.ensemble_size > 1:
                    idx = tf.random.uniform([],
                                            maxval=FLAGS.ensemble_size,
                                            dtype=tf.int64)
                    idx_one_hot = tf.expand_dims(
                        tf.one_hot(idx, FLAGS.ensemble_size,
                                   dtype=probs.dtype), 0)
                    probs_shape = probs.shape
                    probs = tf.reshape(probs, [FLAGS.ensemble_size, -1])
                    probs = tf.matmul(idx_one_hot, probs)
                    probs = tf.reshape(probs,
                                       tf.concat([[-1], probs_shape[1:]], 0))

                elif FLAGS.expected_probs and FLAGS.version2 and FLAGS.ensemble_size > 1:
                    probs = tf.reshape(
                        probs,
                        tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]],
                                  0))
                    probs = tf.reduce_mean(probs, 0)

                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        labels, probs))

                filtered_variables = []
                for var in model.trainable_variables:
                    # Apply l2 on the slow weights and bias terms. This excludes BN
                    # parameters and fast weight approximate posterior/prior parameters,
                    # but pay caution to their naming scheme.
                    if 'kernel' in var.name or 'bias' in var.name:
                        filtered_variables.append(tf.reshape(var, (-1, )))

                l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
                    tf.concat(filtered_variables, axis=0))
                kl = sum(model.losses) / train_dataset_size
                kl_scale = tf.cast(optimizer.iterations + 1, kl.dtype)
                kl_scale /= FLAGS.kl_annealing_steps
                kl_scale = tf.minimum(1., kl_scale)
                kl_loss = kl_scale * kl

                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                loss = negative_log_likelihood + l2_loss + kl_loss
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)

            # Separate learning rate implementation.
            grad_list = []
            if FLAGS.fast_weight_lr_multiplier != 1.0:
                grads_and_vars = list(zip(grads, model.trainable_variables))
                for vec, var in grads_and_vars:
                    # Apply different learning rate on the fast weight approximate
                    # posterior/prior parameters. This is excludes BN and slow weights,
                    # but pay caution to the naming scheme.
                    if ('batch_norm' not in var.name
                            and 'kernel' not in var.name):
                        grad_list.append(
                            (vec * FLAGS.fast_weight_lr_multiplier, var))
                    else:
                        grad_list.append((vec, var))
                optimizer.apply_gradients(grad_list)
            else:
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))

            metrics['train/ece'].update_state(labels, probs)
            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, probs)
            if FLAGS.version2 and FLAGS.ensemble_size > 1:
                for k, v in diversity_results.items():
                    training_diversity['train/' + k].update_state(v)
Example #2
0
 def grad(model, inputs, targets):
     with tf.GradientTape() as tape:
         loss_value = loss(model, inputs, targets, training=True)
         for loss_internal in model.losses:
             loss_value += loss_internal
     return loss_value, tape.gradient(loss_value, model.trainable_variables)
Example #3
0
 def compare(a, b):
     with tf.GradientTape() as tape:
         tape.watch(a)
         r = f(a, b)
     expected = tape.gradient(r, a)
     self.assertAllEqual(expected, g(a, b))
Example #4
0
 def f(x):
     with tf.GradientTape() as t:
         t.watch(x)
         z = m(x**2)
     grads = t.gradient(z, x)
     return grads
Example #5
0
def optimize():
  with tf.GradientTape() as tape:
    loss = -gp.log_prob(observations)
  grads = tape.gradient(loss, gp.trainable_variables)
  optimizer.apply_gradients(zip(grads, gp.trainable_variables))
  return loss
 def jit_with_grad(mat):
     with tf.GradientTape():
         return alt_chol_jit(mat)
Example #7
0
def train_op():
    with tf.GradientTape() as tape:
        neg_log_prob = -log_prob()
    grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0]
    optimizer.apply_gradients([(grads, trainable_log_rates)])
    return neg_log_prob, tf.math.exp(trainable_log_rates)
Example #8
0
 def step_fn(inputs):
   images, targets = inputs
   with tf.GradientTape() as tape:
     loss = compute_loss2(images, targets)
   grads = tape.gradient(loss, model2.variables)
   optimizer.apply_gradients(zip(grads, model2.variables))
Example #9
0
 def minimize_fn():
     with tf.GradientTape() as tape:
         loss = get_loss()
     grads = tape.gradient(loss, var_list)
     return opt.apply_gradients(zip(grads, var_list))
 def fn():
     with tf.GradientTape() as tape:
         x = tf.ones((0, 2, 2, 2))
         layer(x, training=True)
     return tape
Example #11
0
    def test_latent_dirichlet_allocation(self, jd_class):  # pylint: disable=g-doc-args
        """Tests Latent Dirichlet Allocation joint model.

    The LDA generative process can be written as:

    ```none
    N[i] ~ Poisson(xi)
    theta[i] ~ Dirichlet(alpha)
    Z[i] ~ Multinomial(N[i], theta[i])
    for k in 1...K:
      X[i,k] ~ Multinomial(Z[i, k], beta[j])
    ```

    Typically `xi` is specified and `alpha`, `beta` are fit using type-II
    maximum likelihood estimators.

    Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
    """
        seed = test_util.test_seed_stream()
        # Hyperparameters.
        num_topics = 3
        num_words = 10
        avg_doc_length = 5
        u = tfd.Uniform(low=-1., high=1.)
        alpha = tfp.util.TransformedVariable(u.sample([num_topics],
                                                      seed=seed()),
                                             tfb.Softplus(),
                                             name='alpha')
        beta = tf.Variable(u.sample([num_topics, num_words], seed=seed()),
                           name='beta')

        # Note near 1:1 with mathematical specification. The main distinction is the
        # use of Independent--this lets us easily aggregate multinomials across
        # topics (and in any "shape" of documents).
        def lda_coroutine_model():
            n = yield Root(tfd.Poisson(rate=avg_doc_length))
            theta = yield Root(tfd.Dirichlet(concentration=alpha))
            z = yield tfd.Multinomial(total_count=n, probs=theta)
            yield tfd.Multinomial(total_count=z, logits=beta)

        if jd_class is tfd.JointDistributionCoroutineAutoBatched:
            model = lda_coroutine_model
        elif jd_class is tfd.JointDistributionSequentialAutoBatched:
            model = [
                tfd.Poisson(rate=avg_doc_length),  # n
                tfd.Dirichlet(concentration=alpha),  # theta
                lambda theta, n: tfd.Multinomial(total_count=n, probs=theta
                                                 ),  # z
                lambda z: tfd.Multinomial(total_count=z, logits=beta)
            ]
        elif jd_class is tfd.JointDistributionNamedAutoBatched:
            model = collections.OrderedDict((
                ('n', tfd.Poisson(rate=avg_doc_length)),
                ('theta', tfd.Dirichlet(concentration=alpha)),
                ('z',
                 lambda theta, n: tfd.Multinomial(total_count=n, probs=theta)),
                ('X', lambda z: tfd.Multinomial(total_count=z, logits=beta))))

        # TODO(b/159842104): Enable autovectorization for Multinomial sampling.
        lda = jd_class(model, validate_args=True, use_vectorized_map=False)

        # Now, let's sample some "documents" and compute the log-prob of each.
        docs_shape = [2, 4]  # That is, 8 docs in the shape of [2, 4].
        sample = lda.sample(docs_shape, seed=seed())
        log_probs = lda.log_prob(sample)
        self.assertEqual(docs_shape, log_probs.shape)

        # Verify we correctly track trainable variables.
        self.assertLen(lda.trainable_variables, 2)
        self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0])
        self.assertIs(beta, lda.trainable_variables[1])

        # Ensure we can compute gradients.
        with tf.GradientTape() as tape:
            # Note: The samples are not taped, hence implicitly "stop_gradient."
            negloglik = -lda.log_prob(sample)
        grads = tape.gradient(negloglik, lda.trainable_variables)

        self.assertLen(grads, 2)
        self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape),
                            (grads[0].shape, grads[1].shape))
        self.assertAllNotNone(grads)
Example #12
0
    def _reparameterize_sample(self, x):
        """Adds reparameterization (pathwise) gradients to samples of the mixture.

    Implicit reparameterization gradients are
       dx/dphi = -(d transform(x, phi) / dx)^-1 * d transform(x, phi) / dphi,
    where transform(x, phi) is distributional transform that removes all
    parameters from samples x.

    We implement them by replacing x with
      -stop_gradient(d transform(x, phi) / dx)^-1 * transform(x, phi)]
    for the backward pass (gradient computation).
    The derivative of this quantity w.r.t. phi is then the implicit
    reparameterization gradient.
    Note that this replaces the gradients w.r.t. both the mixture
    distribution parameters and components distributions parameters.

    Limitations:
      1. Fundamental: components must be fully reparameterized.
      2. Distributional transform is currently only implemented for
        factorized components.
      3. Distributional transform currently only works for known rank of the
        batch tensor.

    Arguments:
      x: Sample of mixture distribution

    Returns:
      Tensor with same value as x, but with reparameterization gradients
    """
        # Remove the existing gradients of x wrt parameters of the components.
        x = tf.stop_gradient(x)

        x_2d_shape = [-1, self._event_size]  # [S*prod(B), prod(E)]

        # Perform distributional transform of x in [S, B, E] shape,
        # but have Jacobian of size [S*prod(B), prod(E), prod(E)].
        def reshaped_distributional_transform(x_2d):
            return tf.reshape(
                self._distributional_transform(tf.reshape(x_2d, tf.shape(x))),
                x_2d_shape)

        # transform_2d: [S*prod(B), prod(E)]
        # jacobian: [S*prod(B), prod(E), prod(E)]
        x_2d = tf.reshape(x, x_2d_shape)
        try:
            with tf.GradientTape() as tape:
                tape.watch(x_2d)
                transform_2d = reshaped_distributional_transform(x_2d)
            jacobian = tape.batch_jacobian(transform_2d, x_2d)
        except TypeError:
            # TODO(b/139374388): Remove exception workaround.
            with tf.GradientTape(persistent=True) as tape:
                tape.watch(x_2d)
                transform_2d = reshaped_distributional_transform(x_2d)
            jacobian = tape.batch_jacobian(transform_2d,
                                           x_2d,
                                           experimental_use_pfor=False)

        # We only provide the first derivative; the second derivative computed by
        # autodiff would be incorrect, so we raise an error if it is requested.
        transform_2d = _prevent_2nd_derivative(transform_2d)

        # Compute [- stop_gradient(jacobian)^-1 * transform] by solving a linear
        # system. The Jacobian is lower triangular because the distributional
        # transform for i-th event dimension does not depend on the next
        # dimensions.
        surrogate_x_2d = -tf.linalg.triangular_solve(
            tf.stop_gradient(jacobian),
            tf.expand_dims(transform_2d, axis=-1),
            lower=True)  # [S*prod(B), prod(E), 1]
        surrogate_x = tf.reshape(surrogate_x_2d, tf.shape(x))

        # Replace gradients of x with gradients of surrogate_x, but keep the value.
        return x + (surrogate_x - tf.stop_gradient(surrogate_x))
Example #13
0
 def compute_jacobian(x):
     with tf.GradientTape() as g:
         g.watch(x)
         y = tfp.math.log_cumsum_exp(x)
     return g.jacobian(y, x)
Example #14
0
def helper_keras_gradtape(
    trial_dir,
    save_all=False,
    include_collections=None,
    reduction_config=None,
    save_config=None,
    hook=None,
    batch_size=64,
    persistent=False,
):
    mnist = tf.keras.datasets.mnist
    (x_train, y_train), _ = mnist.load_data()
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(x_train[..., tf.newaxis] / 255,
                 tf.float32), tf.cast(y_train, tf.int64)))
    dataset = dataset.shuffle(1000).batch(batch_size)

    model = tf.keras.models.Sequential([
        # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279
        tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation="softmax"),
    ])

    if hook is None:
        if save_config is None:
            save_config = SaveConfig(save_interval=3)

        hook = smd.KerasHook(
            trial_dir,
            save_config=save_config,
            save_all=save_all,
            include_collections=include_collections,
            reduction_config=reduction_config,
        )

        if not save_all and include_collections is not None:
            for cname in hook.include_collections:
                if cname not in include_collections:
                    hook.get_collection(cname).save_config = SaveConfig(
                        end_step=0)

    opt = tf.keras.optimizers.Adam()
    hook.wrap_optimizer(opt)

    cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
    train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

    n_epochs = 1
    for epoch in range(n_epochs):
        for data, labels in dataset:
            dataset_labels = labels
            labels = tf.one_hot(labels, depth=10)
            with hook.wrap_tape(
                    tf.GradientTape(persistent=persistent)) as tape:
                logits = model(data, training=True)  # (32,10)
                loss_value = cce(labels, logits)
            grads = tape.gradient(loss_value, model.variables)

            # By default, the resources held by a GradientTape are released as
            # soon as GradientTape.gradient() method is called. To compute
            # multiple gradients over the same computation, create a persistent
            # gradient tape. This allows multiple calls to the gradient() method
            # as resources are released when the tape object is garbage collected.
            if persistent:
                _ = tape.gradient(loss_value, model.variables)
            opt.apply_gradients(zip(grads, model.variables))
            acc = train_acc_metric(dataset_labels, logits)
            hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc)
        train_acc_metric.reset_states()

    hook.close()
Example #15
0
    def run_one_epoch(
        self,
        dataset: tf.data.Dataset,
        training: bool = False,
    ):
        total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0
        ground_truth = []
        predictions = []

        for step, (batch_features, batch_labels) in enumerate(dataset):
            self.hyperparameters["batch_size"] = len(batch_labels)
            sources = batch_features
            targets = batch_labels
            with tf.GradientTape() as tape:
                model_outputs = self.compute_logits(
                    batch_features,
                    tf.cast(batch_labels["target_value"], tf.int32),
                    training=training)
                result = self.compute_loss_and_acc(model_outputs,
                                                   batch_features,
                                                   batch_labels)

            total_loss += result.token_ce_loss
            num_samples += tf.cast(batch_features["num_graphs_in_batch"],
                                   tf.float32)
            num_tokens += result.num_predictions
            num_correct_tokens += result.num_correct_token_predictions

            target_texts = self.get_text_from_tensor(
                batch_labels["target_value"])
            predicted_texts = self.get_text_from_tensor(
                tf.argmax(model_outputs, 2))
            # source_text = self.get_source_from_tensor(batch_features["source_seq"])

            ref = [([x[1:x.index("%END%")] if "%END%" in x else x[1:]])
                   for x in target_texts]
            hyp = [(x[:x.index("%END%")] if "%END%" in x else x)
                   for x in predicted_texts]
            smoothing = SmoothingFunction().method4
            try:
                bleu_score = corpus_bleu(ref,
                                         hyp,
                                         smoothing_function=smoothing)
            except:
                bleu_score = 0
            # for r, h, s in zip(ref, hyp, source_text):
            #     print(
            #         f"Target:         {' '.join(r[0])}\n"
            #         f"Prediction:     {' '.join(h)}\n"
            #         f"Source:         {' '.join(s)}\n"
            #     )
            # print(bleu_score)

            ground_truth += ref
            predictions += hyp

            if training:
                gradients = tape.gradient(result.token_ce_loss,
                                          self.trainable_variables)
                self.optimizer.apply_gradients(
                    zip(gradients, self.trainable_variables))

            print(
                "   Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f | bleu: %.5f"
                % (step, total_loss / num_samples, result.token_ce_loss,
                   float(result.num_correct_token_predictions) /
                   (float(result.num_predictions) + float(1e-7)), bleu_score),
                end="\n",
            )
        print("\r\x1b[K", end="")

        return (total_loss / num_samples,
                float(num_correct_tokens) / (float(num_tokens) + 1e-7),
                ground_truth, predictions)
Example #16
0
    def plot_jacobians(self,
                       which,
                       intervals,
                       arrow_intervals,
                       scale=2,
                       figsize=None):
        if not (len(intervals) == len(arrow_intervals) == self.ndim_source ==
                self.ndim_latent == 2):
            raise ValueError("This method is only defined for 2D models.")
        if which not in ("analysis", "synthesis"):
            raise ValueError("`which` must be 'analysis' or 'synthesis'.")

        data = [
            tf.linspace(float(i[0]), float(i[1]), int(i[2])) for i in intervals
        ]
        data = tf.meshgrid(*data, indexing="ij")
        data = tf.stack(data, axis=-1)
        data_dist = self.source.prob(data).numpy()

        if which == "analysis":
            arrow_data = [
                tf.linspace(float(i[0]), float(i[1]), int(i[2]))
                for i in arrow_intervals
            ]
            arrow_data = tf.meshgrid(*arrow_data, indexing="ij")
            arrow_data = tf.stack(arrow_data, axis=-1)
            arrow_data = tf.reshape(arrow_data, (-1, arrow_data.shape[-1]))
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(arrow_data)
                arrow_latents = self.analysis(arrow_data)
            # First dimension is batch, second is latent dim, third is source dim.
            jacobian = tape.batch_jacobian(arrow_latents, arrow_data)
            jacobian = tf.linalg.inv(jacobian)
            jacobian = tf.transpose(jacobian, (0, 2, 1))
        else:
            arrow_latents = [
                tf.linspace(float(i[0]), float(i[1]), int(i[2]))
                for i in arrow_intervals
            ]
            arrow_latents = tf.meshgrid(*arrow_latents, indexing="ij")
            arrow_latents = tf.stack(arrow_latents, axis=-1)
            arrow_latents = tf.reshape(arrow_latents,
                                       (-1, arrow_latents.shape[-1]))
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(arrow_latents)
                arrow_data = self.synthesis(arrow_latents)
            jacobian = tape.batch_jacobian(arrow_data, arrow_latents)
            jacobian = tf.transpose(jacobian, (0, 2, 1))

        google_pink = (0xf4 / 255, 0x39 / 255, 0xa0 / 255)
        google_purple = (0xa1 / 255, 0x42 / 255, 0xf4 / 255)

        plt.figure(figsize=figsize or (16, 14))
        plt.imshow(data_dist,
                   vmin=0,
                   vmax=data_dist.max(),
                   origin="lower",
                   extent=(data[0, 0, 1], data[0, -1, 1], data[0, 0,
                                                               0], data[-1, 0,
                                                                        0]))
        plt.quiver(
            arrow_data[:, 1],
            arrow_data[:, 0],
            jacobian[:, 0, 1],
            jacobian[:, 0, 0],
            pivot="tail",
            angles="xy",
            headlength=4,
            headaxislength=4,
            units="dots",
            color=google_pink,
            scale_units="xy",
            scale=scale,
        )
        plt.quiver(
            arrow_data[:, 1],
            arrow_data[:, 0],
            jacobian[:, 1, 1],
            jacobian[:, 1, 0],
            pivot="tail",
            angles="xy",
            headlength=4,
            headaxislength=4,
            units="dots",
            color=google_purple,
            scale_units="xy",
            scale=scale,
        )
        plt.axis("image")
        plt.grid(False)
        plt.xlim(data[0, 0, 1], data[0, -1, 1])
        plt.ylim(data[0, 0, 0], data[-1, 0, 0])
        plt.xlabel("source dimension 1")
        plt.ylabel("source dimension 2")
    def test_latent_dirichlet_allocation(self):
        """Tests Latent Dirichlet Allocation joint model.

    The LDA generative process can be written as:

    ```none
    N[i] ~ Poisson(xi)
    theta[i] ~ Dirichlet(alpha)
    Z[i] ~ Multinomial(N[i], theta[i])
    for k in 1...K:
      X[i,k] ~ Multinomial(Z[i, k], beta[j])
    ```

    Typically `xi` is specified and `alpha`, `beta` are fit using type-II
    maximum likelihood estimators.

    Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
    """

        # Hyperparameters.
        num_topics = 3
        num_words = 10
        avg_doc_length = 5
        u = tfd.Uniform(low=-1., high=1.)
        alpha = tfp.util.DeferredTensor(
            tf.math.softplus,
            tf.Variable(u.sample([num_topics]), name='raw_alpha'))
        beta = tf.Variable(u.sample([num_topics, num_words]), name='beta')

        # LDA Model.
        # Note near 1:1 with mathematical specification. The main distinction is the
        # use of Independent--this lets us easily aggregate multinomials across
        # topics (and in any "shape" of documents).
        lda = tfd.JointDistributionSequential([
            tfd.Poisson(rate=avg_doc_length),  # n
            tfd.Dirichlet(concentration=alpha),  # theta
            lambda theta, n: tfd.Multinomial(total_count=n, probs=theta),  # z
            lambda z: tfd.Independent(  # x  pylint: disable=g-long-lambda
                tfd.Multinomial(total_count=z, logits=beta),
                reinterpreted_batch_ndims=1),
        ])

        # Now, let's sample some "documents" and compute the log-prob of each.
        docs_shape = [2, 4]  # That is, 8 docs in the shape of [2, 4].
        [n, theta, z, x] = lda.sample(docs_shape)
        log_probs = lda.log_prob([n, theta, z, x])
        self.assertEqual(docs_shape, log_probs.shape)

        # Verify we correctly track trainable variables.
        self.assertLen(lda.trainable_variables, 2)
        self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0])
        self.assertIs(beta, lda.trainable_variables[1])

        # Ensure we can compute gradients.
        with tf.GradientTape() as tape:
            # Note: The samples are not taped, hence implicitly "stop_gradient."
            negloglik = -lda.log_prob([n, theta, z, x])
        grads = tape.gradient(negloglik, lda.trainable_variables)

        self.assertLen(grads, 2)
        self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape),
                            (grads[0].shape, grads[1].shape))
        self.assertAllNotNone(grads)
Example #18
0
	flag = True
	indeces_ = []
	while(flag):
		pair_ = [int(a) for a in list(input()) if a != ' ']
		if any([a > max_indeces for a in pair_]):
			assert (False), ('We enter index more then available!')

		print(pair_)
		if(pair_[0] == 0 or pair_[1] == 0):
			flag = False
		else:
			indeces_.append(pair_)

	return indeces_		

#indeces_ = create_a_list_indeces(num_of_parameters)
#print([(iter_1, iter_2) \
#	for iter_1, iter_2 in indeces_])


markov_theory = Markovitz_theory()
markov_theory._initial_model_param_()

funct_obj = wrapper_fuzzy_constraints_(markov_theory)
x = tf.Variable(np.random.random(markov_theory.weights.shape[0]))
print(x.shape)
with tf.GradientTape() as g:
	g.watch((x))
	f = funct_obj(x)
	grad_ = g.gradient(f, x)
	print(grad_.numpy())
Example #19
0
def kernel(target_log_prob_fn,
           current_state,
           step_size,
           seed=None,
           current_target_log_prob=None,
           current_grads_target_log_prob=None,
           name="nuts_kernel"):
    """Simulates a No-U-Turn Sampler (NUTS) trajectory.

  Args:
    target_log_prob_fn: Python callable which takes an argument like
      `*current_state` and returns its (possibly unnormalized) log-density under
      the target distribution.
    current_state: List of `Tensor`s representing the states to simulate from.
    step_size: List of `Tensor`s representing the step sizes for the leapfrog
      integrator. Must have same shape as `current_state`.
    seed: Integer to seed the random number generator.
    current_target_log_prob: Scalar `Tensor` representing the value of
      `target_log_prob_fn` at the `current_state`.
    current_grads_target_log_prob: List of `Tensor`s representing gradient of
      `current_target_log_prob` with respect to `current_state`. Must have same
      shape as `current_state`.
    name: A name for the operation.

  Returns:
    next_state: List of `Tensor`s representing the next states of the NUTS
      trajectory. Has same shape as `current_state`.
    next_target_log_prob: Scalar `Tensor` representing the value of
      `target_log_prob_fn` at `next_state`.
    next_grads_target_log_prob: List of `Tensor`s representing the gradient of
      `next_target_log_prob` with respect to `next_state`.

  Raises:
    NotImplementedError: If the execution mode is not eager.
  """
    if not tf.executing_eagerly():
        raise NotImplementedError("`kernel` is only available in Eager mode.")

    with tf.name_scope(name):
        with tf.name_scope("initialize"):
            current_state = [tf.convert_to_tensor(s) for s in current_state]
            step_size = [tf.convert_to_tensor(s) for s in step_size]
            if (current_target_log_prob is None
                    or current_grads_target_log_prob is None):
                with tf.GradientTape() as tape:
                    tape.watch(current_state)
                    current_target_log_prob = target_log_prob_fn(
                        *current_state)
                current_grads_target_log_prob = tape.gradient(
                    current_target_log_prob, current_state)
                if any(grad is None for grad in current_grads_target_log_prob):
                    raise ValueError("Gradient is None for a state.")

            seed_stream = tfp.distributions.SeedStream(seed, "nuts_kernel")
            current_momentum = []
            for state_tensor in current_state:
                momentum_tensor = tf.random.normal(
                    shape=tf.shape(state_tensor),
                    dtype=state_tensor.dtype,
                    seed=seed_stream())
                current_momentum.append(momentum_tensor)

            # Draw a slice variable u ~ Uniform(0, p(initial state, initial
            # momentum)) and compute log u. For numerical stability, we perform this
            # in log space where log u = log (u' * p(...)) = log u' + log
            # p(...) and u' ~ Uniform(0, 1).
            log_slice_sample = tf.math.log(
                tf.random.uniform([], seed=seed_stream()))
            log_slice_sample += _log_joint(current_target_log_prob,
                                           current_momentum)

            # Initialize loop variables. It comprises a collection of information
            # about a "reverse" state, a collection of information about a "forward"
            # state, a collection of information about the next state,
            # the trajectory's tree depth, the number of candidate states, and
            # whether to continue the trajectory.
            reverse_state = current_state
            reverse_target_log_prob = current_target_log_prob
            reverse_grads_target_log_prob = current_grads_target_log_prob
            reverse_momentum = current_momentum
            forward_state = current_state
            forward_target_log_prob = current_target_log_prob
            forward_grads_target_log_prob = current_grads_target_log_prob
            forward_momentum = current_momentum
            next_state = current_state
            next_target_log_prob = current_target_log_prob
            next_grads_target_log_prob = current_grads_target_log_prob
            depth = 0
            num_states = 1
            continue_trajectory = True

        while continue_trajectory:
            # Grow the No-U-Turn Sampler trajectory by choosing a random direction and
            # simulating Hamiltonian dynamics in that direction. This extends either
            # the forward or reverse state.
            direction = tfp.math.random_rademacher([], seed=seed_stream())
            if direction < 0:
                [
                    reverse_state,
                    reverse_target_log_prob,
                    reverse_grads_target_log_prob,
                    reverse_momentum,
                    _,
                    _,
                    _,
                    _,
                    next_state_in_subtree,
                    next_target_log_prob_in_subtree,
                    next_grads_target_log_prob_in_subtree,
                    num_states_in_subtree,
                    continue_trajectory,
                ] = _build_tree(
                    target_log_prob_fn=target_log_prob_fn,
                    current_state=reverse_state,
                    current_target_log_prob=reverse_target_log_prob,
                    current_grads_target_log_prob=reverse_grads_target_log_prob,
                    current_momentum=reverse_momentum,
                    direction=direction,
                    depth=depth,
                    step_size=step_size,
                    log_slice_sample=log_slice_sample,
                    seed=seed_stream())
            else:
                [
                    _,
                    _,
                    _,
                    _,
                    forward_state,
                    forward_target_log_prob,
                    forward_grads_target_log_prob,
                    forward_momentum,
                    next_state_in_subtree,
                    next_target_log_prob_in_subtree,
                    next_grads_target_log_prob_in_subtree,
                    num_states_in_subtree,
                    continue_trajectory,
                ] = _build_tree(
                    target_log_prob_fn=target_log_prob_fn,
                    current_state=forward_state,
                    current_target_log_prob=forward_target_log_prob,
                    current_grads_target_log_prob=forward_grads_target_log_prob,
                    current_momentum=forward_momentum,
                    direction=direction,
                    depth=depth,
                    step_size=step_size,
                    log_slice_sample=log_slice_sample,
                    seed=seed_stream())

            if continue_trajectory:
                # If the built tree did not terminate, accept the tree's next state
                # with a certain probability.
                accept_state_in_subtree = _random_bernoulli(
                    [],
                    probs=tf.minimum(1., num_states_in_subtree / num_states),
                    dtype=tf.bool,
                    seed=seed_stream())
                if accept_state_in_subtree:
                    next_state = next_state_in_subtree
                    next_target_log_prob = next_target_log_prob_in_subtree
                    next_grads_target_log_prob = next_grads_target_log_prob_in_subtree

            # Continue the NUTS trajectory if the tree-building did not terminate, and
            # if the reverse-most and forward-most states do not exhibit a U-turn.
            has_no_u_turn = tf.logical_and(
                _has_no_u_turn(forward_state, reverse_state, forward_momentum),
                _has_no_u_turn(forward_state, reverse_state, reverse_momentum))
            continue_trajectory = continue_trajectory and has_no_u_turn
            num_states += num_states_in_subtree
            depth += 1

        return next_state, next_target_log_prob, next_grads_target_log_prob
Example #20
0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}"):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28,
                                                 1)),  # WA for TF issue #36279
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        (x_train, y_train), _ = get_keras_data()
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(x_train[..., tf.newaxis] / 255,
                     tf.float32), tf.cast(y_train, tf.int64)))
        dataset = dataset.shuffle(1000).batch(64)

        opt = tf.keras.optimizers.RMSprop()
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        n_epochs = 2
        if script_mode:
            if json_file_contents == "{}":
                hook = smd.KerasHook(out_dir=sim.out_dir,
                                     export_tensorboard=True)
            else:
                hook = smd.KerasHook.create_from_json_file()

            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with hook.wrap_tape(tf.GradientTape()) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                    hook.record_tensor_value(tensor_name="accuracy",
                                             tensor_value=acc)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            # ZCC support added from smdebug v0.8.0)
            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with tf.GradientTape(persistent=True) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            if not is_tf_2_2():
                assert not hook  # only supported on TF 2.2 and greater
                return
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
  def solve_nu_zeta(self,
                    dataset: dataset_lib.OffpolicyDataset,
                    target_policy: tf_policy.TFPolicy,
                    regularizer: float = 1e-6):
    """Solves for density ratios and then approximates target policy value.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      regularizer: A small constant to add to matrices before inverting them or
        to floats before taking square root.

    Returns:
      Estimated average per-step reward of the target policy.
    """

    if not hasattr(self, '_td_mat'):
      # Set up env_steps.
      episodes, valid_steps = dataset.get_all_episodes(
          limit=self._limit_episodes)
      total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1
      num_episodes = tf.shape(valid_steps)[0]
      num_samples = num_episodes * total_num_steps_per_episode
      valid_and_not_last = tf.logical_and(valid_steps, episodes.discount > 0)
      valid_indices = tf.squeeze(
          tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1])))

      initial_env_step = tf.nest.map_structure(
          lambda t: tf.squeeze(
              tf.reshape(
                  tf.repeat(
                      t[:, 0:1, ...],
                      axis=1,
                      repeats=total_num_steps_per_episode), [num_samples, -1])),
          episodes)
      initial_env_step = tf.nest.map_structure(
          lambda t: tf.gather(t, valid_indices), initial_env_step)
      tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep(
          initial_env_step)

      env_step = tf.nest.map_structure(
          lambda t: tf.squeeze(
              tf.reshape(t[:, 0:total_num_steps_per_episode, ...],
                         [num_samples, -1])), episodes)
      env_step = tf.nest.map_structure(lambda t: tf.gather(t, valid_indices),
                                       env_step)
      tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(env_step)

      next_env_step = tf.nest.map_structure(
          lambda t: tf.squeeze(
              tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...],
                         [num_samples, -1])), episodes)
      next_env_step = tf.nest.map_structure(
          lambda t: tf.gather(t, valid_indices), next_env_step)
      tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep(
          next_env_step)

      # get probabilities
      initial_target_probs = target_policy.distribution(
          tfagents_initial_env_step).action.probs_parameter()
      next_target_probs = target_policy.distribution(
          tfagents_next_env_step).action.probs_parameter()

      # First, get the nu_loss and data weights
      #current_nu_loss = self._get_nu_loss(initial_env_step, env_step,
      #                                    next_env_step, target_policy)
      #data_weight, _ = self._get_weights(current_nu_loss)

      # # debug only and to reproduce dual dice result, DELETE
      # data_weight = tf.ones_like(data_weight)

      state_action_count = self._get_state_action_counts(env_step)
      counts = tf.reduce_sum(tf.one_hot(state_action_count, self._dimension), 0)
      gamma_sample = tf.pow(self._gamma, tf.cast(env_step.step_num, tf.float32))

      # # debug only and to reproduce dual dice result, DELETE
      # gamma_sample = tf.ones_like(gamma_sample)

      # now we need to expand_dims to include action space in extra dimensions
      #data_weights = tf.reshape(data_weight, [-1, self._num_limits])
      # both are data sample weights for L2 problem, needs to be normalized later
      #gamma_data_weights = tf.reshape(gamma_sample, [-1, 1]) * data_weights

      initial_states = tf.tile(
          tf.reshape(initial_env_step.observation, [-1, 1]),
          [1, self._num_actions])
      initial_actions = tf.tile(
          tf.reshape(tf.range(self._num_actions), [1, -1]),
          [initial_env_step.observation.shape[0], 1])
      initial_nu_indices = self._get_index(initial_states, initial_actions)

      # linear term w.r.t. initial distribution
      #b_vec_2 = tf.stack([
      #    tf.reduce_sum(
      #        tf.reshape(
      #            data_weights[:, itr] / tf.reduce_sum(data_weights[:, itr]),
      #            [-1, 1]) * tf.reduce_sum(
      #                tf.one_hot(initial_nu_indices, self._dimension) *
      #                (1 - self._gamma) *
      #                tf.expand_dims(initial_target_probs, axis=-1),
      #                axis=1),
      #        axis=0) for itr in range(self._num_limits)
      #],
      #                   axis=0)

      next_states = tf.tile(
          tf.reshape(next_env_step.observation, [-1, 1]),
          [1, self._num_actions])
      next_actions = tf.tile(
          tf.reshape(tf.range(self._num_actions), [1, -1]),
          [next_env_step.observation.shape[0], 1])
      next_nu_indices = self._get_index(next_states, next_actions)
      next_nu_indices = tf.where(
          tf.expand_dims(next_env_step.is_absorbing(), -1),
          -1 * tf.ones_like(next_nu_indices), next_nu_indices)

      nu_indices = self._get_index(env_step.observation, env_step.action)

      target_log_probabilities = target_policy.distribution(
          tfagents_env_step).action.log_prob(env_step.action)
      if not self._solve_for_state_action_ratio:
        policy_ratio = tf.exp(target_log_probabilities -
                              env_step.get_log_probability())
      else:
        policy_ratio = tf.ones([
            target_log_probabilities.shape[0],
        ])
      policy_ratios = tf.tile(
          tf.reshape(policy_ratio, [-1, 1]), [1, self._num_actions])

      # the tabular feature vector
      a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum(
          self._gamma *
          tf.expand_dims(next_target_probs * policy_ratios, axis=-1) *
          tf.one_hot(next_nu_indices, self._dimension),
          axis=1)

      # linear term w.r.t. reward
      #b_vec_1 = tf.stack([
      #    tf.reduce_sum(
      #        tf.reshape(
      #            (gamma_data_weights[:, itr] /
      #             tf.reduce_sum(gamma_data_weights[:, itr])) * self._reward_fn(env_step), #/
      #            #tf.cast(state_action_count, tf.float32),
      #            [-1, 1]) * a_vec,
      #        axis=0) for itr in range(self._num_limits)
      #],
      #                   axis=0)
      # quadratic term of feature
      # Get weighted outer product by using einsum to save computing resource!
      #a_mat = tf.stack([
      #    tf.einsum(
      #        'ai, a, aj -> ij', a_vec,
      #        #1.0 / tf.cast(state_action_count, tf.float32),
      #        gamma_data_weights[:, itr] /
      #        tf.reduce_sum(gamma_data_weights[:, itr]),
      #        a_vec)
      #    for itr in range(self._num_limits)
      #],
      #                 axis=0)

      td_mat = tf.einsum('ai, a, aj -> ij',
                         tf.one_hot(nu_indices, self._dimension),
                         1.0 / tf.cast(state_action_count, tf.float32), a_vec)

      weighted_rewards = policy_ratio * self._reward_fn(env_step)

      bias = tf.reduce_sum(
          tf.one_hot(nu_indices, self._dimension) *
          tf.reshape(weighted_rewards, [-1, 1]) * 1.0 /
          tf.cast(state_action_count, tf.float32)[:, None],
          axis=0)

      # Initialize
      self._nu = np.ones_like(self._nu) * bias[:, None]
      self._nu2 = np.ones_like(self._nu2) * bias[:, None]

      self._a_vec = a_vec
      self._td_mat = td_mat
      self._bias = bias
      self._weighted_rewards = weighted_rewards
      self._state_action_count = state_action_count
      self._nu_indices = nu_indices
      self._initial_nu_indices = initial_nu_indices
      self._initial_target_probs = initial_target_probs
      self._gamma_sample = gamma_sample
      self._gamma_sample = tf.ones_like(gamma_sample)

    saddle_bellman_residuals = (
        tf.matmul(self._a_vec, self._nu) - self._weighted_rewards[:, None])
    saddle_bellman_residuals *= -1 * self._algae_alpha_sign
    saddle_zetas = tf.gather(self._zeta, self._nu_indices)
    saddle_initial_nu_values = tf.reduce_sum(  # Average over actions.
        self._initial_target_probs[:, :, None] *
        tf.gather(self._nu, self._initial_nu_indices),
        axis=1)
    saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values *
                           self._algae_alpha_sign)

    saddle_bellman_residuals2 = (
        tf.matmul(self._a_vec, self._nu2) - self._weighted_rewards[:, None])
    saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign
    saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices)
    saddle_initial_nu_values2 = tf.reduce_sum(  # Average over actions.
        self._initial_target_probs[:, :, None] *
        tf.gather(self._nu2, self._initial_nu_indices),
        axis=1)
    saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 * -1 *
                            self._algae_alpha_sign)

    saddle_loss = 0.5 * (
        saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas +
        -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) +
        -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2 +
        tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2))
    # Binary search to find best alpha.
    left = tf.constant([-8., -8.])
    right = tf.constant([32., 32.])
    for _ in range(16):
      mid = 0.5 * (left + right)
      self._alpha.assign(mid)
      weights, log_weights = self._get_weights(saddle_loss *
                                               self._gamma_sample[:, None])

      divergence = self._compute_divergence(weights, log_weights)
      divergence_violation = divergence - self._two_sided_limit
      left = tf.where(divergence_violation > 0., mid, left)
      right = tf.where(divergence_violation > 0., right, mid)
    self._alpha.assign(0.5 * (left + right))
    weights, log_weights = self._get_weights(saddle_loss *
                                             self._gamma_sample[:, None])

    gamma_data_weights = tf.stop_gradient(weights * self._gamma_sample[:, None])
    #print(tf.concat([gamma_data_weights, saddle_loss], axis=-1))
    avg_saddle_loss = (
        tf.reduce_sum(gamma_data_weights * saddle_loss, axis=0) /
        tf.reduce_sum(gamma_data_weights, axis=0))

    weighted_state_action_count = tf.reduce_sum(
        tf.one_hot(self._nu_indices, self._dimension)[:, :, None] *
        weights[:, None, :],
        axis=0)
    weighted_state_action_count = tf.gather(weighted_state_action_count,
                                            self._nu_indices)
    my_td_mat = tf.einsum(
        'ai, ab, ab, aj -> bij',
        tf.one_hot(self._nu_indices, self._dimension),
        #1.0 / tf.cast(self._state_action_count, tf.float32),
        1.0 / weighted_state_action_count,
        weights,
        self._a_vec)
    my_bias = tf.reduce_sum(
        tf.transpose(weights)[:, :, None] *
        tf.one_hot(self._nu_indices, self._dimension)[None, :, :] *
        tf.reshape(self._weighted_rewards, [1, -1, 1]) *
        #1.0 / tf.cast(self._state_action_count, tf.float32)[None, :, None],
        1.0 / tf.transpose(weighted_state_action_count)[:, :, None],
        axis=1)

    #print('hello', saddle_initial_nu_values[:1], saddle_zetas[:3],
    #      self._nu[:2], my_bias[:, :2], saddle_loss[:4])

    with tf.GradientTape(
        watch_accessed_variables=False, persistent=True) as tape:
      tape.watch([self._nu, self._nu2, self._alpha])
      bellman_residuals = tf.matmul(
          my_td_mat,
          tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None]
      bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1))
      bellman_residuals = tf.gather(bellman_residuals, self._nu_indices)
      initial_nu_values = tf.reduce_sum(  # Average over actions.
          self._initial_target_probs[:, :, None] *
          tf.gather(self._nu, self._initial_nu_indices),
          axis=1)

      bellman_residuals *= self._algae_alpha_sign

      init_nu_loss = ((1 - self._gamma) * initial_nu_values *
                      self._algae_alpha_sign)

      nu_loss = (
          tf.math.square(bellman_residuals) / 2.0 +
          tf.math.abs(self._algae_alpha) * init_nu_loss)

      loss = (
          gamma_data_weights * nu_loss /
          tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

      bellman_residuals2 = tf.matmul(
          my_td_mat,
          tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None]
      bellman_residuals2 = tf.transpose(tf.squeeze(bellman_residuals2, -1))
      bellman_residuals2 = tf.gather(bellman_residuals2, self._nu_indices)
      initial_nu_values2 = tf.reduce_sum(  # Average over actions.
          self._initial_target_probs[:, :, None] *
          tf.gather(self._nu2, self._initial_nu_indices),
          axis=1)

      bellman_residuals2 *= -1 * self._algae_alpha_sign

      init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 *
                       self._algae_alpha_sign)

      nu_loss2 = (
          tf.math.square(bellman_residuals2) / 2.0 +
          tf.math.abs(self._algae_alpha) * init_nu_loss2)

      loss2 = (
          gamma_data_weights * nu_loss2 /
          tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True))

      divergence = self._compute_divergence(weights, log_weights)
      divergence_violation = divergence - self._two_sided_limit

      alpha_loss = (-tf.exp(self._alpha) *
                    tf.stop_gradient(divergence_violation))

      extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :]))
      extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :]))
      nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0]
      nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0]
    avg_loss = tf.reduce_sum(
        0.5 * (loss - loss2) / tf.math.abs(self._algae_alpha), axis=0)
    nu_jacob = tape.jacobian(nu_grad, [self._nu])[0]
    nu_hess = tf.stack([nu_jacob[:, i, :, i] for i in range(self._num_limits)],
                       axis=0)

    nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0]
    nu_hess2 = tf.stack(
        [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0)

    for idx, div in enumerate(divergence):
      tf.summary.scalar('divergence%d' % idx, div)

    #alpha_grads = tape.gradient(alpha_loss, [self._alpha])
    #alpha_grad_op = self._alpha_optimizer.apply_gradients(
    #    zip(alpha_grads, [self._alpha]))
    #self._alpha.assign(tf.minimum(8., tf.maximum(-8., self._alpha)))

    #print(self._alpha, tf.concat([weights, nu_loss], -1))
    #regularizer = 0.1
    nu_transformed = tf.transpose(
        tf.squeeze(
            tf.linalg.solve(nu_hess + regularizer * tf.eye(self._dimension),
                            tf.expand_dims(-tf.transpose(nu_grad), axis=-1))))
    self._nu = self._nu + 0.1 * nu_transformed
    nu_transformed2 = tf.transpose(
        tf.squeeze(
            tf.linalg.solve(nu_hess2 + regularizer * tf.eye(self._dimension),
                            tf.expand_dims(-tf.transpose(nu_grad2), axis=-1))))
    self._nu2 = self._nu2 + 0.1 * nu_transformed2

    print(avg_loss * self._algae_alpha_sign,
          avg_saddle_loss * self._algae_alpha_sign, self._nu[:2], divergence)
    #print(init_nu_loss[:8], init_nu_loss[-8:])
    #print(bellman_residuals[:8])
    #print(self._nu[:3], self._zeta[:3])

    zetas = tf.matmul(my_td_mat,
                      tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None]
    zetas = tf.transpose(tf.squeeze(zetas, -1))
    zetas *= -self._algae_alpha_sign
    zetas /= tf.math.abs(self._algae_alpha)
    self._zeta = self._zeta + 0.1 * (zetas - self._zeta)

    zetas2 = tf.matmul(my_td_mat,
                       tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :,
                                                                      None]
    zetas2 = tf.transpose(tf.squeeze(zetas2, -1))
    zetas2 *= 1 * self._algae_alpha_sign
    zetas2 /= tf.math.abs(self._algae_alpha)
    self._zeta2 = self._zeta2 + 0.1 * (zetas2 - self._zeta2)

    #self._zeta = (
    #    tf.einsum('ij,ja-> ia', self._td_mat, self._nu) -
    #    tf.transpose(my_bias))
    #self._zeta *= -tf.reshape(self._algae_alpha_sign, [1, self._num_limits])
    #self._zeta /= tf.math.abs(self._algae_alpha)
    return [
        avg_saddle_loss * self._algae_alpha_sign,
        avg_loss * self._algae_alpha_sign, divergence
    ]
Example #22
0
    def test_works_like_conv2d_transpose(self, input_shape, filter_shape,
                                         channels_out, strides, padding,
                                         dilations):

        strides_tuple = strides
        if not self.unequal_strides_ok:
            if strides[0] != strides[1]:
                # Skip this test case if the method does not support unequal strides.
                return
            else:
                strides = strides[0]

        x, k = _make_input_and_kernel(
            self.make_input,
            input_batch_shape=[],
            input_shape=input_shape,
            # Use singleton kernel_batch_shape to avoid the short circuit to
            # `conv2d_transpose`.
            kernel_batch_shape=[1],
            filter_shape=filter_shape,
            channels_out=channels_out,
            dtype=self.dtype)

        output_shape, strides_ = convolution_util._get_output_shape(
            rank=2,
            strides=strides_tuple,
            padding=padding,
            dilations=dilations,
            input_shape=input_shape,
            output_size=channels_out,
            filter_shape=filter_shape)

        tf_kernel = tf.transpose(tf.reshape(
            k,
            ps.concat([filter_shape, [input_shape[-1], channels_out]],
                      axis=0)),
                                 perm=[0, 1, 3, 2])
        # conv2d_transpose does not support dilations > 1; use Keras instead.
        if any(d > 1 for d in dilations):
            keras_convt = tf.keras.layers.Conv2DTranspose(
                filters=channels_out,
                kernel_size=filter_shape,
                strides=strides,
                padding=padding,
                dilation_rate=dilations,
                use_bias=False)
            _ = keras_convt(x)  # build kernel
            keras_convt.kernel = tf_kernel
            y_expected = keras_convt(x)
        else:
            y_expected = tf.nn.conv2d_transpose(x,
                                                tf_kernel,
                                                output_shape=output_shape,
                                                strides=strides_,
                                                padding=padding,
                                                dilations=dilations)

        conv_fn = self.make_conv_fn(filter_shape, strides, padding, dilations)
        with tf.GradientTape() as tape:
            tape.watch([x, k])
            y_actual = conv_fn(x, k)
        grad = tape.gradient(y_actual, [x, k])
        self.assertAllNotNone(grad)

        [y_expected_, y_actual_] = self.evaluate([y_expected, y_actual])
        self.assertAllClose(y_expected_, y_actual_, rtol=1e-5, atol=0)
Example #23
0
def pass_arg(Xx, nsim, tr_size, num_iter):
    print("Tr_size:", tr_size)

    def fix_seeds(seed):
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,
                                                inter_op_parallelism_threads=1)
        sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(),
                                    config=session_conf)
        #     K.set_session(sess)
        tf.compat.v1.keras.backend.set_session(sess)

    ss = 1
    fix_seeds(ss)

    # Compute the RMSE given the ground truth (y_true) and the predictions(y_pred)
    def root_mean_squared_error(y_true, y_pred):
        return tf.math.sqrt(
            tf.math.reduce_mean(tf.math.square(y_pred - y_true), axis=-1))

    class InputTransformedKernel(
            tfp.math.psd_kernels.PositiveSemidefiniteKernel):
        def __init__(self,
                     kernel,
                     transformation,
                     name='InputTransformedKernel'):
            self._kernel = kernel
            self._transformation = transformation
            super(InputTransformedKernel,
                  self).__init__(feature_ndims=kernel.feature_ndims,
                                 dtype=kernel.dtype,
                                 name=name)

        def apply(self, x1, x2):
            return self._kernel.apply(self._transformation(x1),
                                      self._transformation(x2))

        def matrix(self, x1, x2):
            return self._kernel.matrix(self._transformation(x1),
                                       self._transformation(x2))

        @property
        def batch_shape(self):
            return self._kernel.batch_shape

        def batch_shape_tensor(self):
            return self._kernel.batch_shape_tensor

    class InputScaledKernel(InputTransformedKernel):
        def __init__(self, kernel, length_scales):
            super(InputScaledKernel, self).__init__(
                kernel, lambda x: x / tf.expand_dims(
                    length_scales, -(kernel.feature_ndims + 1)))

    # Load labeled data
    data = np.loadtxt('../data/labeled_data.dat')
    x_labeled = data[:, :2].astype(
        np.float64)  # -2 because we do not need porosity predictions
    y_labeled = data[:, -2:-1].astype(
        np.float64)  # dimensionless bond length and porosity measurements

    # normalize dataset with MinMaxScaler
    scaler = preprocessing.MinMaxScaler(feature_range=(0.0, 1.0))
    x_labeled = scaler.fit_transform(x_labeled)
    # y_labeled = scaler.fit_transform(y_labeled)

    tr_size = int(tr_size)

    # train and test data
    trainX, trainY = x_labeled[:tr_size, :], y_labeled[:tr_size]
    # testX, testY = x_labeled[tr_size:,:], y_labeled[tr_size:]

    trainY = np.transpose(trainY)
    # testY = np.transpose(testY)

    data_phyloss = np.loadtxt('../data/unlabeled_data_BK_constw_v2_1525.dat')
    x_unlabeled = data_phyloss[:, :]

    # initial porosity
    initporo = x_unlabeled[:, -1]

    x_unlabeled1 = x_unlabeled[:1303, :2]
    x_unlabeled2 = x_unlabeled[-6:, :2]
    x_unlabeled = np.vstack((x_unlabeled1, x_unlabeled2))

    x_unlabeled = scaler.fit_transform(x_unlabeled)
    init_poro1 = initporo[:1303]
    init_poro2 = initporo[-6:]
    init_poro = np.hstack((init_poro1, init_poro2))

    def build_gp(amplitude, length_scale):
        """Defines the conditional dist. of GP outputs, given kernel parameters."""

        # Create the covariance kernel, which will be shared between the prior (which we
        # use for maximum likelihood training) and the posterior (which we use for
        # posterior predictive sampling)
        se_kernel = tfk.ExponentiatedQuadratic(
            amplitude)  # length_scale = None here, implicitly

        # This is the "ARD" kernel (we don't like abbreviations or bizarrely obscure names in
        # TFP, so we're probably going to call this "InputScaledKernel" since....that's what it is! :)
        kernel = InputScaledKernel(se_kernel, length_scale)

        # Create the GP prior distribution, which we will use to train the model
        # parameters.
        return tfd.GaussianProcess(kernel=kernel, index_points=trainX)

    gp_joint_model = tfd.JointDistributionNamedAutoBatched({
        'amplitude':
        tfd.TransformedDistribution(distribution=tfd.Normal(
            loc=0., scale=np.float64(1.)),
                                    bijector=tfb.Exp(),
                                    batch_shape=[1]),
        'length_scale':
        tfd.TransformedDistribution(distribution=tfd.Normal(
            loc=0., scale=np.float64(1.)),
                                    bijector=tfb.Exp(),
                                    batch_shape=[2]),
        'observations':
        build_gp,
    })

    # Create the trainable model parameters, which we'll subsequently optimize.
    # Note that we constrain them to be strictly positive.
    constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())

    amplitude_var = tfp.util.TransformedVariable(
        initial_value=np.random.uniform(size=1),
        bijector=constrain_positive,
        name='amplitude',
        dtype=np.float64)

    length_scale_var = tfp.util.TransformedVariable(
        initial_value=np.random.uniform(size=[2]),
        bijector=constrain_positive,
        name='length_scale',
        dtype=np.float64)

    trainable_variables = [
        v.trainable_variables[0] for v in [amplitude_var, length_scale_var]
    ]

    @tf.function(autograph=False, experimental_compile=False)
    def target_log_prob(amplitude, length_scale, poroi, lam):
        tf.random.set_seed(1234)
        se_kernel = tfk.ExponentiatedQuadratic(
            amplitude)  # length_scale = None here, implicitly
        optimized_kernel = InputScaledKernel(se_kernel, length_scale)
        gprm = tfd.GaussianProcessRegressionModel(kernel=optimized_kernel,
                                                  index_points=x_unlabeled)
        samples = gprm.sample(1)
        pred = tf.squeeze(samples, axis=0)

        phyloss_poro = tf.math.reduce_mean(
            tf.nn.relu(tf.negative(pred)) + tf.nn.relu(pred - poroi))

        #     print("phyloss_poro:",lam*phyloss_poro)
        #     return lam*phyloss_poro
        return lam * phyloss_poro - gp_joint_model.log_prob(
            {
                'amplitude': amplitude,
                'length_scale': length_scale,
                'observations': trainY
            })

    fix_seeds(1)

    # Optimize the model parameters.
    num_iters = int(num_iter)
    lam = 100000
    optimizer = tf.optimizers.Adam(learning_rate=.1)

    # Store the likelihood values during training, so we can plot the progress
    lls_ = np.zeros(num_iters, np.float64)

    for i in range(num_iters):
        with tf.GradientTape() as tape:
            loss = target_log_prob(amplitude_var, length_scale_var, init_poro,
                                   lam)  # physics loss & normal loss

        # print(i,"loss_inloop:",loss)
        grads = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(grads, trainable_variables))
        lls_[i] = loss

    # print('Trained parameters:')
    # print('amplitude: {}'.format(amplitude_var._value().numpy()))
    # print('length_scale: {}'.format(length_scale_var._value().numpy()))

    # tf.random.set_seed(1234)
    fix_seeds(1)
    se_kernel = tfk.ExponentiatedQuadratic(
        amplitude_var)  # length_scale = None here, implicitly
    optimized_kernel = InputScaledKernel(se_kernel, length_scale_var)
    gprm = tfd.GaussianProcessRegressionModel(kernel=optimized_kernel,
                                              index_points=Xx)
    preds = gprm.sample(int(nsim))
    samples = np.array(tf.squeeze(preds, axis=1))

    return samples
Example #24
0
def compute_and_plot_saliency(model, image_path):
    """
    This function computes and plots the saliency plot.
    You need to compute the matrix M detailed in section 3.1 in
    K. Simonyan, A. Vedaldi, and A. Zisserman,
    "Deep inside convolutional networks: Visualising imageclassification models and saliency maps,"
    2013, Available at https://arxiv.org/abs/1312.6034.

    :param model: Model which is used
    :param image_path: Path to the image to be analysed
    :return: None
    """
    raw_image = tf.dtypes.cast(decode_jpeg(image_path), tf.float32)

    logits_tensor = model.get_layer('classifier')
    logits_model = tf.keras.Model(model.input, logits_tensor.output)

    with tf.GradientTape() as t:
        ######### Your code starts here #########







        ######### Your code ends here #########

    plt.subplot(2, 1, 1)
    plt.imshow(M)
    plt.title('Saliency with respect to predicted class %s' % LABELS[top_class])
    plt.subplot(2, 1, 2)
    plt.imshow(decode_jpeg(image_path).numpy())
    plt.savefig("../plots/saliency.png")
    plt.show()


def plot_classification(image_path, classification_array):
    nH, nW, _ = classification_array.shape
    image_data = decode_jpeg(image_path).numpy()
    aspect_ratio = float(image_data.shape[0]) / image_data.shape[1]
    plt.figure(figsize=(8, 8*aspect_ratio))
    p1 = plt.subplot(2,2,1)
    plt.imshow(classification_array[:,:,0], interpolation='none', cmap='jet')
    plt.title('%s probability' % LABELS[0])
    p1.set_aspect(aspect_ratio*nW/nH)
    plt.colorbar()
    p2 = plt.subplot(2,2,2)
    plt.imshow(classification_array[:,:,1], interpolation='none', cmap='jet')
    plt.title('%s probability' % LABELS[1])
    p2.set_aspect(aspect_ratio*nW/nH)
    plt.colorbar()
    p2 = plt.subplot(2,2,3)
    plt.imshow(classification_array[:,:,2], interpolation='none', cmap='jet')
    plt.title('%s probability' % LABELS[2])
    p2.set_aspect(aspect_ratio*nW/nH)
    plt.colorbar()
    plt.subplot(2,2,4)
    plt.imshow(image_data)
    plt.savefig("../plots/detect.png")
    plt.show()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--image', type=str)
    parser.add_argument('--scheme', type=str)
    FLAGS, _ = parser.parse_known_args()
    maybe_makedirs("../plots")

    model = tf.keras.models.load_model('./trained_models/trained.h5')
    if FLAGS.scheme == 'brute':
        plot_classification(FLAGS.image, compute_brute_force_classification(model, FLAGS.image, 8, 8))
    elif FLAGS.scheme == 'conv':
        plot_classification(FLAGS.image, compute_convolutional_KxK_classification(model, FLAGS.image))
    elif FLAGS.scheme == 'saliency':
        compute_and_plot_saliency(model, FLAGS.image)
    else:
        print('Unrecognized scheme:', FLAGS.scheme)
Example #25
0
    def testDistribution(self, dist_name, data):
        seed = test_util.test_seed()
        # Explicitly draw event_dim here to avoid relying on _params_event_ndims
        # later, so this test can support distributions that do not implement the
        # slicing protocol.
        event_dim = data.draw(hps.integers(min_value=2, max_value=6))
        dist = data.draw(
            dhps.distributions(dist_name=dist_name,
                               event_dim=event_dim,
                               enable_vars=True))
        batch_shape = dist.batch_shape
        batch_shape2 = data.draw(
            tfp_hps.broadcast_compatible_shape(batch_shape))
        dist2 = data.draw(
            dhps.distributions(dist_name=dist_name,
                               batch_shape=batch_shape2,
                               event_dim=event_dim,
                               enable_vars=True))
        self.evaluate([var.initializer for var in dist.variables])

        # Check that the distribution passes Variables through to the accessor
        # properties (without converting them to Tensor or anything like that).
        for k, v in six.iteritems(dist.parameters):
            if not tensor_util.is_ref(v):
                continue
            self.assertIs(getattr(dist, k), v)

        # Check that standard statistics do not read distribution parameters more
        # than twice (once in the stat itself and up to once in any validation
        # assertions).
        max_permissible = 2 + extra_tensor_conversions_allowed(dist)
        for stat in sorted(
                data.draw(
                    hps.sets(hps.one_of(
                        map(hps.just, [
                            'covariance', 'entropy', 'mean', 'mode', 'stddev',
                            'variance'
                        ])),
                             min_size=3,
                             max_size=3))):
            hp.note('Testing excessive var usage in {}.{}'.format(
                dist_name, stat))
            try:
                with tfp_hps.assert_no_excessive_var_usage(
                        'statistic `{}` of `{}`'.format(stat, dist),
                        max_permissible=max_permissible):
                    getattr(dist, stat)()

            except NotImplementedError:
                pass

        # Check that `sample` doesn't read distribution parameters more than twice,
        # and that it produces non-None gradients (if the distribution is fully
        # reparameterized).
        with tf.GradientTape() as tape:
            # TDs do bijector assertions twice (once by distribution.sample, and once
            # by bijector.forward).
            max_permissible = 2 + extra_tensor_conversions_allowed(dist)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `sample` of `{}`'.format(dist),
                    max_permissible=max_permissible):
                sample = dist.sample(seed=seed)
        if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
            grads = tape.gradient(sample, dist.variables)
            for grad, var in zip(grads, dist.variables):
                var_name = var.name.rstrip('_0123456789:')
                if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()):
                    continue
                if grad is None:
                    raise AssertionError(
                        'Missing sample -> {} grad for distribution {}'.format(
                            var_name, dist_name))

        # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's
        # own samples.  Also, to relax conversion counts for KL (might do >2 w/
        # validate_args).
        dist = dist.copy(validate_args=False)
        dist2 = dist2.copy(validate_args=False)

        # Test that KL divergence reads distribution parameters at most once, and
        # that is produces non-None gradients.
        try:
            for d1, d2 in (dist, dist2), (dist2, dist):
                if dist_name in SKIP_KL_CHECK_DIST_VAR_GRADS:
                    continue
                with tf.GradientTape() as tape:
                    with tfp_hps.assert_no_excessive_var_usage(
                            '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))'
                            .format(d1, d1.variables, d2, d2.variables),
                            max_permissible=1
                    ):  # No validation => 1 convert per var.
                        kl = d1.kl_divergence(d2)
                wrt_vars = list(d1.variables) + list(d2.variables)
                grads = tape.gradient(kl, wrt_vars)
                for grad, var in zip(grads, wrt_vars):
                    if grad is None and dist_name not in NO_KL_PARAM_GRADS:
                        raise AssertionError(
                            'Missing KL({} || {}) -> {} grad:\n'  # pylint: disable=duplicate-string-formatting-argument
                            '{} vars: {}\n{} vars: {}'.format(
                                d1, d2, var, d1, d1.variables, d2,
                                d2.variables))
        except NotImplementedError:
            # Raised by kl_divergence if no registered KL is found.
            pass

        # Test that log_prob produces non-None gradients, except for distributions
        # on the NO_LOG_PROB_PARAM_GRADS blocklist.
        if dist_name not in NO_LOG_PROB_PARAM_GRADS:
            with tf.GradientTape() as tape:
                lp = dist.log_prob(tf.stop_gradient(sample))
            grads = tape.gradient(lp, dist.variables)
            for grad, var in zip(grads, dist.variables):
                if grad is None:
                    raise AssertionError(
                        'Missing log_prob -> {} grad for distribution {}'.
                        format(var, dist_name))

        # Test that all forms of probability evaluation avoid reading distribution
        # parameters more than once.
        for evaluative in sorted(
                data.draw(
                    hps.sets(hps.one_of(
                        map(hps.just, [
                            'log_prob', 'prob', 'log_cdf', 'cdf',
                            'log_survival_function', 'survival_function'
                        ])),
                             min_size=3,
                             max_size=3))):
            hp.note('Testing excessive var usage in {}.{}'.format(
                dist_name, evaluative))
            try:
                # No validation => 1 convert. But for TD we allow 2:
                # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp)
                max_permissible = 2 + extra_tensor_conversions_allowed(dist)
                with tfp_hps.assert_no_excessive_var_usage(
                        'evaluative `{}` of `{}`'.format(evaluative, dist),
                        max_permissible=max_permissible):
                    getattr(dist, evaluative)(sample)
            except NotImplementedError:
                pass
Example #26
0
    u = 0
    jds = tfd.JointDistributionSequential([
        tfd.Normal(loc=x, scale=1.),  # m
        tfd.Normal(loc=y, scale=1.),  # b
        lambda b, m: tfd.Normal(loc=m * X + b, scale=1.)  # Y
    ])

    return jds.log_prob(x, y, z)


print('gradient ',
      tfp.math.value_and_gradient(logp, [[1.0], 2.0, [5.0, 3.0, 2.2]]))
x = tf.Variable(0.1)
beta = tf.Variable(1.36)
q = tf.Variable(2.1)
x.assign(2.25)
with tf.GradientTape() as tape:
    upgamma = (3 - q) / (2 * (q - 1))
    num = (3.14**.5) * tf.math.exp(tf.math.lgamma(upgamma))
    downgamma = 1 / (q - 1)
    den = ((q - 1)**.5) * tf.math.exp(tf.math.lgamma(downgamma))
    cq = num / den
    pd = tf.math.pow(
        (1 - (1 - q) * beta * x**2), 1 / (1 - q)) * (beta**.5) * (1 / cq)
    lpd = tf.math.log(pd)
x.assign(4.2)
with tf.GradientTape() as tape1:
    trial = tf.math.exp(x)
#print(tape.gradient(lpd,[beta,q]))
#print(tape1.gradient(trial,[x]))
Example #27
0
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])

            # generate lambdas
            lambdas = log_uniform_sample(per_core_batch_size,
                                         lambda_parameters)
            lambdas = tf.reshape(lambdas,
                                 (FLAGS.ensemble_size * per_core_batch_size,
                                  lambdas_config.dim))

            with tf.GradientTape() as tape:
                logits = model([images, lambdas], training=True)
                if FLAGS.use_bfloat16:
                    logits = tf.cast(logits, tf.float32)

                if FLAGS.use_gibbs_ce:
                    # Average of single model CEs
                    # tiling of labels should be only done for Gibbs CE loss
                    labels = tf.tile(labels, [FLAGS.ensemble_size])
                    negative_log_likelihood = tf.reduce_mean(
                        tf.keras.losses.sparse_categorical_crossentropy(
                            labels, logits, from_logits=True))
                else:
                    # Ensemble CE uses no tiling of the labels
                    negative_log_likelihood = ensemble_crossentropy(
                        labels, logits, FLAGS.ensemble_size)
                # Note: Divide l2_loss by sample_size (this differs from uncertainty_
                # baselines implementation.)
                l2_loss = sum(model.losses) / train_sample_size
                loss = negative_log_likelihood + l2_loss
                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)

            # Separate learning rate for fast weights.
            grads_and_vars = []
            for grad, var in zip(grads, model.trainable_variables):
                if (('alpha' in var.name or 'gamma' in var.name)
                        and 'batch_norm' not in var.name):
                    grads_and_vars.append(
                        (grad * FLAGS.fast_weight_lr_multiplier, var))
                else:
                    grads_and_vars.append((grad, var))
            optimizer.apply_gradients(grads_and_vars)

            probs = tf.nn.softmax(logits)
            per_probs = tf.split(probs,
                                 num_or_size_splits=FLAGS.ensemble_size,
                                 axis=0)
            per_probs_stacked = tf.stack(per_probs, axis=0)
            metrics['train/ece'].update_state(labels, probs)
            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, logits)
            diversity_results = um.average_pairwise_diversity(
                per_probs_stacked, FLAGS.ensemble_size)
            for k, v in diversity_results.items():
                metrics['train/' + k].update_state(v)

            if grads_and_vars:
                grads, _ = zip(*grads_and_vars)
Example #28
0
    def testTrain(self, layer_id, rng_updater_id, batch_size, trax_has_weights,
                  explicit_build, use_model):
        """Tests training (forward and backward pass) for TraxKerasLayer.

    Args:
      layer_id: an integer, the index into `_LAYERS`.
      rng_updater_id: an integer, the index into `_RNG_UPDATERS`.
      batch_size: an integer or `None`, the value for the `batch_size` argument
        in `TraxKerasLayer.__init__`.
      trax_has_weights: bool, whether to make the trax layer contain weights at
        the time when `TraxKerasLayer.build` is called.
      explicit_build: bool, whether to explicitly call `TraxKerasLayer.build`.
      use_model: bool, whether to build a `tf.keras.Model` out of the
        `TraxKerasLayer` layer and use the model to do the training instead of
        the bare layer. If `True`, we will also test checkpointing and restoring
        using the model.
    """
        with trax.fastmath.use_backend("tensorflow-numpy"):
            make_trax_layer, input_shapes_no_batch, dtype, allow_none_batch = (
                _LAYERS[layer_id])
            # We make a fresh trax layer for each test case, so that different test
            # cases won't interfere with each other.
            trax_layer = make_trax_layer()
            if not allow_none_batch and batch_size is None:
                self.skipTest("This Trax layer can't handle None batch size.")
            rng_updater = _RNG_UPDATERS[rng_updater_id]
            input_shapes = math_lib.nested_map(lambda s: [batch_size] + s,
                                               input_shapes_no_batch)
            input_sig = trax2keras.tensor_shapes_to_shape_dtypes(
                input_shapes, dtype)
            initializer_rng = math_lib.random.get_prng(765)
            weights, state = trax_layer.init(input_sig, rng=initializer_rng)
            generator = tf.random.Generator.from_seed(567)

            def get_inputs():
                return dummy_inputs(generator, input_sig)

            if trax_has_weights:
                trax_layer(to_arrays(get_inputs()),
                           weights=weights,
                           state=state)
            rng = math_lib.random.get_prng(1234)
            keras_layer = trax2keras.TraxKerasLayer(
                trax_layer,
                batch_size=batch_size,
                initializer_rng=initializer_rng,
                rng=rng,
                rng_updater=rng_updater)
            if explicit_build:
                keras_layer.build(input_shapes)
            if use_model:
                x = tf.keras.Input(shape=input_shapes_no_batch, dtype=dtype)
                y = keras_layer(x)
                keras_model = tf.keras.Model(inputs=x, outputs=y)
            lr = 0.1  # learning rate
            for _ in range(3):
                inputs = get_inputs()
                with tf.GradientTape() as trax_tape:
                    trax_tape.watch([x.data for x in tf.nest.flatten(weights)])
                    trax_outputs, state = trax_layer.pure_fn(to_arrays(inputs),
                                                             weights=weights,
                                                             state=state,
                                                             rng=rng)
                trax_grads = trax_tape.gradient(
                    *to_tensors([trax_outputs, weights]))
                # `g` may be `tf.IndexedSlices`, so we need to `convert_to_tensor`
                # before multiplication.
                weights = tf.nest.map_structure(
                    lambda w, g: w + jnp.asarray(lr * tf.convert_to_tensor(g),
                                                 w.dtype), weights, trax_grads)
                rng = rng_updater(rng)
                with tf.GradientTape() as keras_tape:
                    if use_model:
                        keras_outputs = keras_model(inputs)
                    else:
                        keras_outputs = keras_layer(inputs)
                if isinstance(keras_outputs,
                              tuple) and len(keras_outputs) == 1:
                    keras_outputs = keras_outputs[0]
                self.assertAllClose(to_tensors(trax_outputs), keras_outputs)
                keras_grads = keras_tape.gradient(
                    keras_outputs, keras_layer.trainable_variables)
                tf.nest.map_structure(
                    lambda v, g: v.assign_add(  # pylint: disable=g-long-lambda
                        tf.cast(lr * tf.convert_to_tensor(g), v.dtype)),
                    keras_layer.trainable_variables,
                    keras_grads)
                self.assertAllClose(to_tensors(weights),
                                    read_values(keras_layer._weights),
                                    rtol=2e-6,
                                    atol=2e-4 if has_gpu() else 1e-6)
                self.assertAllClose(to_tensors(state),
                                    read_values(keras_layer._state))
                self.assertAllClose(to_tensors(rng),
                                    read_values(keras_layer._rng))
            if use_model:
                fname = os.path.join(self.get_temp_dir(), "checkpoint")
                keras_model.save(fname)
                loaded_model = tf.keras.models.load_model(fname)
                for _ in range(2):
                    inputs = get_inputs()
                    self.assertAllClose(keras_model(inputs),
                                        loaded_model(inputs))
Example #29
0
    def testBijector(self, bijector_name, data):
        tfp_hps.guitar_skip_if_matches('Tanh', bijector_name, 'b/144163991')

        bijector, event_dim = self._draw_bijector(bijector_name, data)

        # Forward mapping: Check differentiation through forward mapping with
        # respect to the input and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        xs = self._draw_domain_tensor(bijector, data, event_dim)
        wrt_vars = [xs] + [
            v for v in bijector.trainable_variables if v.dtype.is_floating
        ]
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ys = bijector.forward(xs + 0)
        grads = tape.gradient(ys, wrt_vars)
        assert_no_none_grad(bijector, 'forward', wrt_vars, grads)

        # For scalar bijectors, verify correctness of the _is_increasing method.
        # TODO(b/148459057): Except, don't verify Softfloor on Guitar because
        # of numerical problem.
        def exception(bijector):
            if not tfp_hps.running_under_guitar():
                return False
            if isinstance(bijector, tfb.Softfloor):
                return True
            if is_invert(bijector):
                return exception(bijector.bijector)
            return False

        if (bijector.forward_min_event_ndims == 0
                and bijector.inverse_min_event_ndims == 0
                and not exception(bijector)):
            dydx = grads[0]
            hp.note('dydx: {}'.format(dydx))
            isfinite = tf.math.is_finite(dydx)
            incr_or_slope_eq0 = bijector._internal_is_increasing() | tf.equal(
                dydx, 0)  # pylint: disable=protected-access
            self.assertAllEqual(
                isfinite & incr_or_slope_eq0,
                isfinite & (dydx >= 0) | tf.zeros_like(incr_or_slope_eq0))

        # FLDJ: Check differentiation through forward log det jacobian with
        # respect to the input and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.forward_min_event_ndims,
                         max_value=xs.shape.ndims))
        with tf.GradientTape() as tape:
            max_permitted = _ldj_tensor_conversions_allowed(bijector,
                                                            is_forward=True)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `forward_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ldj = bijector.forward_log_det_jacobian(
                    xs + 0, event_ndims=event_ndims)
        grads = tape.gradient(ldj, wrt_vars)
        assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars,
                            grads)

        # Inverse mapping: Check differentiation through inverse mapping with
        # respect to the codomain "input" and parameter variables.  Also check that
        # any variables are not referenced overmuch.
        ys = self._draw_codomain_tensor(bijector, data, event_dim)
        wrt_vars = [ys] + [
            v for v in bijector.trainable_variables if v.dtype.is_floating
        ]
        with tf.GradientTape() as tape:
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse` of {}'.format(bijector)):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                xs = bijector.inverse(ys + 0)
        grads = tape.gradient(xs, wrt_vars)
        assert_no_none_grad(bijector, 'inverse', wrt_vars, grads)

        # ILDJ: Check differentiation through inverse log det jacobian with respect
        # to the codomain "input" and parameter variables.  Also check that any
        # variables are not referenced overmuch.
        event_ndims = data.draw(
            hps.integers(min_value=bijector.inverse_min_event_ndims,
                         max_value=ys.shape.ndims))
        with tf.GradientTape() as tape:
            max_permitted = _ldj_tensor_conversions_allowed(bijector,
                                                            is_forward=False)
            with tfp_hps.assert_no_excessive_var_usage(
                    'method `inverse_log_det_jacobian` of {}'.format(bijector),
                    max_permissible=max_permitted):
                tape.watch(wrt_vars)
                # TODO(b/73073515): Fix graph mode gradients with bijector caching.
                ldj = bijector.inverse_log_det_jacobian(
                    ys + 0, event_ndims=event_ndims)
        grads = tape.gradient(ldj, wrt_vars)
        assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars,
                            grads)

        # Verify that `_is_permutation` implies constant zero Jacobian.
        if bijector._is_permutation:
            self.assertTrue(bijector._is_constant_jacobian)
            self.assertAllEqual(ldj, 0.)

        # Verify correctness of batch shape.
        xs_batch_shapes = tf.nest.map_structure(
            lambda x, nd: ps.shape(x)[:ps.rank(x) - nd], xs,
            bijector.inverse_event_ndims(event_ndims))
        empirical_batch_shape = functools.reduce(
            ps.broadcast_shape,
            nest.flatten_up_to(bijector.forward_min_event_ndims,
                               xs_batch_shapes))
        batch_shape = bijector.experimental_batch_shape(
            y_event_ndims=event_ndims)
        if tensorshape_util.is_fully_defined(batch_shape):
            self.assertAllEqual(empirical_batch_shape, batch_shape)
        self.assertAllEqual(
            empirical_batch_shape,
            bijector.experimental_batch_shape_tensor(
                y_event_ndims=event_ndims))

        # Check that the outputs of forward_dtype and inverse_dtype match the dtypes
        # of the outputs of forward and inverse.
        self.assertAllEqualNested(ys.dtype, bijector.forward_dtype(xs.dtype))
        self.assertAllEqualNested(xs.dtype, bijector.inverse_dtype(ys.dtype))
Example #30
0
 def grad(model, inputs):
     with tf.GradientTape() as tape:
         _ = model(inputs, training=True)
         loss_value = sum(model.losses)
     return model.losses, tape.gradient(loss_value,
                                        model.trainable_variables)