Example #1
0
    def compute_gradients(
        self,
        loss,
        var_list=None,
        gate_gradients=GATE_OP,
        aggregation_method=None,
        colocate_gradients_with_ops=False,
        grad_loss=None,
    ):
        assert type(loss) is list
        num_tasks = len(loss)
        loss = tf.stack(loss)
        tf.random.shuffle(loss)

        # Compute per-task gradients.
        grads_task = tf.vectorized_map(
            lambda x: tf.concat(
                [
                    tf.reshape(
                        grad,
                        [
                            -1,
                        ],
                    ) for grad in tf.gradients(x, var_list) if grad is not None
                ],
                axis=0,
            ),
            loss,
        )

        # Compute gradient projections.
        def proj_grad(grad_task):
            for k in range(num_tasks):
                inner_product = tf.reduce_sum(grad_task * grads_task[k])
                proj_direction = inner_product / tf.reduce_sum(
                    grads_task[k] * grads_task[k])
                grad_task = grad_task - tf.minimum(proj_direction,
                                                   0.0) * grads_task[k]
            return grad_task

        proj_grads_flatten = tf.vectorized_map(proj_grad, grads_task)

        # Unpack flattened projected gradients back to their original shapes.
        proj_grads = []
        for j in range(num_tasks):
            start_idx = 0
            for idx, var in enumerate(var_list):
                grad_shape = var.get_shape()
                flatten_dim = np.prod([
                    grad_shape.dims[i].value
                    for i in range(len(grad_shape.dims))
                ])
                proj_grad = proj_grads_flatten[j][start_idx:start_idx +
                                                  flatten_dim]
                proj_grad = tf.reshape(proj_grad, grad_shape)
                if len(proj_grads) < len(var_list):
                    proj_grads.append(proj_grad)
                else:
                    proj_grads[idx] += proj_grad
                start_idx += flatten_dim
        grads_and_vars = list(zip(proj_grads, var_list))
        return grads_and_vars
Example #2
0
    def __call__(
        self, observation: types.NestedTensor, prev_state: PolicyCriticRNNState
    ) -> Tuple[types.NestedTensor, PolicyCriticRNNState]:

        return tf.vectorized_map(self._call, (observation, prev_state))
Example #3
0
        def compute_gradients(self,
                              loss,
                              var_list,
                              gate_gradients=GATE_OP,
                              aggregation_method=None,
                              colocate_gradients_with_ops=False,
                              grad_loss=None,
                              gradient_tape=None):
            """DP-SGD version of base class method."""
            if callable(loss):
                # TF is running in Eager mode
                raise NotImplementedError(
                    'Vectorized optimizer unavailable for TF2.')
            else:
                # TF is running in graph mode, check we did not receive a gradient tape.
                if gradient_tape:
                    raise ValueError(
                        'When in graph mode, a tape should not be passed.')

                batch_size = tf.shape(input=loss)[0]
                if self._num_microbatches is None:
                    self._num_microbatches = batch_size

                # Note: it would be closer to the correct i.i.d. sampling of records if
                # we sampled each microbatch from the appropriate binomial distribution,
                # although that still wouldn't be quite correct because it would be
                # sampling from the dataset without replacement.
                microbatch_losses = tf.reshape(loss,
                                               [self._num_microbatches, -1])

                if var_list is None:
                    var_list = (tf.compat.v1.trainable_variables(
                    ) + tf.compat.v1.get_collection(
                        tf.compat.v1.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))

                def process_microbatch(microbatch_loss):
                    """Compute clipped grads for one microbatch."""
                    microbatch_loss = tf.reduce_mean(
                        input_tensor=microbatch_loss)
                    grads, _ = zip(
                        *super(DPOptimizerClass, self).compute_gradients(
                            microbatch_loss, var_list, gate_gradients,
                            aggregation_method, colocate_gradients_with_ops,
                            grad_loss))
                    grads_list = [
                        g if g is not None else tf.zeros_like(v)
                        for (g, v) in zip(list(grads), var_list)
                    ]
                    # Clip gradients to have L2 norm of l2_norm_clip.
                    # Here, we use TF primitives rather than the built-in
                    # tf.clip_by_global_norm() so that operations can be vectorized
                    # across microbatches.
                    grads_flat = tf.nest.flatten(grads_list)
                    squared_l2_norms = [
                        tf.reduce_sum(input_tensor=tf.square(g))
                        for g in grads_flat
                    ]
                    global_norm = tf.sqrt(tf.add_n(squared_l2_norms))
                    div = tf.maximum(global_norm / self._l2_norm_clip, 1.)
                    clipped_flat = [g / div for g in grads_flat]
                    clipped_grads = tf.nest.pack_sequence_as(
                        grads_list, clipped_flat)
                    return clipped_grads

                clipped_grads = tf.vectorized_map(process_microbatch,
                                                  microbatch_losses)

                def reduce_noise_normalize_batch(stacked_grads):
                    summed_grads = tf.reduce_sum(input_tensor=stacked_grads,
                                                 axis=0)
                    noise_stddev = self._l2_norm_clip * self._noise_multiplier
                    noise = tf.random.normal(tf.shape(input=summed_grads),
                                             stddev=noise_stddev)
                    noised_grads = summed_grads + noise
                    return noised_grads / tf.cast(self._num_microbatches,
                                                  tf.float32)

                final_grads = tf.nest.map_structure(
                    reduce_noise_normalize_batch, clipped_grads)

                return list(zip(final_grads, var_list))
Example #4
0
def sample_prior_predictive(
    model: ModelType,
    sample_shape: Union[int, Tuple[int]] = 1000,
    sample_from_observed: bool = True,
    var_names: Optional[Union[str, List[str]]] = None,
    state: Optional[SamplingState] = None,
    use_auto_batching: bool = True,
) -> InferenceData:
    """
    Draw ``sample_shape`` values from the model for the desired ``var_names``.

    Parameters
    ----------
    model : types.GeneratorType, pymc4.Model
        Model to draw samples from
    sample_shape: Union[int, Tuple[int]]
        The sample shape of the draw. Every distribution has its core dimensions
        (e.g. ``pm.Normal("x", 0, tf.ones(2))`` has a single core dimension with ``shape=(2,)``).
        The ``sample_shape`` controls the total number of draws to make from a distribution, and
        the shape that will be prepended to the core dimensions. In the above case, if
        ``sample_shape=(3, 1)``, then the resulting draw will have ``shape=(3, 1, 2)``. If an
        ``int`` is passed, it's converted to a tuple with a single entry: ``(sample_shape,)``
    sample_from_observed: bool
        If ``False``, the distributions that were assigned observed values wont be resampled, and
        the observed values will used for computations downstream.
        If ``True``, the distributions that were assigned observed values will be resampled. This
        means that their observed value will be completely ignored (including its implied shape),
        and a new sample will be drawn from the prior distribution.
        Observed variables are only returned in the ``Samples`` dictionary if
        ``sample_from_observed`` is ``True`` or the name of the observed variable is explicitly
        provided in ``var_names``.
    var_names: Optional[Union[str, List[str]]]
        The list of variable names that will be included in the returned samples. Strings can be
        used to specify a single variable. If ``None``, the samples drawn for all untransformed
        distributions and deterministics will be returned in the ``Samples`` dictionary.
        Furthermore, if ``sample_from_observed=True``, then the observed variable names will be
        added to the untransformed distributions.
    state : Optional[pymc4.flow.SamplingState]
        A ``SamplingState`` that can be used to specify distributions fixed values and change
        observed values.
    use_auto_batching: bool
        A bool value that indicates whether ``sample_prior_predictive`` should automatically batch
        the draws or not. If you are sure you have manually tuned your model to be fully
        vectorized, then you can set this to ``False``, and your sampling should be faster than
        the auto batched counterpart. If you are not sure if your model is vectorized, then auto
        batching will safely sample from it but with some additional overhead.

    Returns
    -------
    Samples: InferenceDataType
        An ArviZ's InferenceData object with a prior_predictive group

    Examples
    --------
    Lets define a simple model to sample from

    >>> import pymc4 as pm
    >>> @pm.model
    ... def model():
    ...     sd = yield pm.HalfNormal("sd", 1.)
    ...     norm = yield pm.Normal("n", 0, sd, observed=np.random.randn(10))

    Now, we may want to draw samples from the model's prior, ignoring the
    observed values.

    >>> prior_samples = sample_prior_predictive(model(), sample_shape=(20, 3))

    The samples are returned as an InferenceData object with a prior_predictive group

    >>> sorted(list(prior_samples.prior_predictive))
    ['model/n', 'model/sd']

    The drawn values are the xarray DataSet values, and their shape will depend on the supplied
    ``sample_shape``

    >>> [v.shape for v in prior_samples.prior_predictive.values()]
    [(1, 20, 3), (1, 20, 3)]

    If we only wanted to draw samples from unobserved variables we would have done the following

    >>> prior_samples = sample_prior_predictive(model(), sample_from_observed=False)
    >>> sorted(list(prior_samples.prior_predictive))
    ['model/sd']

    Notes
    -----
    If ``sample_from_observed=False``, the observed value passed to the variables will be used in
    the later stages of the model's computation.

    >>> import pymc4 as pm
    >>> @pm.model
    ... def model2():
    ...     sd = yield pm.HalfNormal("sd", 1.)
    ...     x = yield pm.Normal("x", 0, sd, observed=np.ones(10))
    ...     y = yield pm.Normal("y", x, 1e-8)
    >>> prior_samples = sample_prior_predictive(
    ...     model2(), sample_shape=(20,), sample_from_observed=False
    ... )
    >>> np.allclose(np.mean(prior_samples.prior_predictive["model2/y"]), 1)
    True

    Furthermore, this has consequences at the shape level of the drawn samples
    >>> prior_samples.prior_predictive["model2/y"].shape
    (1, 20, 10)

    If ``sample_from_observed=True`` the value of the ``x`` random variable will be drawn from its
    prior distribution, which will have consequences both at the value and shape levels of
    downstream computations

    >>> prior_samples = sample_prior_predictive(
    ...     model2(), sample_shape=(20,), sample_from_observed=True
    ... ).prior_predictive
    >>> np.allclose(np.mean(prior_samples["model2/y"]), 1)
    False
    >>> prior_samples["model2/y"].shape
    (1, 20)

    If you take special care to fully vectorize your model, you will be able
    to sample from it when you set ``use_auto_batching=False``
    >>> import numpy as np
    >>> from time import time
    >>> observed = np.ones(10, dtype="float32")
    >>> @pm.model
    ... def vect_model():
    ...     mu = yield pm.Normal("mu", 0, 1, conditionally_independent=True)
    ...     scale = yield pm.HalfNormal("scale", 1, conditionally_independent=True)
    ...     obs = yield pm.Normal(
    ...         "obs", mu, scale, event_stack=len(observed), observed=observed
    ...     )
    >>> st1 = time()
    >>> prior_samples1 = sample_prior_predictive(
    ...     vect_model(), sample_shape=(30, 20), use_auto_batching=False
    ... ).prior_predictive
    >>> st2 = en1 = time()
    >>> prior_samples2 = sample_prior_predictive(
    ...     vect_model(), sample_shape=(30, 20), use_auto_batching=True
    ... ).prior_predictive
    >>> en2 = time()
    >>> prior_samples2["vect_model/obs"].shape
    (1, 30, 20, 10)
    >>> prior_samples1["vect_model/obs"].shape
    (1, 30, 20, 10)
    >>> (en1 - st1) < (en2 - st2)
    True

    """
    if isinstance(sample_shape, int):
        sample_shape = (sample_shape, )

    # Do a single forward pass to establish the distributions, deterministics and observeds
    _, state = evaluate_meta_model(model, state=state)
    distributions_names = list(state.untransformed_values)
    deterministic_names = list(state.deterministics_values)
    observed = None
    traced_observeds: Set[str] = set()
    if sample_from_observed:
        state.observed_values = observed = {
            k: None
            for k in state.observed_values
        }
        distributions_names = distributions_names + list(state.observed_values)

    if isinstance(var_names, str):
        var_names = [var_names]

    if var_names is None:
        var_names = distributions_names + deterministic_names
    else:
        # We can trace the observed values if their names are explicitly requested in var_names
        traced_observeds = set([
            var_name for var_name in var_names
            if var_name in state.observed_values
        ])
    if not set(var_names) <= (set(distributions_names + deterministic_names)
                              | traced_observeds):
        raise ValueError(
            "Some of the supplied var_names are not defined in the supplied "
            "model {}.\nList of unknown var_names: {}".format(
                model,
                list(
                    set(var_names) -
                    set(distributions_names + deterministic_names)),
            ))

    # If we don't have to auto-batch, then we can simply evaluate the model
    if not use_auto_batching:
        _, state = evaluate_model(model,
                                  observed=observed,
                                  sample_shape=sample_shape)
        all_values = collections.ChainMap(state.all_values,
                                          state.deterministics_values)
        return trace_to_arviz(
            prior_predictive={k: all_values[k].numpy()
                              for k in var_names})

    # Setup the function that makes a single draw
    @tf.function(autograph=False)
    def single_draw(index):
        _, state = evaluate_model(model, observed=observed)
        return tuple(
            state.untransformed_values[k] if k in state.untransformed_values
            else (state.observed_values[k] if k in
                  traced_observeds else state.deterministics_values[k])
            for k in var_names)

    # Make draws in parallel with tf.vectorized_map
    samples = tf.vectorized_map(single_draw,
                                tf.range(int(np.prod(sample_shape))))

    # Convert the samples to ndarrays and make a dictionary with the desired sample_shape
    output = dict()
    for name, sample in zip(var_names, samples):
        sample = sample.numpy()
        output[name] = np.reshape(sample, sample_shape + sample.shape[1:])

    return trace_to_arviz(prior_predictive=output)
Example #5
0
import numpy as np
import tensorflow as tf

from graph_tf.utils.linalg import SparseLinearOperator

n = 5
m = 3
a = tf.random.uniform((n, n), dtype=tf.float32)
mask = tf.random.uniform((n, n), dtype=tf.float32) < 0.2
a = tf.where(mask, a, tf.zeros_like(a)) + n * tf.eye(n)
a = tf.sparse.from_dense(a)

lo = SparseLinearOperator(a, is_self_adjoint=True, is_positive_definite=True)

x = tf.random.normal((n, ))
# sol = tf.linalg.experimental.conjugate_gradient(lo, x)
# y = sol.x
# x2 = lo.matvec(x)
# np.testing.assert_allclose(x, x2, atol=1e-4)
# print("Passed")

x = tf.random.normal((m, n))
y = tf.vectorized_map(
    lambda x: tf.linalg.experimental.conjugate_gradient(lo, x).x, x)
x2 = tf.vectorized_map(lo.matvec, y)
np.testing.assert_allclose(x, x2, atol=1e-4)
print("Passed")
Example #6
0
 def log_prob(self, params):
     return tf__.vectorized_map(self.log_prob_one_chain, params)
Example #7
0
 def vectorized_logpfn(*state):
     return tf.vectorized_map(lambda mini_state: logpfn(*mini_state), state)
Example #8
0
def corrections_func(mainPN,
                     batch_size,
                     trace_length,
                     corrections=False,
                     cube=None,
                     clip_lola_update_norm=False,
                     lola_correction_multiplier=1.0,
                     clip_lola_correction_norm=False,
                     clip_lola_actor_norm=False,
                     against_destabilizer_exploiter=False):
    """Computes corrections for policy gradients.

    Args:
    -----
        mainPN: list of policy/Q-networks
        batch_size: int
        trace_length: int
        corrections: bool (default: False)
            Whether policy networks should use corrections.
        cube: tf.Varialbe or None (default: None)
            If provided, should be constructed via `lola.utils.make_cube`.
            Used for variance reduction of the value estimation.
            When provided, the computation graph for corrections is faster to
            compile but is quite memory inefficient.
            When None, variance reduction graph is contructed dynamically,
            is a little longer to compile, but has lower memory footprint.
    """
    # not mem_efficient
    if cube is not None:
        ac_logp0 = tf.reshape(mainPN[0].log_pi_action_bs_t,
                              [batch_size, 1, trace_length])
        ac_logp1 = tf.reshape(mainPN[1].log_pi_action_bs_t,
                              [batch_size, trace_length, 1])
        mat_1 = tf.reshape(tf.squeeze(tf.matmul(ac_logp1, ac_logp0)),
                           [batch_size, 1, trace_length * trace_length])

        v_0 = tf.matmul(
            tf.reshape(mainPN[0].sample_reward, [batch_size, trace_length, 1]),
            mat_1)
        v_0 = tf.reshape(
            v_0, [batch_size, trace_length, trace_length, trace_length])

        v_1 = tf.matmul(
            tf.reshape(mainPN[1].sample_reward, [batch_size, trace_length, 1]),
            mat_1)
        v_1 = tf.reshape(
            v_1, [batch_size, trace_length, trace_length, trace_length])

        v_0 = 2 * tf.reduce_sum(v_0 * cube) / batch_size
        v_1 = 2 * tf.reduce_sum(v_1 * cube) / batch_size
    # wt mem_efficient
    else:
        ac_logp0 = tf.reshape(mainPN[0].log_pi_action_bs_t,
                              [batch_size, trace_length])
        ac_logp1 = tf.reshape(mainPN[1].log_pi_action_bs_t,
                              [batch_size, trace_length])

        # Static exclusive cumsum
        ac_logp0_cumsum = [tf.constant(0.)]
        ac_logp1_cumsum = [tf.constant(0.)]
        for i in range(trace_length - 1):
            ac_logp0_cumsum.append(tf.add(ac_logp0_cumsum[-1], ac_logp0[:, i]))
            ac_logp1_cumsum.append(tf.add(ac_logp1_cumsum[-1], ac_logp1[:, i]))

        # Compute v_0 and v_1
        mat_cumsum = ac_logp0[:, 0] * ac_logp1[:, 0]
        v_0 = mat_cumsum * mainPN[0].sample_reward[:, 0]
        v_1 = mat_cumsum * mainPN[1].sample_reward[:, 0]
        for i in range(1, trace_length):
            mat_cumsum = tf.add(mat_cumsum, ac_logp0[:, i] * ac_logp1[:, i])
            mat_cumsum = tf.add(mat_cumsum,
                                ac_logp0_cumsum[i] * ac_logp1[:, i])
            mat_cumsum = tf.add(mat_cumsum,
                                ac_logp1_cumsum[i] * ac_logp0[:, i])
            v_0 = tf.add(v_0, mat_cumsum * mainPN[0].sample_reward[:, i])
            v_1 = tf.add(v_1, mat_cumsum * mainPN[1].sample_reward[:, i])
        v_0 = 2 * tf.reduce_sum(v_0) / batch_size

        if against_destabilizer_exploiter:
            v_1 = 2 * v_1 / batch_size
        else:
            v_1 = 2 * tf.reduce_sum(v_1) / batch_size

    mainPN[0].v_0_log = v_0
    mainPN[1].v_1_log = v_1
    actor_target_error_0 = (mainPN[0].target -
                            tf.stop_gradient(mainPN[0].value))
    v_0_pi_0 = 2*tf.reduce_sum((actor_target_error_0* mainPN[0].gamma_array) * mainPN[0].log_pi_action_bs_t) / \
               batch_size
    v_0_pi_1 = 2*tf.reduce_sum((actor_target_error_0 * mainPN[1].gamma_array) * mainPN[1].log_pi_action_bs_t) / \
               batch_size

    actor_target_error_1 = (mainPN[1].target -
                            tf.stop_gradient(mainPN[1].value))

    v_1_pi_0 = 2 * tf.reduce_sum(
        (actor_target_error_1 * mainPN[0].gamma_array) *
        mainPN[0].log_pi_action_bs_t) / batch_size
    v_1_pi_1 = 2 * tf.reduce_sum(
        (actor_target_error_1 * mainPN[1].gamma_array) *
        mainPN[1].log_pi_action_bs_t) / batch_size

    mainPN[0].actor_target_error = actor_target_error_0
    mainPN[1].actor_target_error = actor_target_error_1
    mainPN[0].actor_loss = v_0_pi_0
    mainPN[1].actor_loss = v_1_pi_1
    mainPN[0].value_used_for_correction = v_0
    mainPN[1].value_used_for_correction = v_1

    v_0_grad_theta_0 = flatgrad(v_0_pi_0, mainPN[0].parameters)
    v_0_grad_theta_1 = flatgrad(v_0_pi_1, mainPN[1].parameters)

    v_1_grad_theta_0 = flatgrad(v_1_pi_0, mainPN[0].parameters)
    v_1_grad_theta_1 = flatgrad(v_1_pi_1, mainPN[1].parameters)

    mainPN[0].grad = v_0_grad_theta_0
    mainPN[1].grad = v_1_grad_theta_1
    mainPN[0].grad_sum = tf.math.reduce_sum(v_0_grad_theta_0)
    mainPN[1].grad_sum = tf.math.reduce_sum(v_1_grad_theta_1)

    mainPN[0].grad_v_1 = v_1_grad_theta_0
    mainPN[1].grad_v_0 = v_0_grad_theta_1

    if corrections:
        v_0_grad_theta_0_wrong = flatgrad(v_0, mainPN[0].parameters)
        if against_destabilizer_exploiter:
            # v_1_grad_theta_1_wrong_splits = [ flatgrad(v_1[i], mainPN[1].parameters) for i in range(batch_size)]
            # v_1_grad_theta_1_wrong = tf.stack(v_1_grad_theta_1_wrong_splits, axis=1)

            v_1_grad_theta_1_wrong = tf.vectorized_map(
                partial(flatgrad, var_list=mainPN[1].parameters), v_1)
        else:
            v_1_grad_theta_1_wrong = flatgrad(v_1, mainPN[1].parameters)

        param_len = v_0_grad_theta_0_wrong.get_shape()[0].value
        # param_len = -1

        if against_destabilizer_exploiter:
            multiply0 = tf.matmul(
                tf.reshape(tf.stop_gradient(v_0_grad_theta_1), [1, param_len]),
                tf.reshape(v_1_grad_theta_1_wrong, [param_len, batch_size]))
        else:
            multiply0 = tf.matmul(
                tf.reshape(tf.stop_gradient(v_0_grad_theta_1), [1, param_len]),
                tf.reshape(v_1_grad_theta_1_wrong, [param_len, 1]))
        multiply1 = tf.matmul(
            tf.reshape(tf.stop_gradient(v_1_grad_theta_0), [1, param_len]),
            tf.reshape(v_0_grad_theta_0_wrong, [param_len, 1]))

        if against_destabilizer_exploiter:
            second_order0 = flatgrad(multiply0, mainPN[0].parameters)
            second_order0 = second_order0[:, None]

            # second_order0_splits = [flatgrad(multiply0[:, i], mainPN[0].parameters) for i in range(batch_size)]
            # second_order0 = tf.stack(second_order0_splits, axis=1)

            # second_order0 = tf.vectorized_map(partial(flatgrad, var_list=mainPN[0].parameters), multiply0[0, :])
            # second_order0 = tf.reshape(second_order0, [param_len, batch_size])
        else:
            second_order0 = flatgrad(multiply0, mainPN[0].parameters)
        second_order1 = flatgrad(multiply1, mainPN[1].parameters)

        mainPN[0].multiply0 = multiply0
        mainPN[0].v_0_grad_01 = second_order0
        mainPN[1].v_1_grad_10 = second_order1
        mainPN[0].second_order = tf.math.reduce_sum(second_order0)
        mainPN[1].second_order = tf.math.reduce_sum(second_order1)

        if against_destabilizer_exploiter:
            second_order0 = tf.math.reduce_sum(second_order0, axis=1)

        second_order0 = (second_order0 * lola_correction_multiplier)
        second_order1 = (second_order1 * lola_correction_multiplier)
        if clip_lola_correction_norm:
            second_order0 = tf.clip_by_norm(second_order0,
                                            clip_lola_correction_norm,
                                            axes=None,
                                            name=None)
            second_order1 = tf.clip_by_norm(second_order1,
                                            clip_lola_correction_norm,
                                            axes=None,
                                            name=None)
        if clip_lola_actor_norm:
            v_0_grad_theta_0 = tf.clip_by_norm(v_0_grad_theta_0,
                                               clip_lola_actor_norm,
                                               axes=None,
                                               name=None)
            v_1_grad_theta_1 = tf.clip_by_norm(v_1_grad_theta_1,
                                               clip_lola_actor_norm,
                                               axes=None,
                                               name=None)

        delta_0 = v_0_grad_theta_0 + second_order0
        delta_1 = v_1_grad_theta_1 + second_order1

        if clip_lola_update_norm:
            delta_0 = tf.clip_by_norm(delta_0,
                                      clip_lola_update_norm,
                                      axes=None,
                                      name=None)
            delta_1 = tf.clip_by_norm(delta_1,
                                      clip_lola_update_norm,
                                      axes=None,
                                      name=None)

        mainPN[0].delta = delta_0
        mainPN[1].delta = delta_1
    else:
        mainPN[0].delta = v_0_grad_theta_0
        mainPN[1].delta = v_1_grad_theta_1

        # To prevent some logic about logging stuff
        mainPN[0].v_0_grad_01 = tf.reduce_sum(v_0_grad_theta_0) * 0.0
        mainPN[1].v_1_grad_10 = tf.reduce_sum(v_0_grad_theta_0) * 0.0
    def _log_prob(self, y, **kwargs):
        """Calculates the log probability of observing epidemic events y
        :param y: a list of tensors.  The first is of shape [n_times] containing times,
                  the second is of shape [n_times, n_states, n_states] containing event
                  matrices.
        :param param: a list of parameters
        :returns: a scalar giving the log probability of the epidemic
        """
        dtype = dtype_util.common_dtype([y, self.initial_state],
                                        dtype_hint=self.dtype)
        events = tf.convert_to_tensor(y, dtype)
        with tf.name_scope("StateTransitionMarginalModel.log_prob"):

            state_timeseries = compute_state(
                initial_state=self.initial_state,
                events=events,
                stoichiometry=self.stoichiometry,
                closed=True,
            )

            tms_timeseries = tf.transpose(state_timeseries, perm=(1, 0, 2))
            tmr_events = tf.transpose(events, perm=(1, 0, 2))

            def fn(elems):
                return tf.stack(self.transition_rates(*elems), axis=-1)

            rates = tf.vectorized_map(
                fn=fn,
                elems=(
                    self._initial_step + tf.range(tms_timeseries.shape[0]),
                    tms_timeseries,
                ),
            )

            def integrated_rate_fn():
                """Use mid-point integration to estimate the constant rate
                over time.
                """
                integrated_rates = tms_timeseries[..., :-1] * rates
                return (integrated_rates[:-1, ...] +
                        integrated_rates[1:, ...]) / 2.0

            integrated_rates = integrated_rate_fn()

            log_norm_constant = tf.reduce_sum(
                tf.math.multiply_no_nan(tf.math.log(integrated_rates),
                                        tmr_events) -
                tf.math.lgamma(tmr_events + 1.0),
                axis=(0, 1),
            )
            pi_concentration = (
                tf.reduce_sum(tmr_events, axis=(0, 1)) +
                self.baseline_hazard_rate_priors["concentration"])
            pi_rate = (tf.reduce_sum(integrated_rates * self.time_delta,
                                     axis=(0, 1)) +
                       self.baseline_hazard_rate_priors["rate"])

            log_prob = (log_norm_constant + tf.math.lgamma(pi_concentration) -
                        (pi_concentration) * tf.math.log(pi_rate))

            return tf.reduce_sum(log_prob)
	def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:  
		"""
		Executes all the contractions of the layer
		The contraction over a single input sample is done in the contract function
		and is then vectorized over the whole input dataset
		"""

		def contract(
			x          : tf.Tensor  ,  # input sample 
			nodes      : List[Node] ,  # list of weights
			n_contr    : int        ,  # number of feature to contract on each weight
			use_bias   : bool       ,  # use bias (true or false)
			bias_var   : tf.Tensor	   # bias tensor
			) -> tf.Tensor :

			x_nodes  = []
			tn_nodes = []

			#create tensornetwork nodes using a for loop on nodes and one on features
			#to the i-th node the associated features are [i*n_contr;(i+1)*n_contr[ 
			for i in range(len(nodes)):  # loop over the weight tensors
				for j in range(n_contr): # loop over input features corresponding to the weight
					#create feature nodes
					x_nodes.append(
						tn.Node(x[n_contr*i+j]          , #feature to convert to node
								name      = 'xnode'     , 
								backend   = "tensorflow"  #use tensoflwo to manage computations
						)
					)
				#create weight nodes
				tn_nodes.append(
					tn.Node(
						nodes[i]            , #weight to convert to node
						name=f'node_{i}'    , 
						backend="tensorflow"  #use tensoflwo to manage computations
					)
				)
			
			#using the same loop structure connect the edges of the nodes
			#this DOES NOT contract but only prepares for the contraction
			for i in range(len(nodes)):  # loop over the weight tensors
				for j in range(n_contr): # loop over input features corresponding to the weight
					#make connections between weight and corresponing feature
					x_nodes[n_contr*i+j][0] ^ tn_nodes[i][j]
					
			# Contract each weight tensor to its feature
			# using tensornetwork contractor
			result = []
			for i in range(len(nodes)): #loop over weights
				result.append(
					tn.contractors.greedy( #use Tn contractor
						[x_nodes[n_contr*i+j] for j in range(n_contr)]+[tn_nodes[i]] #weight node and list of connected features
					)
				)

			#revert the result to a Tensoflow tensor and add (if specified) the bias tensor
			result= tf.convert_to_tensor([r.tensor for r in result])
			if use_bias:
				result += bias_var
			return result

		#prepare input data for the vectorization of the contract function
		input_shape = list(inputs.shape) #get shape
		inputs = tf.reshape(inputs, (-1, input_shape[1], input_shape[2])) #expand dimension
		#vectorize the contraction over all the input samples
		result = tf.vectorized_map( #vectorize
				lambda vec: contract( #create a lambda function to be vectorized
					vec                , #input sample
					self.nodes         , #weight tensors
					self.n_contraction , #number of feaqture to contract
					self.use_bias      , 
					self.bias_var		#bias tensor
				), 
			inputs						#input dataset over which vectorize the lambda function
		)

		#if specified use the activation function over the output
		if self.activation is not None:
			result = self.activation(result)
		return result
def kde_tf(query, dataset, kernel_tf, bandwidth=1):
    @tf.function(experimental_relax_shapes=True)
    def model_fn(arg):
        return kernel_tf(x=arg, y=query, bandwidth=bandwidth)

    return tf.math.reduce_mean(tf.vectorized_map(model_fn, dataset))
Example #12
0
def loss(x):
    f = lambda x_: tf.reduce_sum(tf.math.square(tf.math.square(x_)), axis=0)
    return tf.vectorized_map(f, x)
Example #13
0
def multi_hot(ind, depth):
    one_hots = tf.vectorized_map(lambda i: tf.one_hot(i, depth),
                                 tf.transpose(ind))
    multi_hots = tf.reduce_any(tf.cast(one_hots, tf.bool), axis=0)
    return tf.cast(multi_hots, tf.float32)
Example #14
0
 def _create_mask(trajectory):
     tf_range = tf.range(tf.size(trajectory))
     return tf.vectorized_map(lambda x: tf.math.reduce_sum(tf.cast((trajectory == x), tf.int32))
                              > 0, tf_range)
Example #15
0
def differentiable_barycentrics(
        framebuffer: fb.Framebuffer,
        clip_space_vertices: type_alias.TensorLike,
        triangles: type_alias.TensorLike,
        use_vectorized_map: bool = True) -> fb.Framebuffer:
    """Computes differentiable barycentric coordinates from a Framebuffer.

  The barycentric coordinates will be differentiable w.r.t. the input vertices.
  Later, we may support derivatives w.r.t. pixel position for mip-mapping.

  Args:
    framebuffer: a multi-layer Framebuffer containing triangle ids and a
      foreground mask with shape [batch, num_layers, height, width, 1]
    clip_space_vertices: a 2-D float32 tensor with shape [vertex_count, 4] or a
      3-D tensor with shape [batch, vertex_count, 4] containing homogenous
      vertex positions (xyzw).
    triangles: a 2-D int32 tensor with shape [triangle_count, 3] or a 3-D tensor
      with shape [batch, triangle_count, 3] containing per-triangle vertex
      indices in counter-clockwise order.
    use_vectorized_map: If true uses vectorized_map otherwise uses map_fn.

  Returns:
    a copy of `framebuffer`, but the differentiable barycentric coordinates will
    replace any barycentric coordinates already in the `framebuffer`.
  """
    rank = lambda t: len(t.shape)

    clip_space_vertices = tf.convert_to_tensor(clip_space_vertices)
    shape.check_static(tensor=clip_space_vertices,
                       tensor_name="clip_space_vertices",
                       has_rank_greater_than=1,
                       has_rank_less_than=4)
    if rank(clip_space_vertices) == 2:
        clip_space_vertices = tf.expand_dims(clip_space_vertices, axis=0)

    triangles = tf.convert_to_tensor(triangles)
    shape.check_static(tensor=triangles,
                       tensor_name="triangles",
                       has_rank_greater_than=1,
                       has_rank_less_than=4)
    if rank(triangles) == 2:
        triangles = tf.expand_dims(triangles, axis=0)
    else:
        shape.compare_batch_dimensions(tensors=(clip_space_vertices,
                                                triangles),
                                       last_axes=(-3, -3),
                                       broadcast_compatible=False)

    shape.compare_batch_dimensions(tensors=(clip_space_vertices,
                                            framebuffer.triangle_id),
                                   last_axes=(-3, -4),
                                   broadcast_compatible=False)

    # Compute image pixel coordinates.
    px, py = normalized_pixel_coordinates(framebuffer.width,
                                          framebuffer.height)

    def compute_barycentrics_fn(
        slices: Tuple[type_alias.TensorLike, type_alias.TensorLike,
                      type_alias.TensorLike]
    ) -> tf.Tensor:
        clip_vertices_slice, triangle_slice, triangle_id_slice = slices
        triangle_id_slice = triangle_id_slice[..., 0]
        if rank(triangle_id_slice) == 2:  # There is no layer dimension.
            triangle_id_slice = tf.expand_dims(triangle_id_slice, axis=0)
        # Compute per-triangle inverse matrices.
        triangle_matrices = compute_triangle_matrices(clip_vertices_slice,
                                                      triangle_slice)

        # Compute per-pixel barycentric coordinates.
        barycentric_coords = compute_barycentric_coordinates(
            triangle_id_slice, triangle_matrices, px, py)
        barycentric_coords = tf.transpose(barycentric_coords,
                                          perm=[1, 2, 3, 0])
        return barycentric_coords

    if use_vectorized_map:
        per_image_barycentrics = tf.vectorized_map(
            compute_barycentrics_fn,
            (clip_space_vertices, triangles, framebuffer.triangle_id))
    else:
        num_meshes = tf.shape(clip_space_vertices)[0]
        triangles_repeated = tf.repeat(triangles, repeats=num_meshes, axis=0)
        per_image_barycentrics = tf.map_fn(
            compute_barycentrics_fn,
            (clip_space_vertices, triangles_repeated, framebuffer.triangle_id),
            fn_output_signature=tf.TensorSpec(shape=(1, None, None, 3)))

    barycentric_coords = tf.stack(per_image_barycentrics, axis=0)
    # After stacking barycentrics will have layers dimension no matter what.
    # In order to make sure we return differentiable barycentrics of the same
    # shape - reshape the tensor using original shape.
    barycentric_coords = tf.reshape(barycentric_coords,
                                    shape=tf.shape(
                                        framebuffer.barycentrics.value))
    # Mask out barycentrics for background pixels.
    barycentric_coords = barycentric_coords * framebuffer.foreground_mask

    return fb.Framebuffer(triangle_id=framebuffer.triangle_id,
                          vertex_ids=framebuffer.vertex_ids,
                          foreground_mask=framebuffer.foreground_mask,
                          attributes=framebuffer.attributes,
                          barycentrics=fb.RasterizedAttribute(
                              barycentric_coords, None, None))
Example #16
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))


        Y = self.data

        # specify qXp, the variational distribution q(X): each x_n is independent w/ N(x_n | \mu_n, S_n)
        # \mu_n \in R^q given by each row of `X_data_mean`
        # S_n \in R^qxq diagonal, so equivalently given by each row of `X_data_var`
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)

        # if split space, specify qXs
        # compute psi statistics for the shared space, keep the original shape of psi statistics, use qXs and kernel_s
        # psi0s is N-vector
        # psi1s is [N, M]
        # psi2s is [N, M, M]
        # also compute the covariance matrix Kuu for the shared space
        if self.split_space:
            qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
            psi0s = expectation(qXs, self.kernel_s)
            psi1s = expectation(qXs, (self.kernel_s, self.Zs))
            psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
            cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())


        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: if no shared space, then psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # if have shared space, then psi0[:, k] = psi0s + psi0k, psi1[:, :, k] = psi1s + psi1k
        # psi2[:, :, :, k] = psi2s + psi2k (the cross terms are added later)
        # then, for each n, psi2[n, :, :, k] = psi1s[n, :]^T dot psi1k[n, :] + psi1k[n, :]^T dot psi1s[n, :] (both are [M, M])
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        psi0 = []
        psi1 = []
        psi2 = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0k = expectation(qXp, kernel_k)
            psi1k = expectation(qXp, (kernel_k, self.Zp))
            psi2k = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            if self.split_space:
                psi0.append(psi0s + psi0k)            
                psi1.append(psi1s + psi1k)
                # add the cross-covariance terms, require computation separately for each n
                sxk = tf.vectorized_map(row_outer_product, (psi1s, psi1k))
                kxs = tf.vectorized_map(row_outer_product, (psi1k, psi1s))
                psi2.append(psi2s + psi2k + sxk + kxs)
            else:
                psi0.append(psi0k)
                psi1.append(psi1k)
                psi2.append(psi2k)
        psi0 = tf.stack(psi0, axis=-1)
        psi1 = tf.stack(psi1, axis=-1)
        psi2 = tf.stack(psi2, axis=-1)

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        L = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            if self.split_space:
                L.append(tf.linalg.cholesky(cov_uu_s + cov_uu_k))
            else:
                L.append(tf.linalg.cholesky(cov_uu_k))
        L = tf.stack(L, axis=-1)
        L = repeat_N(L)
        sigma2 = self.likelihood.variance


        # self.pred_Y = []

        # use `tf.vectorized_map` to avoid writing a loop over N, but it requires every matrix to have N on axis 0
        # so we need to repeat certain matrices that are the same for all N (e.g. L)
        # note we can use `tf.vectorized_map` because the computations are decomposable for each n,
        # i.e. they can be computed in any order over n
        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            # compute intermediate matrices for easier computation involving \inv{Kuu}
            # A is the same as AAT in gplvm, transposing L is the correct thing to do
            # but the two end up being the same since we only care about the trace
            tmp = tf.vectorized_map(triang_solve, (L[..., k], psi2[..., k])) # [N, M, M]
            A = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp)) # \inv{Kuu} * Psi2: [N, M, M]

            #pos_def = tf.vectorized_map(lambda x: is_pos_def(x), psi2[..., k])
            #print(np.all(pos_def))
            # psi2 is not produced using w/ `covariances.Kuu`, but it should still be PD
            # we should add jitter before doing cholesky
            #jitter_mtx = default_jitter() * tf.eye(self.M, dtype=default_float())
            jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())
            LB = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2[..., k]) # [N, M, M]  
            tmp1 = tf.vectorized_map(triang_solve, (L[..., k], LB)) # [N, M, M]
            C = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp1)) # sqrt(\inv{Kuu} * Psi2 * \inv{Kuu}): [N, M, M]

            D = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), C)) # sqrt(M^T * \inv{Kuu} * Psi2 * \inv{Kuu} * M): [N, D, M]

            tmp2 = tf.vectorized_map(triang_solve, (L[..., k], repeat_N(self.q_mu[k])))
            E = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp2)) # \inv{Kuu} * M: [N, M, D]

            # q_sqrt is already the cholesky
            F = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), C)) # sqrt(S * \inv{Kuu} * Psi2 * \inv{Kuu}): [N, D, M, M]

            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1[..., k])) # Y^T * Psi1: [N, D, M]
            G = tf.vectorized_map(matmul_vectorized, (tmp3, E)) # Y^T * Psi1 * \inv{Kuu} * M: [N, D, D]

            # for debugging 
            # self.pred_Y.append(tf.reshape(tf.vectorized_map(matmul_vectorized, (tf.expand_dims(psi1[..., k], 1), E)), (self.N, self.D))) # Psi1 * \inv{Kuu} * M: [N, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(lambda x: trace_tf(x), G) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), D) / sigma2
            Fnk += 0.5 * self.D * tf.vectorized_map(lambda x: trace_tf(x), A)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(lambda x: sum_d_trace(x), F) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * psi0 / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # for debugging 
        #self.Fq = Fq
        # self.pred_Y = tf.stack(self.pred_Y, axis=-1) # [N, D, K]

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_u = 0
        prior_Kuu = np.zeros((self.M, self.M))
        if self.split_space:
            KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
            bound += - KL_s
            prior_Kuu += covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_u += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu+prior_Kuu_k)
        bound += - KL_p - KL_u - KL_c

        return bound
Example #17
0
    def _LMELBO_MLMC(self,
                     x,
                     y,
                     theta,
                     phi,
                     max_level=6,
                     w0=1 - 2.**(-3 / 2),
                     b=2,
                     N0=1,
                     randomize=False):
        """
        Compute (averaged) LMELBO by MLMC

        Arguments:
        x: 3-d array of size [N, T, D]
        y: 2-d array of size [N, T]
        theta: 
        phi: 
        max_level: integer
        w0: the proportion of total samples in (x,y) used at the level 0.
            in other words, 100*(1-w0) % of the total samples are used for estimating the correction term.
        b: scalar. the second moment of the coupled difference estimator (dLMELBO) must decrease at a rate of O(2^(-b*level)).
        randomize: whether to use randomization of MLMC.

        Returns:
        lmelbo: scalar estimate of averaged lmelbo over sample points.
        """
        N = y.shape[0]
        M = self.M

        # unpack parameters
        idx = tf.random.shuffle(tf.range(N))
        x = x[idx]
        y = y[idx]
        alpha = theta['alpha']
        beta = theta['beta']
        K = get_K(alpha, beta)

        m = phi['m']
        CholS = phi['CholS']

        # calculate KL divergence of p(u) and q(u) of u = f_0(z_1,...,z_M)
        K_mm = K(self.z, self.z) + 1e-6 * tf.eye(M, dtype=tf.float64)
        CholK_mm = tf.linalg.cholesky(K_mm)

        p_u = tfp.distributions.MultivariateNormalTriL(loc=0.,
                                                       scale_tril=CholK_mm)
        q_u = tfp.distributions.MultivariateNormalTriL(loc=m, scale_tril=CholS)
        kl_qu_pu = tfp.distributions.kl_divergence(q_u, p_u)

        # calculate distribution of f conditionally on u = f_0(z_1,...,z_M)
        u = q_u.sample(N)
        inv_CholK_mm = tf.linalg.inv(CholK_mm)
        inv_K_mm = tf.transpose(inv_CholK_mm) @ inv_CholK_mm
        K_nm = K(x, self.z)
        K_mn = tf.transpose(K_nm)

        mean_f = tf.linalg.einsum('ni,ij,nj->n', K_nm, inv_K_mm, u)
        var_f = tf.vectorized_map(lambda x: K(x, x), tf.expand_dims(x, axis=1))
        var_f = tf.reshape(var_f, [N])
        var_f = var_f - tf.linalg.einsum('ni,ij,jn->n', K_nm, inv_K_mm, K_mn)

        # determine proportions of the number of samples among levels
        if max_level == 0:
            levels = np.array([0])
            weights = np.array([1.])
        else:
            weights = 2.**(-(b + 1) / 2 * np.arange(max_level))
            weights /= sum(weights)
            weights = np.concatenate([[w0], (1 - w0) * weights])
            levels = np.arange(max_level + 1)

        # determine the N_l's
        if randomize == True:
            Ns = np.random.multinomial(n=N, pvals=weights)
        elif randomize == False:
            Ns = np.array([np.math.ceil(w * N) for w in weights], dtype=np.int)
            Ns[0] = N - sum(Ns[1:])
        else:
            raise (Exception(
                "Invarid argument for 'randomize' of function LMELBO_MLMC. It must be True or False."
            ))

        # compute dLMELBO's using disjoint samples at each level and sum them up
        offset = 0
        lmelbo = -kl_qu_pu / self.N_total
        for l in levels:
            if Ns[l] == 0:
                continue
            x_tmp = x[offset:offset + Ns[l]]
            y_tmp = y[offset:offset + Ns[l]]
            mean_f_tmp = mean_f[offset:offset + Ns[l]]
            var_f_tmp = var_f[offset:offset + Ns[l]]

            if randomize == True:
                lmelbo += self._dconditional_likelihood(
                    x_tmp, y_tmp, mean_f_tmp, var_f_tmp, l,
                    N0) * Ns[l] / N / weights[l]
            elif randomize == False:
                lmelbo += self._dconditional_likelihood(
                    x_tmp, y_tmp, mean_f_tmp, var_f_tmp, l, N0)

            offset += Ns[l]

        return lmelbo
Example #18
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))

        Y = self.data

        qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
        psi0s = expectation(qXs, self.kernel_s)
        psi1s = expectation(qXs, (self.kernel_s, self.Zs))
        psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
        cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        Ls = tf.linalg.cholesky(cov_uu_s)
        Ls = repeat_N(Ls) # [N x M x M]

        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)
        psi0k = []
        psi1k = []
        psi2k = []
        psi2ks = []
        psi2sk = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0 = expectation(qXp, kernel_k)
            psi1 = expectation(qXp, (kernel_k, self.Zp))
            psi2 = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            psi0k.append(psi0)            
            psi1k.append(psi1)
            psi2k.append(psi2)
            # add the cross-covariance terms, require computation separately for each n
            psi2sk.append(tf.vectorized_map(row_outer_product, (psi1s, psi1)))
            #psi2ks.append(tf.vectorized_map(row_outer_product, (psi1, psi1s)))
        psi0k = tf.stack(psi0k, axis=-1)
        psi1k = tf.stack(psi1k, axis=-1)
        psi2k = tf.stack(psi2k, axis=-1)
        psi2sk = tf.stack(psi2sk, axis=-1)
        #psi2ks = tf.stack(psi2ks, axis=-1)  

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        Lk = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            Lk.append(tf.linalg.cholesky(cov_uu_k))
        Lk = tf.stack(Lk, axis=-1)
        Lk = repeat_N(Lk)
        
        sigma2 = self.likelihood.variance
        jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())

        tmp = tf.vectorized_map(triang_solve, (Ls, psi2s))
        As = tf.vectorized_map(triang_solve_transpose, (Ls, tmp)) # \inv{Kuu^s} * Psi2s: [N, M, M]

        LBs = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2s) # [N, M, M]  
        tmp1 = tf.vectorized_map(triang_solve, (Ls, LBs)) # [N, M, M]
        Cs = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1)) # sqrt(\inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, M, M]
        Ds = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), Cs)) # sqrt(Ms^T * \inv{Kuu^s} * Psi2s * \inv{Kuu^s} * Ms): [N, D, M]

        Fs = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt_s, perm=[0, 2, 1])), Cs)) # sqrt(Ss * \inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, D, M, M]

        tmp2 = tf.vectorized_map(triang_solve, (Ls, repeat_N(self.q_mu_s)))
        Es = tf.vectorized_map(triang_solve_transpose, (Ls, tmp2)) # \inv{Kuu^s} * Ms: [N, M, D]
        tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1s)) # Y^T * Psi1: [N, D, M]
        Gs = tf.vectorized_map(matmul_vectorized, (tmp3, Es)) # Y^T * Psi1s * \inv{Kuu^s} * Ms: [N, D, D]

        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            tmp = tf.vectorized_map(triang_solve, (Lk[..., k], psi2k[..., k])) # [N, M, M]
            Ak = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp)) # \inv{Kuu^k} * Psi2k: [N, M, M]

            LBk = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2k[..., k]) # [N, M, M]  
            tmp1k = tf.vectorized_map(triang_solve, (Lk[..., k], LBk)) # [N, M, M]
            Ck = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp1k)) # sqrt(\inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, M, M]
            Dk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), Ck)) # sqrt(Mk^T * \inv{Kuu^k} * Psi2k * \inv{Kuu^k} * Mk): [N, D, M]

            # q_sqrt is already the cholesky
            Fk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), Ck)) # sqrt(Sk * \inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, D, M, M]

            tmp2 = tf.vectorized_map(triang_solve, (Lk[..., k], repeat_N(self.q_mu[k])))
            Ek = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp2)) # \inv{Kuu^k} * Mk: [N, M, D]
            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1k[..., k])) # Y^T * Psi1k: [N, D, M]
            Gk = tf.vectorized_map(matmul_vectorized, (tmp3, Ek)) # Y^T * Psi1k * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the cross terms 
            tmp1sk = tf.vectorized_map(triang_solve, (Ls, psi2sk[..., k]))
            tmp2sk = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1sk)) # \inv{Kuu^s} * Psi2sk: [N, M, M]
            tmp3sk = tf.vectorized_map(matmul_vectorized, (tmp2sk, Ek)) # \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, M, D]
            Dsk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), tmp3sk)) # Ms^T * \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(trace_tf, Gs + Gk) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Ds) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Dk) / sigma2
            # the sum of trace of the 2 cross terms is 2 times the trace of one since they are transpose of one another
            Fnk += - tf.vectorized_map(trace_tf, Dsk) / sigma2 
            Fnk += 0.5 * self.D * tf.vectorized_map(trace_tf, As + Ak)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fs) / sigma2
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fk) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * (tf.repeat(tf.expand_dims(psi0s, -1), self.K, axis=1) + psi0k) / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
        
        prior_Kuu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        KL_us = kullback_leiblers.gauss_kl(q_mu=self.q_mu_s, q_sqrt=self.q_sqrt_s, K=prior_Kuu_s)
        KL_uk = 0
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_uk += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu_k)
        bound += - KL_s - KL_p - KL_us - KL_uk - KL_c

        return bound
Example #19
0
class Params:

    ENABLE_XLA = False  # optimizing compiler, see https://www.tensorflow.org/xla
    # set XLA envvars: export XLA_FLAGS=--xla_gpu_cuda_data_dir=/usr/lib/cuda;
    # to enable auto-clustering on CPU: export TF_XLA_FLAGS=--tf_xla_cpu_global_jit

    DEVICE = "GPU:0"
    DTYPE = 'float32'  # should be float32 at least
    USE_MIXED_PRECISION = False  # set if GPU computeCapability >= 7 (https://www.tensorflow.org/guide/mixed_precision)
    SEED = 123
    NUM_ACTORS = 4  # number of parallel distributed actors
    UPDATE_ACTOR_FREQ = 5  # actor parameter update every n episodes

    MIN_STEPS_TRAIN = tf.constant(8000)  # minimum number of steps to train for (so logs will be deleted on interrupt)
    MAX_STEPS_TRAIN = tf.constant(400000)  # total number of steps to train for
    MAX_EP_STEPS = tf.constant(1200)  # max steps per episode
    WARM_UP_STEPS = tf.constant(2500)  # number of steps per actor to perform randomly chosen action before predicting

    # Environment params
    # V_min and V_max = Lower and upper bounds of critic value output distribution
    # (should be chosen based on the range of normalised reward values in the chosen env)
    # rule of thumb: V_max = discounted sum of the maximum instantaneous rewards for the maximum episode length
    # V_min = - V_max

    ENV_NAME = "GameTF"  # GameTF or GymTF
    ENV_IMAGE_INPUT = True

    if ENV_NAME == "GymTF":

        if ENV_IMAGE_INPUT:
            raise NotImplementedError
        else:
            ENV_OBS_SPACE = tf.TensorShape((3,))

        ENV_ACT_SPACE = tf.TensorShape((1,))
        ENV_ACT_BOUND = tf.constant([2.])

        ENV_V_MAX = tf.constant(+0.)
        ENV_V_MIN = tf.constant(-400.)
        ENV_REWARD_INF = tf.constant(999.)

    elif ENV_NAME == "GameTF":

        ENV_N_GOALS = tf.constant(4)

        if ENV_IMAGE_INPUT:
            ENV_OBS_SPACE = tf.TensorShape((65, 65, 3))
        else:
            ENV_OBS_SPACE = tf.TensorShape(4 + 2*ENV_N_GOALS,)

        ENV_ACT_SPACE = tf.TensorShape(2,)
        ENV_ACT_BOUND = tf.constant([1.])

        ENV_V_MAX = tf.cast(tf.constant(1 * ENV_N_GOALS), DTYPE)
        ENV_V_MIN = tf.constant(-ENV_V_MAX)
        ENV_REWARD_INF = tf.constant(999.)

    else:
        raise Exception(f"Environment with name {ENV_NAME} not found.")

    # Replay Buffer
    BUFFER_TYPE = "ReverbUniform"  # Uniform, ReverbUniform, ReverbPrioritized todo try change
    BUFFER_SIZE = tf.constant(1000000, dtype=tf.int32)  # must be power of 2 for PER
    BUFFER_PRIORITY_ALPHA = tf.constant(0.6)  # (0.0 = Uniform sampling, 1.0 = Greedy prioritisation)
    BUFFER_PRIORITY_BETA_START = tf.constant(0.4, dtype=tf.float64)  # (0: no bias correction, 1: full bias correction)
    BUFFER_PRIORITY_BETA_END = tf.constant(1.0, dtype=tf.float64)
    BUFFER_PRIORITY_EPSILON = tf.constant(0.00001)

    # Networks
    MINIBATCH_SIZE = tf.constant(256, dtype=tf.int32)
    ACTOR_LEARNING_RATE = tf.constant(0.0001)
    CRITIC_LEARNING_RATE = tf.constant(0.001)
    GAMMA = tf.constant(0.996)  # Discount rate for future rewards
    TAU = tf.constant(0.001, dtype=DTYPE)  # Parameter for soft target network updates
    N_STEP_RETURNS = tf.constant(5)
    BASE_NET_ARCHITECTURE = [512]  # shallow net seems to work best, should be divisible by 16
    NUM_ATOMS = 51  # Number of atoms in output layer of distributional critic
    WITH_BATCH_NORM = tf.constant(True)
    WITH_DROPOUT = tf.constant(False)
    WITH_REGULARIZER = tf.constant(True)

    # Actor Noise
    DT = tf.constant(0.02)
    NOISE_TYPE = "Gaussian"  # Gaussian or OrnsteinUhlenbeck
    NOISE_MU = tf.constant(0.)
    NOISE_SIGMA = tf.constant(0.5)
    NOISE_SIGMA_MIN = tf.constant(5e-3)  # when to stop decreasing sigma
    NOISE_THETA = tf.constant(0.15)
    NOISE_DECAY = tf.constant(0.999253712)
    NOISE_X0 = tf.constant(0.)

    # Video Recorder
    RECORD_VIDEO = tf.constant(True)
    RECORD_VIDEO_TYPE = "GIF"  # GIF or MP4
    FRAME_SIZE = tf.constant([65., 65.])
    RECORD_START_EP = tf.constant(500)  # start recording at episode n
    RECORD_FREQ = tf.constant(150)  # record episodes and save to video file every n epsidoes
    RECORD_STEP_FREQ = tf.constant(3)  # do record step every n steps (to skip steps in between)

    LOG_TENSORBOARD = tf.constant(True)  # start with: $ tensorboard --logdir logs --reload_interval 5
    LOG_CONSOLE = tf.constant(False)  # print logs to console
    ACTOR_LOG_STEPS = tf.constant(25)  # log actor status every n episodes
    LEARNER_LOG_STEPS = tf.constant(200)  # log learner status every n learner steps
    TENSORFLOW_PROFILER = tf.constant(False)
    PLOT_MODELS = tf.constant(False)  # plot model summary
    SAVE_MODEL = tf.constant(True)  # save actor network after training

    """
    
    
    
    
    
    
    
    
    
    
    """

    # Calculate some params
    assert tf.reduce_all(tf.equal(tf.constant(ENV_OBS_SPACE[0:2], dtype=DTYPE), FRAME_SIZE[0:2])), \
        f"ENV_OBS_SPACE ({ENV_OBS_SPACE}) must match FRAME_SIZE ({FRAME_SIZE}) in first two dims"
    BUFFER_PRIORITY_BETA_INCREMENT = tf.divide((BUFFER_PRIORITY_BETA_END - BUFFER_PRIORITY_BETA_START),
                                               tf.cast(MAX_STEPS_TRAIN, BUFFER_PRIORITY_BETA_START.dtype))
    BUFFER_FROM_REVERB = tf.constant(True) if BUFFER_TYPE in ("ReverbUniform", "ReverbPrioritized") else tf.constant(False)
    BUFFER_IS_PRIORITIZED = True if BUFFER_TYPE == "ReverbPrioritized" else False
    if BUFFER_FROM_REVERB:
        BUFFER_DATA_SPEC = (
            tf.TensorSpec(ENV_OBS_SPACE, dtype=DTYPE, name="state"),
            tf.TensorSpec(ENV_ACT_SPACE, dtype=DTYPE, name="action"),
            tf.TensorSpec((N_STEP_RETURNS,), dtype=DTYPE, name="rewards_stack"),
            tf.TensorSpec((), dtype=tf.bool, name="terminal"),
            tf.TensorSpec(ENV_OBS_SPACE, dtype=DTYPE, name="state2"),
            tf.TensorSpec((NUM_ATOMS,), dtype=DTYPE, name="target_z_atoms"),
        )
        if BUFFER_IS_PRIORITIZED:
            BUFFER_PRIORITY_TABLE_NAMES = tf.constant([BUFFER_TYPE, BUFFER_TYPE + "_max", BUFFER_TYPE + "_min"])
    else:
        BUFFER_DATA_SPEC = (
            tf.TensorSpec(ENV_OBS_SPACE, dtype=DTYPE, name="state"),
            tf.TensorSpec(ENV_ACT_SPACE, dtype=DTYPE, name="action"),
            tf.TensorSpec((1,), dtype=DTYPE, name="reward"),
            tf.TensorSpec((1,), dtype=tf.bool, name="terminal"),
            tf.TensorSpec(ENV_OBS_SPACE, dtype=DTYPE, name="state2"),
            tf.TensorSpec((1,), dtype=DTYPE, name="gamma**N"),
        )
    BUFFER_DATA_SPEC_DTYPES = tuple(spec.dtype for spec in BUFFER_DATA_SPEC)
    BUFFER_DATA_SPEC_SHAPES = tuple(spec.shape for spec in BUFFER_DATA_SPEC)
    GAMMAS = tf.vectorized_map(
        lambda n, gamma=GAMMA: tf.math.pow(gamma, n),
        tf.range(N_STEP_RETURNS, dtype=DTYPE)
    )
    GAMMAS2 = tf.repeat(GAMMAS, 2)
    Z_ATOMS = tf.linspace(ENV_V_MIN, ENV_V_MAX, NUM_ATOMS)
    Z_ATOMS_ZEROS = tf.zeros_like(Z_ATOMS)
    DO_LOGGING = tf.logical_or(LOG_TENSORBOARD, LOG_CONSOLE)
Example #20
0
 def sum_d_trace(arg):
     trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
     return tf.reduce_sum(trace_D)
Example #21
0
    def _mosaic(self, images, boxes, mosaic_divide_points):
        """Builds mosaic of provided images.
    Args:
      images: original single images to make mosaic.
      boxes: corresponding bounding boxes to images.
      mosaic_divide_points: Points to build mosaic around on given output size.
    Returns:
      A tuple of mosaic Image, Mosaic Boxes merged.
    """
        (
            mosaic_image_topleft,
            mosaic_image_topright,
            mosaic_image_bottomleft,
            mosaic_image_bottomright,
        ) = self._scale_images(images, mosaic_divide_points)

        #####################################################
        # Scale Boxes for TOP LEFT image.
        # Note: Below function is complex because of TF item assignment restriction.
        # Map_fn is replace with vectorized_map below for optimization purpose.
        mosaic_box_topleft = tf.transpose(
            tf.vectorized_map(
                functools.partial(
                    self._scale_box,
                    image=images[0],
                    mosaic_image=mosaic_image_topleft,
                ),
                boxes[0],
            ))

        # Scale and Pad Boxes for TOP RIGHT image.

        mosaic_box_topright = tf.vectorized_map(
            functools.partial(
                self._scale_box,
                image=images[1],
                mosaic_image=mosaic_image_topright,
            ),
            boxes[1],
        )
        num_boxes = boxes[1].shape[0]
        idx_tp = tf.constant([[1], [3]])
        update_tp = [
            [tf.shape(mosaic_image_topleft)[0]] * num_boxes,
            [tf.shape(mosaic_image_topleft)[0]] * num_boxes,
        ]
        mosaic_box_topright = tf.transpose(
            tf.tensor_scatter_nd_add(mosaic_box_topright, idx_tp, update_tp))

        # Scale and Pad Boxes for BOTTOM LEFT image.

        mosaic_box_bottomleft = tf.vectorized_map(
            functools.partial(
                self._scale_box,
                image=images[2],
                mosaic_image=mosaic_image_bottomleft,
            ),
            boxes[2],
        )

        num_boxes = boxes[2].shape[0]
        idx_bl = tf.constant([[0], [2]])
        update_bl = [
            [tf.shape(mosaic_image_topleft)[1]] * num_boxes,
            [tf.shape(mosaic_image_topleft)[1]] * num_boxes,
        ]
        mosaic_box_bottomleft = tf.transpose(
            tf.tensor_scatter_nd_add(mosaic_box_bottomleft, idx_bl, update_bl))

        # Scale and Pad Boxes for BOTTOM RIGHT image.
        mosaic_box_bottomright = tf.vectorized_map(
            functools.partial(
                self._scale_box,
                image=images[3],
                mosaic_image=mosaic_image_bottomright,
            ),
            boxes[3],
        )

        num_boxes = boxes[3].shape[0]
        idx_br = tf.constant([[0], [2], [1], [3]])
        update_br = [
            [tf.shape(mosaic_image_topright)[1]] * num_boxes,
            [tf.shape(mosaic_image_topright)[1]] * num_boxes,
            [tf.shape(mosaic_image_bottomleft)[0]] * num_boxes,
            [tf.shape(mosaic_image_bottomleft)[0]] * num_boxes,
        ]
        mosaic_box_bottomright = tf.transpose(
            tf.tensor_scatter_nd_add(mosaic_box_bottomright, idx_br,
                                     update_br))

        # Gather mosaic_sub_images and boxes.
        mosaic_images = [
            mosaic_image_topleft,
            mosaic_image_topright,
            mosaic_image_bottomleft,
            mosaic_image_bottomright,
        ]
        mosaic_boxes = [
            mosaic_box_topleft,
            mosaic_box_topright,
            mosaic_box_bottomleft,
            mosaic_box_bottomright,
        ]

        return mosaic_images, mosaic_boxes
def main(argv):
    del argv
    horizon = 6
    num_users = 5
    num_topics = 3
    slate_size = 4
    num_iters = 100
    # Set sensitivity to 0.8 for all users to generate trajectories.
    variables = simulation_config.create_latent_variable_model_network(
        num_users=num_users, num_topics=num_topics, slate_size=slate_size)
    data_generation_network = network_lib.Network(variables=variables)
    tf_runtime = runtime.TFRuntime(network=data_generation_network)
    traj = dict(tf_runtime.trajectory(length=horizon))
    print('===============GROUND TRUTH LIKELIHOOD================')
    print(
        log_probability.log_probability_from_value_trajectory(
            variables=variables, value_trajectory=traj, num_steps=horizon - 1))
    print('======================================================')

    # Try to recover the sensitivity, starting from 0.1 for all users.
    sensitivity_var = tf.Variable(
        [0.1] * num_users,
        dtype=tf.float32,
        constraint=lambda x: tf.clip_by_value(x, 0.0, 1.0))
    story = lambda: simulation_config.create_latent_variable_model_network(  # pylint: disable=g-long-lambda
        num_users=num_users,
        num_topics=num_topics,
        slate_size=slate_size,
        satisfaction_sensitivity=sensitivity_var)
    trainable_vars = entity.story_with_trainable_variables(
        story)[1]['ModelLearningDemoUser']

    def unnormalized_log_prob_train(intent):
        # Hold out the user intent in the trajectories.
        intent_traj = tf.expand_dims(intent, axis=0) + tf.zeros(
            (horizon, num_users, num_topics))
        user_state_dict = dict(traj['user state'].as_dict)
        user_state_dict['intent'] = intent_traj
        traj['user state'] = Value(**user_state_dict)
        return log_probability.log_probability_from_value_trajectory(
            variables=story(), value_trajectory=traj, num_steps=horizon - 1)

    # Initialize the HMC transition kernel.
    num_results = int(2e3)
    num_burnin_steps = int(5e2)
    adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation(
        tfp.mcmc.HamiltonianMonteCarlo(
            target_log_prob_fn=unnormalized_log_prob_train,
            num_leapfrog_steps=5,
            step_size=.00008),
        num_adaptation_steps=int(num_burnin_steps * 0.8))

    # Run the chain (with burn-in).
    @tf.function
    def run_chain():
        samples, is_accepted = tfp.mcmc.sample_chain(
            num_results=num_results,
            num_burnin_steps=num_burnin_steps,
            current_state=tfd.Normal(loc=tf.ones(
                (num_users, num_topics)) / num_users,
                                     scale=0.5).sample(),
            kernel=adaptive_hmc,
            trace_fn=lambda _, pkr: pkr.inner_results.is_accepted)

        sample_mean = tf.reduce_mean(samples)
        sample_stddev = tf.math.reduce_std(samples)
        is_accepted = tf.reduce_mean(tf.cast(is_accepted, dtype=tf.float32))
        return samples, sample_mean, sample_stddev, is_accepted

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.02)
    for _ in range(num_iters):
        posterior_samples, sample_mean, sample_stddev, is_accepted = run_chain(
        )
        print('mean:{:.4f}  stddev:{:.4f}  acceptance:{:.4f}'.format(
            sample_mean.numpy(), sample_stddev.numpy(), is_accepted.numpy()))
        log_probs = []
        with tf.GradientTape() as tape:
            log_probs = tf.vectorized_map(
                unnormalized_log_prob_train,
                posterior_samples[num_burnin_steps:, ])
            log_prob = -tf.reduce_mean(log_probs)
        grads = tape.gradient(log_prob, trainable_vars)
        optimizer.apply_gradients(zip(grads, trainable_vars))
        print(trainable_vars[0].numpy(), tf.reduce_mean(log_probs).numpy())
Example #23
0
def sample_posterior_predictive(
    model: ModelType,
    trace: InferenceData,
    var_names: Optional[Union[str, List[str]]] = None,
    observed: Optional[Dict[str, Any]] = None,
    use_auto_batching: bool = True,
    inplace: bool = True,
) -> InferenceData:
    """
    Draw ``sample_shape`` values from the model for the desired ``var_names``.

    Parameters
    ----------
    model : types.GeneratorType, pymc4.Model
        Model to draw samples from
    trace: ArviZ's InferenceData object
        The samples drawn from the model's posterior distribution that should be used for sampling
        from the posterior predictive
    var_names: Optional[Union[str, List[str]]]
        The list of variable names that will be included in the returned samples. Strings can be
        used to specify a single variable. If ``None``, the samples drawn for all observed
        distributions will be returned in the ``Samples`` dictionary.
    observed : Optional[Dict[str, Any]]
        A dictionary that can be used to override the distribution observed values defined in the
        model.
    use_auto_batching: bool
        A bool value that indicates whether ``sample_posterior_predictive`` should automatically
        batch the draws or not. If you are sure you have manually tuned your model to be fully
        vectorized, then you can set this to ``False``, and your sampling should be faster than the
        auto batched counterpart. If you are not sure if your model is vectorized, then auto
        batching will safely sample from it but with some additional overhead.
    inplace: If True (default) it will add a posterior_predictive group to the provided ``trace``,
        instead of returning a new InferenceData object. If a posterior_predictive group is already
        present in ``trace`` it will be overwritten.

    Returns
    -------
    Samples: InferenceDataType
        An ArviZ's InferenceData object with a posterior_predictive group

    Examples
    --------
    Lets define a simple model to sample from

    >>> import pymc4 as pm
    >>> @pm.model
    ... def model():
    ...     sd = yield pm.HalfNormal("sd", 5.)
    ...     norm = yield pm.Normal("n", 0, sd, observed=np.random.randn(100))

    Now, we may want to draw samples from the model's posterior to then sample from the posterior
    predictive.

    >>> trace = pm.inference.sampling.sample(model())
    >>> ppc = pm.sample_posterior_predictive(model(), trace).posterior_predictive

    The samples are returned as a dictionary with the variable names as keys

    >>> sorted(list(ppc))
    ['model/n']

    The drawn values are the dictionary's values, and their shape will depend
    on the supplied ``trace``

    >>> ppc["model/n"].shape
    (10, 1000, 100)

    """
    if var_names is not None and len(var_names) == 0:
        raise ValueError("Supplied an empty var_names list to sample from")
    if isinstance(var_names, str):
        var_names = [var_names]

    # If we don't have to deal with auto-batching we can simply evaluate_model
    # passing the trace as values
    if not use_auto_batching:
        values = {
            var_name: tf.convert_to_tensor(value)
            for var_name, value in trace.posterior.items()
        }
        # We need to pass the number of chains and draws as sample_shape for
        # observed conditionally independent variables
        sample_shape = (trace.posterior.sizes["chain"],
                        trace.posterior.sizes["draw"])
        _, state = evaluate_model_posterior_predictive(
            model, values=values, observed=observed, sample_shape=sample_shape)
        all_values = collections.ChainMap(state.all_values,
                                          state.deterministics_values)
        if var_names is None:
            var_names = list(state.posterior_predictives)
        output = {k: all_values[k] for k in var_names}
        return trace_to_arviz(trace=trace,
                              posterior_predictive=output,
                              inplace=inplace)

    # We cannot assume that the model is vectorized, so we have batch the
    # pm.evaluate_model_posterior_predictive calls across the trace entries
    # This brings one big problem: we need to infer the batch dimensions from
    # the trace. To do this, we will do
    # 1) A single forward pass with the meta executor to determine the
    #    variable's shapes (we'll call these the core shapes)
    # 2) Go through the supplied trace to get each variable's batch shapes
    #    (the shapes to the left of the core shapes)
    # 3) Broadcast the encountered batch shapes between each other as a sanity
    #    check to get the global trace's batch_shape
    # 4) Broadcast the values in the trace to the global batch_shape to get
    #    each variable's broadcasted value.
    # 5) As tf.vectorized_map only iterates across the first dimension, we want
    #    to flatten the batch dimensions. To do this, we reshape the broadcasted
    #    values to (-1,) + core_shape. This way, tf.vectorized_map will be able
    #    to vectorize across the entire batch
    # 6) Collect the samples from, reshape them to batch_shape + core_shape and
    #    return them

    # Do a single forward pass to infer the distributions core shapes and
    # default observeds
    _, state = evaluate_meta_posterior_predictive_model(model,
                                                        observed=observed)
    if var_names is None:
        var_names = list(state.posterior_predictives)
    else:
        defined_variables = set(state.all_values) | set(
            state.deterministics_values)
        if not set(var_names) <= defined_variables:
            raise KeyError(
                "The supplied var_names = {} are not defined in the model.\n"
                "Defined variables are = {}".format(
                    list(set(var_names) - defined_variables),
                    list(defined_variables)))

    # Get the global batch_shape
    batch_shape = tf.TensorShape([])
    # Get a copy of trace because we may manipulate the dictionary later in this
    # function
    posterior = trace.posterior.copy()  # type: ignore
    posterior_names = list(posterior)
    for var_name in posterior_names:
        values = tf.convert_to_tensor(posterior[var_name].values)
        try:
            core_shape = state.all_values[var_name].shape
        except KeyError:
            if var_name in state.deterministics_values:
                # Remove the deterministics from the trace
                del posterior[var_name]
                continue
            else:
                raise TypeError(
                    "Supplied the variable {} in the trace, yet this variable is "
                    "not defined in the model: {!r}".format(var_name, state))
        assert_values_compatible_with_distribution_shape(
            var_name,
            values,
            batch_shape=tf.TensorShape([]),
            event_shape=core_shape)
        batch_shape = tf.TensorShape(
            tf.broadcast_static_shape(
                values.shape[:len(values.shape) -
                             len(core_shape)],  # type: ignore
                batch_shape,
            ))

    # Flatten the batch axis
    flattened_posterior = []
    for k, v in posterior.items():
        core_shape = tf.TensorShape(state.all_values[k].shape)
        batched_val = tf.broadcast_to(v.values, batch_shape + core_shape)
        flattened_posterior.append(
            tf.reshape(batched_val, shape=[-1] + core_shape.as_list()))
    posterior_vars = list(posterior)
    # Setup the function that makes a single draw
    @tf.function(autograph=False)
    def single_draw(elems):
        values = dict(zip(posterior_vars, elems))
        _, st = evaluate_model_posterior_predictive(model,
                                                    values=values,
                                                    observed=observed)
        return tuple([
            (st.untransformed_values[k] if k in st.untransformed_values else
             (st.deterministics_values[k]
              if k in st.deterministics_values else st.transformed_values[k]))
            for k in var_names
        ])

    # Make draws in parallel across the batch elements with tf.vectorized_map
    samples = tf.vectorized_map(single_draw, flattened_posterior)
    # Convert the samples to ndarrays and make a dictionary with the correct
    # batch_shape + core_shape
    output = dict()
    for name, sample in zip(var_names, samples):
        sample = sample.numpy()
        output[name] = np.reshape(sample, batch_shape + sample.shape[1:])
    return trace_to_arviz(trace=trace,
                          posterior_predictive=output,
                          inplace=inplace)
Example #24
0
    def K(self, X, X2=None):
        """
        calc rbf similarity for each node. nodes calc could be independent/the same/correlated
        how about employing adj? kernel matrix of different nodes have correlation.
        
        There are two ways to parallel:
        tf.map_fn(lambda x: scaled_square_dist(x[0], x[1]), (A, B), dtype=tf.float32)
        tf.vectorized_map(RBFNodes.rbf, (X_, X2_))
        :param X: (s1, n, d)
        :param X2: (s2, n, d)
        :return: K(X, X2) = (n, s1, s2)
        """
        assert tf.shape(X).shape[0] == 3

        #print(X.shape, self.lengthscales.shape)
        X_ = tf.transpose(X, [1, 0, 2])  # (n, s1, d)
        if X2 is None:
            if self.kern_type == 'RBF':
                return tf.vectorized_map(RBFNodes.rbf_self, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
            elif self.kern_type == 'Matern12':
                return tf.vectorized_map(RBFNodes.m12_self, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
            elif self.kern_type == 'Matern32':
                return tf.vectorized_map(RBFNodes.m32_self, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
            elif self.kern_type == 'Matern52':
                return tf.vectorized_map(RBFNodes.m52_self, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
            elif self.kern_type == 'Poly1':
                return tf.vectorized_map(RBFNodes.poly_self, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
            elif self.kern_type == 'Poly2':
                return tf.vectorized_map(RBFNodes.poly_self2, (X_, self.lengthscales, self.variance))  # (n, s1, s1)
        else:
            X2_ = tf.transpose(X2, [1, 0, 2])  # (n, s1, d)
            if self.kern_type == 'RBF':
                return tf.vectorized_map(RBFNodes.rbf, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
            elif self.kern_type == 'Matern12':
                return tf.vectorized_map(RBFNodes.m12, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
            elif self.kern_type == 'Matern32':
                return tf.vectorized_map(RBFNodes.m32, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
            elif self.kern_type == 'Matern52':
                return tf.vectorized_map(RBFNodes.m52, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
            elif self.kern_type == 'Poly1':
                return tf.vectorized_map(RBFNodes.poly, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
            elif self.kern_type == 'Poly2':
                return tf.vectorized_map(RBFNodes.poly2, (X_, X2_, self.lengthscales, self.variance))  # (n, s1, s2)
Example #25
0
    def model(X, sample_bias=False, bias_sample_range=None):
        """Defines a Bayesian Neural Network for regression

        Note:
            Output weight prior variance is set to: 10/K

        Args:
            # X: (np.ndarray of NP_DTYPE)  A matrix of input features between (0, 1),
            #     shape (n, d).
            # weight_list: (list of tf.Tensor) A list of hidden weight Tensors,
            #     each element has dtype = TF_DTYPE, and shape is (n_feature, n_node)
            #     (if input weight), (n_node, n_node) if hidden weight,
            #     and (n_node, 1) if output weight
            # bias_list: (list of tf.Tensor) A list of bias term for each layer,
            #     each element has dtype = TF_DTYPE, shape = (,).
            bias_sample_range: (list of int or None) the index of covariate to sample bias for.
                If `None` then sample only the first.

        Returns:
            (tf.Tensor) The output distribution.
        """

        # define architecture
        X = tf.convert_to_tensor(X, dtype=dtype_util.TF_DTYPE)
        n_sample, n_feature = X.shape.as_list()
        layer_size = [n_feature] + [n_node] * n_layer + [1]

        # intialize model building
        weight_list = []
        bias_list = []

        # input layer
        # input = tf.get_variable(initializer=X,
        #                         trainable=False,
        #                         dtype=dtype_util.TF_DTYPE,
        #                         name="input")
        input = X
        net = input

        # for (layer_id, (weights, biases)) in enumerate(zip(weight_list[:-1], bias_list[:-1])):
        #     with tf.variable_scope(scope_list[layer_id].original_name_scope):
        #         net = net_util.Dense(net, weights, biases, activation=activation)

        # hidden layers
        for layer_id in range(len(layer_size) - 1):
            with tf.variable_scope("layer_{}".format(layer_id), reuse=True):
                # configure hidden weight
                weight_shape = (layer_size[layer_id], layer_size[layer_id + 1])
                weight_scale = hidden_weight_sd if layer_id < n_layer else output_weight_sd

                # define random variables
                bias_rv = ed.Normal(loc=0.,
                                    scale=hidden_weight_sd,
                                    name="bias_{}".format(layer_id))
                weight_rv = ed.Normal(loc=0.,
                                      scale=tf.ones(shape=weight_shape) *
                                      weight_scale,
                                      name="weight_{}".format(layer_id))
                # add to list for easy access
                bias_list += [bias_rv]
                weight_list += [weight_rv]

                # produce output, optionally, store output-layer hidden nodes
                if sample_bias:
                    if layer_id == n_layer:
                        phi = net  # shape (n_sample, n_node)
                net = net_util.Dense(
                    net,
                    weight_rv,
                    bias_rv,
                    activation=None if layer_id == n_layer else activation)

        # final output layer
        with tf.variable_scope("output"):
            # produce output prediction
            y_mean = net[:,
                         0]  # shape (n, ) (i.e., the number of data samples)
            std_devs = 1.

            # estimate variable importance (i.e. squared gradient norm)
            y_mean_grad = tf.gradients(y_mean, X)[0]
            var_imp = tf.reduce_mean(y_mean_grad**2, axis=0)

            # estimate variable importance bias
            if sample_bias:
                phi_grad = tf.vectorized_map(  # shape (n_node, n_sample, n_feature)
                    lambda phi_k: tf.gradients(phi_k, X)[0], tf.transpose(phi))
                phi_grad2 = tf.vectorized_map(  # shape (n_feature, n_node, n_node)
                    lambda dphi: tf.matmul(dphi, dphi, transpose_b=True),
                    tf.transpose(phi_grad, [2, 0, 1]))

                phi2_inv = tfp.math.pinv(  # shape (n_node, n_node)
                    tf.matmul(phi, phi, transpose_a=True))

                var_imp_mat = tf.tensordot(  # shape (n_feature, n_node, n_node)
                    phi_grad2,
                    phi2_inv,
                    axes=[2, 0])
                var_imp_bias = tf.vectorized_map(tf.linalg.trace, var_imp_mat)

                # phi_grad = tf.stack(  # shape (n_node, n_sample, n_feature)
                #     [tf.gradients(phi[:, k], X)[0] for k in range(n_node)])
                # phi_inv = tfp.math.pinv(phi)  # shape (n_node, n_sample)
                #
                # var_imp_bias = tf.stack([
                #     tf.reduce_sum(tf.matmul(
                #         phi_grad[:, :, p], phi_inv, transpose_a=True) ** 2)
                #     for p in range(n_feature)])
                #
                var_imp_bias = var_imp_bias / n_sample
            else:
                var_imp_bias = None

            y = ed.Normal(loc=y_mean, scale=std_devs, name="y")

        return y, y_mean, var_imp, var_imp_bias, weight_list, bias_list
Example #26
0
def vectorized_map_reduce(features, coords, indices):
    indices_rect = indices.to_tensor(-1)
    invalid = tf.equal(indices_rect, -1)
    return tf.vectorized_map(get_fn(features, coords),
                             (indices_rect, invalid, coords))
Example #27
0
def batched_histogram_2d(eta, phi, w_px, w_py, x_range, y_range, nbins, bin_dtype=tf.float32):
    return tf.vectorized_map(
        lambda a: histogram_2d(a[0], a[1], a[2], a[3], x_range, y_range, nbins, bin_dtype), (eta, phi, w_px, w_py)
    )
Example #28
0
def pts2img(pts, angs):
	#pts, angs=args
	#p=pts2img_one((pts[0], angs[0]))
	img=tf.vectorized_map(pts2img_one, (pts, angs))
	return tf.math.real(img), tf.math.imag(img)
Example #29
0
 def vectorized_logpfn(*q_samples):
     return tf.vectorized_map(lambda samples: logpfn(*samples),
                              q_samples)
Example #30
0
    def process_gradients(self, loss, var_list, tape):
        # assert type(loss) is list
        if not isinstance(loss, list):
            loss = [loss]

        num_tasks = self.num_tasks if len(loss) == self.num_tasks else len(
            loss)
        random.shuffle(loss)

        # process gradients
        # Compute per-task gradients.
        grads_task = []
        for l in loss:
            grads = tape.gradient(l, var_list, unconnected_gradients='zero')
            l_grads = []
            for grad in grads:
                if grad is not None:
                    l_grads.append(tf.reshape(grad, [-1]))
            l_grads = tf.concat(l_grads, 0)
            grads_task.append(l_grads)
        grads_task = tf.stack(grads_task, 0)
        '''
        [
            [par1_loss1_grad, par2_loss1_grad, ..., parN_loss1_grad], 
            [par1_loss2_grad, par2_loss2_grad, .... parN_loss2_grad], 
                ...
            [par1_lossM_grad, par2_lossM_grad, .... parN_lossM_grad], 
        ], # flatten parameters and gradients, N is the total number of parameters 
        '''

        # Compute gradient projections.
        def proj_grad(grad_task):
            for k in range(num_tasks):
                inner_product = tf.reduce_sum(grad_task * grads_task[k])
                proj_direction = inner_product / \
                    tf.reduce_sum(grads_task[k]*grads_task[k])
                grad_task = grad_task - \
                    tf.minimum(proj_direction, 0.) * grads_task[k]
            return grad_task

        proj_grads_flatten = tf.vectorized_map(proj_grad, grads_task)

        # Unpack flattened projected gradients back to their original shapes.
        proj_grads = []
        for j in range(num_tasks):
            start_idx = 0
            for idx, var in enumerate(var_list):
                grad_shape = var.get_shape()
                flatten_dim = np.prod([
                    grad_shape.dims[i].value
                    for i in range(len(grad_shape.dims))
                ])
                proj_grad = proj_grads_flatten[j][start_idx:start_idx +
                                                  flatten_dim]
                proj_grad = tf.reshape(proj_grad, grad_shape)
                if len(proj_grads) < len(var_list):
                    proj_grads.append(proj_grad)
                else:
                    proj_grads[idx] += proj_grad
                start_idx += flatten_dim
        grads_and_vars = list(zip(proj_grads, var_list))

        return grads_and_vars