Beispiel #1
0
  Returns:
    (logits_1, logits_2), couplings
    where `couplings` is a dictionary whose keys include each of the experiments
    along with baselines, and the values are coupling matrices.
  """
    logits_key, vis_key = jax.random.split(jax.random.PRNGKey(seed))

    if logits_1 is None and logits_2 is None:
        logits_1, logits_2 = experiments[0].logit_pair_distribution_fn(
            logits_key, **(logit_kwargs or {}))

    logits_1 -= jax.scipy.special.logsumexp(logits_1)
    logits_2 -= jax.scipy.special.logsumexp(logits_2)

    probs_1 = jnp.exp(logits_1)
    probs_2 = jnp.exp(logits_2)

    independent_coupling = probs_1[:, None] * probs_2[None, :]
    gumbel_max_estimate = coupling_util.joint_from_samples(
        coupling_util.gumbel_max_sampler,
        logits_1,
        logits_2,
        vis_key,
        num_joint_samples,
        loop_size=500)
    icdf = coupling_util.inverse_cdf_coupling(logits_1, logits_2)
    icdf_perm = coupling_util.permuted_inverse_cdf_coupling(logits_1, logits_2)

    couplings = {
        "Independent": independent_coupling,
Beispiel #2
0
def diag_gaussian_logpdf(x, mean, log_std):
    # Evaluate a single point on a diagonal multivariate Gaussian.
    return np.sum(vmap(norm.logpdf)(x, mean, np.exp(log_std)))
Beispiel #3
0
def main(args):
    print("Start vanilla HMC...")
    nuts_kernel = NUTS(dual_moon_model)
    mcmc = MCMC(nuts_kernel, args.num_warmup, args.num_samples, num_chains=args.num_chains,
                progress_bar=False if "NUMPYRO_SPHINXBUILD" in os.environ else True)
    mcmc.run(random.PRNGKey(0))
    mcmc.print_summary()
    vanilla_samples = mcmc.get_samples()['x'].copy()

    guide = AutoBNAFNormal(dual_moon_model, hidden_factors=[args.hidden_factor, args.hidden_factor])
    svi = SVI(dual_moon_model, guide, optim.Adam(0.003), Trace_ELBO())
    svi_state = svi.init(random.PRNGKey(1))

    print("Start training guide...")
    last_state, losses = lax.scan(lambda state, i: svi.update(state), svi_state, jnp.zeros(args.num_iters))
    params = svi.get_params(last_state)
    print("Finish training guide. Extract samples...")
    guide_samples = guide.sample_posterior(random.PRNGKey(2), params,
                                           sample_shape=(args.num_samples,))['x'].copy()

    print("\nStart NeuTra HMC...")
    neutra = NeuTraReparam(guide, params)
    neutra_model = neutra.reparam(dual_moon_model)
    nuts_kernel = NUTS(neutra_model)
    mcmc = MCMC(nuts_kernel, args.num_warmup, args.num_samples, num_chains=args.num_chains,
                progress_bar=False if "NUMPYRO_SPHINXBUILD" in os.environ else True)
    mcmc.run(random.PRNGKey(3))
    mcmc.print_summary()
    zs = mcmc.get_samples(group_by_chain=True)["auto_shared_latent"]
    print("Transform samples into unwarped space...")
    samples = neutra.transform_sample(zs)
    print_summary(samples)
    zs = zs.reshape(-1, 2)
    samples = samples['x'].reshape(-1, 2).copy()

    # make plots

    # guide samples (for plotting)
    guide_base_samples = dist.Normal(jnp.zeros(2), 1.).sample(random.PRNGKey(4), (1000,))
    guide_trans_samples = neutra.transform_sample(guide_base_samples)['x']

    x1 = jnp.linspace(-3, 3, 100)
    x2 = jnp.linspace(-3, 3, 100)
    X1, X2 = jnp.meshgrid(x1, x2)
    P = jnp.exp(DualMoonDistribution().log_prob(jnp.stack([X1, X2], axis=-1)))

    fig = plt.figure(figsize=(12, 8), constrained_layout=True)
    gs = GridSpec(2, 3, figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])
    ax2 = fig.add_subplot(gs[1, 0])
    ax3 = fig.add_subplot(gs[0, 1])
    ax4 = fig.add_subplot(gs[1, 1])
    ax5 = fig.add_subplot(gs[0, 2])
    ax6 = fig.add_subplot(gs[1, 2])

    ax1.plot(losses[1000:])
    ax1.set_title('Autoguide training loss\n(after 1000 steps)')

    ax2.contourf(X1, X2, P, cmap='OrRd')
    sns.kdeplot(guide_samples[:, 0], guide_samples[:, 1], n_levels=30, ax=ax2)
    ax2.set(xlim=[-3, 3], ylim=[-3, 3],
            xlabel='x0', ylabel='x1', title='Posterior using\nAutoBNAFNormal guide')

    sns.scatterplot(guide_base_samples[:, 0], guide_base_samples[:, 1], ax=ax3,
                    hue=guide_trans_samples[:, 0] < 0.)
    ax3.set(xlim=[-3, 3], ylim=[-3, 3],
            xlabel='x0', ylabel='x1', title='AutoBNAFNormal base samples\n(True=left moon; False=right moon)')

    ax4.contourf(X1, X2, P, cmap='OrRd')
    sns.kdeplot(vanilla_samples[:, 0], vanilla_samples[:, 1], n_levels=30, ax=ax4)
    ax4.plot(vanilla_samples[-50:, 0], vanilla_samples[-50:, 1], 'bo-', alpha=0.5)
    ax4.set(xlim=[-3, 3], ylim=[-3, 3],
            xlabel='x0', ylabel='x1', title='Posterior using\nvanilla HMC sampler')

    sns.scatterplot(zs[:, 0], zs[:, 1], ax=ax5, hue=samples[:, 0] < 0.,
                    s=30, alpha=0.5, edgecolor="none")
    ax5.set(xlim=[-5, 5], ylim=[-5, 5],
            xlabel='x0', ylabel='x1', title='Samples from the\nwarped posterior - p(z)')

    ax6.contourf(X1, X2, P, cmap='OrRd')
    sns.kdeplot(samples[:, 0], samples[:, 1], n_levels=30, ax=ax6)
    ax6.plot(samples[-50:, 0], samples[-50:, 1], 'bo-', alpha=0.2)
    ax6.set(xlim=[-3, 3], ylim=[-3, 3],
            xlabel='x0', ylabel='x1', title='Posterior using\nNeuTra HMC sampler')

    plt.savefig("neutra.pdf")
Beispiel #4
0
 def ARDf(self, X, params):
     r = ieuclidean_distance_s(X / params["lengthscale"], self.diagonal)
     return params["variance"] * jnp.exp(-0.5 * r)
Beispiel #5
0
if __name__ == "__main__":
    num_samples = 40

    @jit
    def objective(params, t):
        rng = random.PRNGKey(t)
        return -batch_elbo(funnel_log_density, rng, params, num_samples)

    # Set up figure.
    fig = plt.figure(figsize=(8, 8), facecolor='white')
    ax = fig.add_subplot(111, frameon=False)
    plt.ion()
    plt.show(block=False)
    x_limits = [-2, 2]
    y_limits = [-4, 2]
    target_dist = lambda x, _: np.exp(funnel_log_density(x))
    approx_dist = lambda x, params: np.exp(diag_gaussian_logpdf(x, *params))

    def callback(params, t):
        print("Iteration {} lower bound {}".format(t, objective(params, t)))

        plt.cla()
        X, Y, Z = mesh_eval(target_dist, x_limits, y_limits, 1)
        ax.contour(X, Y, Z, cmap='summer')
        X, Y, Z = mesh_eval(approx_dist, x_limits, y_limits, params)
        ax.contour(X, Y, Z, cmap='winter')
        ax.set_xlim(x_limits)
        ax.set_ylim(y_limits)
        ax.set_yticks([])
        ax.set_xticks([])
Beispiel #6
0
def loss(W, X, y, l2_penalty=0.):
    """Log loss."""
    log_loss = np.mean(np.log(1 + np.exp(-y * np.dot(X, W))))
    penalty = 0.5 * l2_penalty * np.sum(np.power(W, 2))
    return log_loss + penalty
Beispiel #7
0
def t_complex(params: Dict[str, float]) -> ComplexFloat:
    """
    Transmission coefficient (design parameter)
    """
    return params['t_amp'] * jnp.exp(-1j * params['t_ang'])
Beispiel #8
0
def importance_weights(log_unnormalized_probabilities):
  """Normalizes log-weights."""
  return np.exp(log_unnormalized_probabilities -
                special.logsumexp(log_unnormalized_probabilities))
Beispiel #9
0
 def mean(self):
     return np.exp(self.loc + self.scale ** 2 / 2)
Beispiel #10
0
def ackley_1d(x, y=0):
    out = (-20 * jnp.exp(-0.2 * jnp.sqrt(0.5 * (x**2 + y**2))) -
           jnp.exp(0.5 * (jnp.cos(2 * jnp.pi * x) + jnp.cos(2 * jnp.pi * y))) +
           jnp.e + 20)
    return out
import gin


# Nonlinear mappings encoding different attention kernels.
gin.external_configurable(jnp.cos, 'jcos')
gin.external_configurable(jnp.sin, 'jsin')
gin.external_configurable(jnp.tanh, 'jtanh')
gin.external_configurable(jax.nn.sigmoid, 'jsigmoid')
gin.external_configurable(
    lambda x: jax.nn.gelu(x, approximate=False), 'jgelu'
)  # Needs to be exact, although might be slower. See https://github.com/google/jax/issues/4428.
gin.external_configurable(lambda x: x * x * (x > 0.0), 'jrequ')
gin.external_configurable(jnp.exp, 'jexp')
gin.external_configurable(lambda x: x, 'jidentity')
gin.external_configurable(
    lambda x: (jnp.exp(x)) * (x <= 0.0) + (x + 1.0) * (x > 0.0), 'jshiftedelu'
)  # Nonlinearity used in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention" (https://arxiv.org/abs/2006.16236).


def nonnegative_softmax_kernel_feature_creator(data,
                                               projection_matrix,
                                               attention_dims_t,
                                               batch_dims_t,
                                               precision,
                                               is_query,
                                               normalize_data=True,
                                               eps=0.0001):
  """Constructs nonnegative kernel features for fast softmax attention.
  Args:
    data: input for which features are computes
    projection_matrix: random matrix used to compute features
Beispiel #12
0
 def scalarsoftplus(x):
     return jnp.where(x < -30.0, 0.0, jnp.where(x > 30.0, x, jnp.log1p(jnp.exp(x))))
Beispiel #13
0
def local_value_op_op_cost(logpsi, pars, σp, mel, σ):

    σ_σp = jax.vmap(lambda σp, σ: jnp.hstack((σp, σ)), in_axes=(0, None))(σp, σ)
    σ_σ = jnp.hstack((σ, σ))
    return jnp.sum(mel * jnp.exp(logpsi(pars, σ_σp) - logpsi(pars, σ_σ)))
Beispiel #14
0
def local_value_cost(logpsi, pars, vp, mel, v):
    return jnp.sum(mel * jnp.exp(logpsi(pars, vp) - logpsi(pars, v)))
Beispiel #15
0
def sigmoid(z):
  """
  Numerically stable sigmoid.
  """
  return 1/(1 + jnp.exp(-z))
Beispiel #16
0
 def variance(self):
     return (np.exp(self.scale ** 2) - 1) * np.exp(2 * self.loc + self.scale ** 2)
Beispiel #17
0
def logsumexp(x, axis=-1):
    """
    Numerically stable logsumexp.
    """
    x_max = x.max(axis)
    return x_max + jnp.log(jnp.sum(jnp.exp(x - x_max), axis, keepdims=False))
Beispiel #18
0
 def mean(self):
     low = (self.low - self.loc) / self.scale
     low_prob_scaled = np.exp(self._normal.log_prob(self.low)) * self.scale / ndtr(-low)
     return self.loc + low_prob_scaled * self.scale
Beispiel #19
0
def gradient_loss_fn(W, X, y, l2_penalty):
    n = X.shape[0]
    log_grad = np.dot(np.diag(-y / (1 + np.exp(y * np.dot(X, W)))), X)
    log_grad_sum = np.dot(np.ones(n), log_grad)
    reg_grad = l2_penalty * W
    return (reg_grad + (1 / n) * log_grad_sum)
Beispiel #20
0
 def variance(self):
     low = (self.low - self.loc) / self.scale
     low_prob_scaled = np.exp(self._normal.log_prob(self.low)) * self.scale / ndtr(-low)
     return self._normal.variance * (1 + low * low_prob_scaled - low_prob_scaled ** 2)
Beispiel #21
0
 def NARDf(self, X, params):
     r = ieuclidean_distance(X, self.diagonal) / params["lengthscale"]
     return params["variance"] * jnp.exp(-0.5 * r**2)
Beispiel #22
0
def attend(
    q,
    k=None,
    v=None,
    q_chunk_len=None,
    kv_chunk_len=None,
    n_chunks_before=0,
    n_chunks_after=0,
    mask_fn=None,
    q_info=None,
    kv_info=None,
    dropout=0.0,
    rng=None,
):
    """Dot-product attention, with optional chunking and/or masking.

  Args:
    q: Query vectors, shape [q_len, d_qk]
    k: Key vectors, shape [kv_len, d_qk]; or None
    v: Value vectors, shape [kv_len, d_v]
    q_chunk_len: Set to non-zero to enable chunking for query vectors
    kv_chunk_len: Set to non-zero to enable chunking for key/value vectors
    n_chunks_before: Number of adjacent previous chunks to attend to
    n_chunks_after: Number of adjacent subsequent chunks to attend to
    mask_fn: TODO(kitaev): doc
    q_info: Query-associated metadata for masking
    kv_info: Key-associated metadata for masking
    dropout: Dropout rate
    rng: RNG for dropout
  Returns:
    A tuple (output, dots_logsumexp). The output has shape [q_len, d_v], and
    dots_logsumexp has shape [q_len]. The logsumexp of the attention
    probabilities is useful for combining multiple rounds of attention (as in
    LSH attention).
  """
    assert v is not None
    share_qk = (k is None)

    if q_info is None:
        q_info = jnp.arange(q.shape[-2])

    if kv_info is None and not share_qk:
        kv_info = jnp.arange(v.shape[-2])

    # Split q/k/v into chunks along the time axis, if desired.
    if q_chunk_len is not None:
        q = jnp.reshape(q, (-1, q_chunk_len, q.shape[-1]))
        q_info = jnp.reshape(q_info, (-1, q_chunk_len))

    if share_qk:
        assert kv_chunk_len is None or kv_chunk_len == q_chunk_len
        k = q
        kv_chunk_len = q_chunk_len
        if kv_info is None:
            kv_info = q_info
        elif kv_chunk_len is not None:
            # kv_info is not None, but reshape as required.
            kv_info = jnp.reshape(kv_info, (-1, kv_chunk_len))
    elif kv_chunk_len is not None:
        k = jnp.reshape(k, (-1, kv_chunk_len, k.shape[-1]))
        kv_info = jnp.reshape(kv_info, (-1, kv_chunk_len))

    if kv_chunk_len is not None:
        v = jnp.reshape(v, (-1, kv_chunk_len, v.shape[-1]))

    if share_qk:
        k = length_normalized(k)
    k = k / jnp.sqrt(k.shape[-1])

    # Optionally include adjacent chunks.
    if q_chunk_len is not None or kv_chunk_len is not None:
        assert q_chunk_len is not None and kv_chunk_len is not None
    else:
        assert n_chunks_before == 0 and n_chunks_after == 0

    k = look_adjacent(k, n_chunks_before, n_chunks_after)
    v = look_adjacent(v, n_chunks_before, n_chunks_after)
    kv_info = look_adjacent(kv_info, n_chunks_before, n_chunks_after)

    # Dot-product attention.
    dots = jnp.matmul(q, jnp.swapaxes(k, -1, -2))

    # Masking
    if mask_fn is not None:
        dots = mask_fn(dots, q_info[Ellipsis, :, None], kv_info[Ellipsis,
                                                                None, :])

    # Softmax.
    dots_logsumexp = logsumexp(dots, axis=-1, keepdims=True)
    dots = jnp.exp(dots - dots_logsumexp)

    if dropout > 0.0:
        assert rng is not None
        # Dropout is broadcast across the bin dimension
        dropout_shape = (dots.shape[-2], dots.shape[-1])
        # TODO(kitaev): verify that tie-in is safe to remove (in light of jax fix)
        keep_prob = jax.lax.tie_in(dots, 1.0 - dropout)
        keep = jax.random.bernoulli(rng, keep_prob, dropout_shape)
        multiplier = keep.astype(dots.dtype) / jax.lax.tie_in(keep, keep_prob)
        dots = dots * multiplier

    # The softmax normalizer (dots_logsumexp) is used by multi-round LSH attn.
    out = jnp.matmul(dots, v)
    out = jnp.reshape(out, (-1, out.shape[-1]))
    dots_logsumexp = jnp.reshape(dots_logsumexp, (-1, ))
    return out, dots_logsumexp
Beispiel #23
0
 def ARD(self, X, X2):
     r = euclidean_distance_s(X / self.parameters["lengthscale"],
                              X2 / self.parameters["lengthscale"])
     return self.parameters["variance"] * jnp.exp(-0.5 * r)
Beispiel #24
0
def run_optim(key: np.ndarray, lhs: np.ndarray, tmp: np.ndarray,
              xhats: np.ndarray, tmp_c: np.ndarray, xhats_c: np.ndarray,
              xstar: float, bound: Text, out_dir: Text, x: np.ndarray,
              y: np.ndarray) -> Tuple[int, float, float, int, float, float]:
    """Run optimization (either lower or upper) for a single xstar."""
    # Directory setup
    # ---------------------------------------------------------------------------
    out_dir = os.path.join(out_dir, f"{bound}-xstar_{xstar}")
    if FLAGS.store_data:
        logging.info(f"Current run output directory: {out_dir}...")
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

    # Init optim params
    # ---------------------------------------------------------------------------
    logging.info(
        f"Initialize parameters L, mu, log_sigma, lmbda, tau, slack...")
    key, subkey = random.split(key)
    params = init_params(subkey)

    for parname, param in zip(['L', 'mu', 'log_sigma'], params):
        logging.info(f"Parameter {parname}: {param.shape}")
        logging.info(f"  -> {parname}: {param}")

    tau = FLAGS.tau_init
    logging.info(f"Initial tau = {tau}")
    fin_tau = np.minimum(FLAGS.tau_factor**FLAGS.num_rounds * tau,
                         FLAGS.tau_max)
    logging.info(f"Final tau = {fin_tau}")

    # Set constraint approach and slacks
    # ---------------------------------------------------------------------------
    slack = FLAGS.slack * np.ones(FLAGS.num_z * 2)
    lmbda = np.zeros(FLAGS.num_z * 2)
    logging.info(f"Lambdas: {lmbda.shape}")

    logging.info(
        f"Fractional tolerance (slack) for constraints = {FLAGS.slack}")
    logging.info(f"Set relative slack variables...")
    slack *= np.abs(lhs.ravel())
    logging.info(f"Set minimum slack to {FLAGS.slack_abs}...")
    slack = np.maximum(FLAGS.slack_abs, slack)
    logging.info(f"Slack {slack.shape}")
    logging.info(f"Actual slack min: {np.min(slack)}, max: {np.max(slack)}")

    # Setup optimizer
    # ---------------------------------------------------------------------------
    logging.info(f"Vanilla SGD with init_lr={FLAGS.lr}...")
    logging.info(f"Set learning rate schedule")
    step_size = optim.inverse_time_decay(FLAGS.lr, FLAGS.decay_steps,
                                         FLAGS.decay_rate, FLAGS.staircase)
    init_fun, update_fun, get_params = optim.sgd(step_size)

    logging.info(
        f"Init state for JAX optimizer (including L, mu, log_sigma)...")
    state = init_fun(params)

    # Setup result dict
    # ---------------------------------------------------------------------------
    logging.info(f"Initialize dictionary for results...")
    results = {
        "mu": [],
        "sigma": [],
        "cholesky_factor": [],
        "tau": [],
        "lambda": [],
        "objective": [],
        "constraint_term": [],
        "rhs": []
    }
    if FLAGS.plot_intermediate:
        results["grad_norms"] = []
        results["lagrangian"] = []

    logging.info(f"Evaluate at xstar={xstar}...")

    logging.info(f"Evaluate {bound} bound...")
    sign = 1 if bound == "lower" else -1

    # ===========================================================================
    # OPTIMIZATION LOOP
    # ===========================================================================
    # One-time logging before first step
    # ---------------------------------------------------------------------------
    key, subkey = random.split(key)
    obj, rhs, psisum, constr = objective_rhs_psisum_constr(
        subkey, get_params(state), lmbda, tau, lhs, slack, xstar, tmp_c,
        xhats_c)
    results["objective"].append(obj)
    results["constraint_term"].append(psisum)
    results["rhs"].append(rhs)

    logging.info(f"Objective: scalar")
    logging.info(f"RHS: {rhs.shape}")
    logging.info(f"Sum over Psis: scalar")
    logging.info(f"Constraint: {constr.shape}")

    tril_idx = np.tril_indices(FLAGS.dim_theta + 1)
    count = 0
    logging.info(f"Start optimization loop...")
    for _ in tqdm(range(FLAGS.num_rounds)):
        # log current parameters
        # -------------------------------------------------------------------------
        results["lambda"].append(lmbda)
        results["tau"].append(tau)
        cur_L, cur_mu, cur_logsigma = get_params(state)
        cur_chol = make_cholesky_factor(cur_L)[tril_idx].ravel()[1:]
        results["mu"].append(cur_mu)
        results["sigma"].append(np.exp(cur_logsigma))
        results["cholesky_factor"].append(cur_chol)

        subkeys = random.split(key, num=FLAGS.opt_steps + 1)
        key = subkeys[0]
        # inner optimization for subproblem
        # -------------------------------------------------------------------------
        for j in range(FLAGS.opt_steps):
            v, grads = lagrangian_value_and_grad(subkeys[j + 1],
                                                 get_params(state), lmbda, tau,
                                                 lhs, slack, xstar, tmp, xhats,
                                                 sign)
            state = update_fun(count, grads, state)
            count += 1
            if FLAGS.plot_intermediate:
                results["lagrangian"].append(v)
                results["grad_norms"].append(
                    [np.linalg.norm(grad) for grad in grads])

        # post inner optimization logging
        # -------------------------------------------------------------------------
        key, subkey = random.split(key)
        obj, rhs, psisum, constr = objective_rhs_psisum_constr(
            subkey, get_params(state), lmbda, tau, lhs, slack, xstar, tmp_c,
            xhats_c)
        results["objective"].append(obj)
        results["constraint_term"].append(psisum)
        results["rhs"].append(rhs)

        # update lambda, tau
        # -------------------------------------------------------------------------
        lmbda = update_lambda(constr, lmbda, tau)
        tau = np.minimum(tau * FLAGS.tau_factor, FLAGS.tau_max)

    # Convert and store results
    # ---------------------------------------------------------------------------
    logging.info(f"Finished optimization loop...")

    logging.info(f"Convert all results to numpy arrays...")
    results = {k: np.array(v) for k, v in results.items()}

    logging.info(f"Add final parameters and lhs to results...")
    L, mu, log_sigma = get_params(state)
    results["final_L"] = L
    results["final_mu"] = mu
    results["final_log_sigma"] = log_sigma
    results["lhs"] = lhs

    if FLAGS.store_data:
        logging.info(f"Save result data to...")
        result_path = os.path.join(out_dir, "results.npz")
        onp.savez(result_path, **results)

    # Generate and store plots
    # ---------------------------------------------------------------------------
    if FLAGS.plot_intermediate:
        fig_dir = os.path.join(out_dir, "figures")
        logging.info(f"Generate and save all plots at {fig_dir}...")
        plotting.plot_all(results, x, y, response, fig_dir)

    # Compute last valid and last satisfied
    # ---------------------------------------------------------------------------
    maxabsdiff = np.array([np.max(np.abs(lhs - r)) for r in results["rhs"]])
    fin_i = np.sum(~np.isnan(results["objective"])) - 1
    logging.info(f"Final non-nan objective at {fin_i}.")
    fin_obj = results["objective"][fin_i]
    fin_maxabsdiff = maxabsdiff[fin_i]

    sat_i = [
        np.all((np.abs((lhs - r) / lhs) < FLAGS.slack)
               | (np.abs(lhs - r) < FLAGS.slack_abs)) for r in results["rhs"]
    ]
    sat_i = np.where(sat_i)[0]

    if len(sat_i) > 0:
        sat_i = sat_i[-1]
        logging.info(f"Final satisfied constraint at {sat_i}.")
        sat_obj = results["objective"][sat_i]
        sat_maxabsdiff = maxabsdiff[sat_i]
    else:
        sat_i = -1
        logging.info(f"Constraints were never satisfied.")
        sat_obj, sat_maxabsdiff = np.nan, np.nan

    logging.info("Finished run.")
    return fin_i, fin_obj, fin_maxabsdiff, sat_i, sat_obj, sat_maxabsdiff
Beispiel #25
0
def diag_gaussian_sample(rng, mean, log_std):
    # Take a single sample from a diagonal multivariate Gaussian.
    return mean + np.exp(log_std) * random.normal(rng, mean.shape)
Beispiel #26
0
def cluster_split_matching_pursuit(key,
                                   points,
                                   mask,
                                   log_VS,
                                   log_VE,
                                   kmeans_init=True,
                                   K=2):
    """
    Splits a set of points into two ellipsoids such that the enclosed volume is as close to V(S) without being less.
    V(S) should be an estimate of the true volume contained by the points.

    Args:
        key:
        points: [N, D]
        mask: [N] only split on these points
        log_VS: logV(S) of the set of points
        log_VE: logV(E) of the parent ellipsoid
        kmeans_init: whether to use kmeans to initialise the clustering

    Returns:
        cluster_id: ids of the points, places where ~mask are random assignments
        mu1, radii1, rotation1: ellipsoid params of first subcluster
        mu2, radii2, rotation2: ellipsoid paras of second subcluster

    """
    N, D = points.shape
    num_S = jnp.sum(mask)
    print(mask)
    a_k = jnp.arange(K)

    def log_ellipsoid_volume(logdetC_k, num_k, log_f_k):
        """
        Computes area of u_k @ Lamba @ u_k <= 1
        using
            Lambda = f_k n_k C_k)
            and |Lambda| = f_k^d n_k^d |C_k|
        """
        logdetLambda = D * (log_f_k + jnp.log(num_k)) + logdetC_k
        return (jnp.log(2.) + 0.5 * D * jnp.log(jnp.pi) - jnp.log(D) -
                gammaln(0.5 * D) - 0.5 * logdetLambda)

    def log_factor_k(cluster_id, log_maha_k, num_k, logdetC_k):
        """
        Computes f_k such that,
            u_k @ f0_k n_k C_k @ u_k <= 1
        and
            f_k^d V(n_k C_k) = max(V(S_k), V(f0_k n_k C_k))
            log_f_k = (log max(V(S)*n_k/n_S, V(f0_k n_k C_k)) - log V(n_k C_k))/D
            log_f_k = (max(log(V(S)*n_k/n_S), logV(n_k C_k)) - log V(n_k C_k))/D
        """
        # K
        log_f_expand_k = -jnp.max(jnp.where(cluster_id == a_k[:, None],
                                            log_maha_k, -jnp.inf),
                                  axis=-1)
        log_VE_expand_k = log_ellipsoid_volume(logdetC_k, num_k,
                                               log_f_expand_k)
        log_VE_k = log_ellipsoid_volume(logdetC_k, num_k, 0.)

        log_scale_k = (jnp.maximum(log_VS + jnp.log(num_k) - jnp.log(num_S),
                                   log_VE_expand_k) - log_VE_k) / D
        # K
        return log_scale_k

    # # calculate bounding ellipsoid
    # mu, C =// bounding_ellipsoid(points, mask)
    # radii, _ = ellipsoid_params(C)
    # log_VE = log_ellipsoid_volume(radii)
    # # enlarge so that V(E) = max(V(E), V(S))
    # # (const * r**D) >= V(S) -> scale = 1 else log_scale = (log_V(S) - log(const * r**D))/D
    # log_scale = jnp.maximum(0., (log_VS - log_VE) / D)
    # C = C / jnp.exp(log_scale)
    ###
    # input is essentially log_VS
    if kmeans_init:
        # do Euclidean kmean clustering
        cluster_id, centers = kmeans(key, points, mask, K=K)
    else:
        # assign to random clusters: child0 or child1
        cluster_id = random.randint(key, shape=(N, ), minval=0, maxval=K)
    # K, N
    # log_maha_k is with f_k=1

    State = namedtuple('State', [
        'i', 'done', 'cluster_id', 'C_k', 'logdetC_k', 'mu_k', 'log_maha_k',
        'num_k', 'log_VE_k', 'log_VS_k', 'min_loss', 'delay'
    ])

    def init_state(cluster_id):
        num_k = jnp.sum(mask & (cluster_id == a_k[:, None]), axis=-1)
        mu_k = vmap(lambda k: jnp.average(
            points, axis=0, weights=k == cluster_id))(a_k)
        C_k = vmap(lambda k, mu_k: jnp.linalg.pinv(
            jnp.average((points - mu_k)[:, :, None] *
                        (points - mu_k)[:, None, :],
                        axis=0,
                        weights=k == cluster_id)))(a_k, mu_k)
        logdetC_k = vmap(
            lambda C_k: jnp.sum(jnp.log(jnp.linalg.eigvals(C_k).real)))(C_k)
        precision_k = C_k * num_k[:, None, None]
        # K, N
        log_maha_k = vmap(lambda mu_k, precision_k: jnp.log(
            vmap(lambda point: (point - mu_k) @ precision_k @ (point - mu_k))
            (points)))(mu_k, precision_k)
        log_f_k = log_factor_k(cluster_id, log_maha_k, num_k, logdetC_k)
        log_VE_k = vmap(log_ellipsoid_volume)(logdetC_k, num_k, log_f_k)

        log_VS_k = jnp.log(num_k) - jnp.log(num_S)
        return State(i=jnp.asarray(0),
                     done=num_S < K * (D + 1),
                     cluster_id=cluster_id,
                     C_k=C_k,
                     logdetC_k=logdetC_k,
                     mu_k=mu_k,
                     log_maha_k=log_maha_k,
                     num_k=num_k,
                     log_VE_k=log_VE_k,
                     log_VS_k=log_VS_k,
                     min_loss=jnp.asarray(jnp.inf),
                     delay=jnp.asarray(0))

    def body(state: State):
        new_state_date = dict()
        # upon the start of each iteration the state is consistent.
        # we use the consistent state to calculate the reassignment metrics.
        # we then reassign and update the state so that it is consistent again.
        # K, N
        # K
        log_f_k = log_factor_k(state.cluster_id, state.log_maha_k, state.num_k,
                               state.logdetC_k)

        def single_log_h(log_f_k, log_maha_k, num_k, logdetC_k):
            log_d = log_maha_k + log_f_k
            log_VS_k = log_VS + jnp.log(num_k) - jnp.log(num_S)
            return log_ellipsoid_volume(logdetC_k, num_k,
                                        log_f_k) + log_d - log_VS_k

        # K, N
        log_h_k = vmap(single_log_h)(log_f_k, state.log_maha_k, state.num_k,
                                     state.logdetC_k)
        h_k = jnp.exp(log_h_k)
        # # K, K, N
        delta_F = h_k[:, None, :] - h_k
        # Can reassign if mask says we are working on that node and there would be at least D+1 points in that cluster
        # after taking from it. And, if delta_F < 0.
        able_to_reassign = mask & (state.num_k[state.cluster_id] > D + 1)
        delta_F_masked = jnp.where(able_to_reassign, delta_F, jnp.inf)

        # (k_to, k_from, n_reassign) = jnp.where(delta_F == min_delta_F)
        (k_to, k_from,
         n_reassign) = jnp.unravel_index(jnp.argmin(delta_F_masked.flatten()),
                                         delta_F.shape)
        # dynamic update index arrays of sufficient length for all
        dyn_k_to_idx = jnp.concatenate([k_to[None], jnp.asarray([0, 0])])
        dyn_k_from_idx = jnp.concatenate([k_from[None], jnp.asarray([0, 0])])

        ###
        # update the state

        ###
        # cluster id
        cluster_id = dynamic_update_slice(state.cluster_id, dyn_k_to_idx[0:1],
                                          n_reassign[None])

        ###
        # num_k
        num_from = state.num_k[k_from] - 1
        num_to = state.num_k[k_from] + 1
        num_k = dynamic_update_slice(state.num_k, num_from[None],
                                     dyn_k_from_idx[0:1])
        num_k = dynamic_update_slice(num_k, num_to[None], dyn_k_to_idx[0:1])

        ###
        # ellipsoid parameters
        x_n = points[n_reassign, :]
        mu_from = state.mu_k[k_from, :] + (state.mu_k[k_from, :] -
                                           x_n) / (state.num_k[k_from] - 1)
        C_from, logdetC_from = rank_one_update_matrix_inv(
            state.C_k[k_from, :, :],
            state.logdetC_k[k_from],
            x_n - mu_from,
            x_n - state.mu_k[k_from, :],
            add=False)
        # print(C_from, logdetC_from)
        mu_to = state.mu_k[
            k_to, :] + (x_n - state.mu_k[k_to, :]) / (state.num_k[k_to] + 1)
        C_to, logdetC_to = rank_one_update_matrix_inv(state.C_k[k_to, :, :],
                                                      state.logdetC_k[k_to],
                                                      x_n - mu_to,
                                                      x_n -
                                                      state.mu_k[k_to, :],
                                                      add=True)
        print('from', state.logdetC_k[k_from])
        # print(C_to, logdetC_to)
        mu_k = dynamic_update_slice(state.mu_k, mu_from[None, :],
                                    dyn_k_from_idx[0:2])
        mu_k = dynamic_update_slice(mu_k, mu_to[None, :], dyn_k_to_idx[0:2])
        C_k = dynamic_update_slice(state.C_k, C_from[None, :, :],
                                   dyn_k_from_idx)
        C_k = dynamic_update_slice(C_k, C_to[None, :, :], dyn_k_to_idx)
        logdetC_k = dynamic_update_slice(state.logdetC_k, logdetC_from[None],
                                         dyn_k_from_idx[0:1])
        logdetC_k = dynamic_update_slice(logdetC_k, logdetC_to[None],
                                         dyn_k_to_idx[0:1])

        ###
        # maha

        precision_from = C_from * num_from
        precision_to = C_to * num_to
        log_maha_from = jnp.log(
            vmap(lambda point: (point - mu_from) @ precision_from @ (
                point - mu_from))(points))
        log_maha_to = jnp.log(
            vmap(lambda point:
                 (point - mu_to) @ precision_to @ (point - mu_to))(points))
        log_maha_k = dynamic_update_slice(state.log_maha_k,
                                          log_maha_from[None, :],
                                          dyn_k_from_idx[0:2])
        log_maha_k = dynamic_update_slice(log_maha_k, log_maha_to[None, :],
                                          dyn_k_to_idx[0:2])

        # estimate volumes of current clustering
        log_f_k = log_factor_k(cluster_id, log_maha_k, num_k, logdetC_k)
        log_VE_k = vmap(log_ellipsoid_volume)(logdetC_k, num_k, log_f_k)
        log_VS_k = jnp.log(num_k) - jnp.log(num_S)
        log_V_sum = logsumexp(log_VE_k)
        new_loss = log_V_sum - log_VS
        loss_decreased = new_loss < state.min_loss
        delay = jnp.where(loss_decreased, 0, state.delay + 1)
        min_loss = jnp.where(loss_decreased, new_loss, state.min_loss)
        print(jnp.min(delta_F_masked), log_V_sum, logdetC_k)
        done = jnp.all(cluster_id == state.cluster_id) \
               | (delay >= 10) \
               | jnp.any(num_k < D + 1) \
               | jnp.isnan(log_V_sum) \
               | (jnp.min(delta_F_masked) >= 0.)
        # ['i', 'done', 'cluster_id', 'C_k', 'logdetC_k',
        # 'mu_k', 'log_maha_k', 'num_k',
        # 'log_VE_k', 'log_VS_k',
        # 'min_loss', 'delay']
        state = state._replace(i=state.i + 1,
                               done=done,
                               cluster_id=cluster_id,
                               C_k=C_k,
                               logdetC_k=logdetC_k,
                               mu_k=mu_k,
                               log_maha_k=log_maha_k,
                               num_k=num_k,
                               log_VE_k=log_VE_k,
                               log_VS_k=log_VS_k,
                               min_loss=min_loss,
                               delay=delay)
        return state

    init_state = init_state(cluster_id)
    state = while_loop(lambda state: ~state.done, body, init_state)

    log_f_k = log_factor_k(state.cluster_id, state.log_maha_k, state.num_k,
                           state.logdetC_k)
    log_VE_k = vmap(log_ellipsoid_volume)(state.logdetC_k, state.num_k,
                                          log_f_k)
    log_V_sum = logsumexp(log_VE_k)

    do_split = ((log_V_sum < log_VE) | (log_VE > log_VS + jnp.log(K))) \
               & (~jnp.any(jnp.isnan(state.logdetC_k))) \
               & jnp.all(state.num_k >= D + 1)

    precision_k = state.C_k * jnp.exp(jnp.log(state.num_k) + log_f_k)[:, None,
                                                                      None]
    radii_k, rotation_k = vmap(lambda C_k: ellipsoid_params(C_k))(precision_k)

    return state.cluster_id, state.log_VS_k, state.mu_k, radii_k, rotation_k, do_split
Beispiel #27
0
def funnel_log_density(params):
    return norm.logpdf(params[0], 0, np.exp(params[1])) + \
           norm.logpdf(params[1], 0, 1.35)
Beispiel #28
0
    def body(state: State):
        new_state_date = dict()
        # upon the start of each iteration the state is consistent.
        # we use the consistent state to calculate the reassignment metrics.
        # we then reassign and update the state so that it is consistent again.
        # K, N
        # K
        log_f_k = log_factor_k(state.cluster_id, state.log_maha_k, state.num_k,
                               state.logdetC_k)

        def single_log_h(log_f_k, log_maha_k, num_k, logdetC_k):
            log_d = log_maha_k + log_f_k
            log_VS_k = log_VS + jnp.log(num_k) - jnp.log(num_S)
            return log_ellipsoid_volume(logdetC_k, num_k,
                                        log_f_k) + log_d - log_VS_k

        # K, N
        log_h_k = vmap(single_log_h)(log_f_k, state.log_maha_k, state.num_k,
                                     state.logdetC_k)
        h_k = jnp.exp(log_h_k)
        # # K, K, N
        delta_F = h_k[:, None, :] - h_k
        # Can reassign if mask says we are working on that node and there would be at least D+1 points in that cluster
        # after taking from it. And, if delta_F < 0.
        able_to_reassign = mask & (state.num_k[state.cluster_id] > D + 1)
        delta_F_masked = jnp.where(able_to_reassign, delta_F, jnp.inf)

        # (k_to, k_from, n_reassign) = jnp.where(delta_F == min_delta_F)
        (k_to, k_from,
         n_reassign) = jnp.unravel_index(jnp.argmin(delta_F_masked.flatten()),
                                         delta_F.shape)
        # dynamic update index arrays of sufficient length for all
        dyn_k_to_idx = jnp.concatenate([k_to[None], jnp.asarray([0, 0])])
        dyn_k_from_idx = jnp.concatenate([k_from[None], jnp.asarray([0, 0])])

        ###
        # update the state

        ###
        # cluster id
        cluster_id = dynamic_update_slice(state.cluster_id, dyn_k_to_idx[0:1],
                                          n_reassign[None])

        ###
        # num_k
        num_from = state.num_k[k_from] - 1
        num_to = state.num_k[k_from] + 1
        num_k = dynamic_update_slice(state.num_k, num_from[None],
                                     dyn_k_from_idx[0:1])
        num_k = dynamic_update_slice(num_k, num_to[None], dyn_k_to_idx[0:1])

        ###
        # ellipsoid parameters
        x_n = points[n_reassign, :]
        mu_from = state.mu_k[k_from, :] + (state.mu_k[k_from, :] -
                                           x_n) / (state.num_k[k_from] - 1)
        C_from, logdetC_from = rank_one_update_matrix_inv(
            state.C_k[k_from, :, :],
            state.logdetC_k[k_from],
            x_n - mu_from,
            x_n - state.mu_k[k_from, :],
            add=False)
        # print(C_from, logdetC_from)
        mu_to = state.mu_k[
            k_to, :] + (x_n - state.mu_k[k_to, :]) / (state.num_k[k_to] + 1)
        C_to, logdetC_to = rank_one_update_matrix_inv(state.C_k[k_to, :, :],
                                                      state.logdetC_k[k_to],
                                                      x_n - mu_to,
                                                      x_n -
                                                      state.mu_k[k_to, :],
                                                      add=True)
        print('from', state.logdetC_k[k_from])
        # print(C_to, logdetC_to)
        mu_k = dynamic_update_slice(state.mu_k, mu_from[None, :],
                                    dyn_k_from_idx[0:2])
        mu_k = dynamic_update_slice(mu_k, mu_to[None, :], dyn_k_to_idx[0:2])
        C_k = dynamic_update_slice(state.C_k, C_from[None, :, :],
                                   dyn_k_from_idx)
        C_k = dynamic_update_slice(C_k, C_to[None, :, :], dyn_k_to_idx)
        logdetC_k = dynamic_update_slice(state.logdetC_k, logdetC_from[None],
                                         dyn_k_from_idx[0:1])
        logdetC_k = dynamic_update_slice(logdetC_k, logdetC_to[None],
                                         dyn_k_to_idx[0:1])

        ###
        # maha

        precision_from = C_from * num_from
        precision_to = C_to * num_to
        log_maha_from = jnp.log(
            vmap(lambda point: (point - mu_from) @ precision_from @ (
                point - mu_from))(points))
        log_maha_to = jnp.log(
            vmap(lambda point:
                 (point - mu_to) @ precision_to @ (point - mu_to))(points))
        log_maha_k = dynamic_update_slice(state.log_maha_k,
                                          log_maha_from[None, :],
                                          dyn_k_from_idx[0:2])
        log_maha_k = dynamic_update_slice(log_maha_k, log_maha_to[None, :],
                                          dyn_k_to_idx[0:2])

        # estimate volumes of current clustering
        log_f_k = log_factor_k(cluster_id, log_maha_k, num_k, logdetC_k)
        log_VE_k = vmap(log_ellipsoid_volume)(logdetC_k, num_k, log_f_k)
        log_VS_k = jnp.log(num_k) - jnp.log(num_S)
        log_V_sum = logsumexp(log_VE_k)
        new_loss = log_V_sum - log_VS
        loss_decreased = new_loss < state.min_loss
        delay = jnp.where(loss_decreased, 0, state.delay + 1)
        min_loss = jnp.where(loss_decreased, new_loss, state.min_loss)
        print(jnp.min(delta_F_masked), log_V_sum, logdetC_k)
        done = jnp.all(cluster_id == state.cluster_id) \
               | (delay >= 10) \
               | jnp.any(num_k < D + 1) \
               | jnp.isnan(log_V_sum) \
               | (jnp.min(delta_F_masked) >= 0.)
        # ['i', 'done', 'cluster_id', 'C_k', 'logdetC_k',
        # 'mu_k', 'log_maha_k', 'num_k',
        # 'log_VE_k', 'log_VS_k',
        # 'min_loss', 'delay']
        state = state._replace(i=state.i + 1,
                               done=done,
                               cluster_id=cluster_id,
                               C_k=C_k,
                               logdetC_k=logdetC_k,
                               mu_k=mu_k,
                               log_maha_k=log_maha_k,
                               num_k=num_k,
                               log_VE_k=log_VE_k,
                               log_VS_k=log_VS_k,
                               min_loss=min_loss,
                               delay=delay)
        return state
def weights_loglike(log_component_weights, alpha_prior):
    """Log likelihood of weights under Dirichlet distribution"""
    component_weights = np.exp(log_component_weights)
    component_weights = normalize_weights(component_weights)
    return stats.dirichlet.logpdf(x=component_weights, alpha=alpha_prior)
Beispiel #30
0
def _to_probs_bernoulli(logits):
    return 1 / (1 + jnp.exp(-logits))