Ejemplo n.º 1
0
def loss_fn(params, batch, lens):
    '''
    Objective function of hidden markov models for discrete observations. It returns the mean of the negative
    loglikelihood of the sequence of observations

    Parameters
    ----------
    params : HMMJax
        Hidden Markov Model

    batch: array(N, max_len)
        Minibatch consisting of observation sequences

    lens : array(N, seq_len)
        Consists of the valid length of each observation sequence in the minibatch

    Returns
    -------
    * float
        The mean negative loglikelihood of the minibatch
    '''
    params_soft = HMMJax(softmax(params.trans_mat, axis=1),
                         softmax(params.obs_mat, axis=1),
                         softmax(params.init_dist))
    return -hmm_loglikelihood_jax(params_soft, batch, lens).mean()
Ejemplo n.º 2
0
 def spline_params(params, upper):
     outputs = network_apply_fun(params, upper)
     outputs = np.reshape(outputs, [-1, lower_dim, 3 * K - 1])
     W, H, D = np.split(outputs, [K, 2 * K], axis=2)
     W = 2 * B * softmax(W)
     H = 2 * B * softmax(H)
     D = softplus(D)
     return W, H, D
Ejemplo n.º 3
0
def get_losses(
    inputs,
    outputs,
    args,
    beta_b=.1,
    beta_z=.1,
    prior_rate=3.,
):
    """Get losses (NLL, KL divergences and neg. ELBO).

    Args:
        inputs: Padded input sequences.
        outputs: CompILE model output tuple.
        args: Argument dict from `ArgumentParser`.
        beta_b: Scaling factor for KL term of boundary variables (b).
        beta_z: Scaling factor for KL term of latents (z).
        prior_rate: Rate (lambda) for Poisson prior.
    """

    targets = inputs.reshape(-1)
    all_encs, all_recs, all_masks, all_b, all_z = outputs
    input_dim = args.num_symbols + 1

    nll = 0.
    kl_z = 0.
    for seg_id in range(args.num_segments):
        seg_prob = get_segment_probs(all_b['samples'], all_masks, seg_id)
        preds = all_recs[seg_id].reshape(-1, input_dim)
        seg_loss = cross_entropy(preds, targets, reduction='none').reshape(
            -1, inputs.shape[1])
        # print(seg_loss.shape, seg_prob.shape)

        # Ignore EOS token (last sequence element) in loss.
        nll += (seg_loss[:, :-1] * seg_prob[:, :-1]).sum(1).mean(0)

        # KL divergence on z.
        if args.latent_dist == 'gaussian':
            mu, log_var = jnp.split(all_z['logits'][seg_id], 2, axis=1)
            kl_z += kl_gaussian(mu, log_var).mean(0)
        elif args.latent_dist == 'concrete':
            kl_z += kl_categorical_uniform(
                nn.softmax(all_z['logits'][seg_id], axis=-1)).mean(0)
        else:
            raise ValueError('Invalid argument for `latent_dist`.')

    # KL divergence on b (first segment only, ignore first time step).
    # TODO(tkipf): Implement alternative prior on soft segment length.
    probs_b = nn.softmax(all_b['logits'][0], axis=-1)
    log_prior_b = poisson_categorical_log_prior(probs_b.shape[1], prior_rate)
    kl_b = args.num_segments * kl_categorical(probs_b[:, 1:],
                                              log_prior_b[:, 1:]).mean(0)

    loss = nll + beta_z * kl_z + beta_b * kl_b
    return loss, nll, kl_z, kl_b
Ejemplo n.º 4
0
def single_attention_head(params, inputs, Q_inputs=None, mask=None):
    """params = tuple of weight matrices for Q, K, V,
    
    Q_inputs: if not none, queries will be constructed from these.
    mask: if not None, should be binary mask for attention selections."""
    qw, kw, vw = params

    if Q_inputs is not None:
        Q = jnp.dot(Q_inputs, qw)
    else:
        Q = jnp.dot(inputs, qw)
    K = jnp.dot(inputs, kw)
    V = jnp.dot(inputs, vw)

    scale = jnp.sqrt(K.shape[-1])

    attention_selections = jnp.matmul(Q, jnp.transpose(K, axes=[0, 2, 1
                                                                ])) / scale

    if mask is not None:  # mask out "off" locations with -inf
        attention_selections = jnp.where(mask, attention_selections, -jnp.inf)

    attention_selections = softmax(attention_selections, axis=-1)

    outputs = jnp.matmul(attention_selections, V)

    return outputs
Ejemplo n.º 5
0
 def from_params(cls, params):
     structured_params = params.reshape((-1, 3))
     unnormalized_weights = structured_params[:, 2]
     probs = list(nn.softmax(unnormalized_weights))
     component_dists = [
         cls.component_type(p[0], p[1]) for p in structured_params
     ]
     return cls(component_dists, probs)
Ejemplo n.º 6
0
 def __call__(self, x):
     w_init = hk.initializers.Orthogonal(scale=1.0 / np.sqrt(3.0))
     p = nn.softmax(hk.Linear(self.num_quantiles, w_init=w_init)(x))
     cum_p = jnp.concatenate(
         [jnp.zeros((p.shape[0], 1)),
          jnp.cumsum(p, axis=1)], axis=1)
     cum_p_prime = (cum_p[:, 1:] + cum_p[:, :-1]) / 2.0
     return cum_p, cum_p_prime
Ejemplo n.º 7
0
 def inverse_fun(params, z):
     log_det = np.zeros(z.shape[0])
     idx = dim // 2
     lower, upper = z[:, :idx], z[:, idx:]
     out = f2_apply_fun(f2_params, upper).reshape(-1, dim // 2, 3 * K - 1)
     W, H, D = onp.array_split(out, 3, axis=2)
     W, H = nn.softmax(W, axis=2), nn.softmax(H, axis=2)
     W, H = 2 * B * W, 2 * B * H
     D = nn.softplus(D)
     lower, ld = unconstrained_RQS(lower, W, H, D, inverse=True, tail_bound=B)
     log_det += np.sum(ld, axis=1)
     out = f1_apply_fun(f1_params, lower).reshape(-1, dim // 2, 3 * K - 1)
     W, H, D = onp.array_split(out, 3, axis=2)
     W, H = nn.softmax(W, axis=2), nn.softmax(H, axis=2)
     W, H = 2 * B * W, 2 * B * H
     D = nn.softplus(D)
     upper, ld = unconstrained_RQS(upper, W, H, D, inverse=True, tail_bound=B)
     log_det += np.sum(ld, axis=1)
     return np.concatenate([lower, upper], axis=1), log_det.reshape((z.shape[0],))
def predict(params, x):
    # per-example predictions
    activations = x
    for w, b in params[:-1]:
        outputs = jnp.dot(w, activations) + b
        activations = nn.softmax(outputs)

    final_w, final_b = params[-1]
    logits = jnp.dot(final_w, activations) + final_b
    return nn.log_softmax(logits)
Ejemplo n.º 9
0
 def attention_op(self, query, key, value, mask=None):
     d_model = query.shape[-1]
     scores = jnp.divide(
         jnp.matmul(query, key.transpose(0, 2, 1)), jnp.sqrt(d_model)
     )
     if mask is not None:
         scores = jnp.matmul(scores, mask)
     attention = nn.softmax(scores, axis=-1)
     attention = jnp.matmul(attention, value)
     return attention
Ejemplo n.º 10
0
 def predict(_x: np.ndarray):
     """Predict new values from MDN."""
     logmix, mu_data, logstd = get_mdn_coef(network(params, _x))
     pi_data = softmax(logmix)
     sigma_data = np.exp(logstd)
     z = onp.random.gumbel(loc=0, scale=1, size=pi_data.shape)
     k = (onp.log(pi_data) + z).argmax(axis=1)
     indices = (onp.arange(_x.shape[0]), k)
     rn = onp.random.randn(_x.shape[0])
     sampled = rn * sigma_data[indices] + mu_data[indices]
     return sampled
Ejemplo n.º 11
0
 def __call__(self, x):
     if len(x.shape) == 4:
         x = DQNBody()(x)
     x = MLP(
         self.action_space.n,
         self.hidden_units,
         hidden_activation=nn.relu,
         output_scale=0.01,
     )(x)
     pi_s = nn.softmax(x, axis=1)
     log_pi_s = jnp.log(pi_s + (pi_s == 0.0) * 1e-8)
     return pi_s, log_pi_s
Ejemplo n.º 12
0
    def from_params(cls, fixed_params, opt_params, scale=None, traceable=True):
        if scale is None:
            scale = Scale(0.0, 1.0)
        xs = fixed_params["xs"]

        densities = nn.softmax(opt_params) * opt_params.size

        return cls(xs=xs,
                   densities=densities,
                   scale=scale,
                   normalized=True,
                   traceable=True)
Ejemplo n.º 13
0
def spline_unconstrained_transform(thetax: jnp.ndarray, thetay: jnp.ndarray,
                                   thetad: jnp.ndarray) -> jnp.ndarray:
    """Transform the unconstrained parameters of the spline transform into their
    constrained counterparts.

    Args:
        thetax: Unconstrained x-coordinates of the spline intervals.
        thetay: Unconstrained y-coordinates of the spline intervals.
        thetad: Unconstrained derivatives at internal points.

    Returns:
        xk: The x-coordinates of the intervals on which the rational quadratics
            are defined.
        yk: The y-coordinates of the destination intervals of the rational
            quadratic transforms.
        delta: Derivatives at internal points.

    """
    xk = jnp.atleast_2d(jnp.cumsum(2 * nn.softmax(thetax), axis=-1) - 1.)
    xk = jnp.hstack((-jnp.ones((xk.shape[0], 1)), xk))
    yk = jnp.atleast_2d(jnp.cumsum(2 * nn.softmax(thetay), axis=-1) - 1.)
    yk = jnp.hstack((-jnp.ones((yk.shape[0], 1)), yk))
    delta = nn.softplus(thetad)
    return jnp.squeeze(xk), jnp.squeeze(yk), jnp.squeeze(delta)
Ejemplo n.º 14
0
        def epoch_step(opt_state, key):

            def train_step(opt_state, batch):
                opt_state, loss = self.update(next(itercount), opt_state, batch)
                return opt_state, loss

            batches = self._make_minibatches(observations, batch_size, key)
            opt_state, losses = scan(train_step, opt_state, batches)

            params = get_params(opt_state)
            mixing_coeffs, probs_logits = params
            probs = expit(probs_logits)
            self.model = (softmax(mixing_coeffs), probs)
            self._probs = probs

            return opt_state, (losses.mean(), *params, self.responsibilities(observations))
Ejemplo n.º 15
0
        def epoch_step(opt_state, key):

            def train_step(opt_state, batch):
                opt_state, loss = self.update(next(itercount), opt_state, batch)
                return opt_state, loss

            batches = self._make_minibatches(observations, batch_size, key)
            opt_state, losses = scan(train_step, opt_state, batches)

            params = get_params(opt_state)
            mixing_coeffs, means, untransormed_cov = params
            cov_matrix = vmap(self._transform_to_covariance_matrix)(untransormed_cov)
            self.model = (softmax(mixing_coeffs), means, cov_matrix)
            responsibilities = self.responsibilities(observations)

            return opt_state, (losses.mean(), *params, responsibilities)
Ejemplo n.º 16
0
    def apply_fun(params, x, adj, rng, activation=nn.elu, is_training=False, 
                  **kwargs):
        W, a1, a2 = params
        k1, k2, k3 = random.split(rng, 3) 
        x = drop_fun(None, x, is_training=is_training, rng=k1)
        x = np.dot(x, W)

        f_1 = np.dot(x, a1) 
        f_2 = np.dot(x, a2)
        logits = f_1 + f_2.T
        coefs = nn.softmax(
            nn.leaky_relu(logits, negative_slope=0.2) + np.where(adj, 0., -1e9))

        coefs = drop_fun(None, coefs, is_training=is_training, rng=k2)
        x = drop_fun(None, x, is_training=is_training, rng=k3)

        ret = np.matmul(coefs, x)

        return activation(ret)
Ejemplo n.º 17
0
    def get_samples(self, rng_key, num_samples):
        """
        Draws samples from the weighted samples collected from the run.

        :param random.PRNGKey rng_key: Random number generator key to be used to draw samples.
        :param int num_samples: The number of samples.
        :return: a dict of posterior samples
        """
        if self._results is None:
            raise RuntimeError(
                "NestedSampler.run(...) method should be called first to obtain results."
            )

        samples, log_weights = self.get_weighted_samples()
        p = nn.softmax(log_weights)
        idx = random.choice(rng_key,
                            log_weights.shape[0], (num_samples, ),
                            p=p)
        return {k: v[idx] for k, v in samples.items()}
Ejemplo n.º 18
0
    def loss_fn(self, params, batch):
        '''
        Calculates expected mean negative loglikelihood.

        Parameters
        ----------
        params : tuple
            Consists of mixing coefficients and probabilities of the Bernoulli distribution respectively.

        batch : array
            The subset of observations

        Returns
        -------
        * int
            Negative log likelihood
        '''
        mixing_coeffs, probs = params
        self.model = (softmax(mixing_coeffs), expit(probs))
        return -self.expected_log_likelihood(batch) / len(batch)
Ejemplo n.º 19
0
    def loss_fn(self, params, batch):
        """
        Calculates expected mean negative loglikelihood.

        Parameters
        ----------
        params : tuple
            Consists of mixing coefficients' logits, means and variances of the Gaussian distributions respectively.

        batch : array
            The subset of observations

        Returns
        -------
        * int
            Negative log likelihood
        """
        mixing_coeffs, means, untransormed_cov = params
        cov_matrix = vmap(self._transform_to_covariance_matrix)(untransormed_cov)
        self.model = (softmax(mixing_coeffs), means, cov_matrix)
        return -self.expected_log_likelihood(batch) / len(batch)
Ejemplo n.º 20
0
def gmm_sample(key, resps_c, means_c, logvar_c, varmin=1e-16):
    """ Sample mixture of gaussians X ~ \sum_c \pi_c N(mean_c, exp(logvar_c))

  Arguments:
    key: random.PRNGKey for random bits
    resps_c: np.array with shape c, responsibilities in the GMM, \pi in
      the above formula is softmax(resps_c).
    means_c: np.array with shape c, means in GMM
    logvar_c: np.array with shape c, log variances in GMM
    varmin: Minimum variance allowed (numerially useful).

  Returns:
    Sample from the mixture model, np.array
  """
    keys = random.split(key, 2)
    # pick gaussian to sample
    u = random.uniform(keys[0])
    cum_resps_c = np.cumsum(softmax(resps_c))
    cidx = np.argmax(u <= cum_resps_c)
    # sample that gaussian
    return diag_gaussian_sample(keys[1], means_c[cidx], logvar_c[cidx], varmin)
Ejemplo n.º 21
0
 def from_params(
         cls,
         fixed_params,
         opt_params,
         scale=None,
         traceable=True):  # FIXME: traceable; why sometimes no Scale?
     if not scale:
         scale = Scale(0.0, 1.0)
     floor = fixed_params.get("floor", -np.inf)
     ceiling = fixed_params.get("ceiling", np.inf)
     # Allow logistic center to exceed the range by 20%
     loc_min = np.maximum(scale.low, floor) - 0.2 * scale.width
     loc_max = np.minimum(scale.high, ceiling) + 0.2 * scale.width
     loc_range = loc_max - loc_min
     structured_params = opt_params.reshape((-1, 3))
     locs = loc_min + scipy.special.expit(structured_params[:,
                                                            0]) * loc_range
     # Allow logistic scales between 0.01 and 0.5
     # Don't allow tiny scales outside of the visible range
     s_min = 0.01 + 0.1 * np.where(
         (locs < scale.low),
         scale.low - locs,
         np.where(locs > scale.high, locs - scale.high, 0.0),
     )
     s_max = 0.5
     s_range = s_max - s_min
     ss = s_min + scipy.special.expit(structured_params[:, 1]) * s_range
     # Allow probs > 0.01
     probs = list(0.01 + nn.softmax(structured_params[:, 2]) *
                  (1 - 0.01 * structured_params[:, 2].size))
     # Bundle up components
     component_logistics = [
         Logistic(l, s, scale, normalized=True) for (l, s) in zip(locs, ss)
     ]
     components = [
         Truncate(base_dist=cl, floor=floor, ceiling=ceiling)
         for cl in component_logistics
     ]
     mixture = cls(components=components, probs=probs)
     return mixture
Ejemplo n.º 22
0
Archivo: stax.py Proyecto: yotarok/jax
    def apply_fun(params, inputs, **unused_kwargs):
        query_src, key_src, value_src, kv_mask = inputs
        Q_param, K_param, V_param, O_param = params

        batch_size, kv_maxlen = kv_mask.shape
        q_maxlen = query_src.shape[1]

        Q = Q_apply(Q_param, query_src)  # (B, N, query_dim)
        K = K_apply(K_param, key_src)  # (B, T, query_dim)
        V = V_apply(V_param, value_src)  # (B, T, value_dim)

        # Reshape to expand head-wise vars
        Q = Q.reshape(Q.shape[:-1] + (nhead, single_query_dim))
        Q = Q.transpose((0, 2, 1, 3))  # Q: (B, nhead, N, single_query_dim)
        K = K.reshape(K.shape[:-1] + (nhead, single_query_dim))
        K = K.transpose((0, 2, 1, 3))  # K: (B, nhead, T, single_query_dim)

        score = np.einsum('bhnd,bhtd->bhnt', Q, K)
        scaled_score = score / np.sqrt(single_query_dim)

        masked_score = (
            scaled_score +
            (1.0 - kv_mask.reshape(batch_size, 1, 1, kv_maxlen)) * logepsilon)

        if att_prob_mask_fun is not None:
            extra_mask = att_prob_mask_fun(score.shape)
            masked_score = masked_score + (1.0 - extra_mask) * logepsilon

        att_probs = softmax(masked_score)  # (B, nhead, N, T)

        V = V.reshape(V.shape[:-1] + (nhead, single_value_dim))
        V = V.transpose((0, 2, 1, 3))  # V: (B, nhead, T, single_value_dim)

        head = np.einsum('bhnt,bhtd->bhnd', att_probs, V)

        head = head.transpose((0, 2, 1, 3)).reshape(
            (batch_size, q_maxlen, value_dim))  # collapse heads

        return O_apply(O_param, head)
def optimize_lfads(key, init_params, hps, opt_hps, train_data_fun,
                   eval_data_fun):
    """Optimize the LFADS model and print batch based optimization data.

  This loop is at the cpu nonjax-numpy level.

  Arguments:
    init_params: a dict of parameters to be trained
    hps: dict of lfads model HPs
    opt_hps: dict of optimization HPs
    train_data_fun: function that takes a key and returns
      nexamples x time x ndims np array of data for training
    eval_data_fun: function that takes a key and returns
      nexamples x time x ndims np array of data for held out error
  Returns:
    a dictionary of trained parameters"""

    # Begin optimziation loop.
    all_tlosses = []
    all_elosses = []

    # Build some functions used in optimization.
    kl_warmup_fun = get_kl_warmup_fun(opt_hps)
    decay_fun = optimizers.exponential_decay(opt_hps['step_size'],
                                             opt_hps['decay_steps'],
                                             opt_hps['decay_factor'])

    opt_init, opt_update, get_params = optimizers.adam(step_size=decay_fun,
                                                       b1=opt_hps['adam_b1'],
                                                       b2=opt_hps['adam_b2'],
                                                       eps=opt_hps['adam_eps'])
    opt_state = opt_init(init_params)

    def update_w_gc(i, opt_state, hps, opt_hps, key, x_bxt, kl_warmup):
        """Update fun for gradients, includes gradient clipping."""
        params = get_params(opt_state)
        grads = grad(lfads.training_loss_jit)(params, hps, key, x_bxt,
                                              kl_warmup, opt_hps['keep_rate'])
        clipped_grads = optimizers.clip_grads(grads, opt_hps['max_grad_norm'])
        return opt_update(i, clipped_grads, opt_state)

    update_w_gc_jit = jit(update_w_gc, static_argnums=(2, 3))

    # Run the optimization, pausing every so often to collect data and
    # print status.
    batch_size = hps['batch_size']
    num_batches = opt_hps['num_batches']
    print_every = opt_hps['print_every']
    num_opt_loops = int(num_batches / print_every)
    params = get_params(opt_state)
    for oidx in range(num_opt_loops):
        batch_idx_start = oidx * print_every
        start_time = time.time()
        key, tkey, dtkey1, dtkey2, dekey1, dekey2 = \
            random.split(random.fold_in(key, oidx), 6)
        opt_state = optimize_core_jit(tkey, batch_idx_start, print_every,
                                      update_w_gc_jit, kl_warmup_fun,
                                      opt_state, hps, opt_hps, train_data_fun)
        batch_time = time.time() - start_time

        # Losses
        params = get_params(opt_state)
        batch_pidx = batch_idx_start + print_every
        kl_warmup = kl_warmup_fun(batch_idx_start)
        # Training loss
        #didxs = onp.random.randint(0, train_data.shape[0], batch_size)
        #x_bxt = train_data[didxs].astype(onp.float32)
        x_bxt = train_data_fun(dtkey1)
        tlosses = lfads.losses_jit(params, hps, dtkey2, x_bxt, kl_warmup, 1.0)

        # Evaluation loss
        #didxs = onp.random.randint(0, eval_data.shape[0], batch_size)
        #ex_bxt = eval_data[didxs].astype(onp.float32)
        ex_bxt = eval_data_fun(dekey1)
        elosses = lfads.losses_jit(params, hps, dekey2, ex_bxt, kl_warmup, 1.0)
        # Saving, printing.
        resps = softmax(params['prior']['resps'])
        rmin = onp.min(resps)
        rmax = onp.max(resps)
        rmean = onp.mean(resps)
        rstd = onp.std(resps)

        all_tlosses.append(tlosses)
        all_elosses.append(elosses)
        s1 = "Batches {}-{} in {:0.2f} sec, Step size: {:0.5f}"
        s2 = "    Training losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} "
        s3 = "        Eval losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} "
        s4 = "        Resps: min {:0.4f}, mean {:0.4f}, max {:0.4f}, std {:0.4f}"
        print(
            s1.format(batch_idx_start + 1, batch_pidx, batch_time,
                      decay_fun(batch_pidx)))
        print(
            s2.format(tlosses['total'], tlosses['nlog_p_xgz'],
                      tlosses['kl_prescale'], tlosses['kl'], tlosses['l2'],
                      tlosses['ii_l2'], tlosses['ii_tavg']))
        print(
            s3.format(elosses['total'], elosses['nlog_p_xgz'],
                      elosses['kl_prescale'], elosses['kl'], elosses['l2'],
                      elosses['ii_l2'], elosses['ii_tavg']))
        print(s4.format(rmin, rmean, rmax, rstd))

        tlosses_thru_training = utils.merge_losses_dicts(all_tlosses)
        elosses_thru_training = utils.merge_losses_dicts(all_elosses)
        optimizer_details = {
            'tlosses': tlosses_thru_training,
            'elosses': elosses_thru_training
        }

    return params, optimizer_details
Ejemplo n.º 24
0
    def fit_sgd(self, observations, batch_size, rng_key=None, optimizer=None, num_epochs=1):
        '''
        Fits the model using gradient descent algorithm with the given hyperparameters.

        Parameters
        ----------
        observations : array
            The observation sequences which Bernoulli Mixture Model is trained on

        batch_size : int
            The size of the batch

        rng_key : array
            Random key of shape (2,) and dtype uint32

        optimizer : jax.experimental.optimizers.Optimizer
            Optimizer to be used

        num_epochs : int
            The number of epoch the training process takes place

        Returns
        -------
        * array
            Mean loss values found per epoch

        * array
            Mixing coefficients found per epoch

        * array
            Probabilities of Bernoulli distribution found per epoch

        * array
            Responsibilites found per epoch
        '''
        global opt_init, opt_update, get_params

        if rng_key is None:
            rng_key = PRNGKey(0)

        if optimizer is not None:
            opt_init, opt_update, get_params = optimizer

        opt_state = opt_init((softmax(self.mixing_coeffs), logit(self.probs)))
        itercount = itertools.count()

        def epoch_step(opt_state, key):

            def train_step(opt_state, batch):
                opt_state, loss = self.update(next(itercount), opt_state, batch)
                return opt_state, loss

            batches = self._make_minibatches(observations, batch_size, key)
            opt_state, losses = scan(train_step, opt_state, batches)

            params = get_params(opt_state)
            mixing_coeffs, probs_logits = params
            probs = expit(probs_logits)
            self.model = (softmax(mixing_coeffs), probs)
            self._probs = probs

            return opt_state, (losses.mean(), *params, self.responsibilities(observations))

        epochs = split(rng_key, num_epochs)
        opt_state, history = scan(epoch_step, opt_state, epochs)
        params = get_params(opt_state)
        mixing_coeffs, probs_logits = params
        probs = expit(probs_logits)
        self.model = (softmax(mixing_coeffs), probs)
        self._probs = probs
        return history
Ejemplo n.º 25
0
def exponential_mechanism(rng, votes, per_example_epsilon, sensitivity=1.):
  """Exponential mechanism."""
  scores = nn.softmax(per_example_epsilon * votes / (2 * sensitivity))
  return randomly_sample(rng, scores)
Ejemplo n.º 26
0
def optimize_lfads(key,
                   init_params,
                   hps,
                   opt_hps,
                   train_data_fun,
                   eval_data_fun,
                   ncompleted_batches=0,
                   opt_state=None,
                   callback_fun=None,
                   do_print=True):
    """Optimize the LFADS model and print batch based optimization data.

  This loop is at the cpu nonjax-numpy level.

  Arguments:
    key: random.PRNGKey for randomness
    init_params: a dict of parameters to be trained
    hps: dict of lfads model HPs
    opt_hps: dict of optimization HPs
    train_data_fun: function that takes a key and returns
      nexamples x time x ndims np array of data for training
    eval_data_fun: function that takes a key and returns
      nexamples x time x ndims np array of data for held out error
    ncompleted_batches: (default 0), use this to restart training in the middle
      of the batch count. Used in tandem with opt_state (below).
    opt_state: (default None) 3-tuple (params, m - 1st moment, v - 2nd moment) 
      from jax.experimental.optimizers.adam (None value starts optimizer anew).
      The params in opt_state[0] will *override* the init_params argument.
    callback_fun: (default None) function that the optimzie routine will call 
      every print_every loops, in order to do whatever the user wants, typically
      saving, or reporting to a hyperparameter tuner, etc.
      callback_fun parameters are
        (current_batch_idx:int, hps:dict, opt_hps:dict, 
         params:dict, opt_state:tuple,
         tlosses:dict, elosses:dict) 
    do_print: (default True), print loss information
  Returns:
    A 3-tuple of 
      (trained_params, 
       opt_details - dictionary of optimization losses through training, 
       (opt_state - a 3-tuple of trained params in odd pytree form, 
         m 1st moment, v 2nd moment)).
  """

    # Begin optimziation loop.
    all_tlosses = []
    all_elosses = []

    # Build some functions used in optimization.
    kl_warmup_fun = get_kl_warmup_fun(opt_hps)
    decay_fun = optimizers.exponential_decay(opt_hps['step_size'],
                                             opt_hps['decay_steps'],
                                             opt_hps['decay_factor'])

    opt_init, opt_update, get_params = optimizers.adam(step_size=decay_fun,
                                                       b1=opt_hps['adam_b1'],
                                                       b2=opt_hps['adam_b2'],
                                                       eps=opt_hps['adam_eps'])
    print_every = opt_hps['print_every']
    if ncompleted_batches > 0:
        print('Starting batch count at %d.' % (ncompleted_batches))
        assert ncompleted_batches % print_every == 0
        opt_loop_start_idx = int(ncompleted_batches / print_every)
    else:
        opt_loop_start_idx = 0
    if opt_state is not None:
        print('Received opt_state, ignoring init_params argument.')
    else:
        opt_state = opt_init(init_params)

    def update_w_gc(i, opt_state, hps, opt_hps, key, x_bxt, kl_warmup):
        """Update fun for gradients, includes gradient clipping."""
        params = get_params(opt_state)
        grads = grad(lfads.training_loss_jit)(params, hps, key, x_bxt,
                                              kl_warmup, opt_hps['keep_rate'])
        clipped_grads = optimizers.clip_grads(grads, opt_hps['max_grad_norm'])
        return opt_update(i, clipped_grads, opt_state)

    update_w_gc_jit = jit(update_w_gc, static_argnums=(2, 3))

    # Run the optimization, pausing every so often to collect data and
    # print status.
    batch_size = hps['batch_size']
    num_batches = opt_hps['num_batches']
    assert num_batches % print_every == 0
    num_opt_loops = int(num_batches / print_every)
    params = get_params(opt_state)
    for oidx in range(opt_loop_start_idx, num_opt_loops):
        batch_idx_start = oidx * print_every
        start_time = time.time()
        key, tkey, dtkey1, dtkey2, dekey1, dekey2 = \
            random.split(random.fold_in(key, oidx), 6)
        opt_state = optimize_core_jit(tkey, batch_idx_start, print_every,
                                      update_w_gc_jit, kl_warmup_fun,
                                      opt_state, hps, opt_hps, train_data_fun)
        batch_time = time.time() - start_time

        # Losses
        params = get_params(opt_state)
        batch_pidx = batch_idx_start + print_every
        kl_warmup = kl_warmup_fun(batch_idx_start)
        # Training loss
        x_bxt = train_data_fun(dtkey1)
        tlosses = lfads.losses_jit(params, hps, dtkey2, x_bxt, kl_warmup, 1.0)

        # Evaluation loss
        ex_bxt = eval_data_fun(dekey1)
        elosses = lfads.losses_jit(params, hps, dekey2, ex_bxt, kl_warmup, 1.0)
        # Saving, printing.
        resps = softmax(params['prior']['resps'])
        rmin = onp.min(resps)
        rmax = onp.max(resps)
        rmean = onp.mean(resps)
        rstd = onp.std(resps)

        all_tlosses.append(tlosses)
        all_elosses.append(elosses)
        if do_print:
            s1 = "Batches {}-{} in {:0.2f} sec, Step size: {:0.5f}"
            s2 = "    Training losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} "
            s3 = "        Eval losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} "
            s4 = "        Resps: min {:0.4f}, mean {:0.4f}, max {:0.4f}, std {:0.4f}"
            print(
                s1.format(batch_idx_start + 1, batch_pidx, batch_time,
                          decay_fun(batch_pidx)))
            print(
                s2.format(tlosses['total'], tlosses['nlog_p_xgz'],
                          tlosses['kl_prescale'], tlosses['kl'], tlosses['l2'],
                          tlosses['ii_l2'], tlosses['ii_tavg']))
            print(
                s3.format(elosses['total'], elosses['nlog_p_xgz'],
                          elosses['kl_prescale'], elosses['kl'], elosses['l2'],
                          elosses['ii_l2'], elosses['ii_tavg']))
            print(s4.format(rmin, rmean, rmax, rstd))

        if callback_fun is not None:
            callback_fun(batch_pidx, hps, opt_hps, params, opt_state, tlosses,
                         elosses)

    tlosses_thru_training = utils.merge_losses_dicts(all_tlosses)
    elosses_thru_training = utils.merge_losses_dicts(all_elosses)
    optimizer_details = {
        'tlosses': tlosses_thru_training,
        'elosses': elosses_thru_training
    }

    return params, optimizer_details, opt_state
Ejemplo n.º 27
0
 def _rvs(self, p):
     if self.is_logits:
         p = softmax(p)
     return categorical_rvs(self._random_state, p, self._size)
Ejemplo n.º 28
0
 def _rvs(self, n, p):
     if self.is_logits:
         p = softmax(p)
     return multinomial_rvs(self._random_state, p, n, self._size)
Ejemplo n.º 29
0
 def loss(q, k, dummy_proj, attn):
     logits = q @ k.T
     probs = softmax(logits)
     return fat.kl(attn, probs).mean()
Ejemplo n.º 30
0
def _to_probs_multinom(logits):
    return softmax(logits, axis=-1)