def loss_fn(params, batch, lens): ''' Objective function of hidden markov models for discrete observations. It returns the mean of the negative loglikelihood of the sequence of observations Parameters ---------- params : HMMJax Hidden Markov Model batch: array(N, max_len) Minibatch consisting of observation sequences lens : array(N, seq_len) Consists of the valid length of each observation sequence in the minibatch Returns ------- * float The mean negative loglikelihood of the minibatch ''' params_soft = HMMJax(softmax(params.trans_mat, axis=1), softmax(params.obs_mat, axis=1), softmax(params.init_dist)) return -hmm_loglikelihood_jax(params_soft, batch, lens).mean()
def spline_params(params, upper): outputs = network_apply_fun(params, upper) outputs = np.reshape(outputs, [-1, lower_dim, 3 * K - 1]) W, H, D = np.split(outputs, [K, 2 * K], axis=2) W = 2 * B * softmax(W) H = 2 * B * softmax(H) D = softplus(D) return W, H, D
def get_losses( inputs, outputs, args, beta_b=.1, beta_z=.1, prior_rate=3., ): """Get losses (NLL, KL divergences and neg. ELBO). Args: inputs: Padded input sequences. outputs: CompILE model output tuple. args: Argument dict from `ArgumentParser`. beta_b: Scaling factor for KL term of boundary variables (b). beta_z: Scaling factor for KL term of latents (z). prior_rate: Rate (lambda) for Poisson prior. """ targets = inputs.reshape(-1) all_encs, all_recs, all_masks, all_b, all_z = outputs input_dim = args.num_symbols + 1 nll = 0. kl_z = 0. for seg_id in range(args.num_segments): seg_prob = get_segment_probs(all_b['samples'], all_masks, seg_id) preds = all_recs[seg_id].reshape(-1, input_dim) seg_loss = cross_entropy(preds, targets, reduction='none').reshape( -1, inputs.shape[1]) # print(seg_loss.shape, seg_prob.shape) # Ignore EOS token (last sequence element) in loss. nll += (seg_loss[:, :-1] * seg_prob[:, :-1]).sum(1).mean(0) # KL divergence on z. if args.latent_dist == 'gaussian': mu, log_var = jnp.split(all_z['logits'][seg_id], 2, axis=1) kl_z += kl_gaussian(mu, log_var).mean(0) elif args.latent_dist == 'concrete': kl_z += kl_categorical_uniform( nn.softmax(all_z['logits'][seg_id], axis=-1)).mean(0) else: raise ValueError('Invalid argument for `latent_dist`.') # KL divergence on b (first segment only, ignore first time step). # TODO(tkipf): Implement alternative prior on soft segment length. probs_b = nn.softmax(all_b['logits'][0], axis=-1) log_prior_b = poisson_categorical_log_prior(probs_b.shape[1], prior_rate) kl_b = args.num_segments * kl_categorical(probs_b[:, 1:], log_prior_b[:, 1:]).mean(0) loss = nll + beta_z * kl_z + beta_b * kl_b return loss, nll, kl_z, kl_b
def single_attention_head(params, inputs, Q_inputs=None, mask=None): """params = tuple of weight matrices for Q, K, V, Q_inputs: if not none, queries will be constructed from these. mask: if not None, should be binary mask for attention selections.""" qw, kw, vw = params if Q_inputs is not None: Q = jnp.dot(Q_inputs, qw) else: Q = jnp.dot(inputs, qw) K = jnp.dot(inputs, kw) V = jnp.dot(inputs, vw) scale = jnp.sqrt(K.shape[-1]) attention_selections = jnp.matmul(Q, jnp.transpose(K, axes=[0, 2, 1 ])) / scale if mask is not None: # mask out "off" locations with -inf attention_selections = jnp.where(mask, attention_selections, -jnp.inf) attention_selections = softmax(attention_selections, axis=-1) outputs = jnp.matmul(attention_selections, V) return outputs
def from_params(cls, params): structured_params = params.reshape((-1, 3)) unnormalized_weights = structured_params[:, 2] probs = list(nn.softmax(unnormalized_weights)) component_dists = [ cls.component_type(p[0], p[1]) for p in structured_params ] return cls(component_dists, probs)
def __call__(self, x): w_init = hk.initializers.Orthogonal(scale=1.0 / np.sqrt(3.0)) p = nn.softmax(hk.Linear(self.num_quantiles, w_init=w_init)(x)) cum_p = jnp.concatenate( [jnp.zeros((p.shape[0], 1)), jnp.cumsum(p, axis=1)], axis=1) cum_p_prime = (cum_p[:, 1:] + cum_p[:, :-1]) / 2.0 return cum_p, cum_p_prime
def inverse_fun(params, z): log_det = np.zeros(z.shape[0]) idx = dim // 2 lower, upper = z[:, :idx], z[:, idx:] out = f2_apply_fun(f2_params, upper).reshape(-1, dim // 2, 3 * K - 1) W, H, D = onp.array_split(out, 3, axis=2) W, H = nn.softmax(W, axis=2), nn.softmax(H, axis=2) W, H = 2 * B * W, 2 * B * H D = nn.softplus(D) lower, ld = unconstrained_RQS(lower, W, H, D, inverse=True, tail_bound=B) log_det += np.sum(ld, axis=1) out = f1_apply_fun(f1_params, lower).reshape(-1, dim // 2, 3 * K - 1) W, H, D = onp.array_split(out, 3, axis=2) W, H = nn.softmax(W, axis=2), nn.softmax(H, axis=2) W, H = 2 * B * W, 2 * B * H D = nn.softplus(D) upper, ld = unconstrained_RQS(upper, W, H, D, inverse=True, tail_bound=B) log_det += np.sum(ld, axis=1) return np.concatenate([lower, upper], axis=1), log_det.reshape((z.shape[0],))
def predict(params, x): # per-example predictions activations = x for w, b in params[:-1]: outputs = jnp.dot(w, activations) + b activations = nn.softmax(outputs) final_w, final_b = params[-1] logits = jnp.dot(final_w, activations) + final_b return nn.log_softmax(logits)
def attention_op(self, query, key, value, mask=None): d_model = query.shape[-1] scores = jnp.divide( jnp.matmul(query, key.transpose(0, 2, 1)), jnp.sqrt(d_model) ) if mask is not None: scores = jnp.matmul(scores, mask) attention = nn.softmax(scores, axis=-1) attention = jnp.matmul(attention, value) return attention
def predict(_x: np.ndarray): """Predict new values from MDN.""" logmix, mu_data, logstd = get_mdn_coef(network(params, _x)) pi_data = softmax(logmix) sigma_data = np.exp(logstd) z = onp.random.gumbel(loc=0, scale=1, size=pi_data.shape) k = (onp.log(pi_data) + z).argmax(axis=1) indices = (onp.arange(_x.shape[0]), k) rn = onp.random.randn(_x.shape[0]) sampled = rn * sigma_data[indices] + mu_data[indices] return sampled
def __call__(self, x): if len(x.shape) == 4: x = DQNBody()(x) x = MLP( self.action_space.n, self.hidden_units, hidden_activation=nn.relu, output_scale=0.01, )(x) pi_s = nn.softmax(x, axis=1) log_pi_s = jnp.log(pi_s + (pi_s == 0.0) * 1e-8) return pi_s, log_pi_s
def from_params(cls, fixed_params, opt_params, scale=None, traceable=True): if scale is None: scale = Scale(0.0, 1.0) xs = fixed_params["xs"] densities = nn.softmax(opt_params) * opt_params.size return cls(xs=xs, densities=densities, scale=scale, normalized=True, traceable=True)
def spline_unconstrained_transform(thetax: jnp.ndarray, thetay: jnp.ndarray, thetad: jnp.ndarray) -> jnp.ndarray: """Transform the unconstrained parameters of the spline transform into their constrained counterparts. Args: thetax: Unconstrained x-coordinates of the spline intervals. thetay: Unconstrained y-coordinates of the spline intervals. thetad: Unconstrained derivatives at internal points. Returns: xk: The x-coordinates of the intervals on which the rational quadratics are defined. yk: The y-coordinates of the destination intervals of the rational quadratic transforms. delta: Derivatives at internal points. """ xk = jnp.atleast_2d(jnp.cumsum(2 * nn.softmax(thetax), axis=-1) - 1.) xk = jnp.hstack((-jnp.ones((xk.shape[0], 1)), xk)) yk = jnp.atleast_2d(jnp.cumsum(2 * nn.softmax(thetay), axis=-1) - 1.) yk = jnp.hstack((-jnp.ones((yk.shape[0], 1)), yk)) delta = nn.softplus(thetad) return jnp.squeeze(xk), jnp.squeeze(yk), jnp.squeeze(delta)
def epoch_step(opt_state, key): def train_step(opt_state, batch): opt_state, loss = self.update(next(itercount), opt_state, batch) return opt_state, loss batches = self._make_minibatches(observations, batch_size, key) opt_state, losses = scan(train_step, opt_state, batches) params = get_params(opt_state) mixing_coeffs, probs_logits = params probs = expit(probs_logits) self.model = (softmax(mixing_coeffs), probs) self._probs = probs return opt_state, (losses.mean(), *params, self.responsibilities(observations))
def epoch_step(opt_state, key): def train_step(opt_state, batch): opt_state, loss = self.update(next(itercount), opt_state, batch) return opt_state, loss batches = self._make_minibatches(observations, batch_size, key) opt_state, losses = scan(train_step, opt_state, batches) params = get_params(opt_state) mixing_coeffs, means, untransormed_cov = params cov_matrix = vmap(self._transform_to_covariance_matrix)(untransormed_cov) self.model = (softmax(mixing_coeffs), means, cov_matrix) responsibilities = self.responsibilities(observations) return opt_state, (losses.mean(), *params, responsibilities)
def apply_fun(params, x, adj, rng, activation=nn.elu, is_training=False, **kwargs): W, a1, a2 = params k1, k2, k3 = random.split(rng, 3) x = drop_fun(None, x, is_training=is_training, rng=k1) x = np.dot(x, W) f_1 = np.dot(x, a1) f_2 = np.dot(x, a2) logits = f_1 + f_2.T coefs = nn.softmax( nn.leaky_relu(logits, negative_slope=0.2) + np.where(adj, 0., -1e9)) coefs = drop_fun(None, coefs, is_training=is_training, rng=k2) x = drop_fun(None, x, is_training=is_training, rng=k3) ret = np.matmul(coefs, x) return activation(ret)
def get_samples(self, rng_key, num_samples): """ Draws samples from the weighted samples collected from the run. :param random.PRNGKey rng_key: Random number generator key to be used to draw samples. :param int num_samples: The number of samples. :return: a dict of posterior samples """ if self._results is None: raise RuntimeError( "NestedSampler.run(...) method should be called first to obtain results." ) samples, log_weights = self.get_weighted_samples() p = nn.softmax(log_weights) idx = random.choice(rng_key, log_weights.shape[0], (num_samples, ), p=p) return {k: v[idx] for k, v in samples.items()}
def loss_fn(self, params, batch): ''' Calculates expected mean negative loglikelihood. Parameters ---------- params : tuple Consists of mixing coefficients and probabilities of the Bernoulli distribution respectively. batch : array The subset of observations Returns ------- * int Negative log likelihood ''' mixing_coeffs, probs = params self.model = (softmax(mixing_coeffs), expit(probs)) return -self.expected_log_likelihood(batch) / len(batch)
def loss_fn(self, params, batch): """ Calculates expected mean negative loglikelihood. Parameters ---------- params : tuple Consists of mixing coefficients' logits, means and variances of the Gaussian distributions respectively. batch : array The subset of observations Returns ------- * int Negative log likelihood """ mixing_coeffs, means, untransormed_cov = params cov_matrix = vmap(self._transform_to_covariance_matrix)(untransormed_cov) self.model = (softmax(mixing_coeffs), means, cov_matrix) return -self.expected_log_likelihood(batch) / len(batch)
def gmm_sample(key, resps_c, means_c, logvar_c, varmin=1e-16): """ Sample mixture of gaussians X ~ \sum_c \pi_c N(mean_c, exp(logvar_c)) Arguments: key: random.PRNGKey for random bits resps_c: np.array with shape c, responsibilities in the GMM, \pi in the above formula is softmax(resps_c). means_c: np.array with shape c, means in GMM logvar_c: np.array with shape c, log variances in GMM varmin: Minimum variance allowed (numerially useful). Returns: Sample from the mixture model, np.array """ keys = random.split(key, 2) # pick gaussian to sample u = random.uniform(keys[0]) cum_resps_c = np.cumsum(softmax(resps_c)) cidx = np.argmax(u <= cum_resps_c) # sample that gaussian return diag_gaussian_sample(keys[1], means_c[cidx], logvar_c[cidx], varmin)
def from_params( cls, fixed_params, opt_params, scale=None, traceable=True): # FIXME: traceable; why sometimes no Scale? if not scale: scale = Scale(0.0, 1.0) floor = fixed_params.get("floor", -np.inf) ceiling = fixed_params.get("ceiling", np.inf) # Allow logistic center to exceed the range by 20% loc_min = np.maximum(scale.low, floor) - 0.2 * scale.width loc_max = np.minimum(scale.high, ceiling) + 0.2 * scale.width loc_range = loc_max - loc_min structured_params = opt_params.reshape((-1, 3)) locs = loc_min + scipy.special.expit(structured_params[:, 0]) * loc_range # Allow logistic scales between 0.01 and 0.5 # Don't allow tiny scales outside of the visible range s_min = 0.01 + 0.1 * np.where( (locs < scale.low), scale.low - locs, np.where(locs > scale.high, locs - scale.high, 0.0), ) s_max = 0.5 s_range = s_max - s_min ss = s_min + scipy.special.expit(structured_params[:, 1]) * s_range # Allow probs > 0.01 probs = list(0.01 + nn.softmax(structured_params[:, 2]) * (1 - 0.01 * structured_params[:, 2].size)) # Bundle up components component_logistics = [ Logistic(l, s, scale, normalized=True) for (l, s) in zip(locs, ss) ] components = [ Truncate(base_dist=cl, floor=floor, ceiling=ceiling) for cl in component_logistics ] mixture = cls(components=components, probs=probs) return mixture
def apply_fun(params, inputs, **unused_kwargs): query_src, key_src, value_src, kv_mask = inputs Q_param, K_param, V_param, O_param = params batch_size, kv_maxlen = kv_mask.shape q_maxlen = query_src.shape[1] Q = Q_apply(Q_param, query_src) # (B, N, query_dim) K = K_apply(K_param, key_src) # (B, T, query_dim) V = V_apply(V_param, value_src) # (B, T, value_dim) # Reshape to expand head-wise vars Q = Q.reshape(Q.shape[:-1] + (nhead, single_query_dim)) Q = Q.transpose((0, 2, 1, 3)) # Q: (B, nhead, N, single_query_dim) K = K.reshape(K.shape[:-1] + (nhead, single_query_dim)) K = K.transpose((0, 2, 1, 3)) # K: (B, nhead, T, single_query_dim) score = np.einsum('bhnd,bhtd->bhnt', Q, K) scaled_score = score / np.sqrt(single_query_dim) masked_score = ( scaled_score + (1.0 - kv_mask.reshape(batch_size, 1, 1, kv_maxlen)) * logepsilon) if att_prob_mask_fun is not None: extra_mask = att_prob_mask_fun(score.shape) masked_score = masked_score + (1.0 - extra_mask) * logepsilon att_probs = softmax(masked_score) # (B, nhead, N, T) V = V.reshape(V.shape[:-1] + (nhead, single_value_dim)) V = V.transpose((0, 2, 1, 3)) # V: (B, nhead, T, single_value_dim) head = np.einsum('bhnt,bhtd->bhnd', att_probs, V) head = head.transpose((0, 2, 1, 3)).reshape( (batch_size, q_maxlen, value_dim)) # collapse heads return O_apply(O_param, head)
def optimize_lfads(key, init_params, hps, opt_hps, train_data_fun, eval_data_fun): """Optimize the LFADS model and print batch based optimization data. This loop is at the cpu nonjax-numpy level. Arguments: init_params: a dict of parameters to be trained hps: dict of lfads model HPs opt_hps: dict of optimization HPs train_data_fun: function that takes a key and returns nexamples x time x ndims np array of data for training eval_data_fun: function that takes a key and returns nexamples x time x ndims np array of data for held out error Returns: a dictionary of trained parameters""" # Begin optimziation loop. all_tlosses = [] all_elosses = [] # Build some functions used in optimization. kl_warmup_fun = get_kl_warmup_fun(opt_hps) decay_fun = optimizers.exponential_decay(opt_hps['step_size'], opt_hps['decay_steps'], opt_hps['decay_factor']) opt_init, opt_update, get_params = optimizers.adam(step_size=decay_fun, b1=opt_hps['adam_b1'], b2=opt_hps['adam_b2'], eps=opt_hps['adam_eps']) opt_state = opt_init(init_params) def update_w_gc(i, opt_state, hps, opt_hps, key, x_bxt, kl_warmup): """Update fun for gradients, includes gradient clipping.""" params = get_params(opt_state) grads = grad(lfads.training_loss_jit)(params, hps, key, x_bxt, kl_warmup, opt_hps['keep_rate']) clipped_grads = optimizers.clip_grads(grads, opt_hps['max_grad_norm']) return opt_update(i, clipped_grads, opt_state) update_w_gc_jit = jit(update_w_gc, static_argnums=(2, 3)) # Run the optimization, pausing every so often to collect data and # print status. batch_size = hps['batch_size'] num_batches = opt_hps['num_batches'] print_every = opt_hps['print_every'] num_opt_loops = int(num_batches / print_every) params = get_params(opt_state) for oidx in range(num_opt_loops): batch_idx_start = oidx * print_every start_time = time.time() key, tkey, dtkey1, dtkey2, dekey1, dekey2 = \ random.split(random.fold_in(key, oidx), 6) opt_state = optimize_core_jit(tkey, batch_idx_start, print_every, update_w_gc_jit, kl_warmup_fun, opt_state, hps, opt_hps, train_data_fun) batch_time = time.time() - start_time # Losses params = get_params(opt_state) batch_pidx = batch_idx_start + print_every kl_warmup = kl_warmup_fun(batch_idx_start) # Training loss #didxs = onp.random.randint(0, train_data.shape[0], batch_size) #x_bxt = train_data[didxs].astype(onp.float32) x_bxt = train_data_fun(dtkey1) tlosses = lfads.losses_jit(params, hps, dtkey2, x_bxt, kl_warmup, 1.0) # Evaluation loss #didxs = onp.random.randint(0, eval_data.shape[0], batch_size) #ex_bxt = eval_data[didxs].astype(onp.float32) ex_bxt = eval_data_fun(dekey1) elosses = lfads.losses_jit(params, hps, dekey2, ex_bxt, kl_warmup, 1.0) # Saving, printing. resps = softmax(params['prior']['resps']) rmin = onp.min(resps) rmax = onp.max(resps) rmean = onp.mean(resps) rstd = onp.std(resps) all_tlosses.append(tlosses) all_elosses.append(elosses) s1 = "Batches {}-{} in {:0.2f} sec, Step size: {:0.5f}" s2 = " Training losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} " s3 = " Eval losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} " s4 = " Resps: min {:0.4f}, mean {:0.4f}, max {:0.4f}, std {:0.4f}" print( s1.format(batch_idx_start + 1, batch_pidx, batch_time, decay_fun(batch_pidx))) print( s2.format(tlosses['total'], tlosses['nlog_p_xgz'], tlosses['kl_prescale'], tlosses['kl'], tlosses['l2'], tlosses['ii_l2'], tlosses['ii_tavg'])) print( s3.format(elosses['total'], elosses['nlog_p_xgz'], elosses['kl_prescale'], elosses['kl'], elosses['l2'], elosses['ii_l2'], elosses['ii_tavg'])) print(s4.format(rmin, rmean, rmax, rstd)) tlosses_thru_training = utils.merge_losses_dicts(all_tlosses) elosses_thru_training = utils.merge_losses_dicts(all_elosses) optimizer_details = { 'tlosses': tlosses_thru_training, 'elosses': elosses_thru_training } return params, optimizer_details
def fit_sgd(self, observations, batch_size, rng_key=None, optimizer=None, num_epochs=1): ''' Fits the model using gradient descent algorithm with the given hyperparameters. Parameters ---------- observations : array The observation sequences which Bernoulli Mixture Model is trained on batch_size : int The size of the batch rng_key : array Random key of shape (2,) and dtype uint32 optimizer : jax.experimental.optimizers.Optimizer Optimizer to be used num_epochs : int The number of epoch the training process takes place Returns ------- * array Mean loss values found per epoch * array Mixing coefficients found per epoch * array Probabilities of Bernoulli distribution found per epoch * array Responsibilites found per epoch ''' global opt_init, opt_update, get_params if rng_key is None: rng_key = PRNGKey(0) if optimizer is not None: opt_init, opt_update, get_params = optimizer opt_state = opt_init((softmax(self.mixing_coeffs), logit(self.probs))) itercount = itertools.count() def epoch_step(opt_state, key): def train_step(opt_state, batch): opt_state, loss = self.update(next(itercount), opt_state, batch) return opt_state, loss batches = self._make_minibatches(observations, batch_size, key) opt_state, losses = scan(train_step, opt_state, batches) params = get_params(opt_state) mixing_coeffs, probs_logits = params probs = expit(probs_logits) self.model = (softmax(mixing_coeffs), probs) self._probs = probs return opt_state, (losses.mean(), *params, self.responsibilities(observations)) epochs = split(rng_key, num_epochs) opt_state, history = scan(epoch_step, opt_state, epochs) params = get_params(opt_state) mixing_coeffs, probs_logits = params probs = expit(probs_logits) self.model = (softmax(mixing_coeffs), probs) self._probs = probs return history
def exponential_mechanism(rng, votes, per_example_epsilon, sensitivity=1.): """Exponential mechanism.""" scores = nn.softmax(per_example_epsilon * votes / (2 * sensitivity)) return randomly_sample(rng, scores)
def optimize_lfads(key, init_params, hps, opt_hps, train_data_fun, eval_data_fun, ncompleted_batches=0, opt_state=None, callback_fun=None, do_print=True): """Optimize the LFADS model and print batch based optimization data. This loop is at the cpu nonjax-numpy level. Arguments: key: random.PRNGKey for randomness init_params: a dict of parameters to be trained hps: dict of lfads model HPs opt_hps: dict of optimization HPs train_data_fun: function that takes a key and returns nexamples x time x ndims np array of data for training eval_data_fun: function that takes a key and returns nexamples x time x ndims np array of data for held out error ncompleted_batches: (default 0), use this to restart training in the middle of the batch count. Used in tandem with opt_state (below). opt_state: (default None) 3-tuple (params, m - 1st moment, v - 2nd moment) from jax.experimental.optimizers.adam (None value starts optimizer anew). The params in opt_state[0] will *override* the init_params argument. callback_fun: (default None) function that the optimzie routine will call every print_every loops, in order to do whatever the user wants, typically saving, or reporting to a hyperparameter tuner, etc. callback_fun parameters are (current_batch_idx:int, hps:dict, opt_hps:dict, params:dict, opt_state:tuple, tlosses:dict, elosses:dict) do_print: (default True), print loss information Returns: A 3-tuple of (trained_params, opt_details - dictionary of optimization losses through training, (opt_state - a 3-tuple of trained params in odd pytree form, m 1st moment, v 2nd moment)). """ # Begin optimziation loop. all_tlosses = [] all_elosses = [] # Build some functions used in optimization. kl_warmup_fun = get_kl_warmup_fun(opt_hps) decay_fun = optimizers.exponential_decay(opt_hps['step_size'], opt_hps['decay_steps'], opt_hps['decay_factor']) opt_init, opt_update, get_params = optimizers.adam(step_size=decay_fun, b1=opt_hps['adam_b1'], b2=opt_hps['adam_b2'], eps=opt_hps['adam_eps']) print_every = opt_hps['print_every'] if ncompleted_batches > 0: print('Starting batch count at %d.' % (ncompleted_batches)) assert ncompleted_batches % print_every == 0 opt_loop_start_idx = int(ncompleted_batches / print_every) else: opt_loop_start_idx = 0 if opt_state is not None: print('Received opt_state, ignoring init_params argument.') else: opt_state = opt_init(init_params) def update_w_gc(i, opt_state, hps, opt_hps, key, x_bxt, kl_warmup): """Update fun for gradients, includes gradient clipping.""" params = get_params(opt_state) grads = grad(lfads.training_loss_jit)(params, hps, key, x_bxt, kl_warmup, opt_hps['keep_rate']) clipped_grads = optimizers.clip_grads(grads, opt_hps['max_grad_norm']) return opt_update(i, clipped_grads, opt_state) update_w_gc_jit = jit(update_w_gc, static_argnums=(2, 3)) # Run the optimization, pausing every so often to collect data and # print status. batch_size = hps['batch_size'] num_batches = opt_hps['num_batches'] assert num_batches % print_every == 0 num_opt_loops = int(num_batches / print_every) params = get_params(opt_state) for oidx in range(opt_loop_start_idx, num_opt_loops): batch_idx_start = oidx * print_every start_time = time.time() key, tkey, dtkey1, dtkey2, dekey1, dekey2 = \ random.split(random.fold_in(key, oidx), 6) opt_state = optimize_core_jit(tkey, batch_idx_start, print_every, update_w_gc_jit, kl_warmup_fun, opt_state, hps, opt_hps, train_data_fun) batch_time = time.time() - start_time # Losses params = get_params(opt_state) batch_pidx = batch_idx_start + print_every kl_warmup = kl_warmup_fun(batch_idx_start) # Training loss x_bxt = train_data_fun(dtkey1) tlosses = lfads.losses_jit(params, hps, dtkey2, x_bxt, kl_warmup, 1.0) # Evaluation loss ex_bxt = eval_data_fun(dekey1) elosses = lfads.losses_jit(params, hps, dekey2, ex_bxt, kl_warmup, 1.0) # Saving, printing. resps = softmax(params['prior']['resps']) rmin = onp.min(resps) rmax = onp.max(resps) rmean = onp.mean(resps) rstd = onp.std(resps) all_tlosses.append(tlosses) all_elosses.append(elosses) if do_print: s1 = "Batches {}-{} in {:0.2f} sec, Step size: {:0.5f}" s2 = " Training losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} " s3 = " Eval losses {:0.0f} = NLL {:0.0f} + KL {:0.1f},{:0.1f} + L2 {:0.2f} + II L2 {:0.2f} + <II> {:0.2f} " s4 = " Resps: min {:0.4f}, mean {:0.4f}, max {:0.4f}, std {:0.4f}" print( s1.format(batch_idx_start + 1, batch_pidx, batch_time, decay_fun(batch_pidx))) print( s2.format(tlosses['total'], tlosses['nlog_p_xgz'], tlosses['kl_prescale'], tlosses['kl'], tlosses['l2'], tlosses['ii_l2'], tlosses['ii_tavg'])) print( s3.format(elosses['total'], elosses['nlog_p_xgz'], elosses['kl_prescale'], elosses['kl'], elosses['l2'], elosses['ii_l2'], elosses['ii_tavg'])) print(s4.format(rmin, rmean, rmax, rstd)) if callback_fun is not None: callback_fun(batch_pidx, hps, opt_hps, params, opt_state, tlosses, elosses) tlosses_thru_training = utils.merge_losses_dicts(all_tlosses) elosses_thru_training = utils.merge_losses_dicts(all_elosses) optimizer_details = { 'tlosses': tlosses_thru_training, 'elosses': elosses_thru_training } return params, optimizer_details, opt_state
def _rvs(self, p): if self.is_logits: p = softmax(p) return categorical_rvs(self._random_state, p, self._size)
def _rvs(self, n, p): if self.is_logits: p = softmax(p) return multinomial_rvs(self._random_state, p, n, self._size)
def loss(q, k, dummy_proj, attn): logits = q @ k.T probs = softmax(logits) return fat.kl(attn, probs).mean()
def _to_probs_multinom(logits): return softmax(logits, axis=-1)