def __call__(self, inputs, use_running_stats = None): """Normalizes the input using batch (optional) means and variances. Stats are computed over the batch and spherical dimensions: (0, 1, 2). Args: inputs: An array of dimensions (batch_size, resolution, resolution, n_spins_in, n_channels_in). use_running_stats: if true, the statistics stored in batch_stats will be used instead of computing the batch statistics on the input. Returns: Normalized inputs (the same shape as inputs). """ use_running_stats = nn.module.merge_param( "use_running_stats", self.use_running_stats, use_running_stats) # Normalization is independent per spin per channel. num_spins, num_channels = inputs.shape[-2:] feature_shape = (1, 1, 1, num_spins, num_channels) reduced_feature_shape = (num_spins, num_channels) initializing = not self.has_variable("batch_stats", "variance") running_variance = self.variable("batch_stats", "variance", lambda s: jnp.ones(s, jnp.float32), reduced_feature_shape) if self.centered: running_mean = self.variable("batch_stats", "mean", lambda s: jnp.zeros(s, jnp.complex64), reduced_feature_shape) if use_running_stats: variance = running_variance.value if self.centered: mean = running_mean.value else: # Compute the spherical mean over the spherical grid dimensions, then a # conventional mean over the batch. if self.centered: mean = sphere_utils.spin_spherical_mean(inputs) mean = jnp.mean(mean, axis=0) # Complex variance is E[x x*] - E[x]E[x*]. # For spin != 0, E[x] should be zero, although due to discretization this # is not always true. We only use E[x x*] here. # E[x x*]: mean_abs_squared = sphere_utils.spin_spherical_mean(inputs * inputs.conj()) mean_abs_squared = jnp.mean(mean_abs_squared, axis=0) # Aggregate means over devices. if self.axis_name is not None and not initializing: if self.centered: mean = lax.pmean(mean, axis_name=self.axis_name) mean_abs_squared = lax.pmean(mean_abs_squared, axis_name=self.axis_name) # Imaginary part is negligible. variance = mean_abs_squared.real if not initializing: running_variance.value = (self.momentum * running_variance.value + (1 - self.momentum) * variance) if self.centered: running_mean.value = (self.momentum * running_mean.value + (1 - self.momentum) * mean) if self.centered: outputs = inputs - mean.reshape(feature_shape) else: outputs = inputs factor = lax.rsqrt(variance.reshape(feature_shape) + self.epsilon) if self.use_scale: scale = self.param("scale", self.scale_init, reduced_feature_shape).reshape(feature_shape) factor = factor * scale outputs = outputs * factor if self.use_bias: bias = self.param("bias", self.bias_init, reduced_feature_shape).reshape(feature_shape) outputs = outputs + bias return outputs
) rotation_matrices = jax.vmap(jax.vmap(rotation_matrix))( trajectories["solution_values"]) rotation_matrices = jnp.einsum("ij,abjk", (rotation_matrix(problem.x0).T), rotation_matrices) epsilon_tensor = jnp.array([ [[0, 0, 0], [0, 0, 1], [0, -1, 0]], [[0, 0, -1], [0, 0, 0], [1, 0, 0]], [[0, 1, 0], [-1, 0, 0], [0, 0, 0]], ]) delta_u = -0.5 * jnp.einsum("kij,abij->abk", epsilon_tensor, rotation_matrices) cor = jnp.mean(delta_u**2, axis=0) t_a = trajectories["time_values"][0] t_t = jnp.arange(0.0, trajectories["time_values"][0][-1], 0.005) plt.plot(t_a, cor[:, 0]) plt.plot(t_a, cor[:, 1]) plt.plot(t_a, cor[:, 2]) D = 1.0 plt.plot( t_t, 1.0 / 6.0 - (5.0 / 12.0) * jnp.exp(-6.0 * D * t_t) + (1.0 / 4.0) * jnp.exp(-2.0 * D * t_t), label="theoretical", )
def LayerNorm(x, params, epsilon=1e-6, **unused_kwargs): # pylint: disable=invalid-name (scale, bias) = params mean = np.mean(x, axis=-1, keepdims=True) variance = np.mean((x - mean)**2, axis=-1, keepdims=True) norm_inputs = (x - mean) / np.sqrt(variance + epsilon) return norm_inputs * scale + bias
def loss(params, batch): inputs, targets = batch preds = predict(params, inputs) return -jnp.mean(jnp.sum(preds * targets, axis=1))
def mlp_loss(params, x, y): probs = mlp_predict(params, x) loss = jnp.mean(cross_entropy(probs, y)) return loss
def cross_entropy_loss(logits, labels): return -jnp.mean(jnp.sum(onehot(labels) * logits, axis=-1))
def loss_fn(params, batch): x, y = batch y_hat = model_apply(params, x) return jnp.mean(jnp.square(y_hat - y))
def _acc_fn(logits, labels): """ Classification accuracy of the model. """ predicted_class = jnp.argmax(logits, axis=1) return jnp.mean(predicted_class == labels)
def _reg_loss_fn(reg): return jnp.mean(reg)
def compute_loss(params, obs, act, returns): logp = get_policy(params, obs).log_prob(act) return jnp.mean(-(logp * returns))
def metrics(dist_params, priors, beta): kl_div = self.f.proba_dist.kl_divergence(dist_params, priors) return { 'KLDivRegularizer/beta': beta, 'KLDivRegularizer/kl_div': jnp.mean(kl_div) }
def loss(params, batch, model_predict): """Calculate loss.""" inputs, targets = batch preds = model_predict(params, inputs) return -np.mean(np.sum(preds * one_hot(targets, preds.shape[-1]), axis=-1))
def neg_log_perplexity(batch, model_predictions): """Calculate negative log perplexity.""" _, targets = batch hot_targets = one_hot(targets, model_predictions.shape[-1]) return np.mean(np.sum(model_predictions * hot_targets, axis=-1))
def accuracy(batch, model_predictions): """Calculate accuracy.""" _, targets = batch predicted_class = np.argmax(model_predictions, axis=-1) return np.mean(predicted_class == targets)
def __call__(self, inputs, is_training, test_local_stats=False, scale=None, offset=None): """Connects the batch norm. Args: inputs: An array, where the data format is [..., C]. is_training: Whether this is during training. test_local_stats: Whether local stats are used when is_training=False. scale: An array up to n-D. The shape of this tensor must be broadcastable to the shape of `inputs`. This is the scale applied to the normalized inputs. This cannot be passed in if the module was constructed with `create_scale=True`. offset: An array up to n-D. The shape of this tensor must be broadcastable to the shape of `inputs`. This is the offset applied to the normalized inputs. This cannot be passed in if the module was constructed with `create_offset=True`. Returns: The array, normalized across all but the last dimension. """ rank = inputs.ndim channel_index = self._channel_index if self._channel_index < 0: channel_index += rank if self._axis: axis = self._axis else: axis = [i for i in range(rank) if i != channel_index] if is_training or test_local_stats: if self._cross_replica_axis: # Calculate global statistics - n is the number of replicas which could # differ from jax.device_count() in cases of nested pmaps. n = jax.lax.psum(1, self._cross_replica_axis) mean = jnp.mean(inputs, axis, keepdims=True) mean = jax.lax.psum(mean, axis_name=self._cross_replica_axis) / n mean_of_squares = jnp.mean(inputs**2, axis, keepdims=True) mean_of_squares = jax.lax.psum( mean_of_squares, axis_name=self._cross_replica_axis) / n var = mean_of_squares - mean ** 2 else: mean = jnp.mean(inputs, axis, keepdims=True) # This uses E[(X - E[X])^2]. # TODO(tycai): Consider the faster, but possibly less stable # E[X^2] - E[X]^2 method. var = jnp.var(inputs, axis, keepdims=True) else: mean = self._mean_ema.average var = self._var_ema.average # Update moving averages. if is_training: self._mean_ema(mean) self._var_ema(var) params_shape = tuple( 1 if i in axis else inputs.shape[i] for i in range(rank)) if self._create_scale: if scale is not None: raise ValueError( "Cannot pass `scale` at call time if `create_scale=True`.") scale = base.get_parameter("scale", params_shape, inputs.dtype, self._scale_init) elif scale is None: scale = 1. if self._create_offset: if offset is not None: raise ValueError( "Cannot pass `offset` at call time if `create_offset=True`.") offset = base.get_parameter("offset", params_shape, inputs.dtype, self._offset_init) elif offset is None: offset = 0. # TODO(tycai): TF found that the below comment was ~2x faster than this # naive implementation (w/o XLA). Benchmark & consider their implementation. # inv = scale * lax.rsqrt(var + self._eps) # return inputs * inv + (offset - (mean * inv)) return scale * (inputs - mean) / jnp.sqrt(var + self._eps) + offset
def main(unused_argv): using_SGD = FLAGS.using_SGD train_size = FLAGS.train_size x_train, y_train, x_test, y_test = pickle.load( open("data_" + str(train_size) + ".p", "rb")) print("Got data") sys.stdout.flush() train_size = FLAGS.train_size # Build the network init_fn, apply_fn, _ = stax.serial( stax.Dense(2048, 1., 0.05), # stax.Erf(), stax.Relu(), stax.Dense(1, 1., 0.05)) ##ONLY IMPLEMENTED MSE LOSS AND 0,1 LABELS for now # initialize the network first time, to compute NTK randnnn = numpy.random.random_integers(np.iinfo(np.int32).min, high=np.iinfo(np.int32).max, size=2)[0] key = random.PRNGKey(randnnn) _, params = init_fn(key, (-1, 784)) # Create an MSE predictor to solve the NTK equation in function space. # we assume that the NTK is approximately the same for any sample of parameters (true in the limit of infinite width) sys.stdout.flush() g_dd = pickle.load(open("ntk_train_" + str(FLAGS.train_size) + ".p", "rb")) g_td = pickle.load( open("ntk_train_test_" + str(FLAGS.train_size) + ".p", "rb")) print("Got NTK") if not using_SGD: predictor = nt.predict.gradient_descent_mse(g_dd, y_train, g_td) batch_size = FLAGS.batch_size from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print(rank) for i in range(FLAGS.num_samples): if i % (ceil(FLAGS.num_samples / 100)) == 0: print(i) sys.stdout.flush() #reinitialize the network randnnn = numpy.random.random_integers(np.iinfo(np.int32).min, high=np.iinfo(np.int32).max, size=2)[0] key = random.PRNGKey(randnnn) _, params = init_fn(key, (-1, 784)) # Get initial values of the network in function space. fx_train = apply_fn(params, x_train) fx_test = apply_fn(params, x_test) if using_SGD: error = 1 lr = 0.1 lr = nt.predict.max_learning_rate(g_dd) print(lr) # lr *= 0.05 # lr*=1 ntk_train = g_dd.squeeze() ntk_train_test = g_td.squeeze() if batch_size == train_size: indices = np.array(list(range(train_size))) while error >= 0.5: if batch_size != train_size: indices = numpy.random.choice(range(train_size), size=batch_size, replace=False) fx_test = fx_test - lr * np.matmul( ntk_train_test[:, indices], (fx_train[indices] - y_train[indices])) / (2 * batch_size) fx_train = fx_train - lr * np.matmul( ntk_train[:, indices], (fx_train[indices] - y_train[indices])) / (2 * batch_size) # fx_train = jax.ops.index_add(fx_train, indices, -lr*np.matmul(ntk_train[:,indices],(fx_train[indices]-y_train[indices]))/(2*batch_size)) # print(fx_train[0:10]) error = np.dot( (fx_train - y_train).squeeze(), (fx_train - y_train).squeeze()) / (2 * train_size) #print(error) else: # Get predictions from analytic computation. fx_train, fx_test = predictor(FLAGS.train_time, fx_train, fx_test) OUTPUT = fx_test > 0.5 OUTPUT = OUTPUT.astype(int) #print(np.transpose(OUTPUT)) fun = ''.join([str(int(i)) for i in OUTPUT]) fun TRUE_OUTPUT = y_test > 0.5 TRUE_OUTPUT = TRUE_OUTPUT.astype(int) #print(np.transpose(OUTPUT)) ''.join([str(int(i)) for i in TRUE_OUTPUT]) print("Generalization accuracy", np.sum(OUTPUT == TRUE_OUTPUT) / FLAGS.test_size) loss = lambda fx, y_hat: 0.5 * np.mean((fx - y_hat)**2) #util.print_summary('train', y_train, apply_fn(params, x_train), fx_train, loss) #util.print_summary('test', y_test, apply_fn(params, x_test), fx_test, loss) OUTPUT = fx_train > 0.5 OUTPUT = OUTPUT.astype(int) #print(np.transpose(OUTPUT)) ''.join([str(int(i)) for i in OUTPUT]) TRUE_OUTPUT = y_train > 0.5 TRUE_OUTPUT = TRUE_OUTPUT.astype(int) #print(np.transpose(OUTPUT)) ''.join([str(int(i)) for i in TRUE_OUTPUT]) print("Training accuracy", np.sum(OUTPUT == TRUE_OUTPUT) / FLAGS.train_size) assert np.all(OUTPUT == TRUE_OUTPUT) file = open('results/data_{}_large.txt'.format(rank), 'a') file.write(fun + '\n') file.close()
def test_binomial_mean(n, p): samples = binomial(random.PRNGKey(1), p, n, shape=(100, 100)) expected_mean = n * p assert_allclose(np.mean(samples), expected_mean, rtol=0.05)
rng, rng_predict = random.split(random.PRNGKey(0)) samples = run_inference(model, args, rng, X, Y, D_H, sigma) samples_collected.append((sigma, samples)) # predict Y_test at inputs X_test vmap_args = (samples, random.split(rng_predict, args["num_samples"])) predictions = vmap(lambda samples, rng: predict( model, rng, samples, X_test, D_H, sigma))(*vmap_args) predictions = predictions[..., 0] 1 train_predictions = vmap(lambda samples, rng: predict( model, rng, samples, X, D_H, sigma))(*vmap_args) train_predictions = train_predictions[..., 0] # compute mean prediction and 95% confidence interval around median mean_prediction = np.mean(predictions, axis=0) percentiles = onp.percentile(predictions, [2.5, 97.5], axis=0) # compute mean prediction and confidence interval around median train_mean_prediction = np.mean(train_predictions, axis=0) # plot training data ax[i].plot(X, Y, 'kx', c="red", alpha=0.3, label="Data samples") # plot 90% confidence level of predictions ax[i].fill_between(X_test[:, 0], percentiles[0, :], percentiles[1, :], color='lightblue', label="95% CI", step='mid') # plot mean prediction
def munchausen_target_quantile_values(network, target_params, states, actions, next_states, rewards, terminals, num_tau_prime_samples, num_quantile_samples, cumulative_gamma, rng, tau, alpha, clip_value_min): """Build the munchausen target for return values at given quantiles.""" rng, rng1, rng2, rng3 = jax.random.split(rng, num=4) target_action = network.apply(target_params, states, num_quantiles=num_quantile_samples, rng=rng1) curr_state_representation = target_action.representation curr_state_representation = jnp.squeeze(curr_state_representation) is_terminal_multiplier = 1. - terminals.astype(jnp.float32) # Incorporate terminal state to discount factor. gamma_with_terminal = cumulative_gamma * is_terminal_multiplier gamma_with_terminal = jnp.tile(gamma_with_terminal, [num_tau_prime_samples]) replay_net_target_outputs = network.apply( target_params, next_states, num_quantiles=num_tau_prime_samples, rng=rng2) replay_quantile_values = replay_net_target_outputs.quantile_values target_next_action = network.apply(target_params, next_states, num_quantiles=num_quantile_samples, rng=rng3) target_next_quantile_values_action = target_next_action.quantile_values replay_next_target_q_values = jnp.squeeze( jnp.mean(target_next_quantile_values_action, axis=0)) q_state_values = target_action.quantile_values replay_target_q_values = jnp.squeeze(jnp.mean(q_state_values, axis=0)) num_actions = q_state_values.shape[-1] replay_action_one_hot = jax.nn.one_hot(actions, num_actions) replay_next_log_policy = stable_scaled_log_softmax( replay_next_target_q_values, tau, axis=0) replay_next_policy = stable_softmax(replay_next_target_q_values, tau, axis=0) replay_log_policy = stable_scaled_log_softmax(replay_target_q_values, tau, axis=0) tau_log_pi_a = jnp.sum(replay_log_policy * replay_action_one_hot, axis=0) tau_log_pi_a = jnp.clip(tau_log_pi_a, a_min=clip_value_min, a_max=1) munchausen_term = alpha * tau_log_pi_a weighted_logits = (replay_next_policy * (replay_quantile_values - replay_next_log_policy)) target_quantile_vals = jnp.sum(weighted_logits, axis=1) rewards += munchausen_term rewards = jnp.tile(rewards, [num_tau_prime_samples]) target_quantile_vals = (rewards + gamma_with_terminal * target_quantile_vals) next_state_representation = target_next_action.representation next_state_representation = jnp.squeeze(next_state_representation) return (rng, jax.lax.stop_gradient(target_quantile_vals[:, None]), jax.lax.stop_gradient(curr_state_representation), jax.lax.stop_gradient(next_state_representation))
def loss_fn(params, batch): """The loss function.""" x, y = batch y_hat = model_apply(params, x) return jnp.mean(jnp.square(y_hat - y))
def _update_numerical_cluster(numerical_points, assignment, k): return jnp.mean(jnp.where( assignment.reshape(-1, 1, 1) == jnp.arange(k).reshape(1, k, 1), numerical_points[:, jnp.newaxis, :], 0), axis=0)
def loss(params, R): return np.mean((vmap(energy_fn, (None, 0))(params, R) - E_gt(R, dr0))**2)
def accuracy(params, batch): inputs, targets = batch target_class = jnp.argmax(targets, axis=1) predicted_class = jnp.argmax(predict(params, inputs), axis=1) return jnp.mean(predicted_class == target_class)
def crossentropy_loss(logpred, target): """Calculate crossentropy loss.""" return -np.mean( np.sum(logpred * slax.one_hot(target, logpred.shape[-1]), axis=-1))
for ind in range(1): #Jee.shape[0] # Jee Jei Jie Jii gE gI NMDAratio plocal sigR (or sigEE sigIE) params_init = np.array([ Jee[ind] / psi, Jei[ind] / psi, Jie[ind] / psi, Jii[ind] / psi, 1, I2E[ind], 0.1, Plocal[ind], Plocal[ind], sigEE[ind], sigIE[ind] ]) OLDSTYLE = False params_init = find_params_to_sigmoid(params_init, MULTI=True, OLDSTYLE=OLDSTYLE) spect, fs, f0, r_fp, CONVG = ssn_multi_probes.ssn_FP(params_init, OLDSTYLE=OLDSTYLE) spect = np.real(spect) / np.mean(np.real(spect)) if CONVG: # r_targ = r_fp[trgt,rad_inds]/np.mean(r_fp[trgt, rad_inds], axis =1) # softmax_r = T * logsumexp( r_targ / T ) # suppression_index = 1 - (r_targ[-1]/softmax_r) #find suppression index for both E/I cells r_targ = r_fp[trgt, :] r_targ = r_targ[:, rad_inds] / np.mean(r_targ[:, rad_inds], axis=1)[:, None] softmax_r = T * logsumexp(r_targ / T) suppression_index = 1 - (r_targ[:, -1] / softmax_r) if suppression_index[0] > SI_max: SI_max = suppression_index[0]
def categorical_cross_entropy_loss(logits, labels): onehot_labels = common_utils.onehot(labels, logits.shape[-1]) return jnp.mean(-jnp.sum(onehot_labels * jnp.log(logits), axis=1))
def apply_fun(params, inputs, **kwargs): del kwargs (a_2, b_2) = params mean = np.mean(inputs, axis=-1, keepdims=True) std = np.std(inputs, axis=-1, keepdims=True) return a_2 * (inputs - mean) / (std + epsilon) + b_2
def _per_batch(inputs, labels): logits = predict(params, inputs) predicted_classes = top_k_classes(logits, 1) predicted_classes = predicted_classes.reshape((predicted_classes.shape[0],)) return jnp.mean(predicted_classes == labels)
def loss(params, batch): inputs, targets = batch preds = predict(params, inputs) return -np.mean(preds * targets)
def accuracy(params, X, y): target_class = jnp.argmax(y, axis=1) predicted_class = jnp.argmax(batch_predict(params, X), axis=1) return jnp.mean(predicted_class == target_class)