def mapped_update(i, opt_state, batch, state, rng): """This is a multi-device version of the update function above.""" # We assume all tensors have the first dimension = n_devices. weights, slots, opt_params = opt_state rng, subrng = jax_random.split(rng) grad_fn = backend.grad(model_and_loss_call, has_aux=True) grads, state = grad_fn(weights, batch, state, rng) # We do a psum(1.0) here instead of `n_devices` since `n_devices` is just # the number of devices on this host machine, however psum goes over all # devices of all hosts (ex: a TPU pod) and we need to be averaging over all # of them. grads = jax.tree_util.tree_map( lambda g: backend.psum(g, 'batch') / backend.psum(1.0, 'batch'), grads) return optimizer.tree_update( i, grads, weights, slots, opt_params), state, subrng
def mapped_update(i, opt_state, batch, state, rng): """This is a multi-device version of the update function above.""" # We assume all tensors have the first dimension = n_devices. weights, slots, opt_params = opt_state rng, subrng = jax_random.split(rng) grad_fn = backend.grad(model_and_loss_call, has_aux=True) grads, state = grad_fn(weights, batch, state, rng) grads = jax.tree_util.tree_map( lambda g: backend.psum(g, 'batch'), grads) return optimizer.tree_update( i, grads, weights, slots, opt_params), state, subrng