def f(sq: Squarer, x): return sq.square(x) f_jitted = jax.jit(f) def f_no_class(x): """Implements the same math as Squarer.square, but without the class.""" return 2 * x**2 f_no_class_jitted = jax.jit(f_no_class) x_test = jnp.array([0., 1., 2., 3.]) x_test_onp = onp.array([0., 1., 2., 3.]) sq = Squarer(2.) # Make jit compile f_jitted(sq, x_test) f_no_class_jitted(x_test) def print_times(times, n_runs, n_repeats): print('{:d} loops, best of {:d}: {:.1f} usec per loop'.format( n_runs, n_repeats, 1e6 * min(times) / n_runs)) n_runs = 1000 n_repeats = 5
Tmax = 500 #power spectrum resolution and range fnums = 30 freq_range = [15, 100] #SSN parameters n = 2 k = 0.04 tauE = 20 # in ms tauI = 10 # in ms psi = 0.774 t_scale = 1 tau_s = np.array([ 3, 5, 100 ]) * t_scale #in ms, AMPA, GABA, NMDA current decay time constants contrasts = np.array([0, 25, 50, 100]) gridsizedeg = 4 dradius = gridsizedeg / 8 gridperdeg = 5 gridsize = round(gridsizedeg * gridperdeg) + 1 magnFactor = 2 #mm/deg #biological hyper_col length is ~750 um, magFactor is typically 2 mm/deg in macaque V1 # hyper_col = 0.75/magnFactor hyper_col = 8 Lx = gridsizedeg Ly = gridsizedeg # r_cent = np.array([0.3, 0.6, 0.9, 1.2, 1.5])
def as_array(self): return tree_util.tree_multimap(lambda *args: np.array(list(args)), *self.data)
def testOneHotNonArrayInput(self): actual = nn.one_hot([0, 1, 2], 3) expected = jnp.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) self.assertAllClose(actual, expected)
def testGluValue(self): val = nn.glu(jnp.array([1.0, 0.0]), axis=0) self.assertAllClose(val, jnp.array([0.5]))
def invm(Pbc): _Pbc = Pbc * np.array([-1,-1,-1,1]) return np.sum(Pbc * _Pbc,axis=1)
def l2_regularizer(params, reg=reg): """ Return the L2 regularization loss. """ leaves, _ = tree_flatten(params) return reg * jnp.sum(jnp.array([jnp.vdot(x, x) for x in leaves]))
def test_jit_or_pmap_broadcast(self): def kernel_fn(x1, x2, do_flip, keys, do_square, params, _unused=None, p=0.65): res = np.abs(np.matmul(x1, x2)) if do_square: res *= res if do_flip: res = -res res *= random.uniform(keys) * p return [res, params] params = (np.array([1., 0.3]), (np.array([1.2]), np.array([0.5]))) x2 = np.arange(0, 10).reshape((10, )) keys = random.PRNGKey(1) kernel_fn_pmapped = batch._jit_or_pmap_broadcast(kernel_fn, device_count=0) x1 = np.arange(0, 10).reshape((1, 10)) for do_flip in [True, False]: for do_square in [True, False]: with self.subTest(do_flip=do_flip, do_square=do_square, device_count=0): res_1 = kernel_fn(x1, x2, do_flip, keys, do_square, params, _unused=True, p=0.65) res_2 = kernel_fn_pmapped(x1, x2, do_flip, keys, do_square, params, _unused=True) self.assertAllClose(res_1, res_2, True) test_utils.stub_out_pmap(batch, 1) x1 = np.arange(0, 10).reshape((1, 10)) kernel_fn_pmapped = batch._jit_or_pmap_broadcast(kernel_fn, device_count=1) for do_flip in [True, False]: for do_square in [True, False]: with self.subTest(do_flip=do_flip, do_square=do_square, device_count=1): res_1 = kernel_fn(x1, x2, do_flip, keys, do_square, params, _unused=False, p=0.65) res_2 = kernel_fn_pmapped(x1, x2, do_flip, keys, do_square, params, _unused=None) self.assertAllClose(res_1[0], res_2[0], True) self.assertAllClose( tree_map(partial(np.expand_dims, axis=0), res_1[1]), res_2[1], True) kernel_fn_pmapped = batch._jit_or_pmap_broadcast(kernel_fn, device_count=2) x1 = np.arange(0, 20).reshape((2, 10)) test_utils.stub_out_pmap(batch, 2) def broadcast(arg): return np.broadcast_to(arg, (2, ) + arg.shape) for do_flip in [True, False]: for do_square in [True, False]: with self.subTest(do_flip=do_flip, do_square=do_square, device_count=2): res_1 = kernel_fn(x1, x2, do_flip, keys, do_square, params, p=0.2) res_2 = kernel_fn_pmapped(x1, x2, do_flip, keys, do_square, params, _unused=None, p=0.2) self.assertAllClose(res_1[0][0], res_2[0][0], True) self.assertAllClose(res_1[0][1], res_2[0][1], True) self.assertAllClose(tree_map(broadcast, res_1[1]), res_2[1], True)
def integrate_tke(u, v, w, maskU, maskV, maskW, dxt, dxu, dyt, dyu, dzt, dzw, cost, cosu, kbot, kappaM, mxl, forc, forc_tke_surface, tke, dtke): tau = 0 taup1 = 1 taum1 = 2 dt_tracer = 1. dt_mom = 1. AB_eps = 0.1 alpha_tke = 1. c_eps = 0.7 K_h_tke = 2000. flux_east = np.zeros_like(maskU) flux_north = np.zeros_like(maskU) flux_top = np.zeros_like(maskU) sqrttke = np.sqrt(np.maximum(0., tke[:, :, :, tau])) """ integrate Tke equation on W grid with surface flux boundary condition """ dt_tke = dt_mom # use momentum time step to prevent spurious oscillations """ vertical mixing and dissipation of TKE """ ks = kbot - 1 # [2:-2, 2:-2] print("Init empty") a_tri = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) shapes match better if we ignore slicing b_tri = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) c_tri = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) d_tri = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) delta = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) b_tri_edge = np.zeros((maskU.shape[0], maskU.shape[1], maskU.shape[2])) # [2:-2, 2:-2]) # delta = jax.ops.index_update( # delta, jax.ops.index[:, :, :-1], # dt_tke / dzt[np.newaxis, np.newaxis, 1:] * alpha_tke * 0.5 \ # * (kappaM[2:-2, 2:-2, :-1] + kappaM[2:-2, 2:-2, 1:]) # ) print("Init delta") for x in range(delta.shape[0]): for y in range(delta.shape[1]): for z in range(delta.shape[2]): if x >= 2 and x < delta.shape[0] - 2 and y >= 2 and y < delta.shape[1] - 2 and z < delta.shape[2]-1: delta[x,y,z] = dt_tke / dzt[z+1] * alpha_tke * 0.5 \ * (kappaM[x, y, z] + kappaM[x, y, z+1]) # else: # # not necessary if we assume 0 init # delta[x,y,z] = 0 # a_tri = jax.ops.index_update( # a_tri, jax.ops.index[:, :, 1:-1], # -delta[:, :, :-2] / # dzw[np.newaxis, np.newaxis, 1:-1] # ) # a_tri = jax.ops.index_update( # a_tri, jax.ops.index[:, :, -1], # -delta[:, :, -2] / (0.5 * dzw[-1]) # ) print("Init attri") for x in range(a_tri.shape[0]): for y in range(a_tri.shape[1]): for z in range(a_tri.shape[2]): if x >= 2 and x < a_tri.shape[0] - 2 and y >= 2 and y < a_tri.shape[1] - 2: if z > 0 and z < a_tri.shape[2]-1: a_tri[x,y,z] = -delta[x,y,z-1] / dzw[z] elif z == a_tri.shape[2] - 1: a_tri[x,y,z] = -delta[x, y, z-1] / (0.5 * dzw[z]) # b_tri = jax.ops.index_update( # b_tri, jax.ops.index[:, :, 1:-1], # 1 + (delta[:, :, 1:-1] + delta[:, :, :-2]) / dzw[np.newaxis, np.newaxis, 1:-1] \ # + dt_tke * c_eps \ # * sqrttke[2:-2, 2:-2, 1:-1] / mxl[2:-2, 2:-2, 1:-1] # ) # b_tri = jax.ops.index_update( # b_tri, jax.ops.index[:, :, -1], # 1 + delta[:, :, -2] / (0.5 * dzw[-1]) \ # + dt_tke * c_eps / mxl[2:-2, 2:-2, -1] * sqrttke[2:-2, 2:-2, -1] # ) print("Init b_tri") for x in range(b_tri.shape[0]): for y in range(b_tri.shape[1]): for z in range(b_tri.shape[2]): if x >= 2 and x < b_tri.shape[0] - 2 and y >= 2 and y < b_tri.shape[1] - 2: if z > 0 and z < b_tri.shape[2]-1: b_tri[x,y,z] = 1 + (delta[x, y, z] + delta[x, y, z-1]) / dzw[z] \ + dt_tke * c_eps \ * sqrttke[x, y, z] / mxl[x, y, z] elif z == b_tri.shape[2]-1: b_tri[x,y,z] = 1 + delta[x, y, z-1] / (0.5 * dzw[z]) \ + dt_tke * c_eps / mxl[x,y,z] * sqrttke[x,y,z] else: # not necessary if we assume 0 init b_tri[x,y,z] = 0 else: # not necessary if we assume 0 init b_tri[x,y,z] = 0 # b_tri_edge = 1 + delta / dzw[np.newaxis, np.newaxis, :] \ # + dt_tke * c_eps / mxl[2:-2, 2:-2, :] * sqrttke[2:-2, 2:-2, :] print("Init b_tri_edge") for x in range(b_tri_edge.shape[0]): for y in range(b_tri_edge.shape[1]): for z in range(b_tri_edge.shape[2]): if x >= 2 and x < b_tri_edge.shape[0] - 2 and y >= 2 and y < b_tri_edge.shape[1] - 2: b_tri_edge[x,y,z] = 1 + delta[x,y,z] / dzw[z] \ + dt_tke * c_eps / mxl[x, y, z] * sqrttke[x, y, z] #mxl and sqrttke else: # not necessary if we assume 0 init b_tri[x,y,z] = 0 # c_tri = jax.ops.index_update( # c_tri, jax.ops.index[:, :, :-1], # -delta[:, :, :-1] / dzw[np.newaxis, np.newaxis, :-1] # ) print("Init c_tri") for x in range(c_tri.shape[0]): for y in range(c_tri.shape[1]): for z in range(c_tri.shape[2]): if x >= 2 and x < c_tri.shape[0] - 2 and y >= 2 and y < c_tri.shape[1] - 2: if z < c_tri.shape[2]-1: c_tri[x,y,z] = -delta[x,y,z] / dzw[z] # d_tri = tke[2:-2, 2:-2, :, tau] + dt_tke * forc[2:-2, 2:-2, :] # d_tri = jax.ops.index_add( # d_tri, jax.ops.index[:, :, -1], # dt_tke * forc_tke_surface[2:-2, 2:-2] / (0.5 * dzw[-1]) # ) print("Init d_tri") for x in range(d_tri.shape[0]): for y in range(d_tri.shape[1]): for z in range(d_tri.shape[2]): if x >= 2 and x < d_tri.shape[0] - 2 and y >= 2 and y < d_tri.shape[1] - 2: d_tri[x,y,z] = tke[x,y,z,tau] + dt_tke * forc[x,y,z] if z == d_tri.shape[2]-1: d_tri[x,y,z] += dt_tke * forc_tke_surface[x,y] / (0.5 * dzw[z]) # so far so good# print("Init masks and edge") # edge_mask = np.zeros(a_tri.shape) # water_mask = np.zeros(a_tri.shape) for x in range(a_tri.shape[0]): for y in range(a_tri.shape[1]): land_mask = ks[x,y] >= 0 for z in range(a_tri.shape[2]): if x >= 2 and x < a_tri.shape[0] - 2 and y >= 2 and y < a_tri.shape[1] - 2: edge_mask = land_mask and (z == ks[x, y]) water_mask = land_mask and (z >= ks[x, y]) if edge_mask: a_tri[x,y,z] = 0 #water_mask * a_tri[x,y,z] * np.logical_not(edge_mask) if not water_mask: a_tri[x,y,z] = 0 b_tri[x,y,z] = 1. c_tri[x,y,z] = 0 d_tri[x,y,z] = 0 if b_tri_edge is not None: if edge_mask: b_tri[x,y,z] = b_tri_edge[x,y,z] # if d_edge is not None: # if edge_mask: # d_tri[x,y,z] = d_edge[x,y,z] # if d_edge is not None: # d_tri = where(edge_mask, d_edge, d_tri) print("solve tridiag") a_tri_jax = jnp.array(a_tri) b_tri_jax = jnp.array(b_tri) c_tri_jax = jnp.array(c_tri) d_tri_jax = jnp.array(d_tri) a_tri_jax.block_until_ready() b_tri_jax.block_until_ready() c_tri_jax.block_until_ready() d_tri_jax.block_until_ready() sol = solve_tridiag(a_tri_jax, b_tri_jax, c_tri_jax, d_tri_jax) print("solve tridiag done") sol.block_until_ready() sol = np.array(sol) # tke = jax.ops.index_update( # tke, jax.ops.index[2:-2, 2:-2, :, taup1], # where(water_mask, sol, tke[2:-2, 2:-2, :, taup1]) # ) print("integrate tridiag sol") for x in range(a_tri.shape[0]): for y in range(a_tri.shape[1]): for z in range(a_tri.shape[2]): water_mask = (ks[x,y] >= 0) and (z >= ks[x, y]) if x >= 2 and x < c_tri.shape[0] - 2 and y >= 2 and y < c_tri.shape[1] - 2: if water_mask: tke[x,y,z,taup1] = sol[x,y,z] """ Add TKE if surface density flux drains TKE in uppermost box """ #mask = tke[2:-2, 2:-2, -1, taup1] < 0.0 # tke_surf_corr = jax.ops.index_update( # tke_surf_corr, jax.ops.index[2:-2, 2:-2], # where(mask, # -tke[2:-2, 2:-2, -1, taup1] * 0.5 * dzw[-1] / dt_tke, # 0.) # ) # tke = jax.ops.index_update( # tke, jax.ops.index[2:-2, 2:-2, -1, taup1], # np.maximum(0., tke[2:-2, 2:-2, -1, taup1]) # ) print("correct surf") tke_surf_corr = np.zeros((maskU.shape[0], maskU.shape[1])) for x in range(tke_surf_corr.shape[0]): for y in range(tke_surf_corr.shape[1]): if x >= 2 and x < tke_surf_corr.shape[0] - 2 and y >= 2 and y < tke_surf_corr.shape[1] - 2: tke_val = tke[x, y, tke.shape[2]-1, taup1] if tke_val < 0.0: tke_surf_corr[x,y] = -tke_val * 0.5 * dzw[dzw.shape[0]-1] / dt_tke tke[x, y,tke.shape[2]-1, taup1] = 0 else: tke_surf_corr[x,y] = 0 # """ # add tendency due to lateral diffusion # """ # flux_east = jax.ops.index_update( # flux_east, jax.ops.index[:-1, :, :], # K_h_tke * (tke[1:, :, :, tau] - tke[:-1, :, :, tau]) # / (cost[np.newaxis, :, np.newaxis] * dxu[:-1, np.newaxis, np.newaxis]) * maskU[:-1, :, :] # ) print("lateral diffusion east") for x in range(flux_east.shape[0]): for y in range(flux_east.shape[1]): for z in range(flux_east.shape[2]): if x < flux_east.shape[0]-1: flux_east[x,y,z] = K_h_tke * (tke[x+1, y, z, tau] - tke[x, y, z, tau]) \ / (cost[y] * dxu[x]) * maskU[x, y, z] # flux_north = jax.ops.index_update( # flux_north, jax.ops.index[:, :-1, :], # K_h_tke * (tke[:, 1:, :, tau] - tke[:, :-1, :, tau]) \ # / dyu[np.newaxis, :-1, np.newaxis] * maskV[:, :-1, :] * cosu[np.newaxis, :-1, np.newaxis] # ) print("lateral diffusion north") for x in range(flux_north.shape[0]): for y in range(flux_north.shape[1]): for z in range(flux_north.shape[2]): if y < flux_north.shape[1]-1: flux_north[x,y,z] = K_h_tke * (tke[x, y+1, z, tau] - tke[x, y, z, tau]) \ / dyu[y] * maskV[x, y, z] * cosu[y] # tke = jax.ops.index_add( # tke, jax.ops.index[2:-2, 2:-2, :, taup1], # dt_tke * maskW[2:-2, 2:-2, :] * # ((flux_east[2:-2, 2:-2, :] - flux_east[1:-3, 2:-2, :]) # / (cost[np.newaxis, 2:-2, np.newaxis] * dxt[2:-2, np.newaxis, np.newaxis]) # + (flux_north[2:-2, 2:-2, :] - flux_north[2:-2, 1:-3, :]) # / (cost[np.newaxis, 2:-2, np.newaxis] * dyt[np.newaxis, 2:-2, np.newaxis])) # ) print("add lateral diffusion") for x in range(tke.shape[0]): for y in range(tke.shape[1]): for z in range(tke.shape[2]): if x >= 2 and x < tke.shape[0] - 2 and y >= 2 and y < tke.shape[1] - 2: tke[x,y,z,taup1] += dt_tke * maskW[x, y, z] * \ ((flux_east[x,y,z] - flux_east[x-1, y, z]) / (cost[y] * dxt[x]) + (flux_north[x,y,z] - flux_north[x, y-1, z]) / (cost[y] * dyt[y])) """ add tendency due to advection """ flux_east, flux_north, flux_top = adv_flux_superbee_wgrid( tke[:, :, :, tau], u[..., tau], v[..., tau], w[..., tau], maskW, dxt, dyt, dzw, cost, cosu, dt_tracer ) # dtke = jax.ops.index_update( # dtke, jax.ops.index[2:-2, 2:-2, :, tau], # maskW[2:-2, 2:-2, :] * (-(flux_east[2:-2, 2:-2, :] - flux_east[1:-3, 2:-2, :]) # / (cost[jnp.newaxis, 2:-2, jnp.newaxis] * dxt[2:-2, jnp.newaxis, jnp.newaxis]) # - (flux_north[2:-2, 2:-2, :] - flux_north[2:-2, 1:-3, :]) # / (cost[jnp.newaxis, 2:-2, jnp.newaxis] * dyt[jnp.newaxis, 2:-2, jnp.newaxis])) # ) print("Adding to dtke") for x in range(dtke.shape[0]): for y in range(dtke.shape[1]): for z in range(dtke.shape[2]): if x >= 2 and x < dtke.shape[0] - 2 and y >= 2 and y < dtke.shape[1] - 2: dtke[x,y,z,tau] = maskW[x,y,z] * (-(flux_east[x,y,z] - flux_east[x-1, y, z]) \ / (cost[y] * dxt[x]) \ - (flux_north[x,y,z] - flux_north[x, y-1, z]) \ / (cost[y] * dyt[y])) if z == 0: dtke[x,y,z,tau] -= flux_top[x, y, 0] / dzw[0] if z >= 1 and z < dtke.shape[2]-1: dtke[x,y,z,tau] -= (flux_top[x, y, z] - flux_top[x, y, z-1]) / dzw[z] if z == dtke.shape[2]-1: dtke[x,y,z,tau] -= (flux_top[x, y, z] - flux_top[x, y, z-1]) / \ (0.5 * dzw[z]) tke[x,y,z, taup1] += dt_tracer * ((1.5 + AB_eps) * dtke[x, y, z, tau] - (0.5 + AB_eps) * dtke[x, y, z, taum1]) # dtke = jax.ops.index_add( # dtke, jax.ops.index[:, :, 0, tau], # -flux_top[:, :, 0] / dzw[0] # ) # dtke = jax.ops.index_add( # dtke, jax.ops.index[:, :, 1:-1, tau], # -(flux_top[:, :, 1:-1] - flux_top[:, :, :-2]) / dzw[1:-1] # ) # dtke = jax.ops.index_add( # dtke, jax.ops.index[:, :, -1, tau], # -(flux_top[:, :, -1] - flux_top[:, :, -2]) / \ # (0.5 * dzw[-1]) # ) # """ # Adam Bashforth time stepping # """ # tke = jax.ops.index_add( # tke, jax.ops.index[:, :, :, taup1], # dt_tracer * ((1.5 + AB_eps) * dtke[:, :, :, tau] - (0.5 + AB_eps) * dtke[:, :, :, taum1]) # ) return tke, dtke, tke_surf_corr
from typing import Any, Callable, Sequence, Optional import flax from flax.core import freeze, unfreeze from flax import linen as nn from jax.config import config config.enable_omnistaging() # Linen requires enabling omnistaging # config.update("jax_enable_x64", True)# Enable complex128 key = random.PRNGKey(42) N = 1 # single qubit N1 = 10 # parameter space size sz = jnp.array([[1, 0], [0, -1]], dtype=jnp.float32) sx = jnp.array([[0, 1], [1, 0]], dtype=jnp.float32) class ExplicitMLP(nn.Module): ''' MLP class ''' features: Sequence[int] def setup(self): self.layers = [nn.Dense(feat) for feat in self.features] def __call__(self, inputs): x = inputs
def func(y, t, *args): omega, params, = args return -1.0j * (omega * sz + A.apply(params, jnp.array([t, omega])) * sx) @ y #Using DNN as control field
def load_prob_method_to_result(problem_ids=all_problems, method_ids=all_methods, problem_to_methods=None, metrics='mse'): ''' Description: Initializes the experiment instance. Args: problem_ids (list): ids of problems to evaluate on method_ids (list): ids of methods to use problem_to_methods (dict): map of the form problem_id -> list of method_id. If None, then we assume that the user wants to test every method in method_to_params against every problem in problem_to_params metrics (list): metrics to load Returns: prob_method_to_result (dict): Dictionary containing results for all specified metrics and performance (time and memory usage) for all problem-method associations. ''' if (problem_to_methods is None): problem_to_methods = create_full_problem_to_methods( problem_ids, method_ids) prob_method_to_result = {} ''' Get loss series ''' for metric in metrics: for problem_id in problem_ids: # datapath for current metric and problem tigerforecast_dir = get_tigerforecast_dir() datapath = 'data/precomputed_results/' + metric + '_' + problem_id[: -3] + '.csv' datapath = os.path.join(tigerforecast_dir, datapath) with open(datapath) as csvfile: reader = csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC) method_no = 0 for row in reader: if (all_methods[method_no] in method_ids): prob_method_to_result[( metric, problem_id, all_methods[method_no])] = np.array(row) method_no += 1 csvfile.close() ''' Get time and memory usage ''' for problem_id in problem_ids: # datapath for current metric and problem tigerforecast_dir = get_tigerforecast_dir() datapath = 'data/precomputed_results/time_memory' + '_' + problem_id[: -3] + '.csv' datapath = os.path.join(tigerforecast_dir, datapath) with open(datapath) as csvfile: reader = csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC) method_no = 0 for row in reader: if (all_methods[method_no] in method_ids): prob_method_to_result[('time', problem_id, all_methods[method_no])] = row[0] prob_method_to_result[('memory', problem_id, all_methods[method_no])] = row[1] method_no += 1 csvfile.close() return prob_method_to_result
def setup(self): if self.goal_state is None: self.goal_state = jnp.array([0., -1., 0.])
def jnp_fun(x, unpacked_indexer): indexer = pack_indexer(unpacked_indexer) return jnp.array(x)[indexer]
def debug_fft(): from jax.config import config config.update("jax_enable_x64", True) import time import numpy as np import jax from jax import numpy as jnp np.random.seed(0) signal = np.random.randn(2 ** 20) signal_jax = jnp.array(signal) jfft = jax.jit(jnp.fft.fft) import tensorflow as tf signal_tf = tf.constant(signal, dtype=tf.complex128) def tffft(x): return tf.signal.fft(x).numpy() X_np = np.fft.fft(signal) X_jax = jfft(signal_jax) X_tf = tffft(signal_tf) print(np.mean(np.abs(X_np))) print("With JAX:") print('max:\t', jnp.max(jnp.abs(X_np - X_jax))) print('mean:\t', jnp.mean(jnp.abs(X_np - X_jax))) print('min:\t', jnp.min(jnp.abs(X_np - X_jax))) print("With Tensorflow:") print('max:\t', jnp.max(jnp.abs(X_np - X_tf))) print('mean:\t', jnp.mean(jnp.abs(X_np - X_tf))) print('min:\t', jnp.min(jnp.abs(X_np - X_tf))) ### CPU # 907.3490574884647 # max: 2.8773885332210747 # mean: 0.3903197564919141 # min: 2.4697454729898156e-05 ### GPU # 907.3490574884647 # max: 0.001166179716824765 # mean: 0.00020841654559267488 # min: 2.741492442122853e-07 R = 100 ts = time.time() for i in range(R): _ = np.fft.fft(signal) print('numpy fft execution time [ms]:\t', (time.time() - ts) / R * 1000) # Compile _ = jfft(signal_jax).block_until_ready() ts = time.time() for i in range(R): _ = jfft(signal_jax).block_until_ready() print('jax fft execution time [ms]:\t', (time.time() - ts) / R * 1000) ts = time.time() for i in range(R): _ = tffft(signal_tf) print('tensorflow fft execution time [ms]:\t', (time.time() - ts) / R * 1000)
def line_search(f, xk, pk, old_fval=None, old_old_fval=None, gfk=None, c1=1e-4, c2=0.9, maxiter=20): """Inexact line search that satisfies strong Wolfe conditions. Algorithm 3.5 from Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-61 Args: fun: function of the form f(x) where x is a flat ndarray and returns a real scalar. The function should be composed of operations with vjp defined. x0: initial guess. pk: direction to search in. Assumes the direction is a descent direction. old_fval, gfk: initial value of value_and_gradient as position. old_old_fval: unused argument, only for scipy API compliance. maxiter: maximum number of iterations to search c1, c2: Wolfe criteria constant, see ref. Returns: LineSearchResults """ def restricted_func_and_grad(t): phi, g = jax.value_and_grad(f)(xk + t * pk) dphi = jnp.dot(g, pk) return phi, dphi, g if old_fval is None or gfk is None: phi_0, dphi_0, gfk = restricted_func_and_grad(0.) else: phi_0 = old_fval dphi_0 = jnp.dot(gfk, pk) def wolfe_one(a_i, phi_i): # actually negation of W1 return phi_i > phi_0 + c1 * a_i * dphi_0 def wolfe_two(dphi_i): return jnp.abs(dphi_i) <= -c2 * dphi_0 state = _LineSearchState( done=False, failed=False, # algorithm begins at 1 as per Wright and Nocedal, however Scipy has a # bug and starts at 0. See https://github.com/scipy/scipy/issues/12157 i=1, a_i1=0., phi_i1=phi_0, dphi_i1=dphi_0, nfev=1 if (old_fval is None or gfk is None) else 0, ngev=1 if (old_fval is None or gfk is None) else 0, a_star=0., phi_star=phi_0, dphi_star=dphi_0, g_star=gfk, saddle_point=False, ) def body(state): # no amax in this version, we just double as in scipy. # unlike original algorithm we do our next choice at the start of this loop a_i = jnp.where(state.i == 1, 1., state.a_i1 * 2.) # if a_i <= 0 then something went wrong. In practice any really small step # length is a failure. Likely means the search pk is not good, perhaps we # are at a saddle point. saddle_point = a_i < 1e-5 state = state._replace(failed=saddle_point, saddle_point=saddle_point) phi_i, dphi_i, g_i = restricted_func_and_grad(a_i) state = state._replace(nfev=state.nfev + 1, ngev=state.ngev + 1) star_to_zoom1 = wolfe_one(a_i, phi_i) | ((phi_i >= state.phi_i1) & (state.i > 1)) star_to_i = wolfe_two(dphi_i) & (~star_to_zoom1) star_to_zoom2 = (dphi_i >= 0.) & (~star_to_zoom1) & (~star_to_i) zoom1 = _zoom(restricted_func_and_grad, wolfe_one, wolfe_two, state.a_i1, state.phi_i1, state.dphi_i1, a_i, phi_i, dphi_i, gfk, ~star_to_zoom1) state = state._replace(nfev=state.nfev + zoom1.nfev, ngev=state.ngev + zoom1.ngev) zoom2 = _zoom(restricted_func_and_grad, wolfe_one, wolfe_two, a_i, phi_i, dphi_i, state.a_i1, state.phi_i1, state.dphi_i1, gfk, ~star_to_zoom2) state = state._replace(nfev=state.nfev + zoom2.nfev, ngev=state.ngev + zoom2.ngev) state = state._replace( done=star_to_zoom1 | state.done, failed=(star_to_zoom1 & zoom1.failed) | state.failed, **_binary_replace( star_to_zoom1, state._asdict(), zoom1._asdict(), keys=['a_star', 'phi_star', 'dphi_star', 'g_star'], ), ) state = state._replace( done=star_to_i | state.done, **_binary_replace( star_to_i, state._asdict(), dict( a_star=a_i, phi_star=phi_i, dphi_star=dphi_i, g_star=g_i, ), ), ) state = state._replace( done=star_to_zoom2 | state.done, failed=(star_to_zoom2 & zoom2.failed) | state.failed, **_binary_replace( star_to_zoom2, state._asdict(), zoom2._asdict(), keys=['a_star', 'phi_star', 'dphi_star', 'g_star'], ), ) state = state._replace(i=state.i + 1, a_i1=a_i, phi_i1=phi_i, dphi_i1=dphi_i) return state state = while_loop( lambda state: (~state.done) & (state.i <= maxiter) & (~state.failed), body, state) status = jnp.where( state.failed & (~state.saddle_point), jnp.array(1), # zoom failed jnp.where( state.failed & state.saddle_point, jnp.array(2), # saddle point reached, jnp.where( state.i > maxiter, jnp.array(3), # maxiter reached jnp.array(0), # passed (should be) ), ), ) results = _LineSearchResults( failed=state.failed | (~state.done), nit=state.i - 1, # because iterations started at 1 nfev=state.nfev, ngev=state.ngev, k=state.i, a_k=state.a_star, f_k=state.phi_star, g_k=state.g_star, status=status, ) return results
def invm_plus(Pb,Pc): Pbc = Pb + Pc _Pbc = Pbc * np.array([-1,-1,-1,1]) return np.sum(Pbc * _Pbc,axis=1)
def supervised_optimization(self, sup_density_list, wiring_str, save_supervised_result_bool, dataset_str, EXPLOITATION_NUM_EPOCHS, EXPLOITATION_BATCH_SIZE, OPTIMIZER_STR, STEP_SIZE, REG, W_initializers_str='glorot_normal()', b_initializers_str='normal()', init_weight_rescale_bool=False, EXPLOITATION_VALIDATION_FRACTION=0.1, EXPLOIT_TRAIN_DATASET_FRACTION=1.0, RECORD_ACC_FREQ=100, DROPOUT_LAYER_POS=[], **kwargs): """ Train a neural network with loaded wiring from scratch. Args: sup_density_list: a list of network density levels wiring_str: a string that represents the network wiring, e.g., trans, rand, snip dataset_str: a string used to retreive the dataset EXPLOITATION_NUM_EPOCHS: the number of epochs used in supervsied training EXPLOITATION_BATCH_SIZE: the batch size used in supervsied training OPTIMIZER_STR: a string used to retreive the optimzier STEP_SIZE: step size of the optimizer REG: l2 regularization constant EXPLOITATION_VALIDATION_FRACTION: the fraction of training data held out for validation purpose EXPLOIT_TRAIN_DATASET_FRACTION: the fraction of training data used in evaluation. RECORD_ACC_FREQ: the frequency for recording train and test results Returns: train_acc_list_runs: a list of training accuracy test_acc_list_runs: a list of testing accuracy """ for density in sup_density_list: if density not in self.ntt_setup_dict['NN_DENSITY_LEVEL_LIST']: raise ValueError( 'The desired density level for supervised training is not used in NTT.' ) dataset_info = Dataset( datasource=dataset_str, VALIDATION_FRACTION=EXPLOITATION_VALIDATION_FRACTION) dataset = dataset_info.dataset # configure the dataset gen_batches = dataset_info.data_stream(EXPLOITATION_BATCH_SIZE) batch_input_shape = [-1] + self.ntt_setup_dict['instance_input_shape'] nr_training_samples = len(dataset['train']['input']) nr_training_samples_subset = int(nr_training_samples * EXPLOIT_TRAIN_DATASET_FRACTION) train_input = dataset['train'][ 'input'][:nr_training_samples_subset].reshape(batch_input_shape) train_label = dataset['train']['label'][:nr_training_samples_subset] test_input = dataset['test']['input'].reshape(batch_input_shape) test_label = dataset['test']['label'] num_complete_batches, leftover = divmod(nr_training_samples, EXPLOITATION_BATCH_SIZE) num_mini_batches_per_epochs = num_complete_batches + bool(leftover) total_batch = EXPLOITATION_NUM_EPOCHS * num_mini_batches_per_epochs if len(DROPOUT_LAYER_POS) == 0: # in this case, dropout is NOT used init_fun_no_dropout, f_train = model_dict[self.model_str]( W_initializers_str=W_initializers_str, b_initializers_str=b_initializers_str) f_test = f_train f_no_dropout = f_train key_dropout = None subkey_dropout = None else: # in this case, dropout is used _, f_train = model_dict[self.model_str + '_dropout']( mode='train', W_initializers_str=W_initializers_str, b_initializers_str=b_initializers_str) _, f_test = model_dict[self.model_str + '_dropout']( mode='test', W_initializers_str=W_initializers_str, b_initializers_str=b_initializers_str) init_fun_no_dropout, f_no_dropout = model_dict[self.model_str]( W_initializers_str=W_initializers_str, b_initializers_str=b_initializers_str) key_dropout = random.PRNGKey(0) @jit def step(i, opt_state, x, y, masks, key): this_step_params = get_params(opt_state) masked_g = grad(softmax_cross_entropy_with_logits_l2_reg)( this_step_params, f_train, x, y, masks, L2_REG_COEFF=REG, key=key) return opt_update(i, masked_g, opt_state) train_results_dict = {} test_results_dict = {} trained_masked_dict = {} for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) time.sleep(orig_random.uniform(1, 5)) now_str = '__' + str(datetime.now().strftime("%D:%H:%M:%S")).replace( '/', ':') supervised_model_info = '[u]' + self.ntt_file_name + '_[s]' + dataset_str supervised_model_wiring_info = supervised_model_info + '_' + wiring_str supervised_model_wiring_dir = self.supervised_result_path + supervised_model_info + '/' + supervised_model_wiring_info + now_str if save_supervised_result_bool: while os.path.exists(supervised_model_wiring_dir): temp = supervised_model_wiring_dir + '_0' supervised_model_wiring_dir = temp # print(supervised_model_wiring_dir) os.makedirs(supervised_model_wiring_dir) logging.basicConfig(filename=supervised_model_wiring_dir + "/supervised_learning_log.log", format='%(asctime)s %(message)s', filemode='w', level=logging.DEBUG) else: logging.basicConfig(filename="supervised_learning_log.log", format='%(asctime)s %(message)s', filemode='w', level=logging.DEBUG) for nn_density_level in sup_density_list: nn_density_level = onp.round(nn_density_level, 2) train_acc_list_runs = [] test_acc_list_runs = [] trained_masked_params_runs = [] for run_index in range(1, self.ntt_setup_dict['NUM_RUNS'] + 1): if wiring_str == 'trans': # load ntt masks and parameters density_run_dir = '/' + 'density_' + str( nn_density_level) + '/' + 'run_' + str(run_index) transferred_masks_fileName = '/transferred_masks_' + self.model_str + density_run_dir.replace( '/', '_') + '.npy' transferred_param_fileName = '/transferred_params_' + self.model_str + density_run_dir.replace( '/', '_') + '.npy' masks = list( np.load(self.ntt_result_path + density_run_dir + transferred_masks_fileName, allow_pickle=True)) masked_params = list( np.load(self.ntt_result_path + density_run_dir + transferred_param_fileName, allow_pickle=True)) elif wiring_str == 'rand': # randomly initialize masks and parameters _, params = init_fun_no_dropout(random.PRNGKey(run_index), tuple(batch_input_shape)) masks = get_masks_from_jax_params( params, nn_density_level, global_bool=self.ntt_setup_dict['GLOBAL_PRUNE_BOOL'], magnitude_base_bool=False, reshuffle_seed=run_index) masked_params = get_sparse_params_filtered_by_masks( params, masks) elif wiring_str == 'dense': # randomly initialize masks and parameters _, params = init_fun_no_dropout(random.PRNGKey(run_index), tuple(batch_input_shape)) # masks = get_masks_from_jax_params(params, nn_density_level, global_bool = self.ntt_setup_dict['GLOBAL_PRUNE_BOOL'], magnitude_base_bool = False, reshuffle_seed = run_index) logger.info("Dense net!!") masks = None masked_params = params elif wiring_str == 'snip': # randomly initialize masks and parameters if dataset_str == 'cifar-10': num_examples_snip = 128 else: num_examples_snip = 100 snip_input = dataset['train']['input'][:num_examples_snip] snip_label = dataset['train']['label'][:num_examples_snip] snip_batch = (snip_input, snip_label) _, params = init_fun_no_dropout(random.PRNGKey(run_index), tuple(batch_input_shape)) if not self.ntt_setup_dict['GLOBAL_PRUNE_BOOL']: logger.info("Use layerwise snip") masks = get_snip_masks( params, nn_density_level, f_no_dropout, snip_batch, batch_input_shape, self.ntt_setup_dict['GLOBAL_PRUNE_BOOL']) masked_params = get_sparse_params_filtered_by_masks( params, masks) elif wiring_str == 'logit_snip': # randomly initialize masks and parameters if dataset_str == 'cifar-10': num_examples_snip = 128 else: num_examples_snip = 100 snip_input = dataset['train']['input'][:num_examples_snip] _, params = init_fun_no_dropout(random.PRNGKey(run_index), tuple(batch_input_shape)) masks = get_logit_snip_masks( params, nn_density_level, f_no_dropout, snip_input, batch_input_shape, self.ntt_setup_dict['GLOBAL_PRUNE_BOOL']) # get_snip_masks(params, nn_density_level, f_no_dropout, snip_batch, batch_input_shape) masked_params = get_sparse_params_filtered_by_masks( params, masks) else: raise ValueError('The wiring string is undefined.') # optionally, add dropout layers #Test without dropout masks if len(DROPOUT_LAYER_POS) > 100: dropout_masked_params = [ () ] * (len(masked_params) + len(DROPOUT_LAYER_POS)) dropout_masks = [[]] * (len(masked_params) + len(DROPOUT_LAYER_POS)) print(len(masked_params)) #check dropout position #pprint(masked_params) # check num_inserted = 0 for i in range(len(dropout_masked_params)): if i in DROPOUT_LAYER_POS: num_inserted += 1 else: dropout_masked_params[i] = masked_params[ i - num_inserted] dropout_masks[i] = masks[i - num_inserted] masks = dropout_masks masked_params = dropout_masked_params if init_weight_rescale_bool == True: logger.info( "Init weight rescaled: W_scaled = W/sqrt(nn_density_level)" ) scaled_params = [] for i in range(len(masked_params)): if len(masked_params[i]) == 2: scaled_params.append( (masked_params[i][0] * np.sqrt(1 / nn_density_level), masked_params[i][1])) else: scaled_params.append(masked_params[i]) masked_params = scaled_params optimizer_with_params = optimizer_dict[OPTIMIZER_STR]( step_size=STEP_SIZE) opt_init, opt_update, get_params = optimizer_with_params opt_state = opt_init(masked_params) train_acc_list = [] test_acc_list = [] itercount = itertools.count() for iteration in range(total_batch): batch_xs, batch_ys = next(gen_batches) batch_xs = batch_xs.reshape(batch_input_shape) if key_dropout is not None: key_dropout, subkey_dropout = random.split(key_dropout) opt_state = step(next(itercount), opt_state, batch_xs, batch_ys, masks=masks, key=subkey_dropout) if iteration % RECORD_ACC_FREQ == 0: masked_trans_params = get_params(opt_state) train_acc = accuracy(masked_trans_params, f_test, train_input, train_label, key_dropout) test_acc = accuracy(masked_trans_params, f_test, test_input, test_label, key_dropout) train_acc_list.append(train_acc) test_acc_list.append(test_acc) logger.info( "NN density %.2f | Run %03d/%03d | Iteration %03d/%03d | Train acc %.2f%% | Test acc %.2f%%", nn_density_level, run_index, self.ntt_setup_dict['NUM_RUNS'], iteration + 1, total_batch, train_acc * 100, test_acc * 100) trained_masked_trans_params = get_params(opt_state) train_acc_list_runs.append(train_acc_list) test_acc_list_runs.append(test_acc_list) trained_masked_params_runs.append(trained_masked_trans_params) train_acc_list_runs = np.array(train_acc_list_runs) test_acc_list_runs = np.array(test_acc_list_runs) train_results_dict[str(nn_density_level)] = train_acc_list_runs test_results_dict[str(nn_density_level)] = test_acc_list_runs trained_masked_dict[str( nn_density_level)] = trained_masked_params_runs if save_supervised_result_bool: supervised_model_wiring_dir_run = supervised_model_wiring_dir + '/density_' + str( round(nn_density_level, 2)) + '/' while os.path.exists(supervised_model_wiring_dir_run): temp = supervised_model_wiring_dir_run + '_0' supervised_model_wiring_dir_run = temp os.makedirs(supervised_model_wiring_dir_run) model_summary_str = '[u]' + self.ntt_file_name + '_[s]' + dataset_str + '_density_' + str( round(nn_density_level, 2)) np.save( supervised_model_wiring_dir_run + '/' + 'supervised_trained_' + model_summary_str, [ nn_density_level, train_acc_list_runs, test_acc_list_runs, trained_masked_params_runs ]) output = dict(train_results=train_results_dict, test_results=test_results_dict, trained_params=trained_masked_dict) return output
chi = 0.3 # uB associated parameter B = 2 # constant cost c_h = 0.5 # social welfare after the unemployment welfare = 5 # tax rate before and after retirement tau_L = 0.2 tau_R = 0.1 # number of states S nS = 27 # probability of survival Pa = jnp.array(np.load("constant/prob.npy")) # deterministic income detEarning = jnp.array(np.load("constant/detEarningHigh.npy")) # Define transition matrix of economical states S Ps = np.genfromtxt('constant/Ps.csv',delimiter=',') fix = (np.sum(Ps, axis = 1) - 1) for i in range(nS): for j in range(nS): if Ps[i,j] - fix[i] > 0: Ps[i,j] = Ps[i,j] - fix[i] break Ps = jnp.array(Ps) # The possible GDP growth, stock return, bond return gkfe = np.genfromtxt('constant/gkfe.csv',delimiter=',') gkfe = jnp.array(gkfe) # GDP growth depending on current S state
def test_sample_loss_fn(self): example = self._make_example() example = dataclasses.replace( example, edges=sparse_operator.SparseCoordOperator( input_indices=jnp.array([[0], [0], [0], [0], [1], [2], [0], [0]]), output_indices=jnp.array([[1, 2], [2, 3], [2, 2], [3, 0], [0, 2], [0, 3], [0, 0], [0, 0]]), values=jnp.array([1, 1, 1, 1, 1, 1, 0, 0]))) @flax.nn.module def mock_model_def(example): del example side_outputs.SideOutput( -jnp.arange(5).astype("float32").reshape((1, 5)), name="one_sample_log_prob_per_edge_per_node") side_outputs.SideOutput(0.3, name="one_sample_reward_baseline") return model_util.safe_logit( jnp.array([ [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], ])) _, params = mock_model_def.init(jax.random.PRNGKey(0), example) mock_model = flax.nn.Model(mock_model_def, params) _, _, _, loss, metrics = train_edge_supervision_lib.sample_loss_fn( mock_model, (example, jax.random.PRNGKey(0)), target_edge_index=0, num_edge_types=3, num_rollouts=1, leave_one_out_baseline=False) np.testing.assert_allclose(metrics["reward"], 0.75, rtol=1e-5) np.testing.assert_allclose(metrics["shifted_reward"], 0.75 - 0.3, rtol=1e-5) np.testing.assert_allclose(metrics["policy_log_prob"], -1.5, rtol=1e-5) np.testing.assert_allclose(metrics["learned_baseline"], 0.3, rtol=1e-5) np.testing.assert_allclose(metrics["baseline_penalty"], 0.001 * (0.75 * (0.7 * 0.7) + 0.25 * (0.3 * 0.3)), rtol=1e-5) np.testing.assert_allclose(metrics["reinforce_term"], (0 * 0.7 + 1 * 0.7 + 2 * 0.7 + 3 * -0.3) / 4, rtol=1e-5) np.testing.assert_allclose(loss, metrics["reinforce_term"] + metrics["baseline_penalty"], rtol=1e-5) (output_logits, targets, valid_mask, loss, metrics) = train_edge_supervision_lib.sample_loss_fn( mock_model, (example, jax.random.PRNGKey(0)), target_edge_index=0, num_edge_types=3, num_rollouts=20, leave_one_out_baseline=True) self.assertEqual(output_logits.shape, (5, 5)) self.assertEqual(targets.shape, (5, 5)) self.assertEqual(valid_mask.shape, (5, 5)) np.testing.assert_allclose(metrics["reward"], 0.75, rtol=1e-5) np.testing.assert_allclose(metrics["shifted_reward"], 0, rtol=1e-5) np.testing.assert_allclose(metrics["learned_baseline"], 0.3, rtol=1e-5) np.testing.assert_allclose(metrics["baseline_penalty"], 0.0, rtol=1e-5)
def testOneHotOutOfBound(self): actual = nn.one_hot(jnp.array([-1, 3]), 3) expected = jnp.array([[0., 0., 0.], [0., 0., 0.]]) self.assertAllClose(actual, expected)
import neos.transforms as transforms import jax.numpy as jnp import neos.models as models import jax import scipy.optimize import neos.fit as fit from neos.cls import cls_maker import pyhf import funnyscipy bounds = jnp.array([[0, 10], [0, 20]]) # check that we map to inf space (i.e. -pi/2 to pi/2) w = jnp.linspace(0, 10) x = transforms.toinf(w, bounds[0]) print(x.min(), x.max()) # check that we can map very large values to bounded space w = jnp.linspace(-1000, 1000, 1001) x = transforms.to_bounded(w, bounds[0]) print(x.min(), x.max()) # define NLL functions in both parameter spaces def make_nll_boundspace(hyperpars): s, b, db = hyperpars def nll_boundspace(pars): truth_pars = [0, 1] m = models.hepdata_like([s], [b], [db])
def testOneHotCustomDtype(self): actual = nn.one_hot(jnp.array([0, 1, 2]), 3, dtype=jnp.bool_) expected = jnp.array([[True, False, False], [False, True, False], [False, False, True]]) self.assertAllClose(actual, expected)
def test_new(self): stat = metrics.MeanStat.new(jnp.array([2, 3, 1]), jnp.array([1, 0, 1])) npt.assert_array_equal(stat.accum, [2, 0, 1]) npt.assert_array_equal(stat.weight, [1, 0, 1])
def value(P, r, pis): return np.array([ utils.value_functional(P, r, pi, discount) for pi in pis ]) # jax doesnt seem to like me changing the batch size to a vmap?!?
def test_reduce(self): stat = metrics.MeanStat.new(jnp.array([1, 2, 4]), jnp.array([1, 1, 0])) reduced_stat = stat.reduce() self.assertEqual(reduced_stat.accum, 3) self.assertEqual(reduced_stat.weight, 2)
def setUp(self): super().setUp() self.init_params = (jnp.array([1., 2.]), jnp.array([3., 4.])) self.per_step_updates = (jnp.array([500., 5.]), jnp.array([300., 3.]))
def test_reduce(self): stat = metrics.SumStat.new(jnp.array([1, 2, 1])) reduced_stat = stat.reduce() self.assertEqual(reduced_stat.accum, 4)
def load_pretrained(*, pretrained_path, init_params, model_config, logger): """Loads/converts a pretrained checkpoint for fine tuning. Args: logger: Logger to use to output diagnostic messages. init_params: Parameters from model. Will be used for the head of the model and to verify that the model is compatible with the stored checkpoint. init_file: File pointing to pretrained checkpoint. model_config: Configuration of the model. Will be used to configure the head and rescale the position embeddings. Returns: Parameters like `init_params`, but loaded with pretrained weights from `init_file` and adapted accordingly. """ restored_params = inspect_params( params=load(pretrained_path), expected=init_params, logger=logger, fail_if_extra=False, fail_if_missing=False) # The following allows implementing fine-tuning head variants depending on the # value of `representation_size` in the fine-tuning job: # - `None` : drop the whole head and attach a nn.Linear. # - same number as in pre-training means : keep the head but reset the last # layer (logits) for the new task. if model_config.representation_size is None: if 'pre_logits' in restored_params: logger.info('load_pretrained: drop-head variant') restored_params['pre_logits'] = {} restored_params['head']['kernel'] = init_params['head']['kernel'] restored_params['head']['bias'] = init_params['head']['bias'] if 'posembed_input' in restored_params.get('Transformer', {}): # Rescale the grid of position embeddings. Param shape is (1,N,1024) posemb = restored_params['Transformer']['posembed_input']['pos_embedding'] posemb_new = init_params['Transformer']['posembed_input']['pos_embedding'] if posemb.shape != posemb_new.shape: logger.info('load_pretrained: resized variant: %s to %s', posemb.shape, posemb_new.shape) ntok_new = posemb_new.shape[1] if model_config.classifier == 'token': posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:] ntok_new -= 1 else: posemb_tok, posemb_grid = posemb[:, :0], posemb[0] gs_old = int(np.sqrt(len(posemb_grid))) gs_new = int(np.sqrt(ntok_new)) logger.info('load_pretrained: grid-size from %s to %s', gs_old, gs_new) posemb_grid = posemb_grid.reshape(gs_old, gs_old, -1) zoom = (gs_new / gs_old, gs_new / gs_old, 1) posemb_grid = scipy.ndimage.zoom(posemb_grid, zoom, order=1) posemb_grid = posemb_grid.reshape(1, gs_new * gs_new, -1) posemb = jnp.array(np.concatenate([posemb_tok, posemb_grid], axis=1)) restored_params['Transformer']['posembed_input']['pos_embedding'] = posemb return restored_params
def testNTK_NTKNNGPAgreement(self, train_shape, test_shape, network, out_logits): _, x_test, x_train, y_train = self._get_inputs(out_logits, test_shape, train_shape) _, _, ker_fun = _build_network(train_shape[1:], network, out_logits) reg = 1e-7 predictor = predict.gradient_descent_mse_ensemble(ker_fun, x_train, y_train, diag_reg=reg) ts = np.logspace(-2, 8, 10).reshape((5, 2)) for t in (None, 'ts'): for x in (None, 'x_test'): with self.subTest(t=t, x=x): x = x if x is None else x_test t = t if t is None else ts ntk = predictor(t=t, get='ntk', x_test=x) # Test time broadcasting if t is not None: ntk_ind = np.array([predictor(t=t, get='ntk', x_test=x) for t in t.ravel()]).reshape( t.shape + ntk.shape[2:]) self.assertAllClose(ntk_ind, ntk) # Create a hacked kernel function that always returns the ntk kernel def always_ntk(x1, x2, get=('nngp', 'ntk')): out = ker_fun(x1, x2, get=('nngp', 'ntk')) if get == 'nngp' or get == 'ntk': return out.ntk else: return out._replace(nngp=out.ntk) predictor_ntk = predict.gradient_descent_mse_ensemble(always_ntk, x_train, y_train, diag_reg=reg) ntk_nngp = predictor_ntk(t=t, get='nngp', x_test=x) # Test if you use nngp equations with ntk, you get the same mean self.assertAllClose(ntk, ntk_nngp) # Next test that if you go through the NTK code path, but with only # the NNGP kernel, we recreate the NNGP dynamics. # Create a hacked kernel function that always returns the nngp kernel def always_nngp(x1, x2, get=('nngp', 'ntk')): out = ker_fun(x1, x2, get=('nngp', 'ntk')) if get == 'nngp' or get == 'ntk': return out.nngp else: return out._replace(ntk=out.nngp) predictor_nngp = predict.gradient_descent_mse_ensemble(always_nngp, x_train, y_train, diag_reg=reg) nngp_cov = predictor(t=t, get='nngp', x_test=x, compute_cov=True).covariance # test time broadcasting for covariance nngp_ntk_cov = predictor_nngp(t=t, get='ntk', x_test=x, compute_cov=True).covariance if t is not None: nngp_ntk_cov_ind = np.array( [predictor_nngp(t=t, get='ntk', x_test=x, compute_cov=True).covariance for t in t.ravel()]).reshape(t.shape + nngp_cov.shape[2:]) self.assertAllClose(nngp_ntk_cov_ind, nngp_ntk_cov) # Test if you use ntk equations with nngp, you get the same cov # Although, due to accumulation of numerical errors, only roughly. self.assertAllClose(nngp_cov, nngp_ntk_cov)