def compute_empirical(step, layers, load_kwargs, empirical): weights = utils.load_features(steps=[str(step)], suffix="weight", **load_kwargs,) biases = utils.load_features(steps=[str(step)], suffix="bias", **load_kwargs,) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] empirical[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2)
def gradient(model, feats_dir, steps, **kwargs): layers = [layer for layer in utils.get_layers(model) if "conv" in layer] empirical = {layer: {} for layer in layers} for i in tqdm(range(len(steps))): step = steps[i] if step == 0: continue weight_buffers = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="weight.grad_norm_buffer", group="buffers", ) bias_buffers = utils.load_features( steps=[str(step)], feats_dir=feats_dir, model=model, suffix="bias.grad_norm_buffer", group="buffers", ) for layer in layers: wl_t = weight_buffers[layer][f"step_{step}"] bl_t = bias_buffers[layer][f"step_{step}"] empirical[layer][step] = utils.in_synapses(wl_t, bl_t) return {"empirical": empirical}
def compute_pos_vel(step, layers, load_kwargs, position, velocity, **kwargs): lr = kwargs.get("lr") wd = kwargs.get("wd") weights = utils.load_features( steps=[str(step)], suffix="weight", group="params", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", group="params", **load_kwargs, ) weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.grad_norm_buffer", group="buffers", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.grad_norm_buffer", group="buffers", **load_kwargs, ) for layer in layers: Wl_t = weights[layer][f"step_{step}"] bl_t = biases[layer][f"step_{step}"] position[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2) g_Wl_t = weight_buffers[layer][f"step_{step}"] g_bl_t = bias_buffers[layer][f"step_{step}"] # -2lambda |\theta|^2 + \eta(|g|^2 - \lambda^2|\theta|^2) velocity[layer][step] = lr*utils.in_synapses(g_Wl_t, g_bl_t) velocity[layer][step] -= (2*wd + lr*wd**2)*position[layer][step]
def compute_theoretical( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, W_0, b_0, ): t = lr * step if i > 0: weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer", **load_kwargs, ) for layer in layers: Wl_0 = W_0[layer][f"step_{step_0}"] bl_0 = b_0[layer][f"step_{step_0}"] theoretical[layer][step] = np.exp(-2 * wd * t) * utils.in_synapses( Wl_0 ** 2, bl_0 ** 2 ) if i > 0: g_Wl_t = weight_buffers[layer][f"step_{step}"] g_bl_t = bias_buffers[layer][f"step_{step}"] theoretical[layer][step] += ( (lr ** 2) * np.exp(-2 * wd * t) * utils.in_synapses(g_Wl_t, g_bl_t) )
def compute_theoretical( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, W_0, b_0, ): t = lr * step if i > 0: weight_buffers = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer", **load_kwargs, ) bias_buffers = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer", **load_kwargs, ) W_in = np.exp(-2 * wd * t) * W_0[layers[0]][f"step_{step_0}"]**2 b_in = np.exp(-2 * wd * t) * b_0[layers[0]][f"step_{step_0}"]**2 if i > 0: g_W = weight_buffers[layers[0]][f"step_{step}"] g_b = bias_buffers[layers[0]][f"step_{step}"] W_in += (lr**2) * np.exp(-2 * wd * t) * g_W b_in += (lr**2) * np.exp(-2 * wd * t) * g_b for layer in layers[1:]: W_out = np.exp(-2 * wd * t) * W_0[layer][f"step_{step_0}"]**2 b_out = np.exp(-2 * wd * t) * b_0[layer][f"step_{step_0}"]**2 if i > 0: g_W = weight_buffers[layer][f"step_{step}"] g_b = bias_buffers[layer][f"step_{step}"] W_out += (lr**2) * np.exp(-2 * wd * t) * g_W b_out += (lr**2) * np.exp(-2 * wd * t) * g_b theoretical[layer][step] = utils.out_synapses( W_out) - utils.in_synapses(W_in, b_in) W_in = W_out b_in = b_out
def compute_empirical(step, layers, load_kwargs, empirical): weights = utils.load_features( steps=[str(step)], suffix="weight", **load_kwargs, ) biases = utils.load_features( steps=[str(step)], suffix="bias", **load_kwargs, ) W_in = weights[layers[0]][f"step_{step}"]**2 b_in = biases[layers[0]][f"step_{step}"]**2 for layer in layers[1:]: W_out = weights[layer][f"step_{step}"]**2 b_out = biases[layer][f"step_{step}"]**2 empirical[layer][step] = utils.out_synapses(W_out) - utils.in_synapses( W_in, b_in) W_in = W_out b_in = b_out
def compute_theoretical_momentum( step, layers, load_kwargs, theoretical, i, step_0, lr, wd, momentum, dampening, omega, gamma, W_0, b_0, ): t = lr * (1 - dampening) * step if i > 0: weight_buffers_1 = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer_1", **load_kwargs, ) bias_buffers_1 = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer_1", **load_kwargs, ) weight_buffers_2 = utils.load_features( steps=[str(step)], suffix="weight.integral_buffer_2", **load_kwargs, ) bias_buffers_2 = utils.load_features( steps=[str(step)], suffix="bias.integral_buffer_2", **load_kwargs, ) for layer in layers: Wl_0 = W_0[layer][f"step_{step_0}"] bl_0 = b_0[layer][f"step_{step_0}"] if gamma < omega: cos = np.cos(np.sqrt(omega ** 2 - gamma ** 2) * t) sin = np.sin(np.sqrt(omega ** 2 - gamma ** 2) * t) scale = np.exp(-gamma * t) * ( cos + gamma / np.sqrt(omega ** 2 - gamma ** 2) * sin ) elif gamma == omega: scale = np.exp(-gamma * t) * (1 + gamma * t) else: alpha_p = -gamma + np.sqrt(gamma ** 2 - omega ** 2) alpha_m = -gamma - np.sqrt(gamma ** 2 - omega ** 2) numer = alpha_p * np.exp(alpha_m * t) - alpha_m * np.exp(alpha_p * t) denom = alpha_p - alpha_m scale = numer / denom theoretical[layer][step] = scale * utils.in_synapses( Wl_0 ** 2, bl_0 ** 2, dtype=np.float128 ) if i > 0: g_Wl_t_1 = weight_buffers_1[layer][f"step_{step}"] g_bl_t_1 = bias_buffers_1[layer][f"step_{step}"] g_Wl_t_2 = weight_buffers_2[layer][f"step_{step}"] g_bl_t_2 = bias_buffers_2[layer][f"step_{step}"] if gamma < omega: sqrt = np.sqrt(omega ** 2 - gamma ** 2) scale_1 = np.exp(-gamma * t) * np.sin(sqrt * t) / sqrt scale_2 = -np.exp(-gamma * t) * np.cos(sqrt * t) / sqrt elif gamma == omega: scale_1 = np.exp(-gamma * t) * t scale_2 = -np.exp(-gamma * t) else: sqrt = np.sqrt(gamma ** 2 - omega ** 2) alpha_p = -gamma + sqrt alpha_m = -gamma - sqrt scale_1 = np.exp(alpha_p * t) / (alpha_p - alpha_m) scale_2 = -np.exp(alpha_m * t) / (alpha_p - alpha_m) scale = (lr * (1 - dampening)) * 2 if np.all(np.isfinite(g_Wl_t_1)) and np.all(np.isfinite(g_bl_t_1)): theoretical[layer][step] += ( scale * scale_1 * utils.in_synapses(g_Wl_t_1, g_bl_t_1, dtype=np.float128) ) if np.all(np.isfinite(g_Wl_t_2)) and np.all(np.isfinite(g_bl_t_2)): theoretical[layer][step] += ( scale * scale_2 * utils.in_synapses(g_Wl_t_2, g_bl_t_2, dtype=np.float128) )