Exemple #1
0
def scale_momentum(model, feats_dir, steps, **kwargs):
    lr = kwargs.get("lr")
    wd = kwargs.get("wd")
    momentum = kwargs.get("momentum")
    dampening = kwargs.get("dampening")

    lr = np.array(lr, dtype=np.float128)
    wd = np.array(wd, dtype=np.float128)
    momentum = np.array(momentum, dtype=np.float128)
    dampening = np.array(dampening, dtype=np.float128)

    denom = lr * (1 - dampening) * (1 + momentum)
    gamma = (1 - momentum) / denom
    omega = np.sqrt(4 * wd / denom)

    layers = [layer for layer in utils.get_layers(model) if "conv" in layer]
    W_0 = utils.load_features(
        steps=[str(steps[0])],
        feats_dir=feats_dir,
        model=model,
        suffix="weight",
        group="params",
    )
    b_0 = utils.load_features(
        steps=[str(steps[0])],
        feats_dir=feats_dir,
        model=model,
        suffix="bias",
        group="params",
    )

    load_kwargs = {
        "model": model,
        "feats_dir": feats_dir,
    }
    theory_kwargs = {
        "lr": lr,
        "wd": wd,
        "momentum": momentum,
        "dampening": dampening,
        "gamma": gamma,
        "omega": omega,
        "W_0": W_0,
        "b_0": b_0,
        "step_0": steps[0],
    }

    theoretical = {layer: {} for layer in layers}
    empirical = {layer: {} for layer in layers}
    for i in tqdm(range(len(steps))):
        step = steps[i]
        theory_kwargs["i"] = i
        load_kwargs["group"] = "buffers"
        compute_theoretical_momentum(
            step, layers, load_kwargs, theoretical, **theory_kwargs,
        )
        load_kwargs["group"] = "params"
        compute_empirical(step, layers, load_kwargs, empirical)

    return {"empirical": empirical, "theoretical": theoretical}
def gradient(model, feats_dir, steps, **kwargs):
    layers = [layer for layer in utils.get_layers(model) if "conv" in layer]

    empirical = {layer: {} for layer in layers}
    for i in tqdm(range(len(steps))):
        step = steps[i]
        if step == 0:
            continue
        weight_buffers = utils.load_features(
            steps=[str(step)],
            feats_dir=feats_dir,
            model=model,
            suffix="weight.grad_norm_buffer",
            group="buffers",
        )
        bias_buffers = utils.load_features(
            steps=[str(step)],
            feats_dir=feats_dir,
            model=model,
            suffix="bias.grad_norm_buffer",
            group="buffers",
        )
        for layer in layers:
            wl_t = weight_buffers[layer][f"step_{step}"]
            bl_t = bias_buffers[layer][f"step_{step}"]
            empirical[layer][step] = utils.in_synapses(wl_t, bl_t)

    return {"empirical": empirical}
Exemple #3
0
def compute_empirical(step, layers, load_kwargs, empirical):
    weights = utils.load_features(steps=[str(step)], suffix="weight", **load_kwargs,)
    biases = utils.load_features(steps=[str(step)], suffix="bias", **load_kwargs,)
    for layer in layers:
        Wl_t = weights[layer][f"step_{step}"]
        bl_t = biases[layer][f"step_{step}"]
        empirical[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2)
Exemple #4
0
def compute_empirical(step, layers, load_kwargs, empirical):
    weights = utils.load_features(
        steps=[str(step)],
        suffix="weight",
        **load_kwargs,
    )
    biases = utils.load_features(
        steps=[str(step)],
        suffix="bias",
        **load_kwargs,
    )
    for layer in layers:
        wl_t = weights[layer][f"step_{step}"]
        bl_t = biases[layer][f"step_{step}"]
        Wl_t = np.column_stack((wl_t, bl_t))
        empirical[layer][step] = utils.out_synapses(Wl_t)
Exemple #5
0
def translation(model, feats_dir, steps, **kwargs):
    lr = kwargs.get("lr")
    wd = kwargs.get("wd")

    layers = [
        layer for layer in utils.get_layers(model) if "classifier" in layer
    ]
    W_0 = utils.load_features(
        steps=[str(steps[0])],
        feats_dir=feats_dir,
        model=model,
        suffix="weight",
        group="params",
    )
    b_0 = utils.load_features(
        steps=[str(steps[0])],
        feats_dir=feats_dir,
        model=model,
        suffix="bias",
        group="params",
    )

    load_kwargs = {
        "model": model,
        "feats_dir": feats_dir,
    }
    theory_kwargs = {
        "lr": lr,
        "wd": wd,
        "W_0": W_0,
        "b_0": b_0,
        "step_0": steps[0],
    }

    theoretical = {layer: {} for layer in layers}
    empirical = {layer: {} for layer in layers}
    for i in tqdm(range(len(steps))):
        step = steps[i]
        theory_kwargs["i"] = i
        load_kwargs["group"] = "buffers"
        compute_theoretical(step, layers, load_kwargs, theoretical,
                            **theory_kwargs)
        load_kwargs["group"] = "params"
        compute_empirical(step, layers, load_kwargs, empirical)

    return {"empirical": empirical, "theoretical": theoretical}
Exemple #6
0
def compute_theoretical(
    step,
    layers,
    load_kwargs,
    theoretical,
    i,
    step_0,
    lr,
    wd,
    W_0,
    b_0,
):
    t = lr * step
    if i > 0:
        weight_buffers = utils.load_features(
            steps=[str(step)],
            suffix="weight.integral_buffer",
            **load_kwargs,
        )
        bias_buffers = utils.load_features(
            steps=[str(step)],
            suffix="bias.integral_buffer",
            **load_kwargs,
        )

    W_in = np.exp(-2 * wd * t) * W_0[layers[0]][f"step_{step_0}"]**2
    b_in = np.exp(-2 * wd * t) * b_0[layers[0]][f"step_{step_0}"]**2
    if i > 0:
        g_W = weight_buffers[layers[0]][f"step_{step}"]
        g_b = bias_buffers[layers[0]][f"step_{step}"]
        W_in += (lr**2) * np.exp(-2 * wd * t) * g_W
        b_in += (lr**2) * np.exp(-2 * wd * t) * g_b
    for layer in layers[1:]:
        W_out = np.exp(-2 * wd * t) * W_0[layer][f"step_{step_0}"]**2
        b_out = np.exp(-2 * wd * t) * b_0[layer][f"step_{step_0}"]**2
        if i > 0:
            g_W = weight_buffers[layer][f"step_{step}"]
            g_b = bias_buffers[layer][f"step_{step}"]
            W_out += (lr**2) * np.exp(-2 * wd * t) * g_W
            b_out += (lr**2) * np.exp(-2 * wd * t) * g_b
        theoretical[layer][step] = utils.out_synapses(
            W_out) - utils.in_synapses(W_in, b_in)
        W_in = W_out
        b_in = b_out
Exemple #7
0
def extract_weights_and_grads(step, layers, load_kwargs, weights_and_grads, **kwargs):
    lr = kwargs.get("lr")
    wd = kwargs.get("wd")

    weights = utils.load_features(
            steps=[str(step)],
            suffix="weight",
            group="params",
            **load_kwargs,
        )
    biases = utils.load_features(
            steps=[str(step)],
            suffix="bias",
            group="params",
            **load_kwargs,
        )

    weight_buffers = utils.load_features(
            steps=[str(step)],
            suffix="weight.grad_buffer",
            group="buffers",
            **load_kwargs,
        )
    bias_buffers = utils.load_features(
            steps=[str(step)],
            suffix="bias.grad_buffer",
            group="buffers",
            **load_kwargs,
        )

    for layer in layers:
        Wl_t = weights[layer][f"step_{step}"]
        bl_t = biases[layer][f"step_{step}"]
        weights_and_grads[layer]["weight"].append(
            np.concatenate((Wl_t.flatten(), bl_t.flatten()))
        )

        g_Wl_t = weight_buffers[layer][f"step_{step}"]
        g_bl_t = bias_buffers[layer][f"step_{step}"]
        weights_and_grads[layer]["grad"].append(
            np.concatenate((g_Wl_t.flatten(), g_bl_t.flatten()))
        )
Exemple #8
0
def compute_empirical(step, layers, load_kwargs, empirical):
    weights = utils.load_features(
        steps=[str(step)],
        suffix="weight",
        **load_kwargs,
    )
    biases = utils.load_features(
        steps=[str(step)],
        suffix="bias",
        **load_kwargs,
    )
    W_in = weights[layers[0]][f"step_{step}"]**2
    b_in = biases[layers[0]][f"step_{step}"]**2
    for layer in layers[1:]:
        W_out = weights[layer][f"step_{step}"]**2
        b_out = biases[layer][f"step_{step}"]**2
        empirical[layer][step] = utils.out_synapses(W_out) - utils.in_synapses(
            W_in, b_in)
        W_in = W_out
        b_in = b_out
Exemple #9
0
def compute_pos_vel(step, layers, load_kwargs, position, velocity, **kwargs):
    lr = kwargs.get("lr")
    wd = kwargs.get("wd")

    weights = utils.load_features(
            steps=[str(step)],
            suffix="weight",
            group="params",
            **load_kwargs,
        )
    biases = utils.load_features(
            steps=[str(step)],
            suffix="bias",
            group="params",
            **load_kwargs,
        )

    weight_buffers = utils.load_features(
            steps=[str(step)],
            suffix="weight.grad_norm_buffer",
            group="buffers",
            **load_kwargs,
        )
    bias_buffers = utils.load_features(
            steps=[str(step)],
            suffix="bias.grad_norm_buffer",
            group="buffers",
            **load_kwargs,
        )

    for layer in layers:
        Wl_t = weights[layer][f"step_{step}"]
        bl_t = biases[layer][f"step_{step}"]
        position[layer][step] = utils.in_synapses(Wl_t ** 2, bl_t ** 2)

        g_Wl_t = weight_buffers[layer][f"step_{step}"]
        g_bl_t = bias_buffers[layer][f"step_{step}"]
        # -2lambda |\theta|^2 + \eta(|g|^2 - \lambda^2|\theta|^2)
        velocity[layer][step] = lr*utils.in_synapses(g_Wl_t, g_bl_t)
        velocity[layer][step] -= (2*wd + lr*wd**2)*position[layer][step]
def network(model, feats_dir, steps, **kwargs):
    subset = kwargs.get("subset", None)
    seed = kwargs.get("seed", 0)
    layers = [layer for layer in utils.get_layers(model)]
    empirical = {layer: {} for layer in layers}
    for i in range(len(steps)):
        step = steps[i]
        weights = utils.load_features(
            steps=[str(step)],
            feats_dir=feats_dir,
            model=model,
            suffix="weight",
            group="params",
        )
        biases = utils.load_features(
            steps=[str(step)],
            feats_dir=feats_dir,
            model=model,
            suffix="bias",
            group="params",
        )
        np.random.seed(seed)
        for layer in layers:
            Wl_t = weights[layer][f"step_{step}"]
            bl_t = biases[layer][f"step_{step}"]
            all_weights = np.concatenate((Wl_t.reshape(-1), bl_t.reshape(-1)))
            if subset is None:
                random_subset_idx = np.arange(len(all_weights))
            else:
                random_subset_idx = np.random.choice(len(all_weights),
                                                     size=min(
                                                         subset,
                                                         len(all_weights)),
                                                     replace=False)
            empirical[layer][step] = all_weights[random_subset_idx]

    return {"empirical": empirical}
Exemple #11
0
def compute_theoretical(
    step, layers, load_kwargs, theoretical, i, step_0, lr, wd, W_0, b_0,
):
    t = lr * step
    if i > 0:
        weight_buffers = utils.load_features(
            steps=[str(step)], suffix="weight.integral_buffer", **load_kwargs,
        )
        bias_buffers = utils.load_features(
            steps=[str(step)], suffix="bias.integral_buffer", **load_kwargs,
        )

    for layer in layers:
        Wl_0 = W_0[layer][f"step_{step_0}"]
        bl_0 = b_0[layer][f"step_{step_0}"]
        theoretical[layer][step] = np.exp(-2 * wd * t) * utils.in_synapses(
            Wl_0 ** 2, bl_0 ** 2
        )
        if i > 0:
            g_Wl_t = weight_buffers[layer][f"step_{step}"]
            g_bl_t = bias_buffers[layer][f"step_{step}"]
            theoretical[layer][step] += (
                (lr ** 2) * np.exp(-2 * wd * t) * utils.in_synapses(g_Wl_t, g_bl_t)
            )
Exemple #12
0
def compute_theoretical_momentum(
    step,
    layers,
    load_kwargs,
    theoretical,
    i,
    step_0,
    lr,
    wd,
    momentum,
    dampening,
    omega,
    gamma,
    W_0,
    b_0,
):
    t = lr * (1 - dampening) * step

    if i > 0:
        weight_buffers_1 = utils.load_features(
            steps=[str(step)], suffix="weight.integral_buffer_1", **load_kwargs,
        )
        bias_buffers_1 = utils.load_features(
            steps=[str(step)], suffix="bias.integral_buffer_1", **load_kwargs,
        )
        weight_buffers_2 = utils.load_features(
            steps=[str(step)], suffix="weight.integral_buffer_2", **load_kwargs,
        )
        bias_buffers_2 = utils.load_features(
            steps=[str(step)], suffix="bias.integral_buffer_2", **load_kwargs,
        )

    for layer in layers:
        Wl_0 = W_0[layer][f"step_{step_0}"]
        bl_0 = b_0[layer][f"step_{step_0}"]
        if gamma < omega:
            cos = np.cos(np.sqrt(omega ** 2 - gamma ** 2) * t)
            sin = np.sin(np.sqrt(omega ** 2 - gamma ** 2) * t)
            scale = np.exp(-gamma * t) * (
                cos + gamma / np.sqrt(omega ** 2 - gamma ** 2) * sin
            )
        elif gamma == omega:
            scale = np.exp(-gamma * t) * (1 + gamma * t)
        else:
            alpha_p = -gamma + np.sqrt(gamma ** 2 - omega ** 2)
            alpha_m = -gamma - np.sqrt(gamma ** 2 - omega ** 2)
            numer = alpha_p * np.exp(alpha_m * t) - alpha_m * np.exp(alpha_p * t)
            denom = alpha_p - alpha_m
            scale = numer / denom
        theoretical[layer][step] = scale * utils.in_synapses(
            Wl_0 ** 2, bl_0 ** 2, dtype=np.float128
        )
        if i > 0:
            g_Wl_t_1 = weight_buffers_1[layer][f"step_{step}"]
            g_bl_t_1 = bias_buffers_1[layer][f"step_{step}"]
            g_Wl_t_2 = weight_buffers_2[layer][f"step_{step}"]
            g_bl_t_2 = bias_buffers_2[layer][f"step_{step}"]

            if gamma < omega:
                sqrt = np.sqrt(omega ** 2 - gamma ** 2)
                scale_1 = np.exp(-gamma * t) * np.sin(sqrt * t) / sqrt
                scale_2 = -np.exp(-gamma * t) * np.cos(sqrt * t) / sqrt

            elif gamma == omega:
                scale_1 = np.exp(-gamma * t) * t
                scale_2 = -np.exp(-gamma * t)

            else:
                sqrt = np.sqrt(gamma ** 2 - omega ** 2)
                alpha_p = -gamma + sqrt
                alpha_m = -gamma - sqrt
                scale_1 = np.exp(alpha_p * t) / (alpha_p - alpha_m)
                scale_2 = -np.exp(alpha_m * t) / (alpha_p - alpha_m)

            scale = (lr * (1 - dampening)) * 2
            if np.all(np.isfinite(g_Wl_t_1)) and np.all(np.isfinite(g_bl_t_1)):
                theoretical[layer][step] += (
                    scale
                    * scale_1
                    * utils.in_synapses(g_Wl_t_1, g_bl_t_1, dtype=np.float128)
                )
            if np.all(np.isfinite(g_Wl_t_2)) and np.all(np.isfinite(g_bl_t_2)):
                theoretical[layer][step] += (
                    scale
                    * scale_2
                    * utils.in_synapses(g_Wl_t_2, g_bl_t_2, dtype=np.float128)
                )