Esempio n. 1
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        update_deltas = treeano.UpdateDeltas()
        for parameter_vw, grad in zip(parameter_vws, grads):
            prev_grad, _ = update_utils.update_previous(
                network, update_deltas, grad, "grad(%s)" % parameter_vw.name,
                parameter_vw.shape)

            prev_update = network.create_vw(
                "quickprop_prev_update(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[treeano.inits.ConstantInit(1)],
            ).variable

            denom = prev_grad - grad
            # TODO paramerize
            epsilon = 1e-6
            denom = denom + treeano.utils.sign_non_zero(denom) * epsilon
            parameter_delta = prev_update * grad / denom

            parameter = parameter_vw.variable
            update_deltas[parameter] = parameter_delta
            update_deltas[prev_update] = parameter_delta - prev_update
        return update_deltas
Esempio n. 2
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # alpha / stepsize / learning rate are all the same thing
        # using alpha because that is what is used in the paper
        alpha = network.find_hyperparameter(
            ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.001)
        beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.9)
        beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999)
        epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"],
                                              1e-8)

        update_deltas = treeano.UpdateDeltas()

        # keep count state only once
        t_vw = network.create_vw(
            "adam_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        t = t_vw.variable
        new_t = t + 1
        update_deltas[t] = new_t - t

        for parameter_vw, grad in zip(parameter_vws, grads):
            # biased 1st moment estimate
            # moving average of gradient
            m_vw = network.create_vw(
                "adam_m(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # 2nd moment
            # moving average of squared gradient
            v_vw = network.create_vw(
                "adam_v(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            m = m_vw.variable
            v = v_vw.variable

            # new value for 1st moment estimate
            new_m = beta1 * m + (1 - beta1) * grad
            # new value for 2nd moment estimate
            new_v = beta2 * v + (1 - beta2) * T.sqr(grad)

            parameter_delta = -alpha * new_m / (T.sqrt(new_v) + epsilon)

            update_deltas[m] = new_m - m
            update_deltas[v] = new_v - v
            update_deltas[parameter_vw.variable] = parameter_delta

        return update_deltas
Esempio n. 3
0
 def _new_update_deltas(self, network, parameter_vws, grads):
     learning_rate = network.find_hyperparameter(
         ["sgd_learning_rate", "learning_rate"], 0.1)
     # HACK changes the rest of this node... mostly restructuring
     deltas = {}
     for vw, grad in zip(parameter_vws, grads):
         initial_std = np.std(vw.value)
         # prevent multiplying by 0 std
         if initial_std == 0:
             initial_std = 1.0
         factor = treeano.utils.as_fX(-learning_rate * initial_std**2)
         deltas[vw.variable] = factor * grad
     return treeano.UpdateDeltas(deltas)
Esempio n. 4
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        learning_rate = network.find_hyperparameter(["learning_rate"], 1e-2)
        momentum = network.find_hyperparameter(["momentum"], 0.9)
        rho = network.find_hyperparameter(["rho"], 0.95)
        epsilon = network.find_hyperparameter(
            ["std_rmsprop_epsilon", "epsilon"], 1e-8)

        update_deltas = treeano.UpdateDeltas()
        for parameter_vw, grad in zip(parameter_vws, grads):
            # exponential moving average of gradients for numerator
            g_avg_numer = network.create_vw(
                "std_rmsprop_gradients_momentum(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable
            # exponential moving average of gradients for denominator
            g_avg_denom = network.create_vw(
                "std_rmsprop_gradients(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable
            # exponential moving average of gradients squared
            g2_avg = network.create_vw(
                "std_rmsprop_gradients_squared(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable

            # updated state
            new_g_avg_numer = momentum * g_avg_numer + (1 - momentum) * grad
            new_g_avg_denom = rho * g_avg_denom + (1 - rho) * grad
            new_g2_avg = rho * g2_avg + (1 - rho) * T.sqr(grad)

            # calculate update
            std = T.sqrt(new_g2_avg - T.sqr(new_g_avg_denom) + epsilon)
            deltas = -learning_rate * new_g_avg_numer / std

            update_deltas[g_avg_numer] = new_g_avg_numer - g_avg_numer
            update_deltas[g_avg_denom] = new_g_avg_denom - g_avg_denom
            update_deltas[g2_avg] = new_g2_avg - g2_avg
            update_deltas[parameter_vw.variable] = deltas

        return update_deltas
Esempio n. 5
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # NOTE: in the paper, learning_rate is referred to as epsilon
        # not doing that here as it would be confusing
        learning_rate = network.find_hyperparameter(["learning_rate"], 0.01)
        # NOTE: this is referred to as lambda in the paper
        # NOTE: when doing hyperparameter selection in the paper,
        # they select from 1e-4, 1e-5, 1e-6
        damping_factor = network.find_hyperparameter(["damping_factor"], 1e-2)

        update_deltas = treeano.UpdateDeltas()

        k_vw = network.create_vw(
            "esgd_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        k = k_vw.variable
        new_k = k + 1
        update_deltas[k] = new_k - k

        for parameter_vw, grad in zip(parameter_vws, grads):
            D_vw = network.create_vw(
                "esgd_D(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            # TODO ESGD update should only occur every 20 iterations
            # to amortize cost
            parameter = parameter_vw.variable
            D = D_vw.variable
            # TODO save this state so that we can seed the rng
            srng = MRG_RandomStreams()
            # noise vector
            v = srng.normal(size=parameter.shape)
            Hv = T.Rop(grad, parameter, v)
            D_delta = T.sqr(Hv)
            new_D = D + D_delta
            # new_D / new_k is essentially a mean
            denominator = damping_factor + T.sqrt(new_D / new_k)
            parameter_delta = -learning_rate * grad / denominator
            update_deltas[parameter] = parameter_delta
            update_deltas[D] = D_delta
        return update_deltas
Esempio n. 6
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        learning_rate = network.find_hyperparameter(["learning_rate"], 1e-4)
        rho = network.find_hyperparameter(["rho"], 0.95)
        momentum = network.find_hyperparameter(["momentum"], 0.9)
        epsilon = network.find_hyperparameter(["epsilon"], 1e-4)

        update_deltas = treeano.UpdateDeltas()
        for parameter_vw, grad in zip(parameter_vws, grads):
            # momentum term
            delta = network.create_vw(
                "graves_rmsprop_delta(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable
            # exponential moving average of gradients
            g_avg = network.create_vw(
                "graves_rmsprop_gradients(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable
            # exponential moving average of gradients squared
            g2_avg = network.create_vw(
                "graves_rmsprop_gradients_squared(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            ).variable

            # updated gradients squared
            new_g_avg = rho * g_avg + (1 - rho) * grad
            new_g2_avg = rho * g2_avg + (1 - rho) * T.sqr(grad)

            # calculate update
            std = T.sqrt(new_g2_avg - T.sqr(new_g_avg) + epsilon)
            new_delta = momentum * delta - learning_rate * grad / std

            update_deltas[g_avg] = new_g_avg - g_avg
            update_deltas[g2_avg] = new_g2_avg - g2_avg
            update_deltas[delta] = new_delta - delta
            update_deltas[parameter_vw.variable] = new_delta

        return update_deltas
Esempio n. 7
0
 def _new_update_deltas(self, network, parameter_vws, grads):
     learning_rate = network.find_hyperparameter(["learning_rate"], 0.001)
     epsilon = network.find_hyperparameter(["epsilon"], 1e-16)
     update_deltas = treeano.UpdateDeltas()
     for parameter_vw, grad in zip(parameter_vws, grads):
         mem_vw = network.create_vw(
             "smorms3_mem(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[treeano.inits.ConstantInit(1)],
         )
         g_vw = network.create_vw(
             "smorms3_g(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[],
         )
         g2_vw = network.create_vw(
             "smorms3_g2(%s)" % parameter_vw.name,
             shape=parameter_vw.shape,
             is_shared=True,
             tags={"state"},
             default_inits=[],
         )
         parameter = parameter_vw.variable
         mem = mem_vw.variable
         g = g_vw.variable
         g2 = g2_vw.variable
         r = 1 / (mem + 1)
         new_g = (1 - r) * g + r * grad
         new_g2 = (1 - r) * g2 + r * grad**2
         term1 = (new_g**2) / (new_g2 + epsilon)
         term2 = T.sqrt(new_g2) + epsilon
         parameter_delta = -grad * T.minimum(learning_rate, term1) / term2
         new_mem = 1 + mem * (1 - term1)
         update_deltas[parameter] = parameter_delta
         update_deltas[mem] = new_mem - mem
         update_deltas[g] = new_g - g
         update_deltas[g2] = new_g2 - g2
     return update_deltas
Esempio n. 8
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # alpha / stepsize / learning rate are all the same thing
        # using alpha because that is what is used in the paper
        alpha = network.find_hyperparameter(["adam_learning_rate",
                                             "adam_alpha",
                                             "learning_rate"],
                                            0.001)
        beta1 = network.find_hyperparameter(["adam_beta1",
                                             "beta1"],
                                            0.9)
        beta2 = network.find_hyperparameter(["adam_beta2",
                                             "beta2"],
                                            0.999)
        epsilon = network.find_hyperparameter(["adam_epsilon",
                                               "epsilon"],
                                              1e-8)

        update_deltas = treeano.UpdateDeltas()

        # keep count state only once
        t_vw = network.create_vw(
            "adam_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        t = t_vw.variable
        new_t = t + 1
        update_deltas[t] = new_t - t

        # compute some values only once
        # unbias terms to take into account initializing with 0
        # NOTE: unbias terms assume constant beta1/beta2
        m_unbias_term = 1 - beta1 ** new_t
        v_unbias_term = T.sqrt(1 - beta2 ** new_t)
        epsilon_hat = epsilon * v_unbias_term
        alpha_t = alpha * v_unbias_term / m_unbias_term

        for parameter_vw, grad in zip(parameter_vws, grads):
            # biased 1st moment estimate
            # moving average of gradient
            m_vw = network.create_vw(
                "adam_m(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # 2nd moment
            # moving average of squared gradient
            v_vw = network.create_vw(
                "adam_v(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # another moving average of gradient
            g_vw = network.create_vw(
                "adam_g(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            m = m_vw.variable
            v = v_vw.variable
            g = g_vw.variable

            # new value for 1st moment estimate
            new_m = beta1 * m + (1 - beta1) * grad
            # new value for 2nd moment estimate
            new_v = beta2 * v + (1 - beta2) * T.sqr(grad)
            new_g = beta2 * g + (1 - beta2) * grad

            parameter_delta = - alpha_t * new_m / (T.sqrt(new_v - T.sqr(new_g))
                                                   + epsilon_hat)

            update_deltas[m] = new_m - m
            update_deltas[v] = new_v - v
            update_deltas[g] = new_g - g
            update_deltas[parameter_vw.variable] = parameter_delta

        return update_deltas
Esempio n. 9
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # alpha / stepsize / learning rate are all the same thing
        # using alpha because that is what is used in the paper
        alpha = network.find_hyperparameter(["learning_rate"], 0.001)
        beta1 = network.find_hyperparameter(["beta1"], 0.9)
        beta2 = network.find_hyperparameter(["beta2"], 0.999)
        epsilon = network.find_hyperparameter(["epsilon"], 1e-8)
        constant_root = network.find_hyperparameter(["constant_root"], None)
        normalize_denominator = network.find_hyperparameter(
            ["normalize_denominator"], True)

        update_deltas = treeano.UpdateDeltas()

        # keep count state only once
        t_vw = network.create_vw(
            "adaadam_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        t = t_vw.variable
        new_t = t + 1
        update_deltas[t] = new_t - t

        # compute some values only once
        # unbias terms to take into account initializing with 0
        # NOTE: unbias terms assume constant beta1/beta2
        m_unbias_term = 1 - beta1**new_t
        v_unbias_term = T.sqrt(1 - beta2**new_t)
        epsilon_hat = epsilon * v_unbias_term
        alpha_t = alpha * v_unbias_term / m_unbias_term

        if constant_root is None:
            h = network.find_hyperparameter(["half_life_batches"])
            # heuristic: set as half_life_batches by default
            c = network.find_hyperparameter(["clipped_batches"], h)
            f = 2.0**(1. / h)
            w0 = 2.0 * (1 / f)**c
            w_state = network.create_vw(
                "adaadam_w",
                shape=(),
                is_shared=True,
                tags={"state"},
                default_inits=[treeano.inits.ConstantInit(w0)],
            ).variable
            update_deltas[w_state] = w_state * f - w_state
            # TODO parameterize bounds
            w = T.clip(w_state, 2.0, 10000.0)
        else:
            w = constant_root

        for parameter_vw, grad in zip(parameter_vws, grads):
            # biased 1st moment estimate
            # moving average of gradient
            m_vw = network.create_vw(
                "adaadam_m(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # 2nd moment
            # moving average of squared gradient
            v_vw = network.create_vw(
                "adaadam_v(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            m = m_vw.variable
            v = v_vw.variable

            # new value for 1st moment estimate
            new_m = beta1 * m + (1 - beta1) * grad
            # new value for 2nd moment estimate
            new_v = beta2 * v + (1 - beta2) * T.sqr(grad)

            orig_denom = T.sqrt(new_v)
            denom = T.pow(new_v, 1. / w)
            # FIXME try w/ and w/o normalizer
            if normalize_denominator:
                denom_normalizer = ((orig_denom.sum() + 1e-8) /
                                    (denom.sum() + 1e-8))
            else:
                denom_normalizer = 1

            if 1:
                parameter_delta = -alpha_t * new_m / (
                    (denom + epsilon_hat) * denom_normalizer)
            else:
                parameter_delta = -alpha_t * new_m / (
                    denom * denom_normalizer + epsilon_hat)

            update_deltas[m] = new_m - m
            update_deltas[v] = new_v - v
            update_deltas[parameter_vw.variable] = parameter_delta

        return update_deltas
Esempio n. 10
0
 def _new_update_deltas(self, network, vws, grads):
     return treeano.UpdateDeltas({vw.variable: const for vw in vws})
Esempio n. 11
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # alpha / stepsize / learning rate are all the same thing
        # using alpha because that is what is used in the paper
        alpha = network.find_hyperparameter(
            ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.002)
        beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.975)
        beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999)
        epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"],
                                              1e-8)

        update_deltas = treeano.UpdateDeltas()

        # keep count state only once
        t_vw = network.create_vw(
            "adam_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        t = t_vw.variable
        new_t = t + 1
        update_deltas[t] = new_t - t

        # compute some values only once
        # unbias terms to take into account initializing with 0
        # NOTE: unbias terms assume constant beta1/beta2
        m_unbias_term1 = 1 - beta1**new_t
        m_unbias_term2 = 1 - beta1**(new_t + 1)
        v_unbias_term = T.sqrt(1 - beta2**new_t)

        for parameter_vw, grad in zip(parameter_vws, grads):
            # biased 1st moment estimate
            # moving average of gradient
            m_vw = network.create_vw(
                "adam_m(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # 2nd moment
            # moving average of squared gradient
            v_vw = network.create_vw(
                "adam_v(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            m = m_vw.variable
            v = v_vw.variable

            # new value for 1st moment estimate
            new_m = beta1 * m + (1 - beta1) * grad
            # new value for 2nd moment estimate
            new_v = beta2 * v + (1 - beta2) * T.sqr(grad)

            numer = (beta1 * new_m / m_unbias_term2 +
                     (1 - beta1) * grad / m_unbias_term1)
            # NOTE: nadam paper has epsilon inside sqrt, but leaving it outside
            # for consistency with adam
            denom = T.sqrt(beta2 * new_v / v_unbias_term) + epsilon
            parameter_delta = -alpha * numer / denom

            update_deltas[m] = new_m - m
            update_deltas[v] = new_v - v
            update_deltas[parameter_vw.variable] = parameter_delta

        return update_deltas
Esempio n. 12
0
 def new_update_deltas(self, network):
     batch_idx = network.get_vw("batch_idx").variable
     ud = treeano.UpdateDeltas()
     ud[batch_idx] = treeano.utils.as_fX(1)
     return ud
Esempio n. 13
0
    def _new_update_deltas(self, network, parameter_vws, grads):
        # alpha / stepsize / learning rate are all the same thing
        # using alpha because that is what is used in the paper
        alpha = network.find_hyperparameter(
            ["adam_learning_rate", "adam_alpha", "learning_rate"], 0.001)
        beta1 = network.find_hyperparameter(["adam_beta1", "beta1"], 0.9)
        beta2 = network.find_hyperparameter(["adam_beta2", "beta2"], 0.999)
        epsilon = network.find_hyperparameter(["adam_epsilon", "epsilon"],
                                              1e-8)
        # HACK part 1: different from adam
        scale_fn = network.find_hyperparameter(["scale_function"],
                                               treeano.utils.identity)

        update_deltas = treeano.UpdateDeltas()

        # keep count state only once
        t_vw = network.create_vw(
            "adam_count",
            shape=(),
            is_shared=True,
            tags={"state"},
            default_inits=[],
        )
        t = t_vw.variable
        new_t = t + 1
        update_deltas[t] = new_t - t

        # compute some values only once
        # unbias terms to take into account initializing with 0
        # NOTE: unbias terms assume constant beta1/beta2
        m_unbias_term = 1 - beta1**new_t
        v_unbias_term = T.sqrt(1 - beta2**new_t)
        epsilon_hat = epsilon * v_unbias_term
        alpha_t = alpha * v_unbias_term / m_unbias_term

        for parameter_vw, grad in zip(parameter_vws, grads):
            # biased 1st moment estimate
            # moving average of gradient
            m_vw = network.create_vw(
                "adam_m(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )
            # 2nd moment
            # moving average of squared gradient
            v_vw = network.create_vw(
                "adam_v(%s)" % parameter_vw.name,
                shape=parameter_vw.shape,
                is_shared=True,
                tags={"state"},
                default_inits=[],
            )

            m = m_vw.variable
            v = v_vw.variable

            # new value for 1st moment estimate
            new_m = beta1 * m + (1 - beta1) * grad
            # new value for 2nd moment estimate
            new_v = beta2 * v + (1 - beta2) * T.sqr(grad)

            parameter_delta = -alpha_t * new_m / (T.sqrt(new_v) + epsilon_hat)

            # HACK  part 2: different from standard adam
            initial_std = treeano.utils.as_fX(np.std(parameter_vw.value))
            # prevent multiplying by 0 std
            if initial_std > 0:
                parameter_delta *= scale_fn(initial_std)

            update_deltas[m] = new_m - m
            update_deltas[v] = new_v - v
            update_deltas[parameter_vw.variable] = parameter_delta

        return update_deltas