Example #1
0
    def test_Adam_const_lr(self, dev=cpu_dev):
        cpu_dev.EnableGraph(False)
        opt1 = opt.Adam(lr=0.1)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(1.0)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.1)

        # m := beta_1 * m + (1 - beta_1) * grad
        # v := beta_2 * v + (1 - beta_2) * grad * grad
        # m_norm = m / (1 - beta_1 ^ step)
        # v_norm = v / (1 - beta_2 ^ step)
        # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) )

        m = 0.1 * g
        tmp = tensor.square(g)
        v = 0.001 * tmp

        m_norm = m / 0.1
        v_norm = v / 0.001

        tmp = tensor.sqrt(v_norm) + 1e-8
        tmp = m_norm / tmp

        w_step1 = w - 0.1 * tmp
        opt1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1, decimal=5)
Example #2
0
    def test_AdaGrad_const_lr(self, dev=cpu_dev):
        cpu_dev.EnableGraph(False)
        opt1 = opt.AdaGrad(lr=0.1)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.1)

        # history = history + param_grad * param_grad
        # param_value = param_value - lr * param_grad / sqrt(history + epsilon)

        history = tensor.square(g)
        tmp = history + 1e-8
        tmp = tensor.sqrt(tmp)
        tmp = g / tmp

        w_step1 = w - 0.1 * tmp
        opt1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1)
Example #3
0
    def test_RMSProp_const_lr(self, dev=cpu_dev):
        cpu_dev.EnableGraph(False)
        opt1 = opt.RMSProp(lr=0.1)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.1)

        # running_average = running_average * rho + param_grad * param_grad * (1 - rho)
        # param_value = param_value - lr * param_grad / sqrt(running_average + epsilon)

        running_average = 0.1 * tensor.square(g)
        tmp = running_average + 1e-8
        tmp = tensor.sqrt(tmp)
        tmp = g / tmp

        w_step1 = w - 0.1 * tmp
        opt1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1)
Example #4
0
    def apply(self, param_name, param_value, param_grad):
        """Performs a single optimization step.

        Args:
                param_name(String): the name of the param
                param_value(Tensor): param values to be update in-place
                grad(Tensor): param gradients; the values may be updated
                        in this function; cannot use it anymore
        """
        assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                       param_value.shape,
                                                       param_grad.shape)
        self.device_check(param_value, self.step_counter, self.lr_value,
                          self.beta_1_value, self.beta_2_value,
                          self.epsilon_value, self.decay_value)

        # if self.decay_value != 0:
        if self.weight_decay.init_value != 0:
            singa.Axpy(self.decay_value.data, param_value.data,
                       param_grad.data)

        if param_name not in self.m:
            flag = param_value.device.graph_enabled()
            param_value.device.EnableGraph(False)
            self.m[param_name] = tensor.zeros_like(param_value)
            self.v[param_name] = tensor.zeros_like(param_value)
            param_value.device.EnableGraph(flag)

        # overall steps
        # m := beta_1 * m + (1 - beta_1) * grad
        # v := beta_2 * v + (1 - beta_2) * grad * grad
        # m_norm = m / (1 - beta_1 ^ step)
        # v_norm = v / (1 - beta_2 ^ step)
        # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) )

        step = self.step_counter + 1.0

        # m := beta_1 * m + (1 - beta_1) * grad
        tmp = 1.0 - self.beta_1_value
        self.m[param_name] *= self.beta_1_value
        singa.Axpy(tmp.data, param_grad.data, self.m[param_name].data)

        # v := beta_2 * v + (1 - beta_2) * grad * grad
        tmp = 1.0 - self.beta_2_value
        self.v[param_name] *= self.beta_2_value
        singa.Axpy(tmp.data, singa.Square(param_grad.data),
                   self.v[param_name].data)

        # m_norm = m / (1 - beta_1 ^ step)
        tmp = tensor.pow(self.beta_1_value, step)
        tmp = 1.0 - tmp
        m_norm = self.m[param_name] / tmp

        # v_norm = v / (1 - beta_2 ^ step)
        tmp = tensor.pow(self.beta_2_value, step)
        tmp = 1.0 - tmp
        v_norm = self.v[param_name] / tmp

        # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) )
        a = tensor.sqrt(v_norm) + self.epsilon_value
        tmp = m_norm / a

        minus_lr = 0.0 - self.lr_value
        singa.Axpy(minus_lr.data, tmp.data, param_value.data)