def test_Adam_const_lr(self, dev=cpu_dev): cpu_dev.EnableGraph(False) opt1 = opt.Adam(lr=0.1) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(1.0) g = tensor.Tensor(w_shape, device=dev).set_value(0.1) # m := beta_1 * m + (1 - beta_1) * grad # v := beta_2 * v + (1 - beta_2) * grad * grad # m_norm = m / (1 - beta_1 ^ step) # v_norm = v / (1 - beta_2 ^ step) # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) ) m = 0.1 * g tmp = tensor.square(g) v = 0.001 * tmp m_norm = m / 0.1 v_norm = v / 0.001 tmp = tensor.sqrt(v_norm) + 1e-8 tmp = m_norm / tmp w_step1 = w - 0.1 * tmp opt1.apply(w.name, w, g) assertTensorEqual(w, w_step1, decimal=5)
def test_AdaGrad_const_lr(self, dev=cpu_dev): cpu_dev.EnableGraph(False) opt1 = opt.AdaGrad(lr=0.1) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.1) # history = history + param_grad * param_grad # param_value = param_value - lr * param_grad / sqrt(history + epsilon) history = tensor.square(g) tmp = history + 1e-8 tmp = tensor.sqrt(tmp) tmp = g / tmp w_step1 = w - 0.1 * tmp opt1.apply(w.name, w, g) assertTensorEqual(w, w_step1)
def test_RMSProp_const_lr(self, dev=cpu_dev): cpu_dev.EnableGraph(False) opt1 = opt.RMSProp(lr=0.1) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.1) # running_average = running_average * rho + param_grad * param_grad * (1 - rho) # param_value = param_value - lr * param_grad / sqrt(running_average + epsilon) running_average = 0.1 * tensor.square(g) tmp = running_average + 1e-8 tmp = tensor.sqrt(tmp) tmp = g / tmp w_step1 = w - 0.1 * tmp opt1.apply(w.name, w, g) assertTensorEqual(w, w_step1)
def apply(self, param_name, param_value, param_grad): """Performs a single optimization step. Args: param_name(String): the name of the param param_value(Tensor): param values to be update in-place grad(Tensor): param gradients; the values may be updated in this function; cannot use it anymore """ assert param_value.shape == param_grad.shape, ("shape mismatch", param_value.shape, param_grad.shape) self.device_check(param_value, self.step_counter, self.lr_value, self.beta_1_value, self.beta_2_value, self.epsilon_value, self.decay_value) # if self.decay_value != 0: if self.weight_decay.init_value != 0: singa.Axpy(self.decay_value.data, param_value.data, param_grad.data) if param_name not in self.m: flag = param_value.device.graph_enabled() param_value.device.EnableGraph(False) self.m[param_name] = tensor.zeros_like(param_value) self.v[param_name] = tensor.zeros_like(param_value) param_value.device.EnableGraph(flag) # overall steps # m := beta_1 * m + (1 - beta_1) * grad # v := beta_2 * v + (1 - beta_2) * grad * grad # m_norm = m / (1 - beta_1 ^ step) # v_norm = v / (1 - beta_2 ^ step) # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) ) step = self.step_counter + 1.0 # m := beta_1 * m + (1 - beta_1) * grad tmp = 1.0 - self.beta_1_value self.m[param_name] *= self.beta_1_value singa.Axpy(tmp.data, param_grad.data, self.m[param_name].data) # v := beta_2 * v + (1 - beta_2) * grad * grad tmp = 1.0 - self.beta_2_value self.v[param_name] *= self.beta_2_value singa.Axpy(tmp.data, singa.Square(param_grad.data), self.v[param_name].data) # m_norm = m / (1 - beta_1 ^ step) tmp = tensor.pow(self.beta_1_value, step) tmp = 1.0 - tmp m_norm = self.m[param_name] / tmp # v_norm = v / (1 - beta_2 ^ step) tmp = tensor.pow(self.beta_2_value, step) tmp = 1.0 - tmp v_norm = self.v[param_name] / tmp # param := param - (lr * m_norm) / ( sqrt(v_norm) + epsilon) ) a = tensor.sqrt(v_norm) + self.epsilon_value tmp = m_norm / a minus_lr = 0.0 - self.lr_value singa.Axpy(minus_lr.data, tmp.data, param_value.data)