def test_lambda_lr(test_case): optimizer = flow.optim.SGD( [ { "params": [Parameter(flow.Tensor([1.0]))] }, { "params": [Parameter(flow.Tensor([1.0]))] }, ], lr=TestLrScheduler.base_lr, ) lambdas = [lambda step: step // 30, lambda step: 0.95 * step] def lambda_lr_step(base_lrs, current_step): return [ base_lr * lmbda(current_step) for base_lr, lmbda in zip(base_lrs, lambdas) ] lambda_lr = flow.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambdas) for i in range(1, 21): lambda_lr.step() new_lrs = lambda_lr_step(lambda_lr.base_lrs, i) for lr1, lr2 in zip(lambda_lr.get_last_lr(), new_lrs): test_case.assertAlmostEqual(lr1, lr2, places=5)
def _apply(self, fn): for module in self.children(): module._apply(fn) for key, param in self._parameters.items(): if param is not None: assert isinstance(param, Parameter) assert param.is_leaf with flow.no_grad(): param_applied = fn(param) self._parameters[key] = Parameter(param_applied, param.requires_grad) if param.grad is not None: assert param.grad.is_leaf with flow.no_grad(): grad_applied = fn(param.grad) self._parameters[key].grad = grad_applied.requires_grad_( param.grad.requires_grad) for key, buf in self._buffers.items(): if buf is not None: self._buffers[key] = fn(buf) return self
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value)) param_list = list() param_list.append(x) rmsprop = flow.optim.RMSprop( [{ "param": param_list }], lr=learning_rate, momentum=momentum, scale=scale, alpha=alpha, eps=eps, weight_decay=weight_decay, centered=centered, ) def train_one_iter(grad): grad_tensor = flow.Tensor(grad, requires_grad=False) loss = x * grad_tensor loss = flow.sum(x * grad_tensor) loss.backward() rmsprop.step() rmsprop.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
class TestLrScheduler(flow.unittest.TestCase): base_lr = 1.0 optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=base_lr) def test_cosine_scheduler(test_case): def cosine_scheduler_step(base_lr, current_step, steps, alpha): if current_step < steps: cos_decay = 0.5 * (1 + math.cos(math.pi * current_step / steps)) decay_factor = (1 - alpha) * cos_decay + alpha return base_lr * decay_factor else: return base_lr * alpha alpha = 0.5 steps = 10 cosine_scheduler = flow.optim.lr_scheduler.CosineScheduler( TestLrScheduler.optimizer, steps=10, alpha=0.5) for i in range(1, 21): cosine_scheduler.step() new_lr = cosine_scheduler_step(TestLrScheduler.base_lr, i, steps, alpha) test_case.assertAlmostEqual(cosine_scheduler.get_last_lr()[0], new_lr, places=4)
def _apply(self, fn): for module in self.children(): module._apply(fn) for key, param in self._parameters.items(): if param is not None: assert isinstance(param, Parameter) assert param.is_leaf with flow.no_grad(): # TODO(xuxiaoyu): remove Tensor convert after Tensor refactoring param_applied = Tensor(fn(param)) self._parameters[key] = Parameter(param_applied, param.requires_grad) if param.grad is not None: assert param.grad.is_leaf with flow.no_grad(): # TODO(xuxiaoyu): remove Tensor convert after Tensor refactoring grad_applied = Tensor(fn(param.grad)) self._parameters[key].grad = grad_applied.requires_grad_( param.grad.requires_grad) for key, buf in self._buffers.items(): if buf is not None: # TODO(xuxiaoyu): remove Tensor convert after Tensor refactoring self._buffers[key] = Tensor(fn(buf)) return self
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) adam = flow.optim.Adam( [ { "params": [x], "lr": learning_rate, "betas": betas, "eps": eps, "weight_decay": weight_decay, "scale": scale, } ] ) def train_one_iter(grad): grad_tensor = flow.Tensor( grad, requires_grad=False, device=flow.device(device) ) loss = flow.sum(x * grad_tensor) loss.backward() adam.step() adam.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) param_list = list() param_list.append(x) rmsprop = flow.optim.RMSprop([{ "params": param_list, "lr": learning_rate, "alpha": alpha, "eps": eps, "weight_decay": weight_decay, "momentum": momentum, "centered": centered, "scale": scale, }]) def train_one_iter(grad): grad_tensor = flow.Tensor(grad, requires_grad=False, device=flow.device(device)) loss = flow.sum(x * grad_tensor) loss.backward() rmsprop.step() rmsprop.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
def test_step_lr(test_case): optimizer = flow.optim.SGD( [{"params": [Parameter(flow.Tensor([1.0]))]}], lr=TestLrScheduler.base_lr ) def step_lr_step(base_lr, current_step, step_size, gamma): return base_lr * (gamma ** (current_step // step_size)) gamma = 0.1 step_size = 5 step_lr = flow.optim.lr_scheduler.StepLR( optimizer, step_size=step_size, gamma=gamma ) for i in range(1, 21): step_lr.step() new_lr = step_lr_step(TestLrScheduler.base_lr, i, step_size, gamma) test_case.assertAlmostEqual(step_lr.get_last_lr()[0], new_lr, places=5)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value)) param_list = list() param_list.append(x) sgd = flow.optim.SGD( [{"param": param_list}], lr=learning_rate, momentum=momentum, scale=scale ) def train_one_iter(grad): grad_tensor = flow.Tensor(grad, requires_grad=False) loss = x * grad_tensor # BUG: loss = flow.sum(x * grad_tensor) grad = flow.Tensor(np.ones(list(loss.shape))) loss.backward(grad) sgd.step() sgd.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) sgd = flow.optim.SGD([{ "params": [x], "lr": learning_rate, "momentum": momentum, "scale": scale }]) def train_one_iter(grad): grad_tensor = flow.Tensor(grad, requires_grad=False, device=flow.device(device)) loss = flow.sum(x * grad_tensor) loss.backward() sgd.step() sgd.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value)) param_list = list() param_list.append(x) adam = flow.optim.AdamW( [{ "param": param_list }], lr=learning_rate, scale=scale, weight_decay=weight_decay, ) def train_one_iter(grad): grad_tensor = flow.Tensor(grad, requires_grad=False) loss = x * grad_tensor loss = flow.sum(x * grad_tensor) loss.backward() adam.step() adam.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) return x
def __init__(self): super().__init__() self.w = Parameter(flow.Tensor(para))