def test_lambda_lr(test_case): optimizer = flow.optim.SGD( [ { "params": [Parameter(flow.Tensor([1.0]))] }, { "params": [Parameter(flow.Tensor([1.0]))] }, ], lr=TestLrScheduler.base_lr, ) lambdas = [lambda step: step // 30, lambda step: 0.95 * step] def lambda_lr_step(base_lrs, current_step): return [ base_lr * lmbda(current_step) for (base_lr, lmbda) in zip(base_lrs, lambdas) ] lambda_lr = flow.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambdas) for i in range(1, 21): lambda_lr.step() new_lrs = lambda_lr_step(lambda_lr.base_lrs, i) for (lr1, lr2) in zip(lambda_lr.get_last_lr(), new_lrs): test_case.assertAlmostEqual(lr1, lr2, places=5)
def rebuild_tensor(cls, tensor_data, requires_grad): t = flow.tensor(tensor_data) if cls == Parameter: # we have to pass requires_grad into constructor, rather than set it as an # attribute later, because it's an important check for Integer Tensors to # have requires_grad=False (or else they raise an error) t = Parameter(t, requires_grad=requires_grad) else: t.requires_grad = requires_grad return t
def test_cosine_annealing_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def cosine_annealing_lr_step(base_lr, current_step, last_lr, T_max, eta_min): if (current_step - 1 - T_max) % (2 * T_max) == 0: return (last_lr + (TestLrScheduler.base_lr - eta_min) * (1 - math.cos(math.pi / T_max)) / 2) else: return (1 + math.cos(math.pi * current_step / T_max)) / ( 1 + math.cos(math.pi * (current_step - 1) / T_max)) * ( last_lr - eta_min) + eta_min T_max = 20 eta_min = 0.5 cosine_annealing_lr = flow.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=T_max, eta_min=eta_min) numpy_last_lr = TestLrScheduler.base_lr for i in range(1, 101): cosine_annealing_lr.step() numpy_last_lr = cosine_annealing_lr_step(TestLrScheduler.base_lr, i, numpy_last_lr, T_max, eta_min) test_case.assertAlmostEqual(cosine_annealing_lr.get_last_lr()[0], numpy_last_lr, places=4)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) sgd = flow.optim.SGD([{ "params": [x], "lr": learning_rate, "momentum": momentum, "weight_decay": weight_decay, }]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() sgd.step() sgd.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) # test state_dict/load_state_dict if i == reload_state_step: state_dict = sgd.state_dict() sgd = flow.optim.SGD([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) sgd.load_state_dict(state_dict) return x
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) adagrad = flow.optim.Adagrad( [ { "params": [x], "lr": learning_rate, "eps": eps, "weight_decay": weight_decay, } ], lr_decay=lr_decay, initial_accumulator_value=initial_accumulator_value, ) def train_one_iter(grad): grad_tensor = flow.tensor( grad, requires_grad=False, device=flow.device(device) ) loss = flow.sum(x * grad_tensor) loss.backward() adagrad.step() adagrad.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = adagrad.state_dict() adagrad = flow.optim.Adagrad([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) adagrad.load_state_dict(state_dict) return x
def test_cosine_decay_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def cosine_decay_lr_step(base_lr, current_step, decay_steps, alpha): if current_step < decay_steps: cos_decay = 0.5 * ( 1 + math.cos(math.pi * current_step / decay_steps)) decay_factor = (1 - alpha) * cos_decay + alpha return base_lr * decay_factor else: return base_lr * alpha alpha = 0.5 decay_steps = 10 cosine_decay_lr = flow.optim.lr_scheduler.CosineDecayLR( optimizer, decay_steps=decay_steps, alpha=alpha) for i in range(1, 21): cosine_decay_lr.step() new_lr = cosine_decay_lr_step(TestLrScheduler.base_lr, i, decay_steps, alpha) test_case.assertAlmostEqual(cosine_decay_lr.get_last_lr()[0], new_lr, places=4)
def _apply(self, fn, applied_dict=None): # A dict to store tensors that has already been applied. # There is no need to apply multiple times on a same tensor. if applied_dict is None: applied_dict = dict() for module in self.children(): module._apply(fn, applied_dict) def can_use_assign_copy(tensor, tensor_applied): return tensor.is_local == tensor_applied.is_local for (key, param) in self._parameters.items(): if param is None: continue need_apply = False if param not in applied_dict: need_apply = True assert isinstance(param, Parameter) assert param.is_leaf with flow.no_grad(): param_applied = fn(param) param_applied.requires_grad = param.requires_grad if param.grad is not None: assert param.grad.is_leaf with flow.no_grad(): grad_applied = fn(param.grad) grad_applied.requires_grad = param.grad.requires_grad param_applied.grad = grad_applied else: param_applied = applied_dict[param] if can_use_assign_copy(param_applied, param): if need_apply: self._parameters[key].data = param_applied applied_dict[param] = param_applied else: # The parameter's data has already been set when it can use assign copy. pass else: if need_apply: new_param = Parameter(param_applied, param.requires_grad) self._parameters[key] = new_param applied_dict[param] = new_param else: self._parameters[key] = applied_dict[param] for (key, buf) in self._buffers.items(): if buf is not None: if buf not in applied_dict: buf_applied = fn(buf) self._buffers[key] = buf_applied applied_dict[buf] = buf_applied else: self._buffers[key] = applied_dict[buf] return self
def rebuild_shm_parameter(shm, shape, dtype, requires_grad): def delete_shm(): shm.close() shm.unlink() arr = np.ndarray(shape, dtype=dtype, buffer=shm.buf) t = flow.from_numpy(arr) t._register_storage_delete_hook(delete_shm) return Parameter(t, requires_grad=requires_grad)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) param_list = list() param_list.append(x) rmsprop = flow.optim.RMSprop([{ "params": param_list, "lr": learning_rate, "alpha": alpha, "eps": eps, "weight_decay": weight_decay, "momentum": momentum, "centered": centered, "clip_grad_max_norm": clip_grad_max_norm, "clip_grad_norm_type": clip_grad_norm_type, }]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() rmsprop.clip_grad() rmsprop.step() rmsprop.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = rmsprop.state_dict() rmsprop = flow.optim.RMSprop([x]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) rmsprop.load_state_dict(state_dict) return x
def test_polynomial_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def polynomial_lr_step(base_lr, end_lr, step, decay_steps, power, cycle): if cycle: if step == 0: step = 1 decay_steps = decay_steps * math.ceil(step / decay_steps) step = min(step, decay_steps) return (base_lr - end_lr) * (1 - step / decay_steps)**power + end_lr decay_steps = 100 end_learning_rate = 1e-5 power = 2 cycle = True poly_decay_lr = flow.optim.lr_scheduler.PolynomialLR( optimizer, decay_steps, end_learning_rate, power, cycle) # step(0) will be invoked in LrScheduler.__init__ new_lr = polynomial_lr_step(TestLrScheduler.base_lr, end_learning_rate, 0, decay_steps, power, cycle) test_case.assertAlmostEqual(poly_decay_lr.get_last_lr()[0], new_lr, places=4) for i in range(1, 21): poly_decay_lr.step() new_lr = polynomial_lr_step(TestLrScheduler.base_lr, end_learning_rate, i, decay_steps, power, cycle) test_case.assertAlmostEqual(poly_decay_lr.get_last_lr()[0], new_lr, places=4) cycle = True poly_decay_lr = flow.optim.lr_scheduler.PolynomialLR( optimizer, decay_steps, end_learning_rate, power, cycle) for i in range(1, 21): poly_decay_lr.step() new_lr = polynomial_lr_step(TestLrScheduler.base_lr, end_learning_rate, i, decay_steps, power, cycle) test_case.assertAlmostEqual(poly_decay_lr.get_last_lr()[0], new_lr, places=4)
def test_exponential_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def exponential_lr_step(base_lr, current_step, gamma): return base_lr * gamma**current_step gamma = 0.1 exponential_lr = flow.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma) for i in range(1, 21): exponential_lr.step() new_lr = exponential_lr_step(TestLrScheduler.base_lr, i, gamma) test_case.assertAlmostEqual(exponential_lr.get_last_lr()[0], new_lr, places=5)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) adam = flow.optim.Adam( [ { "params": [x], "lr": learning_rate, "betas": betas, "eps": eps, "weight_decay": weight_decay, "clip_grad_max_norm": clip_grad_max_norm, "clip_grad_norm_type": clip_grad_norm_type, } ], do_bias_correction=do_bias_correction, amsgrad=amsgrad, ) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() adam.clip_grad() adam.step() adam.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = adam.state_dict() adam = flow.optim.Adam( [{"params": [x],}], do_bias_correction=do_bias_correction, ) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) adam.load_state_dict(state_dict) return x
def test_step_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def step_lr_step(base_lr, current_step, step_size, gamma): return base_lr * gamma**(current_step // step_size) gamma = 0.1 step_size = 5 step_lr = flow.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma) for i in range(1, 21): step_lr.step() new_lr = step_lr_step(TestLrScheduler.base_lr, i, step_size, gamma) test_case.assertAlmostEqual(step_lr.get_last_lr()[0], new_lr, places=5)
def train_by_oneflow(): x = Parameter(flow.Tensor(init_value, device=flow.device(device))) ftrl = Ftrl([{ "params": [x], "lr": learning_rate, "weight_decay": weight_decay, "lr_power": lr_power, "initial_accumulator_value": initial_accumulator_value, "lambda1": lambda1, "lambda2": lambda2, "beta": beta, "clip_grad_max_norm": clip_grad_max_norm, "clip_grad_norm_type": clip_grad_norm_type, }]) def train_one_iter(grad): grad_tensor = flow.tensor( grad, dtype=flow.float32, requires_grad=False, device=flow.device(device), ) loss = flow.sum(x * grad_tensor) loss.backward() ftrl.clip_grad() ftrl.step() ftrl.zero_grad() for i in range(train_iters): train_one_iter(random_grad_seq[i]) if i == reload_state_step: state_dict = ftrl.state_dict() ftrl = Ftrl([{ "params": [x], }]) if save_load_by_pickle: with tempfile.TemporaryDirectory() as save_dir: flow.save(state_dict, save_dir) state_dict = flow.load(save_dir) ftrl.load_state_dict(state_dict) return x
def test_multistep_lr(test_case): optimizer = flow.optim.SGD([{ "params": [Parameter(flow.Tensor([1.0]))] }], lr=TestLrScheduler.base_lr) def multistep_lr_step(base_lr, current_step, milestones, gamma): count = 0 for step in milestones: if current_step >= step: count += 1 return base_lr * gamma**count gamma = 0.1 milestones = [5, 11, 15] multistep_lr = flow.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=gamma) for i in range(1, 18): multistep_lr.step() new_lr = multistep_lr_step(TestLrScheduler.base_lr, i, milestones, gamma) test_case.assertAlmostEqual(multistep_lr.get_last_lr()[0], new_lr, places=5)
def compare_with_torch_reduce_lr( test_case, mode, factor, patience, threshold, threshold_mode, cooldown, min_lr, eps, ): optimizer_flow = flow.optim.SGD( [ { "params": [Parameter(flow.Tensor([1.0]))] }, ], lr=TestLrScheduler.base_lr, momentum=0.9, ) optimizer_torch = torch.optim.SGD( [ { "params": [torch.nn.Parameter(torch.Tensor([1.0]))] }, ], lr=TestLrScheduler.base_lr, momentum=0.9, ) scheduler_flow = flow.optim.lr_scheduler.ReduceLROnPlateau( optimizer_flow, mode, factor, patience, threshold, threshold_mode, cooldown, min_lr, eps, ) scheduler_troch = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer_torch, mode, factor, patience, threshold, threshold_mode, cooldown, min_lr, eps, ) val_loss = 0.1 for epoch in range(15): val_loss += (random.random() - 0.5) / 10 scheduler_flow.step(val_loss) scheduler_troch.step(val_loss) for (lr1, lr2) in zip(scheduler_flow._last_lr, scheduler_troch._last_lr): test_case.assertAlmostEqual(lr1, lr2, places=5)
def rebuild_empty_parameter(shape, dtype, requires_grad): t = flow.tensor([], dtype=dtype) t = t.reshape(*shape) return Parameter(t, requires_grad=requires_grad)