def f(g): a = T.variable([], initializer=123.) b = T.variable([], initializer=456.) c = T.variable([], initializer=789.) T.random.seed(1234) optimizer = optimizer_factory([a, b, c], lr) with optimizer.capture_grad(): optimizer.add_loss((a + b)**2) g(optimizer) optimizer.step() return [T.to_numpy(t) for t in (a, b, c, (a + b)**2)]
def test_xavier_initializer(self): for dtype, initializer, mode in product( float_dtypes, (tk.init.xavier_normal, tk.init.xavier_uniform), (None, 'fan_in', 'fan_out'), ): weight = T.variable([n_samples // 50, 50], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) mode_arg = {'mode': mode} if mode is not None else {} # xavier fan_in, fan_out = tk.init.calculate_fan_in_and_fan_out(weight) xavier_std = np.sqrt(2.0 / float(fan_in + fan_out)) tk.init.apply_initializer(weight, initializer, **mode_arg) self.assertLessEqual(np.abs(T.to_numpy(T.reduce_mean(weight))), 5.0 / xavier_std / np.sqrt(n_samples)) # xavier with custom gain and fan_in/fan_out fan_in, fan_out = 23, 17 init_gain = 1.5 xavier_std = init_gain * np.sqrt(2.0 / float(fan_in + fan_out)) tk.init.apply_initializer(weight, initializer, fan_in_and_fan_out=(fan_in, fan_out), gain=init_gain, **mode_arg) self.assertLessEqual(np.abs(T.to_numpy(T.reduce_mean(weight))), 5.0 / xavier_std / np.sqrt(n_samples))
def test_fill(self): for dtype in float_dtypes: weight = T.variable([2, 3, 4], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) tk.init.apply_initializer(weight, partial(tk.init.fill, fill_value=123.)) assert_equal(weight, T.full_like(weight, 123.))
def test_kaming_initializer(self): for dtype, initializer, mode in product( float_dtypes, (tk.init.kaming_normal, tk.init.kaming_uniform), (None, 'fan_in', 'fan_out'), ): weight = T.variable([n_samples // 50, 50], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) mode_arg = {'mode': mode} if mode is not None else {} # kaming fan_in, fan_out = tk.init.calculate_fan_in_and_fan_out(weight) if mode == 'fan_out': kaming_std = np.sqrt(1.0 / np.sqrt(fan_out)) else: kaming_std = np.sqrt(1.0 / np.sqrt(fan_in)) tk.init.apply_initializer(weight, initializer, **mode_arg) self.assertLessEqual(np.abs(T.to_numpy(T.reduce_mean(weight))), 5.0 / kaming_std / np.sqrt(n_samples)) # kaming with custom gain and fan_in/fan_out fan_in, fan_out = 23, 17 init_gain = 1.5 if mode == 'fan_out': kaming_std = init_gain * np.sqrt(1.0 / np.sqrt(fan_out)) else: kaming_std = init_gain * np.sqrt(1.0 / np.sqrt(fan_in)) tk.init.apply_initializer(weight, initializer, fan_in_and_fan_out=(fan_in, fan_out), gain=init_gain, **mode_arg) self.assertLessEqual(np.abs(T.to_numpy(T.reduce_mean(weight))), 5.0 / kaming_std / np.sqrt(n_samples)) # test error with pytest.raises( ValueError, match='`mode` must be either "fan_in" or "fan_out"'): weight = T.variable([n_samples // 50, 50], dtype=dtype, initializer=0.) tk.init.apply_initializer(weight, initializer, mode='invalid')
def full_scan_average_check(ctx, factory, input_x, expected): weight = T.variable(T.shape(input_x)[1:], initializer=tk.init.zeros, requires_grad=False) avg = factory([weight]) for x in input_x: T.assign(weight, x) avg.update() avg.commit() assert_allclose(weight, expected, atol=1e-4, rtol=1e-6)
def test_random_init(self): for dtype in float_dtypes: t = T.variable([n_samples, 2, 3], dtype=dtype) for fn, mean, std in [ (partial(T.random.normal_init, mean=1., std=2.), 1., 2.), (partial(T.random.uniform_init, low=0., high=1.), 0.5, 1. / math.sqrt(12)), ]: fn(t) t_mean = np.mean(T.to_numpy(t)) self.assertLess(abs(t_mean - mean), 3. * std / math.sqrt(n_samples * 2 * 3))
def test_normal(self): for dtype in float_dtypes: weight = T.variable([n_samples // 50, 50], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) # uniform with default args tk.init.apply_initializer(weight, tk.init.normal) self.assertLessEqual(np.abs(T.to_numpy(T.reduce_mean(weight))), 5.0 / np.sqrt(n_samples)) # uniform with customized args tk.init.apply_initializer(weight, partial(tk.init.normal, mean=1., std=3.)) self.assertLessEqual( np.abs(T.to_numpy(T.reduce_mean(weight)) - 1.), 5.0 * 3. / np.sqrt(n_samples))
def test_uniform(self): for dtype in float_dtypes: weight = T.variable([n_samples // 50, 50], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) # uniform with default args tk.init.apply_initializer(weight, tk.init.uniform) self.assertLessEqual( np.abs(T.to_numpy(T.reduce_mean(weight)) - 0.5), 5.0 / np.sqrt(12.) / np.sqrt(n_samples)) # uniform with customized args tk.init.apply_initializer( weight, partial(tk.init.uniform, low=-4., high=3.)) self.assertLessEqual( np.abs(T.to_numpy(T.reduce_mean(weight)) - (-0.5)), 5.0 * 7.0 / np.sqrt(12.) / np.sqrt(n_samples))
def test_apply_initializer(self): for dtype in float_dtypes: weight = T.variable([5, 3], dtype=dtype) fan_in_and_fan_out = tk.init.calculate_fan_in_and_fan_out(weight) initializer = Mock() # test by value tk.init.apply_initializer(weight, 123) assert_equal(weight, T.full_like(weight, 123)) tk.init.apply_initializer(weight, 124.) assert_equal(weight, T.full_like(weight, 124.)) tk.init.apply_initializer(weight, np.array(125.)) assert_equal(weight, T.full_like(weight, 125.)) value = np.random.randn(*T.shape(weight)).astype(dtype) tk.init.apply_initializer(weight, value) assert_equal(weight, value) # test by initializer initializer.reset_mock() tk.init.apply_initializer(weight, initializer) self.assertEqual(initializer.call_args, ((weight, ), { 'gain': 1.0, 'mode': 'fan_in', 'fan_in_and_fan_out': fan_in_and_fan_out, })) # test fan_in_and_fan_out initializer.reset_mock() tk.init.apply_initializer(weight, initializer, fan_in_and_fan_out=(2, 3)) self.assertEqual(initializer.call_args, ((weight, ), { 'gain': 1.0, 'mode': 'fan_in', 'fan_in_and_fan_out': (2, 3), })) initializer.reset_mock() tk.init.apply_initializer(weight, initializer, mode='fan_out') self.assertEqual(initializer.call_args, ((weight, ), { 'gain': 1.0, 'mode': 'fan_out', 'fan_in_and_fan_out': fan_in_and_fan_out, })) # test gain initializer.reset_mock() tk.init.apply_initializer(weight, initializer, gain=1.5) self.assertEqual(initializer.call_args, ((weight, ), { 'gain': 1.5, 'mode': 'fan_in', 'fan_in_and_fan_out': fan_in_and_fan_out, })) for activation in ['LeakyReLU', tk.layers.ReLU, tk.layers.Tanh()]: initializer.reset_mock() init_gain = tk.init.get_activation_gain(activation) tk.init.apply_initializer(weight, initializer, activation=activation) self.assertEqual(initializer.call_args, ((weight, ), { 'gain': init_gain, 'mode': 'fan_in', 'fan_in_and_fan_out': fan_in_and_fan_out, })) # unsupported initializer with pytest.raises(TypeError, match='Unsupported initializer'): tk.init.apply_initializer(weight, object())
def test_ones(self): for dtype in float_dtypes: weight = T.variable([2, 3, 4], dtype=dtype, initializer=0.) assert_equal(weight, T.full_like(weight, 0.)) tk.init.apply_initializer(weight, tk.init.ones) assert_equal(weight, T.full_like(weight, 1.))
def optimizer_standard_check(ctx, optimizer_factory, lr): a = T.variable([], initializer=123.) b = T.variable([], initializer=456.) def calculate_loss(a, b): return (a + b)**2 optimizer = optimizer_factory(iter([a]), lr) ctx.assertEqual(optimizer.lr, lr) ctx.assertEqual(list(optimizer.iter_params()), [a]) with pytest.raises(ValueError, match='Duplicated parameter'): optimizer.add_params([a]) with pytest.raises(ValueError, match='Duplicated parameter'): _ = optimizer_factory([a, a], lr) # test optimize a optimizer.clear_grad() with optimizer.capture_grad(): loss = calculate_loss(a, b) optimizer.add_loss(loss) optimizer.step() ctx.assertLessEqual(calculate_loss(a, b), loss) assert_not_equal(a, 123.) assert_equal(b, 456.) # test optimize a and b, # and also using 'set_param_grad' to optimize a0 and b0 T.random.seed(1234) optimizer = optimizer_factory(iter([a]), lr) optimizer.add_params(iter([b])) ctx.assertEqual(list(optimizer.iter_params()), [a, b]) T.random.seed(1234) a0 = T.variable([], initializer=a) b0 = T.variable([], initializer=b) optimizer0 = optimizer_factory([a0], lr) optimizer0.add_params([b0]) with optimizer.capture_grad(): loss = calculate_loss(a, b) optimizer.add_loss(loss) # copy grads to optimizer0 params_and_grads = list(optimizer.iter_params_and_grads()) ctx.assertEqual(len(params_and_grads), 2) ctx.assertIs(params_and_grads[0][0], a) ctx.assertIs(params_and_grads[1][0], b) optimizer0.set_param_grad( a0, T.as_tensor(params_and_grads[0][1], force_copy=True)) optimizer0.set_param_grad( b0, T.as_tensor(params_and_grads[1][1], force_copy=True)) optimizer.step() ctx.assertLessEqual(calculate_loss(a, b), loss) assert_not_equal(a, 123.) assert_not_equal(b, 456.) optimizer0.step() assert_allclose(calculate_loss(a0, b0), calculate_loss(a, b)) assert_allclose(a0, a) assert_allclose(b0, b) # save checkpoint with TemporaryDirectory() as temp_dir: ckpt_path = os.path.join(temp_dir, 'ckpt') checkpoint = tk.train.Checkpoint(optimizer=optimizer) checkpoint.save(ckpt_path) # test backup and restore the status a2 = T.variable([], initializer=a) b2 = T.variable([], initializer=b) optimizer2 = optimizer_factory([a2], lr) optimizer2.add_params([b2]) checkpoint2 = tk.train.Checkpoint(optimizer=optimizer2) checkpoint2.restore(ckpt_path) with optimizer2.capture_grad(): loss = calculate_loss(a2, b2) optimizer2.add_loss(loss) optimizer2.step() ctx.assertLessEqual(calculate_loss(a2, b2), loss) assert_not_equal(a2, a) assert_not_equal(b2, b) # test backup and restore the status, and use maximize instead of minimize a3 = T.variable([], initializer=a) b3 = T.variable([], initializer=b) optimizer3 = optimizer_factory([a3], lr) optimizer3.add_params([b3]) checkpoint3 = tk.train.Checkpoint(optimizer=optimizer3) checkpoint3.restore(ckpt_path) with optimizer3.capture_grad(): loss = calculate_loss(a3, b3) optimizer3.add_loss(-loss, maximize=True) optimizer3.step() ctx.assertLessEqual(calculate_loss(a3, b3), loss) assert_allclose(a3, a2) assert_allclose(b3, b2) assert_allclose(calculate_loss(a3, b3), calculate_loss(a2, b2)) # backup and restore the status, change the learning rate and get # the third output, and compare to the result with optimizer2 a4 = T.variable([], initializer=a) b4 = T.variable([], initializer=b) optimizer4 = optimizer_factory([a4], lr) optimizer4.add_params([b4]) checkpoint4 = tk.train.Checkpoint(optimizer=optimizer4) checkpoint4.restore(ckpt_path) optimizer4.set_lr(lr * 0.5) ctx.assertEqual(optimizer4.lr, lr * 0.5) with optimizer4.capture_grad(): loss = calculate_loss(a4, b4) optimizer4.add_loss(loss) optimizer4.step() assert_not_allclose(a4, a2) assert_not_allclose(b4, b2) assert_not_allclose(calculate_loss(a4, b4), calculate_loss(a2, b2)) # now proceed the optimization from the first optimizer, and compare # the result with optimizer2 optimizer.clear_grad() with optimizer.capture_grad(): loss = calculate_loss(a, b) optimizer.add_loss(loss) optimizer.step() ctx.assertLessEqual(calculate_loss(a, b), loss) assert_allclose(a, a2) assert_allclose(b, b2) assert_allclose(calculate_loss(a, b), calculate_loss(a2, b2)) # test context optimizer.clear_grad() with pytest.raises(RuntimeError, match=r'`add_loss\(\)` must be called inside the ' r'`capture_grad\(\)` context'): optimizer.add_loss(calculate_loss(a, b)) optimizer.clear_grad() with optimizer.capture_grad(): optimizer.add_loss(calculate_loss(a, b)) with pytest.raises(RuntimeError, match=r'`step\(\)` must be called outside the ' r'`capture_grad\(\)` context'): optimizer.step() # test clip grads def check_clip_grad(optimizer_fn, naive_fn): def f(g): a = T.variable([], initializer=123.) b = T.variable([], initializer=456.) c = T.variable([], initializer=789.) T.random.seed(1234) optimizer = optimizer_factory([a, b, c], lr) with optimizer.capture_grad(): optimizer.add_loss((a + b)**2) g(optimizer) optimizer.step() return [T.to_numpy(t) for t in (a, b, c, (a + b)**2)] def h(optimizer): params = [] grads = [] for param, grad in optimizer.iter_params_and_grads(): if grad is not None: params.append(param) grads.append(grad) grads = naive_fn(grads) for param, grad in zip(params, grads): optimizer.set_param_grad(param, grad) a, b, c, loss = f(lambda optimizer: optimizer_fn(optimizer)) a0, b0, c0, loss0 = f(h) for t, t0 in zip((a, b, c, loss), (a0, b0, c0, loss0)): assert_allclose(t, t0, rtol=1e-4, atol=1e-6) def naive_clip_by_value(grads, clip_min, clip_max): return [T.clip(g, clip_min, clip_max) for g in grads] def naive_clip_by_norm(grads, clip_norm): return [T.clip_by_norm(g, clip_norm) for g in grads] def naive_clip_by_global_norm(grads, clip_norm): return T.clip_by_global_norm(grads, clip_norm) for v in [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, 100.0]: check_clip_grad( lambda optimizer: optimizer.clip_grad_by_value(-v, v), lambda grads: naive_clip_by_value(grads, -v, v), ) check_clip_grad( lambda optimizer: optimizer.clip_grad_by_norm(v), lambda grads: naive_clip_by_norm(grads, v), ) check_clip_grad( lambda optimizer: optimizer.clip_grad_by_global_norm(v), lambda grads: naive_clip_by_global_norm(grads, v), )
def stepwise_average_check(ctx, factory, update_fn, get_fn): def clone_state(val): if isinstance(val, dict): return {k: clone_state(v) for k, v in val.items()} elif isinstance(val, list): return [clone_state(v) for v in val] elif isinstance(val, (T.Tensor, T.Variable)): return T.copy(val) elif isinstance(val, np.ndarray): return np.copy(val) else: return copy.copy(val) T.random.seed(1234) weights = [ T.variable(shape=[4], initializer=tk.init.zeros, requires_grad=False), T.variable(shape=[3], initializer=tk.init.zeros, requires_grad=False), ] answers = [clone_state(w) for w in weights] inputs_1 = T.random.randn([7, 4]) inputs_2 = T.random.randn([7, 3]) # do a scan avg = factory(weights) the_states = [] the_outputs = [] num_updates = 0 for batch_vals in zip(inputs_1, inputs_2): for weight, val in zip(weights, batch_vals): T.assign(weight, val) the_states.append(clone_state(avg.get_state_dict())) avg.update() with avg.temporarily_commit(): the_outputs.extend(clone_state(w) for w in weights) for i, val in enumerate(batch_vals): answers[i] = update_fn(answers[i], val, num_updates) num_updates += 1 for weight, ans in zip(weights, answers): assert_allclose(weight, get_fn(ans, num_updates), rtol=1e-4, atol=1e-6) for weight, val in zip(weights, batch_vals): assert_allclose(weight, val, rtol=1e-4, atol=1e-6) # test enabled = False avg = factory(weights, enabled=False) for x1, x2, state, output in zip(inputs_1, inputs_2, the_states, the_outputs): batch_vals = [x1, x2] for weight, val in zip(weights, batch_vals): T.assign(weight, val) avg.update() avg.commit() # should still affect weights even if enabled is False for avg_val in avg.get_state_dict()['averages']: assert_allclose(avg_val, T.zeros_like(avg_val), rtol=1e-4, atol=1e-6) for weight in weights: assert_allclose(weight, T.zeros_like(weight), rtol=1e-4, atol=1e-6) # do another scan using backup states avg = factory(weights, enabled=False) avg.set_enabled(True) for x1, x2, state, output in zip(inputs_1, inputs_2, the_states, the_outputs): batch_vals = [x1, x2] for weight, val in zip(weights, batch_vals): T.assign(weight, val) avg.set_state_dict(state) avg.update() with avg.temporarily_commit(): the_outputs.extend(clone_state(w) for w in weights) for weight, val in zip(weights, batch_vals): assert_allclose(weight, val, rtol=1e-4, atol=1e-6) # try set bad state avg = factory(weights) state = dict(avg.get_state_dict()) state['averages'] = [] with pytest.raises(ValueError, match='Bad state'): avg.set_state_dict(state)