# FIXME y = 0.2*x_1^2 + 2*x2^2 测试 eta = 2 # 比之前的大,但是效果很好,不会发散 def adagrad_2d(x1, x2, s1, s2): g1, g2, eps = 0.2*x1, 4*x2, 1e-6 s1 += g1**2 s2 += g2**2 x1 -= eta/math.sqrt(s1+eps) * g1 x2 -= eta/math.sqrt(s2+eps) * g2 return x1, x2, s1, s2 def fx(x1, x2): return 0.1*x1**2 + 2*x2**2 d2l.plt.figure(figsize=(15,5)) # 设置图片大小7 d2l.show_trace_2d(fx, d2l.train_2d(adagrad_2d)) # 手动写一个 features, labels = d2l.get_data_ch7() def inin_adagrad_states(): s_w = nd.zeros((features.shape[1], 1)) s_b = nd.zeros(1) return (s_w, s_b) def adagrad(params, states, hyperparams): eps = 1e-6 for p,s in zip(params, states): s[:] += p.grad.square() p[:] -= hyperparams['lr'] * p.grad / (s+eps).sqrt()
def rmsprop_2d(x1, x2, s1, s2): g1, g2, eps = 0.2 * x1, 4 * x2, 1e-6 s1 = gamma * s1 + (1 - gamma) * g1**2 s2 = gamma * s2 + (1 - gamma) * g2**2 x1 -= eta / math.sqrt(s1 + eps) * g1 x2 -= eta / math.sqrt(s2 + eps) * g2 return x1, x2, s1, s2 def f_2d(x1, x2): return 0.1 * x1**2 + 2 * x2**2 eta, gamma = 0.4, 0.9 d2l.show_trace_2d(f_2d, d2l.train_2d(rmsprop_2d)) #7.6.2-从零开始实现 features, labels = d2l.get_data_ch7() def init_rmsprop_states(): s_w = nd.zeros((features.shape[1], 1)) s_b = nd.zeros(1) return (s_w, s_b) def rmsprop(params, states, hyperparams): gamma, eps = hyperparams['gamma'], 1e-6 for p, s in zip(params, states): s[:] = gamma * s + (1 - gamma) * p.grad.square()
#7.4-动量法 #7.4.1-梯度下降的问题 #%matplotlib inline import d2lzh as d2l from mxnet import nd eta = 0.4 def f_2d(x1, x2): return 0.1 * x1 ** 2 + 2 * x2 ** 2 def gd_2d(x1, x2, s1, s2): return (x1-eta*0.2*x1, x2-eta*4*x2, 0, 0) d2l.show_trace_2d(f_2d, d2l.train_2d(gd_2d)) #学习率调大,自变量在竖直方向上不断越过最优解并逐渐发散 eta = 0.6 d2l.show_trace_2d(f_2d, d2l.train_2d(gd_2d)) #7.4.2-动量法 def momentum_2d(x1, x2, v1, v2): v1 = gamma * v1 + eta * 0.2 * x1 v2 = gamma * v2 + eta * 4* x2 return x1-v1, x2-v2, v1, v2 eta, gamma = 0.4, 0.5 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d)) eta =0.6 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d))