def inin_adam_states(): v_w = nd.zeros((features.shape[1], 1)) v_b = nd.zeros(1) s_w = nd.zeros((features.shape[1], 1)) s_b = nd.zeros(1) return ((v_w, s_w), (v_b, s_b)) def adam(params, states, hyperparams): beta1, beta2, eps = 0.9, 0.999, 1e-6 for p, (v, s) in zip(params, states): v[:] = beta1 * v + (1 - beta1) * p.grad s[:] = beta2 * s + (1 - beta2) * p.grad.square() corr_v = v / (1 - beta1**hyperparams['t']) corr_s = s / (1 - beta2**hyperparams['t']) g = (hyperparams['lr'] * corr_v) / (corr_s.sqrt() + eps) p[:] -= g hyperparams['t'] += 1 d2l.plt.figure(figsize=(15, 5)) # 设置图片大小 # init_momentum_states()不要忘记括号,可能只是执行这个函数,而不是传进去 d2l.train_ch7(adam, inin_adam_states(), {'lr': 0.01, 't': 1}, features, labels) # TODO 简洁实现 d2l.plt.figure(figsize=(15, 5)) # 设置图片大小 # 参数是rho,没有learning_rate!! d2l.train_gluon_ch7('adam', {'learning_rate': 0.01}, features, labels) d2l.plt.show()
return x1-v1, x2-v2, v1, v2 eta, gamma = 0.4, 0.5 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d)) eta =0.6 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d)) #指数加权移动平均-推导过程 #由指数加权移动平均理解动量法 #7.4.3-从零开始实现 features, labels = d2l.get_data_ch7() def init_momentum_states(): v_w = nd.zeros((features.shape[1], 1)) v_b = nd.zeros(1) return (v_w, v_b) def sgd_momentum(params, states, hyperparams): for p, v in zip(params, states): v[:] = hyperparams['momentum']*v + hyperparams['lr'] * p.grad p[:] -= v d2l.train_ch7(sgd_momentum, init_momentum_states(), {'lr':0.02, 'momentum':0.5}, features, labels) d2l.train_ch7(sgd_momentum, init_momentum_states(), {'lr':0.02, 'momentum':0.9}, features, labels) d2l.train_ch7(sgd_momentum, init_momentum_states(), {'lr':0.004, 'momentum':0.5}, features, labels) #7.4.4-简洁实现 d2l.train_gluon_ch7('sgd', {'learning_rate':0.004, 'momentum':0.9}, features, labels)
eta, gamma = 0.4, 0.9 d2l.show_trace_2d(f_2d, d2l.train_2d(rmsprop_2d)) #7.6.2-从零开始实现 features, labels = d2l.get_data_ch7() def init_rmsprop_states(): s_w = nd.zeros((features.shape[1], 1)) s_b = nd.zeros(1) return (s_w, s_b) def rmsprop(params, states, hyperparams): gamma, eps = hyperparams['gamma'], 1e-6 for p, s in zip(params, states): s[:] = gamma * s + (1 - gamma) * p.grad.square() p[:] -= hyperparams['lr'] * p.grad / (s + eps).sqrt() d2l.train_ch7(rmsprop, init_rmsprop_states(), { 'lr': 0.01, 'gamma': 0.9 }, features, labels) #7.6.3-简洁实现 d2l.train_gluon_ch7('rmsprop', { 'learning_rate': 0.01, 'gamma1': 0.9 }, features, labels)
from mxnet import nd features, labels = d2l.get_data_ch7() def init_adadelta_states(): s_w, s_b = nd.zeros((features.shape[1], 1)), nd.zeros(1) delta_w, delta_b = nd.zeros((features.shape[1], 1)), nd.zeros(1) return ((s_w, delta_w), (s_b, delta_b)) def adadelta(params, states, hyperparams): rho, eps = hyperparams['rho'], 1e-5 for p, (s, delta) in zip(params, states): s[:] = rho * s + (1 - rho) * p.grad.square() g = ((delta + eps).sqrt() / (s + eps).sqrt()) * p.grad p[:] -= g delta[:] = rho * delta + (1 - rho) * g * g # In[4]: d2l.train_ch7(adadelta, init_adadelta_states(), {'rho': 0.9}, features, labels) # In[6]: #简洁实现 d2l.train_gluon_ch7('adadelta', {'rho': 0.9}, features, labels) # In[ ]: