'''
features, labels = get_data_ch7()

def init_adadelta_states():
    s_w, s_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
    delta_w, delta_b = torch.zeros((features.shape[1], 1), dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
    return ((s_w, delta_w), (s_b, delta_b))

def adadelta(params, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-5
    for p, (s, delta) in zip(params, states):  # 依次取出 w, b 进行计算
        s[:] = rho * s + (1 - rho) * (p.grad.data**2)
        # print(s)
        g = p.grad.data * torch.sqrt((delta + eps) / (s + eps))
        p.data -= g
        delta[:] = rho * delta + (1 - rho) * g * g

'''
使用超参数ρ=0.9来训练模型。
'''
# d2l.train_ch7(adadelta, init_adadelta_states(), {'rho': 0.9}, features, labels)

'''
简洁实现:
通过名称为Adadelta的优化器方法,我们便可使用PyTorch提供的AdaDelta算法。它的超参数可以通过rho来指定。
'''
d2l.train_pytorch_ch7(torch.optim.Adadelta, {'rho': 0.9}, features, labels)

'''
AdaDelta算法没有学习率超参数,它通过使用有关自变量更新量平方的指数加权移动平均的项来替代RMSProp算法中的学习率。
'''
    s_w, s_b = torch.zeros(
        (features.shape[1], 1),
        dtype=torch.float32), torch.zeros(1, dtype=torch.float32)
    return ((v_w, s_w), (v_b, s_b))


def adam(params, states, hyperparams):
    beta1, beta2, eps = 0.9, 0.999, 1e-6
    for p, (v, s) in zip(params, states):
        v[:] = beta1 * v + (1 - beta1) * p.grad.data
        s[:] = beta2 * s + (1 - beta2) * p.grad.data**2
        v_bias_corr = v / (1 - beta1**hyperparams['t'])
        s_bias_corr = s / (1 - beta2**hyperparams['t'])
        p.data -= hyperparams['lr'] * v_bias_corr / (torch.sqrt(s_bias_corr) +
                                                     eps)
    hyperparams['t'] += 1


'''
使用学习率为0.01的Adam算法来训练模型。
'''
# d2l.train_ch7(adam, init_adam_states(), {'lr': 0.01, 't': 1}, features, labels)
'''
简洁实现:
通过名称为“Adam”的优化器实例,我们便可使用PyTorch提供的Adam算法。
'''
d2l.train_pytorch_ch7(torch.optim.Adam, {'lr': 0.01}, features, labels)
'''
Adam算法在RMSProp算法的基础上对小批量随机梯度也做了指数加权移动平均。
Adam算法使用了偏差修正。
'''
'''
zip()用法
'''
# params = [1, 2, 3]
# states = [5, 6, 7]
# for p, v in zip(params, states):
#     print(p)
#     print('-'*100)
#     print(v)
# 1
# ----------------------------------------------------------------------------------------------------
# 5
# 2
# ----------------------------------------------------------------------------------------------------
# 6
# 3
# ----------------------------------------------------------------------------------------------------
# 7
#
# print('='*100)
# for p in zip(params, states):
#     print(p)
# (1, 5)
# (2, 6)
# (3, 7)
'''
简洁实现:
'''
d2l.train_pytorch_ch7(torch.optim.SGD, {'lr': 0.004, 'momentum': 0.9},
                    features, labels)
Esempio n. 4
0
import torch

import d2lzh_pytorch as d2l

features, labels = d2l.get_data_ch7()


def init_rmsprop_states():
    s_w = torch.zeros((features.shape[1], 1), dtype=torch.float32)
    s_b = torch.zeros(1, dtype=torch.float32)
    return (s_w, s_b)


def rmsprop(params, states, hyperparams):
    gamma, eps = hyperparams['gamma'], 1e-6
    for p, s in zip(params, states):
        s.data = gamma * s.data + (1 - gamma) * (p.grad.data)**2
        p.data -= hyperparams['lr'] * p.grad.data / torch.sqrt(s + eps)


# d2l.train_ch7(rmsprop, init_rmsprop_states(), {'lr': 0.01, 'gamma': 0.9},
#               features, labels)
d2l.train_pytorch_ch7(torch.optim.RMSprop, {
    'lr': 0.01,
    'alpha': 0.9
}, features, labels)