def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()
            
            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            if optimizer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                optimizer.step() 
            
            
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
def fit_and_plot(lambd):
    w,b = init_params()
    train_ls,test_ls = [],[]
    for _ in range(num_epochs):
        for X,y in train_iter:
            ## 添加L2范数惩罚项
            l = loss(net(X,w,b),y) + lambd *l2_penalty(w)
            l = l.sum()
            if w.grad is not None:
                w.grad.data.zero_()
                b.grad.data.zero_()
            l.backward()
            d2l.sgd([w,b],lr,batch_size)
        train_ls.append(loss(net(train_features,w,b),train_labels).mean().item())
        test_ls.append(loss(net(test_features,w,b),test_labels).mean().item())
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('L2 norm of w:',w.norm().item())
Beispiel #3
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
# rnn:循环神经网络
# get_params:初始化参数的函数
# init_rnn_state:初始化隐藏层状态
# num_hiddens:隐藏层单元数
# vocab_size:1027
# corpus_indices:字符集合
# idx_to_char和char_to_idx 字典
# num_epochs:训练次数
# num_steps: 步长
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()#交叉熵损失函数

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, device)#
# 相邻的两个随机小批量在原始序列上的位置相毗邻。这时候,可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态
# 从而使下一个小批量的输出也取决于当前小批量的输入,并如此循环下去。
# 这对实现循环神经网络造成了两方面影响:一方面,在训练模型时,
#只需要在每一个迭代周期开始时初始化隐藏状态;另一方面,当多个相邻小批量通过传递隐藏状态串联起来时,模型参数的梯度计算将依赖所有串联起来的小批量序列

        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            # print(Y)
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
            #在随机采样照片那个,每个样本是原始序列上任意截取的一段序列。相邻的两个随机小批量在原始序列上的位置不一定相毗邻。因此,无法用一个小批量的最终时间步的
            #隐藏状态来初始化下一个小批量的隐藏状态。在训练模型时,每次随机采样前都需要重新初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach_()# 这里相邻采样只能更新这一次的举证,不能更新上一次和上上上次的
            # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵
            inputs = to_onehot(X, vocab_size)
            # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
            (outputs, state) = rnn(inputs, state, params)# 预测的下一个数据
            # 拼接之后形状为(num_steps * batch_size, vocab_size)
            # print(outputs[0].shape)
            # print(outputs[0].sum())
            outputs = torch.cat(outputs, dim=0)
            # Y的形状是(batch_size, num_steps),转置后再变成形状为
            # (num_steps * batch_size,)的向量,这样跟输出的行一一对应
            # print(outputs.shape)
            y = torch.flatten(Y.T)
            # print(y.shape)
            # print(y)
            # # 使用交叉熵损失计算平均分类误差
            # print("------------------")
            l = loss(outputs, y.long())
            # 刚好是索引位置嘛,比如outputs的维度1020*1027,y的维度
            # 梯度清0
            # print(l)
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params, clipping_theta, device)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.item() * y.shape[0]
            # print(l_sum)
            n += y.shape[0]
            # print(n)

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))