def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None): for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: y_hat = net(X) l = loss(y_hat, y).sum() # 梯度清零 if optimizer is not None: optimizer.zero_grad() elif params is not None and params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() if optimizer is None: d2l.sgd(params, lr, batch_size) else: optimizer.step() # “softmax回归的简洁实现”一节将用到 train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() n += y.shape[0] test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: # 随机采样 data_iter_fn = d2l.data_iter_random else: # 相邻采样 data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 否则需要使用detach函数从计算图分离隐藏状态, 这是为了 # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大) for s in state: s.detach_() inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 y = torch.transpose(Y, 0, 1).contiguous().view(-1) # 使用交叉熵损失计算平均分类误差 # print('y:\n',y.shape) # print('outputs:\n',outputs.shape) l = loss(outputs, y.long()) # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] # times_sum = 0 #记录次数,Debug时用 for _ in range(num_epochs): for X, y in train_iter: # times_sum += 1 # l2_penalty(w)为添加了L2范数惩罚项 # net(X, w, b)为求X×w+b,即代入参数求模型输出 # loss求的是模型输出和训练的label(标签)之间的方差 l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l = l.sum() if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_() l.backward() d2l.sgd([w, b], lr, batch_size) # print(times_sum) train_ls.append( loss(net(train_features, w, b), train_labels).mean().item()) test_ls.append( loss(net(test_features, w, b), test_labels).mean().item()) #上述循环for _ in range(num_epochs):走完后 #train_ls的长度为100;test_ls长度也为100 #之所以都为100,是因为我们设定的num_epochs(迭代次数)为100 #train_ls和test_ls每个元素表示,在一个迭代后,训练集和测试集的模型偏差大小。 d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print('L2 norm of w:', w.norm().item())
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): # 6.3节部分讲述的 if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() state = None for epoch in range(num_epochs): if not is_random_iter: # 如使⽤相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # 如使⽤随机采样,在每个⼩批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 否则需要使⽤detach函数从计算图分离隐藏状态, 这是为了 # 使模型参数的梯度计算只依赖⼀次迭代读取的⼩批量序列(防⽌梯度计算开销太⼤) for s in state: s.detach_() inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成⻓度为 # batch * num_steps 的向量,这样跟输出的⾏⼀⼀对应 y = torch.transpose(Y, 0, 1).contiguous().view(-1) # 使⽤交叉熵损失计算平均分类误差 l = loss(outputs, y.long( )) # https://www.jianshu.com/p/6049dbc1b73f y没有转换成onehot也可以计算 # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() # optimizer.zero_grad() l.backward() # 反向传播计算 grad_clipping(params, clipping_theta, device) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不⽤再做平均, 相当于optimizer.step() l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) # 困惑度是对交叉熵损失函数做指数运算后得到的值 for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None): for epoch in range(num_epochs): train_acc_sum, train_loss_sum, n = 0.0, 0.0, 0 for X, y in train_iter: y_hat = net(X) loss = cross_entropy(y_hat, y).sum() loss.backward() d2l.sgd(params, lr, batch_size) for param in params: param.grad.data.zero_() train_loss_sum += loss.item() train_acc_sum += (y_hat.argmax(dim=1) == y).float().sum().item() n += y.shape[0] test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_loss_sum / n, train_acc_sum / n, test_acc))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: state = init_rnn_state(batch_size, num_hiddens, device) else: for s in state: s.detach_() inputs = to_onehot(X, vocab_size) (outputs, state) = rnn(inputs, state, params) outputs = torch.cat(outputs, dim=0) y = torch.transpose(Y, 0, 1).contiguous().view(-1) l = loss(outputs, y.long()) if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) d2l.sgd(params, lr, 1) l_sum += l.item() * y.shape[0] n += y.shape[0] ##因为我这里要求的是所有epoch累计的损失函数的平均值 if (epoch + 1) % pred_period == 0: print('epoch %d,perplexity %f,time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None): # 调试时计算次数 times_sum = 0 for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: #X的size为[256, 1, 28, 28];y的size为[256] # print(y.size()) y_hat = net(X) l = loss(y_hat, y).sum() ''' 损失函数l为交叉熵函数 最小化交叉熵损失函数等价于最大化训练数据集所有标签类别的联合预测概率。 loss(y_hat, y)返回的是每张图像的交叉熵值,为了反映整体情况需要对其求和“.sum() ” ''' # 梯度清零 if optimizer is not None: optimizer.zero_grad() elif params is not None and params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() times_sum += 1 if optimizer is None: d2l.sgd(params, lr, batch_size) else: optimizer.step() # “softmax回归的简洁实现”一节将用到 train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() n += y.shape[0] test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) print(times_sum)
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l = l.sum() if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_() l.backward() d2l.sgd([w, b], lr, batch_size) train_ls.append(loss(net(train_features, w, b), train_labels).mean().item()) test_ls.append(loss(net(test_features, w, b), test_labels).mean().item()) d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print('L2 norm of w:', w.norm().item())
def fit_and_plot(lamb): w, b = init_params() train_ls, test_ls = [], [] for epoch in range(num_epochs): for X, y in data_iter: # 添了L2范数惩罚项 l = loss(net(X, w, b), y) + lamb * l2_penalty(w) if w.grad is not None: # 第一次求导前参数是没梯度的 w.grad.data.zero_() b.grad.data.zero_() l.backward() # 求导 d2l.sgd([w, b], lr) # 更新参数 train_ls.append( loss(net(features[:n_train, :], w, b), labels[:n_train]).item()) test_ls.append( loss(net(features[n_train:, :], w, b), labels[n_train:]).item()) print('final epoch: train_loss ', train_ls[-1], 'test_loss ', test_ls[-1]) print('L2 norm of w', w.norm().item()) # 绘制误差曲线 d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epoch', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test'])
def fit_and_plot(lambd): w, b = init_params() train_loss, test_loss = [], [] for _ in range(num_epochs): for X, y in train_iter: batch_loss = (loss(net(X, w, b), y) + lambd * l2_penalty(w)) batch_loss = batch_loss.mean() if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_() batch_loss.backward() d2l.sgd([w, b], lr) train_loss.append( loss(net(train_features, w, b), train_labels).mean().item()) test_loss.append( loss(net(test_features, w, b), test_labels).mean().item()) d2l.semilogy(range(1, num_epochs + 1), train_loss, "epochs", "loss", range(1, num_epochs + 1), test_loss, ["train", "test"]) print("L2 norm of w: ", w.norm().item())
dtype=torch.float) # 初始化模型参数 w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float) b = torch.zeros(1, dtype=torch.float) # 记录梯度 w.requires_grad_(requires_grad=True) b.requires_grad_(requires_grad=True) # 定义损失函数:平方损失函数 loss = d2l.squared_loss # 定义模型,即前向计算 net = d2l.linreg # 模型参数:学习率,迭代次数,批量大小 lr, num_epoches, batch_size = 0.03, 3, 10 # 训练模型 for epoch in range(num_epoches): for X, y in d2l.data_iter(batch_size, features, labels): l = loss(net(X, w, b), y) # 计算批量损失,已平均 l.backward() # 代价函数求导,得到梯度 d2l.sgd([w, b], lr) # 更新梯度的值 w.grad.data.zero_() # 清零梯度,否则梯度会一直累加之前的梯度 b.grad.data.zero_() train_l = loss(net(features, w, b), labels) # 计算总损失 print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: # Y为X的下一步的真实值,可理解为X的真实标签 if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 否则需要使用detach函数从计算图分离隐藏状态, 这是为了 # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大) for s in state: s.detach_() inputs = to_onehot(X, vocab_size) ''' 在这里有必要做一下说明: 在本实例中:inputs = to_onehot(X, vocab_size) X为小批量样本,为32*35的张量,其中32为,32表示这个小批量包含32个样本,35表示每个样本长度为35个字符 在做to_onehot()变换后得到的inputs的长度为35*32*1027 inputs为包含35个32*1027矩阵的张量 1027为字典长度 上面的操作可以理解为: 将32个样本的第一个字拿出来,做成一个onehot矩阵,依次往后,将第35个字拿完,组成35个矩阵 其实很奇怪的是,为什么要这样做,这样做的目的很不明了 因为直接做成32个矩阵,每个矩阵对应着35个连续的字也可以达到做成onehot形式来输入进网络的目的 并且,因为上面这种变换操作,导致了后面求损失函数时,标签Y需要进行 行列转换 torch.transpose(Y, 0, 1) ;这样才能和这里的变换对上 ''' ''' 下面是对上面要转换input的解释: 通过针对上面问题进行更改代码,更改后代码为:ResNet_Modify.py 在修改过程正就发现如果按照上面说的32个样本做成32个矩阵 不仅会导致在get_params()函数中W_xh的设置要确定字典长度外 学习的效果也很差,后面的数字一直处于重复状态(这个目前还没搞清楚是什么原因导致的) ''' # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 # a =torch.transpose(Y, 0, 1) # b = a.contiguous() # c = b.view(-1) y = torch.transpose(Y, 0, 1).contiguous().view(-1) # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y.long()) # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))