def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): train_ls, test_ls = [], [] for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: with autograd.record(): y_hat = net(X) l = loss(y_hat, y).sum() l.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype('float32') train_l_sum += l.asscalar() train_acc_sum += (y_hat.argmax(axis=1)).sum().asscalar() n += y.size test_acc = evaluate_accuracy(test_iter, net) print("输出 %d,损失误差 %.4f 训练acc %.3f ,测试acc %.3f" % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) train_ls.append(train_l_sum / n) test_ls.append(test_acc)
def train_softmax(train_iter, test_iter, net, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 # 损失函数(逐渐减小的) 训练集正确率 样本数目60000 for X, y in train_iter: with autograd.record(): y_hat = net(X) # 训练模型 输出<NDArray 256x10 @cpu(0)> l = loss(y_hat, y).sum() # 结果是256的行向量 需要求和,也可以放在l.asscalar(),等效的 l.backward() # 如果上面没求和,这里会自动求和的 if trainer is None: d2l.sgd(params, lr, batch_size) # 第一次用sgd设置参数 else: trainer.step(batch_size) # 其他前进一步训练 train_l_sum += l.asscalar() # 训练集正确率(累计) y = y.astype('float32') train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size # 一次加上256,总和60000 test_acc = evaluate_accurary(test_iter, net) # 测试集正确率 print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) # 交叉熵损失 训练集正确率 测试集正确率
def train_ch3(net, train_iter, test_iter, loss, num_eopchs, batch_size, params=None, lr=None, trainer=None): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0.0 for epoch in range(num_epochs): for X, y in train_iter: with autograd.record(): y_hat = net(X) l = loss(y_hat, y).sum() l.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype("float32") train_l_sum += l.asscalar() # train_acc_sum += n += y.size print("epoch %d, loss %.4f,train_acc %.3f" % (epoch + 1, train_l_sum / n))
def train_rnn(vocab_indices, is_random, vocab_size, hidden_nums, batch_size, num_epochs, num_steps, params, theta, lr, prefixs, predict_step): if is_random: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): l_sum = 0.0 n = 0 start = time.time() if not is_random: H = nd.zeros(shape=(batch_size, hidden_nums)) data_iter = data_iter_fn(vocab_indices, batch_size, num_steps) for X, y in data_iter: if not is_random: for s in (H, ): s.detach() else: H = nd.zeros(shape=(batch_size, hidden_nums)) with autograd.record(): input = to_onehot(X, vocab_size) output, (H, ) = rnn(input, params, (H, )) output = nd.concat(*output, dim=0) y = y.T.reshape((-1, )) l = loss(output, y).mean() l.backward() gradient_clip(params, theta) d2l.sgd(params, lr, 1) l_sum += l.asscalar() * y.size n += y.size print('epoch %d,perplexity %s,time %s' % (epoch, math.exp(l_sum / n), time.time() - start)) rnn_predit(params, predict_step, prefixs, vocab_size, hidden_nums)
def train_ch3(net, w, b, num_inputs, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for x, y in train_iter: with autograd.record(): y_hat = net(x, w, b, num_inputs) l = loss(y_hat, y).sum() l.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype('float32') train_l_sum += l.asscalar() train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size test_acc = evaluate_accuracy(test_iter, net, w, b, num_inputs) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch, train_l_sum / n, train_acc_sum / n, test_acc))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in state: s.detach_() # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵 inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成形状为 # (num_steps * batch_size,)的向量,这样跟输出的行一一对应 y = torch.flatten(Y.T) # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y.long()) # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: #X:一个batch data是batch_size个长度为num_steps的char list,其中char由idx表示 #Y: Y和X格式一样 #print(X[0],Y[0]) if is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx) else: for s in state: s.detach( ) # Returns a new NDArray, detached from the current graph. with autograd.record(): #print('X.shape', X.shape) #print('Y.shape', Y.shape) inputs = to_onehot(X, vocab_size) (outputs, state) = rnn(inputs, state, params) #print('inputs.shape', len(inputs), inputs[0].shape) #print('outputs.shape', len(outputs), outputs[0].shape) outputs = nd.concat(*outputs, dim=0) # 把outputs全连起来 print('concat_outputs', outputs.shape) y = Y.T.reshape((-1, )) print('y', y.shape) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) d2l.sgd(params, lr, 1) # 真正的去优化params参数 l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train_and_predit_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): # 采样方式 if is_random_iter: data_iter_fn = d2l.data_iter_random # 随机取样 else: data_iter_fn = d2l.data_iter_consecutive # 相邻取样 params = get_param() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如果是相邻采样,刚开始就初始化隐藏层的参数 state = init_rnn_state(batch_size, num_hiddens, ctx) # 读取数据 l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # 如果是随机采样,在小批量更新前初始化隐藏层的参数 state = init_rnn_state(batch_size, num_hiddens, ctx) else: for s in state: # 否则从计算图中分离出来 s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) (outputs, state) = rnn( inputs, state, params ) # inputs和outputs是num_steps个(batch_size, vocab_size) outputs = nd.concat( *outputs, dim=0) # 联结之后形状为(batch_size*num_steps, vocab_size) y = Y.T.reshape((-1, )) # Y(batch_size, num_steps),转置后拉成成为行向量 l = loss(outputs, y).mean() # 计算平均分类损失 l.backward() grad_clipping(params, clipping_theta, ctx) # 剪裁梯度 d2l.sgd(params, lr, 1) # 里面不用填batch_size,上面已经mean() l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: # 每pred_period打印一次 # TODO math.exp(l_sum/n)困惑度 print('epoch %d, perplexity %f, time %.3f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( '-', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # If using adjacent sampling, initiate hidden # state at first state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # if using random sampling, iniate at # every batch size state = init_rnn_state(batch_size, num_hiddens, ctx) else: # Or extract state using detach() for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) # outputs is num_steps (batch_size, vocab_size) (outputs, state) = rnn(inputs, state, params) # Concat as (num_steps * batch_size, vocab_size) outputs = nd.concat(*outputs, dim=0) # Y from (batch_size, num_steps) to # batch * num_steps for loss function's computation y = Y.T.reshape((-1, )) # Use cross entropy to be loss function l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) # clipp gradient d2l.sgd(params, lr, 1) # no need to be mean l_sum += l.asscalar() * y.size n += y.size # Print results using perplexity and also predict the results if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 连结之后形状为(num_steps * batch_size, vocab_size) outputs = nd.concat(*outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 y = Y.T.reshape((-1, )) # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def pit_and_plot(lambd): w,b = init_params() # 初始化模型参数 train_l, test_l = [], [] for _ in range(num_epochs): for X,y in train_iter: with autograd.record(): # 增加L2番薯惩罚 l = loss(net(X, w, b), y) + lambd*l2_penalty(w) l.backward() d2l.sgd([w,b], lr, batch_size) train_l.append(loss(net(train_features, w, b), train_labels).mean().asscalar()) test_l.append(loss(net(test_features, w, b), test_labels).mean().asscalar()) d2l.semilogy(range(1, num_epochs+1), train_l, 'epochs', 'loss', range(1, num_epochs+1), test_l, ['train', 'test'], figsize=(15, 5)) print('L2 norm of w:', w.norm().asscalar())
def train_batch(X, y, gpu_params, ctx, lr): #当ctx包含多块GPU及相应的显存时,将小批量数据样本划分并复制到各个显存上 gpu_Xs, gpu_ys = split_and_load(X, ctx), split_and_load(y, ctx) with autograd.record(): #在各块GPU上分别计算损失 ls = [ loss(lenet(gpr_X, gpu_W), gpu_y) for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys, gpu_params) ] for l in ls: #在各块GPU上分别反向传播 l.backward() #把各块显卡的显存上的梯度加起来,然后广播到所有显存上 for i in range(len(gpu_params[0])): allreduce(gpu_params[c][i].grad for c in range(len(ctx))) for param in gpu_params: #在各块显卡的显存上分别更新模型参数 d2l.sgd(param, lr, X.shape[0]) #这里使用了完整批量大小
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, num_inputs, W, b, params=None, lr=None, trainer=None): """ 开始训练 :param net: 模型函数 :param train_iter: 训练数据 :param test_iter: 测试数据 :param loss: 损失函数 :param num_epochs: 迭代周期数 :param batch_size: :param num_inputs: :param W: :param b: :param params: :param lr: 学习率 :param trainer: :return: """ for epoch in range(num_epochs): train_loss_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: with autograd.record(): y_hat = net(X, num_inputs, W, b) data_loss = loss(y_hat, y).sum() # TODO 放在with里面和外面的区别 # 自动求梯度 data_loss.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype('float32') train_loss_sum += data_loss.asscalar() train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size test_acc = evaluate_accuracy(test_iter, net, num_inputs, W, b) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_loss_sum / n, train_acc_sum / n, test_acc))
def train_and_predit_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_tandom_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): # 采样方式 if is_tandom_iter: data_iter_fn = d2l.data_iter_random # 随机取样 else: data_iter_fn = d2l.data_iter_consecutive # 相邻取样 params = get_param() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_tandom_iter: # 如果是相邻采样,刚开始就初始化隐藏层的参数 state = init_rnn_state(batch_size, num_hiddens, ctx) # l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_tandom_iter: state = init_rnn_state(batch_size, num_hiddens, ctx) else: for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) (outputs, state) = rnn(inputs, state, params) outputs = nd.concat(*outputs, dim=0) y = Y.T.reshape((-1, )) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) d2l.sgd(params, lr, 1) l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.3f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - time)) for prefix in prefixes: print( '-', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): global printt for epoch in range(num_epochs): # 每一次训练 train_l_sum = 0.0 train_acc_sum = 0.0 n = 0 # 小批量数据 for X, y in train_iter: # 某一次训练 with autograd.record(): # 某一数据的训练预测值列表 y_hat = net(X) l = loss(y_hat, y).sum() # if printt: # print(y_hat[0].sum()) # printt = False l.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype('float32') train_l_sum += l.asscalar() train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size # 对测试数据求准确率 test_acc = evaluate_accuracy(test_iter, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l.backward() d2l.sgd([w, b], lr, batch_size) train_ls.append( loss(net(train_features, w, b), train_labels).mean().asscalar()) test_ls.append( loss(net(test_features, w, b), test_labels).mean().asscalar()) d2l.semilogy(range(1, num_epochs + 1), train_ls, "epochs", "loss", range(1, num_epochs + 1), test_ls, ["train", "test"]) print("L2 norm of w", w.norm().asscalar())
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: with autograd.record(): # 添加了L2范数惩罚项,广播机制使其变成长度为batch_size的向量 l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l.backward() d2l.sgd([w, b], lr, batch_size) train_ls.append( loss(net(train_features, w, b), train_labels).mean().asscalar()) test_ls.append( loss(net(test_features, w, b), test_labels).mean().asscalar()) d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print('L2 norm of w:', w.norm().asscalar())
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): for x, y in train_iter: with autograd.record(): l = loss(net(x, w, b), y) + lambd * l2_penalty(w) l.backward() d2l.sgd([w, b], lr, batch_size) train_ls.append(loss(net(train_features, w, b), train_labels).mean().asscalar()) test_ls.append(loss(net(test_features, w, b), test_labels).mean().asscalar()) print('true w: ', w.mean()) print('L2 norm of w:', w.norm().asscalar()) print('final epoch: train loss ', train_ls[-1], 'test loss', test_ls[-1]) d2l.semilogy(range(1, num_epochs+1), train_ls, 'epochs', 'loss', range(1, num_epochs+1), test_ls, ['train', 'test'])
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): '''训练模型,并进行预测''' if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx=ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices,batch_size, num_steps, ctx=ctx) for X, Y in data_iter: if is_random_iter: state = init_rnn_state(batch_size, num_hiddens, ctx=ctx) else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in state: s.detach() inputs = one_hot(X, vocab_size) with autograd.record(): (outputs, state) = rnn(inputs, state, params) outputs = nd.concat(*outputs, dim=0) y = Y.T.reshape((-1,)) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx=ctx) d2l.sgd(params, lr, 1) l_sum += l.asscalar()*y.size n += y.size if (epoch+1) % pred_period == 0: print('epoch: %d, perlexity: %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum/n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn( prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train_softmax(self, net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, optimizer=None): W = params[0] b = params[1] for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: """ W是权重,b是偏移量, X是输入,y是输出 """ y_hat = net(X, W, b) l = loss(y_hat, y).sum() # 梯度清零 if optimizer is not None: optimizer.zero_grad() elif params is not None and params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() if optimizer is None: d2l.sgd(params, lr, batch_size) else: optimizer.step() # “softmax回归的简洁实现”一节将用到 train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() n += y.shape[0] test_acc = self.evaluate_accuracy(test_iter, W, b) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc)) return W, b
def train(net, iter_trainning, iter_testing, loss, number_epochs, batch_size, parameters = None, leanrning_rate = None, trainer = None): for epoch in range(number_epochs): train_l_sum = 0.0 train_acc_sum = 0.0 n = 0 for X, y in iter_trainning: with autograd.record(): y_hat = net(X) l = loss(y_hat, y).sum() l.backward() if trainer is None: d2l.sgd(parameters, leanrning_rate, batch_size) else: trainer.step(batch_size) y = y.astype('float32') train_l_sum += (y_hat.argmax(axis = 1) == y).sum().asscalar() n += y.size test_acc = evaluate_accuracy(iter_testing, net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
def train_and_predict(self, is_random_iter, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive self.get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(self.num_epochs): if not is_random_iter: self.init_rnn_state() l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(self.corpus_indices, self.batch_size, self.num_steps) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 self.init_rnn_state(self.batch_size, self.num_hidden) else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in self.state: s.detach() with autograd.record(): self.inputs = self.to_onehot(X.as_in_context(self.ctx), self.vocab_size) outputs, state = self.rnn() outputs = nd.concat(*outputs, dim=0) y = Y.T.reshape((-1,)) l = loss(outputs, y).mean() l.backward() self.grad_clipping(self.clipping_theta) d2l.sgd(self.params, self.lr, 1) l_sum += l_sum n += y.size if (epoch + 1) % pred_period == 0: print( 'epoch {}, perplexity {}, time {} sec'.format(epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', self.predict(prefix, pred_len))
def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None): # 此处需要迭代5次 for epoch in range(num_epochs): # train_l_sum 训练60000张图片正确图片概率熵值的总和 # 熵值的损失 train_l_sum / n # train_acc_sum 训练60000张图片的准确率 # 正确结果为R一张图片正确记为1,错误记为0, # (R1 + ...... + R60000) / n # (R1 + ...... + R60000) / 60000 # n作为训练数据为60000 # n训练数据(训练图片的总数60000) train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in train_iter: # 每次取出256张图片数据,参考下面打印的结果 # X <NDArray 256x1x28x28 @cpu(0)> # y <NDArray 256 @cpu(0)> with autograd.record(): # y_hat 256 x 10 # 用模型进行计算 y_hat = net(X) # 选出当前训练数据,对应正确的下标记的概率的总和 # lose(y_hat, y)求出的是当前正确衣服预测概率的熵,(如果预测正确,概率为P,熵为-lnp = -0,熵值越小,意外度越小,损失越小) # loss(y_hat, y).sum()求出一批数据(256张图像)的熵值的总数 # 下面是简单的ln百分比参考 # -ln0.01 = 4.605 # -ln0.10 = 2.302 # -ln0.50 = 0.693 # -ln0.90 = 0.105 # -ln0.99 = 0.010 # -----!!!!!l是一批数据256张图正确图片出现概率熵值的总和!!!!!----- l = loss(y_hat, y).sum() # 计算梯度 l.backward() if trainer is None: #W----------------!!!!!!!!!!!!!----------------W # 梯度下降(开始“学习”W和b) #M----------------!!!!!!!!!!!!!----------------M # batch_size 256 # lr 0.1 d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) # “softmax回归的简洁实现”一节将用到 # y原来是int类型的正确衣服的下标,转换成float类型 y = y.astype('float32') # 转换成标量,然后加过去 train_l_sum += l.asscalar() # 求出正确的总数 train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size # 每学习一次,用测试数据进行一次测试操作,算出准确度 # 用当前训练轮数的模型,进行softmax,然后当前预测的服装的下标是否正确(正确为1,错误为0),(i1 + ...... + i60000) / 60000 test_acc = evaluate_accuracy(test_iter, net) # epoch 第几轮 # loss 损失 # train acc 训练数据acc(准确率) # test acc 测试数据acc(准确率) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X,Y in data_iter: if: # 如果是相邻采样,刚开始就初始化隐藏层的参数 state = init_rnn_state(batch_size, num_hiddens, ctx) else: for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, vocab_size) (outputs, state) = rnn(inputs, state, params) outputs = nd.concat(*outputs, dim=0) y = Y.T.reshape((-1, )) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) d2l.sgd(params, lr, 1) l_sum += l.asscalar() * y.size n += y.size if (epoch+1)%pred_period == 0: print('epoch %d, perplexity %f, time %.3f sec' % (epoch+1, math.exp(l_sum/n), time.time()-time)) for prefix in prefixes: print('-', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx)) num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 100, 0.01 pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开'] train_and_predit_rnn(rnn, get_param, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, True, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): ''' :param rnn: 循环神经网络 :param get_params: 参数权重 :param init_rnn_state: 模型初始化 :param num_hiddens: 隐藏层大小 :param vocab_size: 不同字符的个数 :param ctx: :param corpus_indices: 字符的索引(不同) :param idx_to_char: :param char_to_idx: :param is_random_iter: 数据是否随机采样 :param num_epochs: 总轮数 :param num_steps: :param lr: 学习率 :param clipping_theta: 梯度裁剪 :param batch_size: 批量大小 :param pred_period: 预测周期 :param pred_len: :param prefixes: 需要预测的字符 :return: ''' if is_random_iter: # 一共vocab_size的大小 # 每次返回batch_size * num_steps的大小,一共vocab_size/xx 次 data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params(vocab_size, num_hiddens, vocab_size, ctx) loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如果使用相邻采样,开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # 随机采样,每个小批量开始前初始化状态. state = init_rnn_state(batch_size, num_hiddens, ctx) else: # 使用detach函数从计算图分离隐藏状态 for s in state: # 将某个node变成不需要梯度的Varibale。因此当反向传播经过这个node时,梯度就不会从这个node往前面传播 # 不想计算A网络的,那么可以把Y通过detach()函数分离出来 s.detach() with autograd.record(): # inputs 是num_steps个(batch_size,vocab_size) = num_steps*batch_size*vocab_size inputs = to_onehot(X, vocab_size) # outputs 有num_steps 个形状为(batch_size,vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 连结之后形状为(num_steps*batch_size,vocab_size)的矩阵 outputs = nd.concat(*outputs, dim=0) # 转置就是vocab_size,num_steps*batch_size # Y的形状时(batch_size,num_steps),转置变成长度时 # batch_size * num_steps的向量,一一对应 y = Y.T.reshape((-1, )) # 转换为一维矩阵 # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) # 裁剪梯度 d2l.sgd(params, lr, 1) # 误差取过均值,梯度不做平均 l_sum += l.asscalar() * y.size # 平均损失*总数 # n += y.size if (epoch + 1) % pred_period == 0: # perplexity print('epoch %d,perplexity %f,time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def fit_and_plot(lambd): # w <NDArray 200x1 @cpu(0)> # b <NDArray 1x1 @cpu(0)> w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): # 训练100次(梯度下降100次) for X, y in train_iter: # 训练批次为20,每一次取出一个训练,一共训练20次 # X <NDArray 1x200 @cpu(0)> # y <NDArray 1x1 @cpu(0)> with autograd.record(): # 添加了L2范数惩罚项,广播机制使其变成长度为batch_size的向量 # loss + λ / 2 * ||w||2 # # L2范数的平方 # # 带惩罚函数的损失 # 惩罚函数 # λ / 2 * (∥w∥ ** 2) # l2_penalty = (w ** 2).sum() / 2 # lambd * l2_penalty(w) = lambd / 2 * ((w ** 2).sum()) # 损失计算 # nd.dot(X, w) + b # X - <NDArray 1x200 @cpu(0)> # w - <NDArray 200x1 @cpu(0)> # b - <NDArray 1 @cpu(0)> netResult = net(X, w, b) lossO = loss(netResult, y) l = lossO + lambd * l2_penalty(w) # l = loss(net(X, w, b), y) + lambd * l2_penalty(w) # 计算梯度 l.backward() # batch_size 1 # lr 0.003 # [w, b] [200 x 1, 1 x 1] # 梯度下降 d2l.sgd([w, b], lr, batch_size) # train_features * w + b = y # 20x200 200x1 1x1 20x1 train_ls.append( loss(net(train_features, w, b), train_labels).mean().asscalar()) test_ls.append( loss(net(test_features, w, b), test_labels).mean().asscalar()) # 100次的训练,每次训练数据的损失都在降低,但是测试数据的损失一直很高 d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) # --------------------normTest-------------------- # TTT1 = nd.array((1,2,3,4)).reshape((1, 4)) # 1 + 4 + 9 + 16 = 30 # TTT2 = TTT1.norm().asscalar() # --------------------normTest-------------------- print('L2 norm of w:', w.norm().asscalar())
d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype("float32") train_l_sum += l.asscalar() # train_acc_sum += n += y.size print("epoch %d, loss %.4f,train_acc %.3f" % (epoch + 1, train_l_sum / n)) for epoch in range(num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0.0 X, y = mnist.train.next_batch(batch_size) X, y = nd.array(X), nd.array(y) print(X.max()) # print(y.shape) with autograd.record(): y_hat = net(X) l = loss(y_hat, y).sum() l.backward() if trainer is None: d2l.sgd(params, lr, batch_size) else: trainer.step(batch_size) y = y.astype("float32") train_l_sum += l.asscalar() n += y.size print("epoch %d, loss %.4f" % (epoch + 1, train_l_sum / n)) net.export("my_mlp") print("ok!")