def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): # 选择读取数据的方式 if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params(vocab_size, num_hiddens, vocab_size) loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): # 若是相邻采样,则刚开始就初始化隐藏状态 if not is_random_iter: state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # 随机采样需要在每个批量之间初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 相邻采样需要将隐藏状态从计算图中分离, 否则会因为图被释放掉无法追踪而报错 for s in state: s.detach_() X = X.type(torch.long) Y = Y.type(torch.long) inputs = to_onehot(X, vocab_size) # 转化为one-hot表示法 outputs, state = rnn(inputs, state, params) # 计算输出值(index)以及隐藏状态 outputs = torch.cat(outputs, dim=0) # 每一行样本都有一个output来组成一个列表,此处将列表拼接起来 y = Y.t().reshape(-1,) l = loss(outputs, y) l.backward() with torch.no_grad(): grad_clipping(params, clipping_theta, device) # 防止梯度爆炸 d2l.sgd(params, lr) l_sum += l.item() * y.size(0) n += y.size(0) if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn( prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_and_predict_rnn_nn(model, device,corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): loss = nn.CrossEntropyLoss() trainer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0, weight_decay=0) for epoch in range(num_epochs): l_sum, n, start = 0.0, 0, time.time() data_iter = d2l.data_iter_consecutive(corpus_indices, batch_size, num_steps, device) state = model.begin_state(batch_size=batch_size, device=device) for X, Y in data_iter: state.detach_() output, state = model(X.type(torch.long), state) y = Y.t().reshape(-1,).type(torch.long) l = loss(output, y) l.backward() # params = [p for p in model.parameters()] # d2l.grad_clipping(params, clipping_theta, device) nn.utils.clip_grad_norm_(model.parameters(), clipping_theta) trainer.step() trainer.zero_grad() l_sum += l.item() * y.size(0) n += y.size(0) if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn_nn(prefix, pred_len, model, device, idx_to_char, char_to_idx))
def train_fine_tuning(net, learning_rate, batch_size=128, num_epochs=5): net = net.to('cuda') train_imgs.transform = train_augs train_iter = DataLoader(train_imgs, batch_size, shuffle=True) test_imgs.transform = test_augs test_iter = DataLoader(test_imgs, batch_size) trainer = torch.optim.SGD([{ 'params': other, 'lr': learning_rate }, { 'params': output, 'lr': learning_rate * 10 }], lr=learning_rate, weight_decay=0.1) d2l.train_ch5(net, train_iter, test_iter, trainer, num_epochs, device='cuda')
def train_ch5(net, train_iter, test_iter, trainer, num_epochs, device=None): loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time() for X, y in train_iter: if device is not None: X = X.to(device) y = y.to(device) trainer.zero_grad() y_hat = net(X) l = loss(y_hat, y.type(torch.long)) l.backward() trainer.step() train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y.type(torch.long)).sum().item() n += y.size(0) test_acc = d2l.evaluate_accuracy(test_iter, net, device=device) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time() - start))
def forward(self, inputs, state): X = d2l.one_hot2(inputs.t(), self.vocab_size) Y, state = self.rnn(X, state) output = self.linear(Y.view(-1, Y.shape[-1])) return output, state
def begin_state(self, batch_size, device=torch.device('cpu'), num_state=1): return (torch.rand(num_state, batch_size, self.num_hiddens, device=device), torch.rand(num_state, batch_size, self.num_hiddens, device=device)) def forward(self, inputs, state): X = d2l.one_hot2(inputs.t(), self.vocab_size) Y, state = self.rnn(X, state) output = self.linear(Y.view(-1, Y.shape[-1])) return output, state if __name__ == '__main__': corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics( ) num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] num_hiddens, device = 256, torch.device('cuda') gru_layer = nn.LSTM(vocab_size, num_hiddens) model = RNNModel(gru_layer, num_hiddens, vocab_size).to(device) d2l.train_and_predict_rnn_nn(model, device, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
grad_clipping(params, clipping_theta, device) # 防止梯度爆炸 d2l.sgd(params, lr) l_sum += l.item() * y.size(0) n += y.size(0) if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn( prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx)) if __name__ == '__main__': corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics() # one_hot(torch.LongTensor([0, 2]), vocab_size) # print(one_hot) # X = torch.arange(10).view(2, 5).cuda() # print(X) # inputs = to_onehot(X, vocab_size) # print(len(inputs), inputs[0].shape) # for input in inputs: # print(input) # params = get_params(vocab_size, 256, vocab_size) # state = init_rnn_state(X.shape[0], 256, torch.device('cuda')) # inputs = to_onehot(X, vocab_size) # outputs, state_new = rnn(inputs, state, params) # print(len(outputs), outputs[0].shape, state_new[0].shape)
class NLeNet(nn.Module): def __init__(self, X_shape, in_channels=1): super().__init__() X_test = torch.rand(1, in_channels, *X_shape) self.conv_part = nn.Sequential( nn.Conv2d(in_channels, 6, kernel_size=5), nn.BatchNorm2d(6), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2)) X_test = self.conv_part(X_test) self.flatten = X_test.shape[1] * X_test.shape[2] * X_test.shape[3] self.linear_part = nn.Sequential(nn.Linear(self.flatten, 120), nn.BatchNorm1d(120), nn.Sigmoid(), nn.Linear(120, 84), nn.BatchNorm1d(84), nn.Sigmoid(), nn.Linear(84, 10)) def forward(self, X): X = self.conv_part(X) return self.linear_part(X.view(-1, self.flatten)) if __name__ == '__main__': lr, num_epochs, batch_size, device = 5.0, 5, 256, torch.device("cuda") net = NLeNet((28, 28)).to(device) trainer = torch.optim.SGD(net.parameters(), lr=lr) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch5(net, train_iter, test_iter, trainer, num_epochs, device)
from torch import nn import MyD2l as d2l num_inputs, num_outputs, num_hiddens = 784, 10, 256 W1 = torch.normal(mean=torch.zeros(size=(num_inputs, num_hiddens)), std=0.01) b1 = torch.zeros(num_hiddens) W2 = torch.normal(mean=torch.zeros(size=(num_hiddens, num_outputs)), std=0.01) b2 = torch.zeros(num_outputs) loss = nn.CrossEntropyLoss() def relu(X): return X.clamp(min=0) def net(X): H = relu(torch.mm(X.view(-1, num_inputs), W1) + b1) return torch.mm(H, W2) + b2 if __name__ == '__main__': params = [W1, b1, W2, b2] for param in params: param.requires_grad_(True) train_iter, test_iter = d2l.data_load() num_epochs, lr = 5, 0.5 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, 256, params, lr)
y_pick = torch.cat((y_pick, (y == j).view(-1, 1)), 1) return -torch.masked_select(y_hat, y_pick).log() def accuracy(y_hat, y): acc = y_hat.argmax(dim=1) == y.type(torch.long) acc = acc.type(torch.float) return acc.mean().item() if __name__ == '__main__': # y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]]) # y = torch.tensor([0, 2], dtype=torch.int) # print(evaluate_accuracy(y_hat, y)) num_epochs, lr = 5, 0.1 train_iter, test_iter = d2l.data_load() now = time() d2l.train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, [W, b], lr) print("所需时间 %.4f" % (time() - now)) for X, y in test_iter: break true_labels = d2l.get_fashion_mnist_labels(y.numpy()) pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy()) # 上面是正确标签,下面是预测标签 titles = [ true + '\n' + pred for true, pred in zip(true_labels, pred_labels) ] d2l.show_fashion_mnist(X[0:9], titles[0:9]) plt.show()
nn.MaxPool2d(kernel_size=3, stride=2, padding=1) ) b5 = nn.Sequential( Inception(832, 256, (160, 320), (32, 128), 128), Inception(832, 384, (192, 384), (48, 128), 128), nn.AdaptiveAvgPool2d(1) ) class Flatten(nn.Module): def forward(self, X): flat = X.shape[1] * X.shape[2] * X.shape[3] return X.view(-1, flat) if __name__ == '__main__': # X = torch.rand(1, 1, 96, 96) # for layer in net.children(): # X = layer(X) # print('output shape:\t', X.shape, '\n') lr, num_epochs, batch_size, device = 0.1, 5, 128, torch.device('cuda') net = nn.Sequential(b1, b2, b3, b4, b5, Flatten(), nn.Linear(1024, 10)).to(device) d2l.initial(net) trainer = torch.optim.SGD(net.parameters(), lr=lr) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, trainer, num_epochs, device)
X = X.to(device) y = y.to(device) trainer.zero_grad() y_hat = net(X) l = loss(y_hat, y.type(torch.long)) l.backward() trainer.step() train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y.type(torch.long)).sum().item() n += y.size(0) test_acc = d2l.evaluate_accuracy(test_iter, net, device=device) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time() - start)) if __name__ == '__main__': train_iter, test_iter = d2l.data_load() device = torch.device('cuda') net = LeNet().to(device) for layer in net.modules(): if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d): print('正在初始化', layer) nn.init.xavier_uniform_(layer.weight) nn.init.zeros_(layer.bias) trainer = torch.optim.SGD(net.parameters(), lr=0.9) train_ch5(net, train_iter, test_iter, trainer, 5, device)
torch.zeros(batch_size, num_hiddens, device=device)) def lstm(inputs, state, params): W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q = params (H, C) = state outputs = [] for X in inputs: I = torch.sigmoid(torch.mm(X, W_xi) + torch.mm(H, W_hi) + b_i) F = torch.sigmoid(torch.mm(X, W_xf) + torch.mm(H, W_hf) + b_f) O = torch.sigmoid(torch.mm(X, W_xo) + torch.mm(H, W_ho) + b_o) C_tilda = torch.tanh(torch.mm(X, W_xc) + torch.mm(H, W_hc) + b_c) C = F * C + I * C_tilda H = O * C.tanh() Y = torch.mm(H, W_hq) + b_q outputs.append(Y) return outputs, (H, C) if __name__ == '__main__': corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics( ) num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2 pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开'] num_hiddens, device = 256, torch.device('cuda') d2l.train_and_predict_rnn(lstm, get_params, init_lstm_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, False, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes)
X_test = torch.rand(1, in_channels, *in_shape) self.conv_part = nn.Sequential( nn.Conv2d(in_channels, 6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2)) X_test = self.conv_part(X_test) self.flatten = X_test.shape[1] * X_test.shape[2] * X_test.shape[3] self.linear_part = nn.Sequential(nn.Linear(self.flatten, 120), BatchNorm(120, num_dims=2), nn.Sigmoid(), nn.Linear(120, 84), BatchNorm(84, num_dims=2), nn.Sigmoid(), nn.Linear(84, 10)) def forward(self, X): X = self.conv_part(X) return self.linear_part(X.view(-1, self.flatten)) if __name__ == '__main__': lr, num_epochs, batch_size, device = 1.0, 5, 256, torch.device("cuda") net = NLeNet((28, 28)) d2l.initial(net) trainer = torch.optim.SGD(net.parameters(), lr=lr) train_tier, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch5(net, train_tier, test_iter, trainer, num_epochs) for layer in net.modules(): if isinstance(layer, BatchNorm): print(layer.gamma.view(-1, ), layer.beta.view(-1, ), sep='\n') break
import torch from torch import nn import MyD2l as d2l num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 1000, 1000 drop_prob1, drop_prob2 = 0.2, 0.5 num_epochs, lr, batch_size = 5, 0.5, 256 net = nn.Sequential(nn.Linear(num_inputs, num_hiddens1), nn.ReLU(), nn.Dropout(drop_prob1), nn.Linear(num_hiddens1, num_hiddens2), nn.ReLU(), nn.Dropout(drop_prob2), nn.Linear(num_hiddens2, num_outputs)) if __name__ == '__main__': trainer = torch.optim.SGD(net.parameters(), lr, weight_decay=0.01) loss = nn.CrossEntropyLoss() train_iter, test_iter = d2l.data_load() d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer=trainer, input_num=num_inputs)