def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    # 选择读取数据的方式
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params(vocab_size, num_hiddens, vocab_size)
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        # 若是相邻采样,则刚开始就初始化隐藏状态
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            if is_random_iter:  # 随机采样需要在每个批量之间初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:  # 相邻采样需要将隐藏状态从计算图中分离, 否则会因为图被释放掉无法追踪而报错
                for s in state:
                    s.detach_()

            X = X.type(torch.long)
            Y = Y.type(torch.long)
            inputs = to_onehot(X, vocab_size)  # 转化为one-hot表示法
            outputs, state = rnn(inputs, state, params)  # 计算输出值(index)以及隐藏状态
            outputs = torch.cat(outputs, dim=0)  # 每一行样本都有一个output来组成一个列表,此处将列表拼接起来
            y = Y.t().reshape(-1,)
            l = loss(outputs, y)

            l.backward()
            with torch.no_grad():
                grad_clipping(params, clipping_theta, device)  # 防止梯度爆炸
                d2l.sgd(params, lr)
            l_sum += l.item() * y.size(0)
            n += y.size(0)

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_and_predict_rnn_nn(model, device,corpus_indices, idx_to_char,
                             char_to_idx, num_epochs, num_steps, lr, clipping_theta,
                             batch_size, pred_period, pred_len, prefixes):
    loss = nn.CrossEntropyLoss()
    trainer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0, weight_decay=0)

    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = d2l.data_iter_consecutive(corpus_indices, batch_size, num_steps, device)
        state = model.begin_state(batch_size=batch_size, device=device)
        for X, Y in data_iter:
            state.detach_()

            output, state = model(X.type(torch.long), state)
            y = Y.t().reshape(-1,).type(torch.long)
            l = loss(output, y)
            l.backward()
            # params = [p for p in model.parameters()]
            # d2l.grad_clipping(params, clipping_theta, device)
            nn.utils.clip_grad_norm_(model.parameters(), clipping_theta)
            trainer.step()
            trainer.zero_grad()

            l_sum += l.item() * y.size(0)
            n += y.size(0)

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_nn(prefix, pred_len, model, device, idx_to_char, char_to_idx))
def train_fine_tuning(net, learning_rate, batch_size=128, num_epochs=5):
    net = net.to('cuda')
    train_imgs.transform = train_augs
    train_iter = DataLoader(train_imgs, batch_size, shuffle=True)
    test_imgs.transform = test_augs
    test_iter = DataLoader(test_imgs, batch_size)
    trainer = torch.optim.SGD([{
        'params': other,
        'lr': learning_rate
    }, {
        'params': output,
        'lr': learning_rate * 10
    }],
                              lr=learning_rate,
                              weight_decay=0.1)
    d2l.train_ch5(net,
                  train_iter,
                  test_iter,
                  trainer,
                  num_epochs,
                  device='cuda')
def train_ch5(net, train_iter, test_iter, trainer, num_epochs, device=None):
    loss = nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time()
        for X, y in train_iter:
            if device is not None:
                X = X.to(device)
                y = y.to(device)
            trainer.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y.type(torch.long))
            l.backward()
            trainer.step()
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y.type(torch.long)).sum().item()
            n += y.size(0)
        test_acc = d2l.evaluate_accuracy(test_iter, net, device=device)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time() - start))
Example #5
0
 def forward(self, inputs, state):
     X = d2l.one_hot2(inputs.t(), self.vocab_size)
     Y, state = self.rnn(X, state)
     output = self.linear(Y.view(-1, Y.shape[-1]))
     return output, state
Example #6
0
    def begin_state(self, batch_size, device=torch.device('cpu'), num_state=1):
        return (torch.rand(num_state,
                           batch_size,
                           self.num_hiddens,
                           device=device),
                torch.rand(num_state,
                           batch_size,
                           self.num_hiddens,
                           device=device))

    def forward(self, inputs, state):
        X = d2l.one_hot2(inputs.t(), self.vocab_size)
        Y, state = self.rnn(X, state)
        output = self.linear(Y.view(-1, Y.shape[-1]))
        return output, state


if __name__ == '__main__':
    corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics(
    )
    num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2
    pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开']
    num_hiddens, device = 256, torch.device('cuda')

    gru_layer = nn.LSTM(vocab_size, num_hiddens)
    model = RNNModel(gru_layer, num_hiddens, vocab_size).to(device)
    d2l.train_and_predict_rnn_nn(model, device, corpus_indices, idx_to_char,
                                 char_to_idx, num_epochs, num_steps, lr,
                                 clipping_theta, batch_size, pred_period,
                                 pred_len, prefixes)
                grad_clipping(params, clipping_theta, device)  # 防止梯度爆炸
                d2l.sgd(params, lr)
            l_sum += l.item() * y.size(0)
            n += y.size(0)

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                    prefix, pred_len, rnn, params, init_rnn_state,
                    num_hiddens, vocab_size, device, idx_to_char, char_to_idx))


if __name__ == '__main__':
    corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics()
    # one_hot(torch.LongTensor([0, 2]), vocab_size)
    # print(one_hot)

    # X = torch.arange(10).view(2, 5).cuda()
    # print(X)
    # inputs = to_onehot(X, vocab_size)
    # print(len(inputs), inputs[0].shape)
    # for input in inputs:
    #     print(input)
    # params = get_params(vocab_size, 256, vocab_size)

    # state = init_rnn_state(X.shape[0], 256, torch.device('cuda'))
    # inputs = to_onehot(X, vocab_size)
    # outputs, state_new = rnn(inputs, state, params)
    # print(len(outputs), outputs[0].shape, state_new[0].shape)
Example #8
0

class NLeNet(nn.Module):
    def __init__(self, X_shape, in_channels=1):
        super().__init__()
        X_test = torch.rand(1, in_channels, *X_shape)
        self.conv_part = nn.Sequential(
            nn.Conv2d(in_channels, 6, kernel_size=5), nn.BatchNorm2d(6),
            nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5), nn.BatchNorm2d(16), nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        X_test = self.conv_part(X_test)
        self.flatten = X_test.shape[1] * X_test.shape[2] * X_test.shape[3]
        self.linear_part = nn.Sequential(nn.Linear(self.flatten, 120),
                                         nn.BatchNorm1d(120), nn.Sigmoid(),
                                         nn.Linear(120, 84),
                                         nn.BatchNorm1d(84), nn.Sigmoid(),
                                         nn.Linear(84, 10))

    def forward(self, X):
        X = self.conv_part(X)
        return self.linear_part(X.view(-1, self.flatten))


if __name__ == '__main__':
    lr, num_epochs, batch_size, device = 5.0, 5, 256, torch.device("cuda")
    net = NLeNet((28, 28)).to(device)
    trainer = torch.optim.SGD(net.parameters(), lr=lr)
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
    d2l.train_ch5(net, train_iter, test_iter, trainer, num_epochs, device)
from torch import nn

import MyD2l as d2l

num_inputs, num_outputs, num_hiddens = 784, 10, 256

W1 = torch.normal(mean=torch.zeros(size=(num_inputs, num_hiddens)), std=0.01)
b1 = torch.zeros(num_hiddens)
W2 = torch.normal(mean=torch.zeros(size=(num_hiddens, num_outputs)), std=0.01)
b2 = torch.zeros(num_outputs)

loss = nn.CrossEntropyLoss()


def relu(X):
    return X.clamp(min=0)


def net(X):
    H = relu(torch.mm(X.view(-1, num_inputs), W1) + b1)
    return torch.mm(H, W2) + b2


if __name__ == '__main__':
    params = [W1, b1, W2, b2]
    for param in params:
        param.requires_grad_(True)
    train_iter, test_iter = d2l.data_load()
    num_epochs, lr = 5, 0.5
    d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, 256, params,
                  lr)
        y_pick = torch.cat((y_pick, (y == j).view(-1, 1)), 1)
    return -torch.masked_select(y_hat, y_pick).log()


def accuracy(y_hat, y):
    acc = y_hat.argmax(dim=1) == y.type(torch.long)
    acc = acc.type(torch.float)
    return acc.mean().item()


if __name__ == '__main__':
    # y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
    # y = torch.tensor([0, 2], dtype=torch.int)
    # print(evaluate_accuracy(y_hat, y))
    num_epochs, lr = 5, 0.1
    train_iter, test_iter = d2l.data_load()
    now = time()
    d2l.train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs,
                  [W, b], lr)
    print("所需时间 %.4f" % (time() - now))
    for X, y in test_iter:
        break
    true_labels = d2l.get_fashion_mnist_labels(y.numpy())
    pred_labels = d2l.get_fashion_mnist_labels(net(X).argmax(dim=1).numpy())
    # 上面是正确标签,下面是预测标签
    titles = [
        true + '\n' + pred for true, pred in zip(true_labels, pred_labels)
    ]
    d2l.show_fashion_mnist(X[0:9], titles[0:9])
    plt.show()
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)

b5 = nn.Sequential(
    Inception(832, 256, (160, 320), (32, 128), 128),
    Inception(832, 384, (192, 384), (48, 128), 128),
    nn.AdaptiveAvgPool2d(1)
)


class Flatten(nn.Module):
    def forward(self, X):
        flat = X.shape[1] * X.shape[2] * X.shape[3]
        return X.view(-1, flat)


if __name__ == '__main__':
    # X = torch.rand(1, 1, 96, 96)
    # for layer in net.children():
    #     X = layer(X)
    #     print('output shape:\t', X.shape, '\n')

    lr, num_epochs, batch_size, device = 0.1, 5, 128, torch.device('cuda')
    net = nn.Sequential(b1, b2, b3, b4, b5, Flatten(), nn.Linear(1024, 10)).to(device)
    d2l.initial(net)
    trainer = torch.optim.SGD(net.parameters(), lr=lr)
    train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
    d2l.train_ch5(net, train_iter, test_iter, trainer, num_epochs, device)


                X = X.to(device)
                y = y.to(device)
            trainer.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y.type(torch.long))
            l.backward()
            trainer.step()
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y.type(torch.long)).sum().item()
            n += y.size(0)
        test_acc = d2l.evaluate_accuracy(test_iter, net, device=device)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time() - start))


if __name__ == '__main__':
    train_iter, test_iter = d2l.data_load()
    device = torch.device('cuda')

    net = LeNet().to(device)
    for layer in net.modules():
        if isinstance(layer, nn.Linear) or isinstance(layer, nn.Conv2d):
            print('正在初始化', layer)
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    trainer = torch.optim.SGD(net.parameters(), lr=0.9)
    train_ch5(net, train_iter, test_iter, trainer, 5, device)

Example #13
0
            torch.zeros(batch_size, num_hiddens, device=device))


def lstm(inputs, state, params):
    W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hq, b_q = params
    (H, C) = state
    outputs = []
    for X in inputs:
        I = torch.sigmoid(torch.mm(X, W_xi) + torch.mm(H, W_hi) + b_i)
        F = torch.sigmoid(torch.mm(X, W_xf) + torch.mm(H, W_hf) + b_f)
        O = torch.sigmoid(torch.mm(X, W_xo) + torch.mm(H, W_ho) + b_o)
        C_tilda = torch.tanh(torch.mm(X, W_xc) + torch.mm(H, W_hc) + b_c)
        C = F * C + I * C_tilda
        H = O * C.tanh()
        Y = torch.mm(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H, C)


if __name__ == '__main__':
    corpus_indices, char_to_idx, idx_to_char, vocab_size = d2l.load_data_jay_lyrics(
    )
    num_epochs, num_steps, batch_size, lr, clipping_theta = 160, 35, 32, 1e2, 1e-2
    pred_period, pred_len, prefixes = 40, 50, ['分开', '不分开']
    num_hiddens, device = 256, torch.device('cuda')

    d2l.train_and_predict_rnn(lstm, get_params, init_lstm_state, num_hiddens,
                              vocab_size, device, corpus_indices, idx_to_char,
                              char_to_idx, False, num_epochs, num_steps, lr,
                              clipping_theta, batch_size, pred_period,
                              pred_len, prefixes)
Example #14
0
        X_test = torch.rand(1, in_channels, *in_shape)
        self.conv_part = nn.Sequential(
            nn.Conv2d(in_channels, 6, kernel_size=5), BatchNorm(6, num_dims=4),
            nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5), BatchNorm(16, num_dims=4),
            nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2))
        X_test = self.conv_part(X_test)
        self.flatten = X_test.shape[1] * X_test.shape[2] * X_test.shape[3]
        self.linear_part = nn.Sequential(nn.Linear(self.flatten, 120),
                                         BatchNorm(120, num_dims=2),
                                         nn.Sigmoid(), nn.Linear(120, 84),
                                         BatchNorm(84, num_dims=2),
                                         nn.Sigmoid(), nn.Linear(84, 10))

    def forward(self, X):
        X = self.conv_part(X)
        return self.linear_part(X.view(-1, self.flatten))


if __name__ == '__main__':
    lr, num_epochs, batch_size, device = 1.0, 5, 256, torch.device("cuda")
    net = NLeNet((28, 28))
    d2l.initial(net)
    trainer = torch.optim.SGD(net.parameters(), lr=lr)
    train_tier, test_iter = d2l.load_data_fashion_mnist(batch_size)
    d2l.train_ch5(net, train_tier, test_iter, trainer, num_epochs)
    for layer in net.modules():
        if isinstance(layer, BatchNorm):
            print(layer.gamma.view(-1, ), layer.beta.view(-1, ), sep='\n')
            break
Example #15
0
import torch
from torch import nn
import MyD2l as d2l

num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 1000, 1000
drop_prob1, drop_prob2 = 0.2, 0.5
num_epochs, lr, batch_size = 5, 0.5, 256

net = nn.Sequential(nn.Linear(num_inputs, num_hiddens1), nn.ReLU(),
                    nn.Dropout(drop_prob1),
                    nn.Linear(num_hiddens1, num_hiddens2), nn.ReLU(),
                    nn.Dropout(drop_prob2), nn.Linear(num_hiddens2,
                                                      num_outputs))

if __name__ == '__main__':
    trainer = torch.optim.SGD(net.parameters(), lr, weight_decay=0.01)
    loss = nn.CrossEntropyLoss()
    train_iter, test_iter = d2l.data_load()
    d2l.train_ch3(net,
                  train_iter,
                  test_iter,
                  loss,
                  num_epochs,
                  trainer=trainer,
                  input_num=num_inputs)