def train(num_gpus, batch_size, lr): train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) ctx = [mx.gpu(i) for i in range(num_gpus)] print('running on:', ctx) net.initialize(init=init.Normal(sigma=0.01), ctx=ctx, force_reinit=True) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(4): start = time.time() for X, y in train_iter: gpu_Xs = gutils.split_and_load(X, ctx) gpu_ys = gutils.spilt_and_load(y, ctx) with autograd.record(): ls = [ loss(net(gpu_X), gpu_y) for gpu_X, gpu_y in zip(gpu_Xs, gpu_ys) ] for l in ls: l.backward() trainer.step(batch_size) nd.waitall() train_time = time.time() - start test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0]) print('epoch %d, time %.1f sec, test_acc %.2f' % (epoch + 1, train_time, test_acc))
def lenet(): """ :return: """ net = nn.Sequential() net.add( nn.Conv2D(channels=6, kernel_size=5, activation='sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(channels=16, kernel_size=5, activation='sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), # Dense会默认将(批量大小, 通道, 高, 宽)形状的输入转换成 # (批量大小, 通道 * 高 * 宽)形状的输入 nn.Dense(120, activation='sigmoid'), nn.Dense(84, activation='sigmoid'), nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 28, 28)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size = 0.9, 5, 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size) net.initialize(force_reinit=True, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, mxnet.cpu(), num_epochs)
def basic_multilayer(): """ 基本的多层感知机实现 :return: """ def relu(X): return nd.maximum(X, 0) def net(X): X = X.reshape((-1, num_inputs)) H = relu(nd.dot(X, W1) + b1) return nd.dot(H, W2) + b2 batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) num_inputs, num_outputs, num_hiddens = 784, 10, 256 W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens)) b1 = nd.zeros(num_hiddens) W2 = nd.random.normal(scale=0.01, shape=(num_hiddens, num_outputs)) b2 = nd.zeros(num_outputs) params = [W1, b1, W2, b2] for param in params: param.attach_grad() loss = gloss.SoftmaxCrossEntropyLoss() num_epochs, lr = 5, 0.5 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
def main(): batch_size = 256 # 下载、读取数据集 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) num_inputs = 784 num_outputs = 10 W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs)) b = nd.zeros(num_outputs) # 附上梯度,开辟梯度缓冲区 W.attach_grad() b.attach_grad() # 迭代周期数 学习率 num_epochs, lr = 5, 0.1 train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, num_inputs, W, b, [W, b], lr) for X, y in test_iter: print(X, y) break true_labels = d2l.get_fashion_mnist_labels(y.asnumpy()) pred_labels = d2l.get_fashion_mnist_labels( net(X, num_inputs, W, b).argmax(axis=1).asnumpy()) titles = [ true + '\n' + pred for true, pred in zip(true_labels, pred_labels) ] show_fashion_mnist(X[0:9], titles[0:9])
def densenet(): """ DenseNet的主要构建模块是稠密块(dense block)和过渡层(transition layer)。前者定义了输入和输出是如何连结的,后者则用来控制 通道数,使之不过大。 :return: """ class DenseBlock(nn.Block): def __init__(self, num_convs, num_channels, **kwargs): super(DenseBlock, self).__init__(**kwargs) self.net = nn.Sequential() for _ in range(num_convs): self.net.add(conv_block(num_channels)) def forward(self, X): for blk in self.net: Y = blk(X) X = nd.concat(X, Y, dim=1) # 在通道维上将输入和输出连结 return X def conv_block(num_channels): blk = nn.Sequential() blk.add(nn.BatchNorm(), nn.Activation('relu'), nn.Conv2D(num_channels, kernel_size=3, padding=1)) return blk def transition_block(num_channels): blk = nn.Sequential() blk.add(nn.BatchNorm(), nn.Activation('relu'), nn.Conv2D(num_channels, kernel_size=1), nn.AvgPool2D(pool_size=2, strides=2)) return blk net = nn.Sequential() net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3), nn.BatchNorm(), nn.Activation('relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) num_channels, growth_rate = 64, 32 # num_channels为当前的通道数 num_convs_in_dense_blocks = [4, 4, 4, 4] for i, num_convs in enumerate(num_convs_in_dense_blocks): net.add(DenseBlock(num_convs, growth_rate)) # 上一个稠密块的输出通道数 num_channels += num_convs * growth_rate # 在稠密块之间加入通道数减半的过渡层 if i != len(num_convs_in_dense_blocks) - 1: num_channels //= 2 net.add(transition_block(num_channels)) net.add(nn.BatchNorm(), nn.Activation('relu'), nn.GlobalAvgPool2D(), nn.Dense(10)) lr, num_epochs, batch_size, ctx = 0.1, 5, 256, d2l.try_gpu() net.initialize(ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def generate(self): net = nn.Sequential() net.add(nn.Dense(256, activation='relu'), nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) loss = gloss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) num_epochs = 5 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
def method1(): num_epochs, lr, batch_size = 5, 0.5, 256 loss = gloss.SoftmaxCrossEntropyLoss() train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) net = nn.Sequential() net.add(nn.Dense(num_hiddens1, activation="relu"), nn.Dropout(drop_prob1), nn.Dense(num_hiddens2, activation="relu"), nn.Dropout(drop_prob2), nn.Dense(num_outputs)) net.initialize(init.Normal(sigma=0.01)) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
def main(): batch_size = 256 # 下载、读取数据集 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) # 定义初始化模型 net = nn.Sequential() net.add(nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) # 定义损失函数 loss = gloss.SoftmaxCrossEntropyLoss() # 定义优化算法 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01}) num_epochs = 5 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
def vgg(): """ :return: """ def vgg_block(num_convs, num_channels): blk = nn.Sequential() for _ in range(num_convs): blk.add( nn.Conv2D(num_channels, kernel_size=3, padding=1, activation='relu')) blk.add(nn.MaxPool2D(pool_size=2, strides=2)) return blk def vgg(conv_arch): net = nn.Sequential() # 卷积层部分 for (num_convs, num_channels) in conv_arch: net.add(vgg_block(num_convs, num_channels)) # 全连接层部分 net.add(nn.Dense(4096, activation='relu'), nn.Dropout(0.5), nn.Dense(4096, activation='relu'), nn.Dropout(0.5), nn.Dense(10)) return net conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512)) net = vgg(conv_arch) net.initialize() X = nd.random.uniform(shape=(1, 1, 224, 224)) for blk in net: X = blk(X) print(blk.name, 'output shape:\t', X.shape) ratio = 4 small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch] net = vgg(small_conv_arch) lr, num_epochs, batch_size, ctx = 0.05, 5, 128, d2l.try_gpu() net.initialize(ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def dropout_gluon(): drop_prob1, drop_prob2, lr, batch_size, num_epochs = 0.2, 0.5, 0.1, 64, 50 net = nn.Sequential() net.add( nn.Dense(256, activation="relu"), nn.Dropout(drop_prob1), # 在第一个全连接层后添加丢弃层 nn.Dense(256, activation="relu"), nn.Dropout(drop_prob2), # 在第二个全连接层后添加丢弃层 nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) loss = gloss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
def simple_multilayer(): """ 多层感知机简洁实现 :return: """ net = nn.Sequential() net.add(nn.Dense(256, activation='relu'), nn.Dense(10)) net.add(gluon.nn.Dropout(0.2)) net.initialize(init.Normal(sigma=0.01)) batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) loss = gloss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) num_epochs = 10 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)
def nin(): """ :return: """ def nin_block(num_channels, kernel_size, strides, padding): blk = nn.Sequential() blk.add( nn.Conv2D(num_channels, kernel_size, strides, padding, activation='relu'), nn.Conv2D(num_channels, kernel_size=1, activation='relu'), nn.Conv2D(num_channels, kernel_size=1, activation='relu')) return blk net = nn.Sequential() net.add( nin_block(96, kernel_size=11, strides=4, padding=0), nn.MaxPool2D(pool_size=3, strides=2), nin_block(256, kernel_size=5, strides=1, padding=2), nn.MaxPool2D(pool_size=3, strides=2), nin_block(384, kernel_size=3, strides=1, padding=1), nn.MaxPool2D(pool_size=3, strides=2), nn.Dropout(0.5), # 标签类别数是10 nin_block(10, kernel_size=3, strides=1, padding=1), # 全局平均池化层将窗口形状自动设置成输入的高和宽 nn.GlobalAvgPool2D(), # 将四维的输出转成二维的输出,其形状为(批量大小, 10) nn.Flatten()) X = nd.random.uniform(shape=(1, 1, 224, 224)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def train(num_gpus, batch_size, lr): train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) ctx = [mx.gpu(i) for i in range(num_gpus)] print('runing on:', ctx) #将模型参数复制到num_gpus块显卡的显存上 gpu_params = [get_params(params, c) for c in ctx] for epoch in range(4): start = time.time() for X, y in train_iter: #对单个小批量进行多GPU训练 train_batch(X, y, gpu_params, ctx, lr) nd.waitall() train_time = time.time() - start def net(x): #在gpu(0)上验证模型 return lenet(x, gpu_params[0]) test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[0]) print('epoch %d, time %.1f sec, test_acc %.2f' % (epoch + 1, train_time, test_acc))
def batchnormalization_simple(): """ Gluon中nn模块定义的BatchNorm类使用起来更加简单。它不需要指定自己定义的BatchNorm类中所需的num_features和num_dims参数值。 在Gluon中,这些参数值都将通过延后初始化而自动获取。下面我们用Gluon实现使用批量归一化的LeNet。 :return: """ net = nn.Sequential() net.add(nn.Conv2D(6, kernel_size=5), nn.BatchNorm(), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(16, kernel_size=5), nn.BatchNorm(), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Dense(120), nn.BatchNorm(), nn.Activation('sigmoid'), nn.Dense(84), nn.BatchNorm(), nn.Activation('sigmoid'), nn.Dense(10)) lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu() net.initialize(ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def train(num_gpus, batch_size, lr): comm = MPI.COMM_WORLD comm_rank = comm.Get_rank() comm_size = comm.Get_size() train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) #ctx = [mx.gpu(i) for i in range(num_gpus)] if comm_rank == 0: ctx = mx.gpu(0) else: ctx = mx.gpu(1) print('running on:', ctx) net.initialize(init=init.Normal(sigma=0.01), ctx=ctx, force_reinit=True) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}, SSP_FLAG=True, thre=2) loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(400000): start = time.time() for X, y in train_iter: gpu_Xs = gutils.split_and_load(X, ctx) gpu_ys = gutils.split_and_load(y, ctx) with autograd.record(): ls = [ loss(net(gpu_X), gpu_y) for gpu_X, gpu_y in zip(gpu_Xs, gpu_ys) ] for l in ls: l.backward() trainer.step(epoch, batch_size) train_time = time.time() - start test_acc = d2l.evaluate_accuracy(test_iter, net, ctx[comm_rank]) print('epoch %d, time %.1f sec, test acc %.2f, process %d' % (epoch + 1, train_time, test_acc, comm_rank))
print(len(self.parameters)) for i, k in enumerate(keys): if i not in [30, 31]: sess.run(self.parameters[i].assign(weights[k])) print('-------------all done------------------') x_imgs = tf.placeholder(tf.float32, [None, 96, 96, 1]) vgg = vgg16(x_imgs, 10) y_imgs = tf.placeholder(tf.int32, [None, 10]) fc3_cat_and_dog = vgg.probs batch_size = 500 loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=fc3_cat_and_dog, labels=y_imgs)) optimizer = tf.train.GradientDescentOptimizer( learning_rate=0.01).minimize(loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) train_iter, test_iter = d2l.load_data_fashion_mnist(200, resize=96) epoch = 5 for i in range(epoch): j = 0 for features, labels in train_iter: X_train = nd.transpose(features, axes=(0, 2, 3, 1)).asnumpy() y_train = labels.asnumpy() Y_train = np_utils.to_categorical(y_train, 10) sess.run(optimizer, feed_dict={x_imgs: X_train, y_imgs: Y_train}) j = j + 1 if j % 10 == 0: print(sess.run(loss, feed_dict={x_imgs: X_train, y_imgs: Y_train}))
@Author : 剑怜情 ''' from mxnet.gluon import data as gdata from mxnet.gluon import nn import d2lzh as d2l from mxnet import nd,autograd,gluon from mxnet import initializer as init mnist_train = gdata.vision.FashionMNIST(train=True) mnist_test = gdata.vision.FashionMNIST(train=False) print("mnist_train.shape:",len(mnist_train)) print("mnist_test.shape:",len(mnist_test)) # 处理成iter batch_size=64 train_iter,test_iter = d2l.load_data_fashion_mnist(64) # 定义W,b num_input = 784 num_outputs = 10 # W = nd.random.normal(scale=1,shape=(num_input,num_outputs)) # b = nd.zeros(shape=num_outputs) # W.attach_grad() # b.attach_grad() # 定义模型、分类模型、损失函数 def softmax(X): x_exp = X.exp() partition = x_exp.sum(axis=1,keepdims = True)
def method0(): num_epochs, lr, batch_size = 5, 0.5, 256 loss = gloss.SoftmaxCrossEntropyLoss() train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
def googlenet(): """ GoogLeNet吸收了NiN中网络串联网络的思想,并在此基础上做了很大改进。在随后的几年里,研究人员对GoogLeNet进行了数次改进,本节将介绍这个模型系列的第一个版本。 :return: """ class Inception(nn.Block): # c1 - c4为每条线路里的层的输出通道数 def __init__(self, c1, c2, c3, c4, **kwargs): super(Inception, self).__init__(**kwargs) # 线路1,单1 x 1卷积层 self.p1_1 = nn.Conv2D(c1, kernel_size=1, activation='relu') # 线路2,1 x 1卷积层后接3 x 3卷积层 self.p2_1 = nn.Conv2D(c2[0], kernel_size=1, activation='relu') self.p2_2 = nn.Conv2D(c2[1], kernel_size=3, padding=1, activation='relu') # 线路3,1 x 1卷积层后接5 x 5卷积层 self.p3_1 = nn.Conv2D(c3[0], kernel_size=1, activation='relu') self.p3_2 = nn.Conv2D(c3[1], kernel_size=5, padding=2, activation='relu') # 线路4,3 x 3最大池化层后接1 x 1卷积层 self.p4_1 = nn.MaxPool2D(pool_size=3, strides=1, padding=1) self.p4_2 = nn.Conv2D(c4, kernel_size=1, activation='relu') def forward(self, x): p1 = self.p1_1(x) p2 = self.p2_2(self.p2_1(x)) p3 = self.p3_2(self.p3_1(x)) p4 = self.p4_2(self.p4_1(x)) return nd.concat(p1, p2, p3, p4, dim=1) # 在通道维上连结输出 b1 = nn.Sequential() b1.add( nn.Conv2D(64, kernel_size=7, strides=2, padding=3, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b2 = nn.Sequential() b2.add(nn.Conv2D(64, kernel_size=1, activation='relu'), nn.Conv2D(192, kernel_size=3, padding=1, activation='relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b3 = nn.Sequential() b3.add(Inception(64, (96, 128), (16, 32), 32), Inception(128, (128, 192), (32, 96), 64), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b4 = nn.Sequential() b4.add(Inception(192, (96, 208), (16, 48), 64), Inception(160, (112, 224), (24, 64), 64), Inception(128, (128, 256), (24, 64), 64), Inception(112, (144, 288), (32, 64), 64), Inception(256, (160, 320), (32, 128), 128), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) b5 = nn.Sequential() b5.add(Inception(256, (160, 320), (32, 128), 128), Inception(384, (192, 384), (48, 128), 128), nn.GlobalAvgPool2D()) net = nn.Sequential() net.add(b1, b2, b3, b4, b5, nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 96, 96)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.1, 5, 128, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
# DLN is free software; you can redistribute it and/or modify it. # # DLN is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Contributors: # Xiao Wang - initial implementation import d2lzh as d2l from mxnet import nd from mxnet.gluon import loss as gloss # load data batch_size = 256 iter_trainning, iter_testing = d2l.load_data_fashion_mnist(batch_size) # initialize parameters number_inputs = 784 number_outputs = 10 number_hiddens = 256 W1 = nd.random.normal(scale=0.01, shape=(number_inputs, number_hiddens)) b1 = nd.zeros(number_hiddens) W2 = nd.random.normal(scale=0.01, shape=(number_hiddens, number_outputs)) b2 = nd.zeros(number_outputs) parameters = [W1, b1, W2, b2] for parameter in parameters:
class Test5: batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size) def try_gpu(self): # 本函数已保存在d2lzh包中方便以后使用 try: ctx = mx.gpu() _ = nd.zeros((1, ), ctx=ctx) except mx.base.MXNetError: ctx = mx.cpu() return ctx # 描述 def evaluate_accuracy(data_iter, net, ctx): acc_sum, n = nd.array([0], ctx=ctx), 0 for X, y in data_iter: # 如果ctx代表GPU及相应的显存,将数据复制到显存上 X, y = X.as_in_context(ctx), y.as_in_context(ctx).astype('float32') acc_sum += (net(X).argmax(axis=1) == y).sum() n += y.size return acc_sum.asscalar() / n def generate(self): net = nn.Sequential() net.add( nn.Conv2D(channels=6, kernel_size=5, activation='sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(channels=16, kernel_size=5, activation='sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), # Dense会默认将(批量大小, 通道, 高, 宽)形状的输入转换成 # (批量大小, 通道 * 高 * 宽)形状的输入 nn.Dense(120, activation='sigmoid'), nn.Dense(84, activation='sigmoid'), nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 28, 28)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs): print('training on', ctx) loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time() for X, y in train_iter: X, y = X.as_in_context(ctx), y.as_in_context(ctx) with autograd.record(): y_hat = net(X) l = loss(y_hat, y).sum() l.backward() trainer.step(batch_size) y = y.astype('float32') train_l_sum += l.asscalar() train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar() n += y.size test_acc = evaluate_accuracy(test_iter, net, ctx) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, ' 'time %.1f sec' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start)) def main(self): lr, num_epochs = 0.9, 5 net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
def batchnormalization(): """ 批量归一化利用小批量上的均值和标准差,不断调整神经网络的中间输出,从而使整个神经网络在各层的中间输出的数值更稳定 :return: """ def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): # 通过autograd来判断当前模式是训练模式还是预测模式 if not autograd.is_training(): # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使用全连接层的情况,计算特征维上的均值和方差 mean = X.mean(axis=0) var = ((X - mean)**2).mean(axis=0) else: # 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差。这里我们需要保持 # X的形状以便后面可以做广播运算 mean = X.mean(axis=(0, 2, 3), keepdims=True) var = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 训练模式下用当前的均值和方差做标准化 X_hat = (X - mean) / nd.sqrt(var + eps) # 更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta # 拉伸和偏移 return Y, moving_mean, moving_var class BatchNorm(nn.Block): def __init__(self, num_features, num_dims, **kwargs): super(BatchNorm, self).__init__(**kwargs) if num_dims == 2: shape = (1, num_features) else: shape = (1, num_features, 1, 1) # 参与求梯度和迭代的拉伸和偏移参数,分别初始化成1和0 self.gamma = self.params.get('gamma', shape=shape, init=init.One()) self.beta = self.params.get('beta', shape=shape, init=init.Zero()) # 不参与求梯度和迭代的变量,全在内存上初始化成0 self.moving_mean = nd.zeros(shape) self.moving_var = nd.zeros(shape) def forward(self, X): # 如果X不在内存上,将moving_mean和moving_var复制到X所在显存上 if self.moving_mean.context != X.context: self.moving_mean = self.moving_mean.copyto(X.context) self.moving_var = self.moving_var.copyto(X.context) # 保存更新过的moving_mean和moving_var Y, self.moving_mean, self.moving_var = batch_norm( X, self.gamma.data(), self.beta.data(), self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9) return Y net = nn.Sequential() net.add(nn.Conv2D(6, kernel_size=5), BatchNorm(6, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Conv2D(16, kernel_size=5), BatchNorm(16, num_dims=4), nn.Activation('sigmoid'), nn.MaxPool2D(pool_size=2, strides=2), nn.Dense(120), BatchNorm(120, num_dims=2), nn.Activation('sigmoid'), nn.Dense(84), BatchNorm(84, num_dims=2), nn.Activation('sigmoid'), nn.Dense(10)) lr, num_epochs, batch_size, ctx = 1.0, 5, 256, d2l.try_gpu() net.initialize(ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs) net[1].gamma.data().reshape((-1, )), net[1].beta.data().reshape((-1, ))
import d2lzh as d2l from mxnet import gluon, init from mxnet.gluon import loss as gloss, nn bathsize = 256 trainer_iter, test_iter = d2l.load_data_fashion_mnist(bathsize) net = nn.Sequential() net.add(nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) loss = gloss.SoftmaxCELoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1}) num = 5 d2l.train_ch3(net, trainer_iter, test_iter, loss, num, bathsize, None, None, trainer) for x, y in test_iter: break truelabes = d2l.get_fashion_mnist_labels(y.asnumpy()) falselabes = d2l.get_fashion_mnist_labels(net(x).argmax(axis=1).asnumpy()) title = [true + '\n' + pred for true, pred in zip(truelabes, falselabes)] d2l.show_fashion_mnist(x[0:9], title[0:9])
def resnet(): """ 残差块通过跨层的数据通道从而能够训练出有效的深度神经网络。 :return: """ class Residual(nn.Block): # 本类已保存在d2lzh包中方便以后使用 def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs): super(Residual, self).__init__(**kwargs) self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides) self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm() self.bn2 = nn.BatchNorm() def forward(self, X): Y = nd.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return nd.relu(Y + X) def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.Sequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(Residual(num_channels, use_1x1conv=True, strides=2)) else: blk.add(Residual(num_channels)) return blk net = nn.Sequential() net.add(nn.Conv2D(64, kernel_size=7, strides=2, padding=3), nn.BatchNorm(), nn.Activation('relu'), nn.MaxPool2D(pool_size=3, strides=2, padding=1)) net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 224, 224)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
import d2lzh from mxnet import gluon, init, nd from mxnet.gluon import loss as gloss, nn batch_size = 256 train_iter, test_iter = d2lzh.load_data_fashion_mnist(batch_size) net = nn.Sequential() net.add(nn.Dense(256, activation='relu'), nn.Dense(10)) net.initialize(init.Normal(sigma=0.01)) loss = gloss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) num_epoch = 5 d2lzh.train_ch3(net, train_iter, test_iter, loss, num_epoch, batch_size, None, None, trainer) nd.concat(X, nd.power)
class MLP: batch_size = 256 num_inputs, num_outputs, num_hiddens = 784, 10, 256 W1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hiddens)), dtype=torch.float) b1 = torch.zeros(num_hiddens, dtype=torch.float) W2 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens, num_outputs)), dtype=torch.float) b2 = torch.zeros(num_outputs, dtype=torch.float) params = [W1, b1, W2, b2] for param in params: param.requires_grad_(requires_grad=True) loss = torch.nn.CrossEntropyLoss() train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) @staticmethod def relu(X): ''' 激活函数,自己定义的,现在用relu更常用 ''' return torch.max(input=X, other=torch.tensor(0.0)) def net(self, X): X = X.view((-1, self.num_inputs)) H = self.relu(torch.matmul(X, self.W1) + self.b1) return torch.matmul(H, self.W2) + self.b2 def train(self): num_epochs, lr = 5, 100.0 d2l.train_ch3(self.net, self.train_iter, self.test_iter, self.loss, num_epochs, self.batch_size, self.params, lr) pass @staticmethod def xyplot(x_vals, y_vals, name): d2l.set_figsize(figsize=(5, 2.5)) d2l.plt.plot(x_vals.detach().numpy(), y_vals.detach().numpy()) d2l.plt.xlabel('x') d2l.plt.ylabel(name + '(x)') plt.show() plt.close() @staticmethod def testxplot(mtd=0): """ H=ϕ(XWh+bh), O=HWo+bo, 其中ϕ就是激活函数{rele|sigmoid|tanh等} """ x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True) switcher = { 0: x.relu, # 分段函数 1: x.sigmoid, # 0-1之间 2: x.tanh, # -1到1之间 } if switcher.get(mtd): y = switcher.get(mtd)() MLP.xyplot(x, y, 'relu') y.sum().backward() MLP.xyplot(x, x.grad, 'grad of relu')
def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.Sequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(Residual(num_channels, use_1x1conv=True, strides=2)) else: blk.add(Residual(num_channels)) return blk net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(10)) X = nd.random.uniform(shape=(1, 1, 224, 224)) net.initialize() for layer in net: X = layer(X) print(layer.name, 'output shape:\t', X.shape) lr, num_epochs, batch_size, ctx = 0.05, 5, 256, d2l.try_gpu() net.initialize(force_reinit=True, ctx=ctx, init=init.Xavier()) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96) d2l.train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, num_epochs)
class EasyMLP: num_inputs, num_outputs, num_hiddens = 784, 10, 256 net = nn.Sequential( d2l.FlattenLayer(), nn.Linear(num_inputs, num_hiddens), nn.ReLU(), nn.Linear(num_hiddens, num_outputs), ) for params in net.parameters(): init.normal_(params, mean=0, std=0.01) batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size) loss = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(), lr=0.5) num_epochs = 5 @staticmethod def sgd(params, lr, batch_size): # 为了和原书保持一致,这里除以了batch_size,但是应该是不用除的,因为一般用PyTorch计算loss时就默认已经 # 沿batch维求了平均了。 for param in params: param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data @staticmethod def evaluate_accuracy(data_iter, net, device=None): if device is None and isinstance(net, torch.nn.Module): # 如果没指定device就使用net的device device = list(net.parameters())[0].device acc_sum, n = 0.0, 0 with torch.no_grad(): for X, y in data_iter: if isinstance(net, torch.nn.Module): net.eval() # 评估模式, 这会关闭dropout acc_sum += (net(X.to(device)).argmax( dim=1) == y.to(device)).float().sum().cpu().item() net.train() # 改回训练模式 else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU if ('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数 # 将is_training设置成False acc_sum += (net(X, is_training=False).argmax( dim=1) == y).float().sum().item() else: acc_sum += (net(X).argmax( dim=1) == y).float().sum().item() n += y.shape[0] return acc_sum / n def train(self): # d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer) for epoch in range(self.num_epochs): train_l_sum, train_acc_sum, n = 0.0, 0.0, 0 for X, y in self.train_iter: y_hat = self.net(X) l = self.loss(y_hat, y).sum() # 梯度清零 if self.optimizer is not None: self.optimizer.zero_grad() elif self.params is not None and self.params[ 0].grad is not None: for param in self.params: param.grad.data.zero_() l.backward() if self.optimizer is None: self.sgd(self.params, self.lr, self.batch_size) else: self.optimizer.step() # “softmax回归的简洁实现”一节将用到 train_l_sum += l.item() train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item() n += y.shape[0] test_acc = self.evaluate_accuracy(self.test_iter, self.net) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))