Exemple #1
0
def train_ch3(net,
              train_iter,
              test_iter,
              loss,
              num_epochs,
              batch_size,
              params=None,
              lr=None,
              trainer=None):
    train_ls, test_ls = [], []
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1)).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net)
        print("输出 %d,损失误差 %.4f 训练acc %.3f ,测试acc %.3f" %
              (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
        train_ls.append(train_l_sum / n)
        test_ls.append(test_acc)
Exemple #2
0
def train_softmax(train_iter,
                  test_iter,
                  net,
                  loss,
                  num_epochs,
                  batch_size,
                  params=None,
                  lr=None,
                  trainer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0  # 损失函数(逐渐减小的) 训练集正确率 样本数目60000
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)  # 训练模型 输出<NDArray 256x10 @cpu(0)>
                l = loss(y_hat,
                         y).sum()  # 结果是256的行向量 需要求和,也可以放在l.asscalar(),等效的
            l.backward()  # 如果上面没求和,这里会自动求和的
            if trainer is None:
                d2l.sgd(params, lr, batch_size)  # 第一次用sgd设置参数
            else:
                trainer.step(batch_size)  # 其他前进一步训练
            train_l_sum += l.asscalar()
            # 训练集正确率(累计)
            y = y.astype('float32')
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size  # 一次加上256,总和60000
        test_acc = evaluate_accurary(test_iter, net)  # 测试集正确率
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
              (epoch + 1, train_l_sum / n, train_acc_sum / n,
               test_acc))  # 交叉熵损失 训练集正确率 测试集正确率
Exemple #3
0
def train_ch3(net,
              train_iter,
              test_iter,
              loss,
              num_eopchs,
              batch_size,
              params=None,
              lr=None,
              trainer=None):
    train_l_sum, train_acc_sum, n = 0.0, 0.0, 0.0
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
        l.backward()
        if trainer is None:
            d2l.sgd(params, lr, batch_size)
        else:
            trainer.step(batch_size)
        y = y.astype("float32")
        train_l_sum += l.asscalar()
        # train_acc_sum +=
        n += y.size
    print("epoch %d, loss %.4f,train_acc %.3f" % (epoch + 1, train_l_sum / n))
Exemple #4
0
def train_rnn(vocab_indices, is_random, vocab_size, hidden_nums, batch_size,
              num_epochs, num_steps, params, theta, lr, prefixs, predict_step):
    if is_random:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        l_sum = 0.0
        n = 0
        start = time.time()
        if not is_random:
            H = nd.zeros(shape=(batch_size, hidden_nums))
        data_iter = data_iter_fn(vocab_indices, batch_size, num_steps)
        for X, y in data_iter:
            if not is_random:
                for s in (H, ):
                    s.detach()
            else:
                H = nd.zeros(shape=(batch_size, hidden_nums))
            with autograd.record():
                input = to_onehot(X, vocab_size)
                output, (H, ) = rnn(input, params, (H, ))
                output = nd.concat(*output, dim=0)
                y = y.T.reshape((-1, ))
                l = loss(output, y).mean()
            l.backward()
            gradient_clip(params, theta)
            d2l.sgd(params, lr, 1)
            l_sum += l.asscalar() * y.size
            n += y.size

        print('epoch %d,perplexity %s,time %s' %
              (epoch, math.exp(l_sum / n), time.time() - start))
        rnn_predit(params, predict_step, prefixs, vocab_size, hidden_nums)
Exemple #5
0
def train_ch3(net,
              w,
              b,
              num_inputs,
              train_iter,
              test_iter,
              loss,
              num_epochs,
              batch_size,
              params=None,
              lr=None,
              trainer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for x, y in train_iter:
            with autograd.record():
                y_hat = net(x, w, b, num_inputs)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, w, b, num_inputs)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
              (epoch, train_l_sum / n, train_acc_sum / n, test_acc))
Exemple #6
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, device, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, device)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device)
        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, device)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach_()
            # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵
            inputs = to_onehot(X, vocab_size)
            # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
            (outputs, state) = rnn(inputs, state, params)
            # 拼接之后形状为(num_steps * batch_size, vocab_size)
            outputs = torch.cat(outputs, dim=0)
            # Y的形状是(batch_size, num_steps),转置后再变成形状为
            # (num_steps * batch_size,)的向量,这样跟输出的行一一对应
            y = torch.flatten(Y.T)
            # 使用交叉熵损失计算平均分类误差
            l = loss(outputs, y.long())

            # 梯度清0
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params, clipping_theta, device)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, device, idx_to_char,
                                char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            #X:一个batch data是batch_size个长度为num_steps的char list,其中char由idx表示
            #Y: Y和X格式一样
            #print(X[0],Y[0])
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:
                    s.detach(
                    )  # Returns a new NDArray, detached from the current graph.
            with autograd.record():
                #print('X.shape', X.shape)
                #print('Y.shape', Y.shape)
                inputs = to_onehot(X, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                #print('inputs.shape', len(inputs), inputs[0].shape)
                #print('outputs.shape', len(outputs), outputs[0].shape)
                outputs = nd.concat(*outputs, dim=0)  # 把outputs全连起来
                print('concat_outputs', outputs.shape)
                y = Y.T.reshape((-1, ))
                print('y', y.shape)
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            d2l.sgd(params, lr, 1)  # 真正的去优化params参数
            l_sum += l.asscalar() * y.size
            n += y.size
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
def train_and_predit_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                         vocab_size, ctx, corpus_indices, idx_to_char,
                         char_to_idx, is_random_iter, num_epochs, num_steps,
                         lr, clipping_theta, batch_size, pred_period, pred_len,
                         prefixes):
    # 采样方式
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random  # 随机取样
    else:
        data_iter_fn = d2l.data_iter_consecutive  # 相邻取样

    params = get_param()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如果是相邻采样,刚开始就初始化隐藏层的参数
            state = init_rnn_state(batch_size, num_hiddens, ctx)

        # 读取数据
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:  # 如果是随机采样,在小批量更新前初始化隐藏层的参数
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:  # 否则从计算图中分离出来
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                (outputs, state) = rnn(
                    inputs, state, params
                )  # inputs和outputs是num_steps个(batch_size, vocab_size)
                outputs = nd.concat(
                    *outputs,
                    dim=0)  # 联结之后形状为(batch_size*num_steps, vocab_size)
                y = Y.T.reshape((-1, ))  # Y(batch_size, num_steps),转置后拉成成为行向量
                l = loss(outputs, y).mean()  # 计算平均分类损失
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # 剪裁梯度
            d2l.sgd(params, lr, 1)  # 里面不用填batch_size,上面已经mean()
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:  # 每pred_period打印一次
            # TODO math.exp(l_sum/n)困惑度
            print('epoch %d, perplexity %f, time %.3f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    '-',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # If using adjacent sampling, initiate hidden
            # state at first
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:  # if using random sampling, iniate at
                # every batch size
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # Or extract state using detach()
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                # outputs is num_steps (batch_size, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                # Concat as (num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # Y from (batch_size, num_steps) to
                # batch * num_steps for loss function's computation
                y = Y.T.reshape((-1, ))
                # Use cross entropy to be loss function
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # clipp gradient
            d2l.sgd(params, lr, 1)  # no need to be mean
            l_sum += l.asscalar() * y.size
            n += y.size
        # Print results using perplexity and also predict the results
        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如使用相邻采样,在epoch开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 连结之后形状为(num_steps * batch_size, vocab_size)
                outputs = nd.concat(*outputs, dim=0)
                # Y的形状是(batch_size, num_steps),转置后再变成长度为
                # batch * num_steps 的向量,这样跟输出的行一一对应
                y = Y.T.reshape((-1, ))
                # 使用交叉熵损失计算平均分类误差
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 因为误差已经取过均值,梯度不用再做平均
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
Exemple #11
0
def pit_and_plot(lambd):
    w,b = init_params()                        # 初始化模型参数
    train_l, test_l = [], []
    for _ in range(num_epochs):
        for X,y in train_iter:
            with autograd.record():
                # 增加L2番薯惩罚
                l = loss(net(X, w, b), y) + lambd*l2_penalty(w)
            l.backward()
            d2l.sgd([w,b], lr, batch_size)
        train_l.append(loss(net(train_features, w, b), train_labels).mean().asscalar())
        test_l.append(loss(net(test_features, w, b), test_labels).mean().asscalar())
    d2l.semilogy(range(1, num_epochs+1), train_l, 'epochs', 'loss',
                 range(1, num_epochs+1), test_l, ['train', 'test'], figsize=(15, 5))
    print('L2 norm of w:', w.norm().asscalar())
def train_batch(X, y, gpu_params, ctx, lr):
    #当ctx包含多块GPU及相应的显存时,将小批量数据样本划分并复制到各个显存上
    gpu_Xs, gpu_ys = split_and_load(X, ctx), split_and_load(y, ctx)
    with autograd.record():  #在各块GPU上分别计算损失
        ls = [
            loss(lenet(gpr_X, gpu_W), gpu_y)
            for gpu_X, gpu_y, gpu_W in zip(gpu_Xs, gpu_ys, gpu_params)
        ]
    for l in ls:  #在各块GPU上分别反向传播
        l.backward()
    #把各块显卡的显存上的梯度加起来,然后广播到所有显存上
    for i in range(len(gpu_params[0])):
        allreduce(gpu_params[c][i].grad for c in range(len(ctx)))
    for param in gpu_params:  #在各块显卡的显存上分别更新模型参数
        d2l.sgd(param, lr, X.shape[0])  #这里使用了完整批量大小
Exemple #13
0
def train_ch3(net,
              train_iter,
              test_iter,
              loss,
              num_epochs,
              batch_size,
              num_inputs,
              W,
              b,
              params=None,
              lr=None,
              trainer=None):
    """
    开始训练
    :param net: 模型函数
    :param train_iter: 训练数据
    :param test_iter: 测试数据
    :param loss: 损失函数
    :param num_epochs: 迭代周期数
    :param batch_size:
    :param num_inputs:
    :param W:
    :param b:
    :param params:
    :param lr: 学习率
    :param trainer:
    :return:
    """
    for epoch in range(num_epochs):
        train_loss_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X, num_inputs, W, b)
                data_loss = loss(y_hat, y).sum()
            # TODO 放在with里面和外面的区别
            # 自动求梯度
            data_loss.backward()
            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_loss_sum += data_loss.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, num_inputs, W, b)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
              (epoch + 1, train_loss_sum / n, train_acc_sum / n, test_acc))
def train_and_predit_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                         vocab_size, ctx, corpus_indices, idx_to_char,
                         char_to_idx, is_tandom_iter, num_epochs, num_steps,
                         lr, clipping_theta, batch_size, pred_period, pred_len,
                         prefixes):
    # 采样方式
    if is_tandom_iter:
        data_iter_fn = d2l.data_iter_random  # 随机取样
    else:
        data_iter_fn = d2l.data_iter_consecutive  # 相邻取样

    params = get_param()
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_tandom_iter:  # 如果是相邻采样,刚开始就初始化隐藏层的参数
            state = init_rnn_state(batch_size, num_hiddens, ctx)

        #
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_tandom_iter:
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = Y.T.reshape((-1, ))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            d2l.sgd(params, lr, 1)
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.3f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - time))
            for prefix in prefixes:
                print(
                    '-',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
def train_ch3(net,
              train_iter,
              test_iter,
              loss,
              num_epochs,
              batch_size,
              params=None,
              lr=None,
              trainer=None):

    global printt

    for epoch in range(num_epochs):
        # 每一次训练
        train_l_sum = 0.0
        train_acc_sum = 0.0
        n = 0

        # 小批量数据
        for X, y in train_iter:
            # 某一次训练
            with autograd.record():
                # 某一数据的训练预测值列表

                y_hat = net(X)
                l = loss(y_hat, y).sum()
                # if printt:

                #     print(y_hat[0].sum())
                #     printt = False

            l.backward()

            if trainer is None:
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)

            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size

        # 对测试数据求准确率
        test_acc = evaluate_accuracy(test_iter, net)

        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
              (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
Exemple #16
0
def fit_and_plot(lambd):
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
            l.backward()
            d2l.sgd([w, b], lr, batch_size)
        train_ls.append(
            loss(net(train_features, w, b), train_labels).mean().asscalar())
        test_ls.append(
            loss(net(test_features, w, b), test_labels).mean().asscalar())
    d2l.semilogy(range(1, num_epochs + 1), train_ls, "epochs", "loss",
                 range(1, num_epochs + 1), test_ls, ["train", "test"])
    print("L2 norm of w", w.norm().asscalar())
def fit_and_plot(lambd):
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                # 添加了L2范数惩罚项,广播机制使其变成长度为batch_size的向量
                l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
            l.backward()
            d2l.sgd([w, b], lr, batch_size)
        train_ls.append(
            loss(net(train_features, w, b), train_labels).mean().asscalar())
        test_ls.append(
            loss(net(test_features, w, b), test_labels).mean().asscalar())
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])
    print('L2 norm of w:', w.norm().asscalar())
Exemple #18
0
def fit_and_plot(lambd):
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        for x, y in train_iter:
            with autograd.record():
                l = loss(net(x, w, b), y) + lambd * l2_penalty(w)
            l.backward()
        d2l.sgd([w, b], lr, batch_size)
        train_ls.append(loss(net(train_features, w, b),
                             train_labels).mean().asscalar())
        test_ls.append(loss(net(test_features, w, b),
                             test_labels).mean().asscalar())
    print('true w: ', w.mean())
    print('L2 norm of w:', w.norm().asscalar())
    print('final epoch: train loss ', train_ls[-1], 'test loss', test_ls[-1])
    d2l.semilogy(range(1, num_epochs+1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs+1), test_ls, ['train', 'test'])
Exemple #19
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    '''训练模型,并进行预测'''
    if is_random_iter:
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params()
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        if not is_random_iter:
            state = init_rnn_state(batch_size, num_hiddens, ctx=ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices,batch_size, num_steps, ctx=ctx)
        for X, Y in data_iter:
            if is_random_iter:
                state = init_rnn_state(batch_size, num_hiddens, ctx=ctx)
            else:  # 否则需要使用detach函数从计算图分离隐藏状态
                for s in state:
                    s.detach()
            inputs = one_hot(X, vocab_size)
            with autograd.record():
                (outputs, state) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = Y.T.reshape((-1,))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx=ctx)
            d2l.sgd(params, lr, 1)
            l_sum += l.asscalar()*y.size
            n += y.size
        
        if (epoch+1) % pred_period == 0:
            print('epoch: %d, perlexity: %f, time %.2f sec' % (
                    epoch + 1, math.exp(l_sum/n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn(
                        prefix, pred_len, rnn, params, init_rnn_state,
                        num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
Exemple #20
0
    def train_softmax(self,
                      net,
                      train_iter,
                      test_iter,
                      loss,
                      num_epochs,
                      batch_size,
                      params=None,
                      lr=None,
                      optimizer=None):
        W = params[0]
        b = params[1]
        for epoch in range(num_epochs):
            train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
            for X, y in train_iter:
                """
                W是权重,b是偏移量, X是输入,y是输出
                """
                y_hat = net(X, W, b)
                l = loss(y_hat, y).sum()

                # 梯度清零
                if optimizer is not None:
                    optimizer.zero_grad()
                elif params is not None and params[0].grad is not None:
                    for param in params:
                        param.grad.data.zero_()

                l.backward()
                if optimizer is None:
                    d2l.sgd(params, lr, batch_size)
                else:
                    optimizer.step()  # “softmax回归的简洁实现”一节将用到

                train_l_sum += l.item()
                train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
                n += y.shape[0]
            test_acc = self.evaluate_accuracy(test_iter, W, b)
            print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
                  (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
        return W, b
Exemple #21
0
def train(net, iter_trainning, iter_testing, loss, number_epochs, batch_size,
          parameters = None, leanrning_rate = None, trainer = None):
    for epoch in range(number_epochs):
        train_l_sum = 0.0
        train_acc_sum = 0.0
        n = 0
        for X, y in iter_trainning:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                d2l.sgd(parameters, leanrning_rate, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += (y_hat.argmax(axis = 1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(iter_testing, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
Exemple #22
0
    def train_and_predict(self, is_random_iter, pred_period, pred_len, prefixes):
        if is_random_iter:
            data_iter_fn = d2l.data_iter_random
        else:
            data_iter_fn = d2l.data_iter_consecutive

        self.get_params()
        loss = gloss.SoftmaxCrossEntropyLoss()

        for epoch in range(self.num_epochs):
            if not is_random_iter:
                self.init_rnn_state()
            l_sum, n, start = 0.0, 0, time.time()
            data_iter = data_iter_fn(self.corpus_indices, self.batch_size, self.num_steps)
            for X, Y in data_iter:
                if is_random_iter:  # 如使用随机采样,在每个小批量更新前初始化隐藏状态
                    self.init_rnn_state(self.batch_size, self.num_hidden)
                else:  # 否则需要使用detach函数从计算图分离隐藏状态
                    for s in self.state:
                        s.detach()
                with autograd.record():
                    self.inputs = self.to_onehot(X.as_in_context(self.ctx), self.vocab_size)
                    outputs, state = self.rnn()
                    outputs = nd.concat(*outputs, dim=0)
                    y = Y.T.reshape((-1,))
                    l = loss(outputs, y).mean()
                l.backward()
                self.grad_clipping(self.clipping_theta)
                d2l.sgd(self.params, self.lr, 1)
                l_sum += l_sum
                n += y.size

            if (epoch + 1) % pred_period == 0:
                print(
                    'epoch {}, perplexity {}, time {} sec'.format(epoch + 1, math.exp(l_sum / n), time.time() - start))
                for prefix in prefixes:
                    print(' -', self.predict(prefix, pred_len))
Exemple #23
0
def train_ch3(net,
              train_iter,
              test_iter,
              loss,
              num_epochs,
              batch_size,
              params=None,
              lr=None,
              trainer=None):
    # 此处需要迭代5次
    for epoch in range(num_epochs):
        # train_l_sum 训练60000张图片正确图片概率熵值的总和
        #             熵值的损失  train_l_sum / n
        # train_acc_sum 训练60000张图片的准确率
        #               正确结果为R一张图片正确记为1,错误记为0,
        #               (R1 + ...... + R60000) / n
        #               (R1 + ...... + R60000) / 60000
        #                                        n作为训练数据为60000
        # n训练数据(训练图片的总数60000)
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            # 每次取出256张图片数据,参考下面打印的结果
            # X <NDArray 256x1x28x28 @cpu(0)>
            # y <NDArray 256 @cpu(0)>

            with autograd.record():
                # y_hat 256 x 10
                # 用模型进行计算
                y_hat = net(X)

                # 选出当前训练数据,对应正确的下标记的概率的总和
                # lose(y_hat, y)求出的是当前正确衣服预测概率的熵,(如果预测正确,概率为P,熵为-lnp = -0,熵值越小,意外度越小,损失越小)
                # loss(y_hat, y).sum()求出一批数据(256张图像)的熵值的总数
                # 下面是简单的ln百分比参考
                # -ln0.01   = 4.605
                # -ln0.10   = 2.302
                # -ln0.50   = 0.693
                # -ln0.90   = 0.105
                # -ln0.99   = 0.010
                # -----!!!!!l是一批数据256张图正确图片出现概率熵值的总和!!!!!-----
                l = loss(y_hat, y).sum()

            # 计算梯度
            l.backward()

            if trainer is None:
                #W----------------!!!!!!!!!!!!!----------------W
                # 梯度下降(开始“学习”W和b)
                #M----------------!!!!!!!!!!!!!----------------M
                # batch_size    256
                # lr            0.1
                d2l.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)  # “softmax回归的简洁实现”一节将用到

            # y原来是int类型的正确衣服的下标,转换成float类型
            y = y.astype('float32')

            # 转换成标量,然后加过去
            train_l_sum += l.asscalar()

            # 求出正确的总数
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size

        # 每学习一次,用测试数据进行一次测试操作,算出准确度
        # 用当前训练轮数的模型,进行softmax,然后当前预测的服装的下标是否正确(正确为1,错误为0),(i1 + ...... + i60000) / 60000
        test_acc = evaluate_accuracy(test_iter, net)

        # epoch         第几轮
        # loss          损失
        # train acc     训练数据acc(准确率)
        # test acc      测试数据acc(准确率)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' %
              (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X,Y in data_iter:
            if:                                      # 如果是相邻采样,刚开始就初始化隐藏层的参数
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                for s in state:
                    s.detach()
            with autograd.record():
                inputs = to_onehot(X, vocab_size)
                (outputs, state) = rnn(inputs, state, params)
                outputs = nd.concat(*outputs, dim=0)
                y = Y.T.reshape((-1, ))
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)
            d2l.sgd(params, lr, 1)
            l_sum += l.asscalar() * y.size
            n += y.size

        if (epoch+1)%pred_period == 0:
            print('epoch %d, perplexity %f, time %.3f sec' % (epoch+1, math.exp(l_sum/n), time.time()-time))
            for prefix in prefixes:
                print('-', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))


num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 100, 0.01
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']

train_and_predit_rnn(rnn, get_param, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices,
                     idx_to_char, char_to_idx, True, num_epochs, num_steps, lr, clipping_theta,
                     batch_size, pred_period, pred_len, prefixes)
Exemple #25
0
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                          vocab_size, ctx, corpus_indices, idx_to_char,
                          char_to_idx, is_random_iter, num_epochs, num_steps,
                          lr, clipping_theta, batch_size, pred_period,
                          pred_len, prefixes):
    '''
    :param rnn:                 循环神经网络
    :param get_params:          参数权重
    :param init_rnn_state:      模型初始化
    :param num_hiddens:         隐藏层大小
    :param vocab_size:          不同字符的个数
    :param ctx:
    :param corpus_indices:      字符的索引(不同)
    :param idx_to_char:
    :param char_to_idx:
    :param is_random_iter:      数据是否随机采样
    :param num_epochs:          总轮数
    :param num_steps:
    :param lr:                  学习率
    :param clipping_theta:      梯度裁剪
    :param batch_size:          批量大小
    :param pred_period:         预测周期
    :param pred_len:
    :param prefixes:            需要预测的字符
    :return:
    '''

    if is_random_iter:
        # 一共vocab_size的大小
        # 每次返回batch_size * num_steps的大小,一共vocab_size/xx 次
        data_iter_fn = d2l.data_iter_random
    else:
        data_iter_fn = d2l.data_iter_consecutive
    params = get_params(vocab_size, num_hiddens, vocab_size, ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()

    for epoch in range(num_epochs):
        if not is_random_iter:  # 如果使用相邻采样,开始时初始化隐藏状态
            state = init_rnn_state(batch_size, num_hiddens, ctx)
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx)
        for X, Y in data_iter:
            if is_random_iter:  # 随机采样,每个小批量开始前初始化状态.
                state = init_rnn_state(batch_size, num_hiddens, ctx)
            else:
                # 使用detach函数从计算图分离隐藏状态
                for s in state:
                    # 将某个node变成不需要梯度的Varibale。因此当反向传播经过这个node时,梯度就不会从这个node往前面传播
                    # 不想计算A网络的,那么可以把Y通过detach()函数分离出来
                    s.detach()
            with autograd.record():
                # inputs 是num_steps个(batch_size,vocab_size) = num_steps*batch_size*vocab_size
                inputs = to_onehot(X, vocab_size)
                # outputs 有num_steps 个形状为(batch_size,vocab_size)的矩阵
                (outputs, state) = rnn(inputs, state, params)
                # 连结之后形状为(num_steps*batch_size,vocab_size)的矩阵
                outputs = nd.concat(*outputs, dim=0)
                # 转置就是vocab_size,num_steps*batch_size
                # Y的形状时(batch_size,num_steps),转置变成长度时
                # batch_size * num_steps的向量,一一对应
                y = Y.T.reshape((-1, ))  # 转换为一维矩阵
                # 使用交叉熵损失计算平均分类误差
                l = loss(outputs, y).mean()
            l.backward()
            grad_clipping(params, clipping_theta, ctx)  # 裁剪梯度
            d2l.sgd(params, lr, 1)  # 误差取过均值,梯度不做平均
            l_sum += l.asscalar() * y.size  # 平均损失*总数
            #
            n += y.size
        if (epoch + 1) % pred_period == 0:
            # perplexity
            print('epoch %d,perplexity %f,time %.2f sec' %
                  (epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(
                    ' -',
                    predict_rnn(prefix, pred_len, rnn, params, init_rnn_state,
                                num_hiddens, vocab_size, ctx, idx_to_char,
                                char_to_idx))
Exemple #26
0
def fit_and_plot(lambd):
    # w     <NDArray 200x1 @cpu(0)>
    # b     <NDArray 1x1 @cpu(0)>
    w, b = init_params()
    train_ls, test_ls = [], []
    for _ in range(num_epochs):
        # 训练100次(梯度下降100次)
        for X, y in train_iter:
            # 训练批次为20,每一次取出一个训练,一共训练20次

            # X <NDArray 1x200 @cpu(0)>
            # y <NDArray 1x1 @cpu(0)>
            with autograd.record():
                # 添加了L2范数惩罚项,广播机制使其变成长度为batch_size的向量
                # loss + λ / 2 * ||w||2
                #
                #                L2范数的平方
                #
                # 带惩罚函数的损失

                # 惩罚函数
                # λ / 2 * (∥w∥ ** 2)
                # l2_penalty = (w ** 2).sum() / 2
                # lambd * l2_penalty(w) = lambd / 2 * ((w ** 2).sum())

                # 损失计算
                # nd.dot(X, w) + b
                # X - <NDArray 1x200 @cpu(0)>
                # w - <NDArray 200x1 @cpu(0)>
                # b - <NDArray 1 @cpu(0)>
                netResult = net(X, w, b)
                lossO = loss(netResult, y)
                l = lossO + lambd * l2_penalty(w)

                # l = loss(net(X, w, b), y) + lambd * l2_penalty(w)
            # 计算梯度
            l.backward()

            # batch_size        1
            # lr                0.003
            # [w, b]            [200 x 1, 1 x 1]
            # 梯度下降
            d2l.sgd([w, b], lr, batch_size)

        # train_features    *   w   +   b       =       y
        # 20x200                200x1   1x1             20x1
        train_ls.append(
            loss(net(train_features, w, b), train_labels).mean().asscalar())
        test_ls.append(
            loss(net(test_features, w, b), test_labels).mean().asscalar())

    # 100次的训练,每次训练数据的损失都在降低,但是测试数据的损失一直很高
    d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',
                 range(1, num_epochs + 1), test_ls, ['train', 'test'])

    # --------------------normTest--------------------
    # TTT1 = nd.array((1,2,3,4)).reshape((1, 4))
    # 1 + 4 + 9 + 16 = 30
    # TTT2 = TTT1.norm().asscalar()
    # --------------------normTest--------------------

    print('L2 norm of w:', w.norm().asscalar())
Exemple #27
0
            d2l.sgd(params, lr, batch_size)
        else:
            trainer.step(batch_size)
        y = y.astype("float32")
        train_l_sum += l.asscalar()
        # train_acc_sum +=
        n += y.size
    print("epoch %d, loss %.4f,train_acc %.3f" % (epoch + 1, train_l_sum / n))


for epoch in range(num_epochs):
    train_l_sum, train_acc_sum, n = 0.0, 0.0, 0.0
    X, y = mnist.train.next_batch(batch_size)
    X, y = nd.array(X), nd.array(y)
    print(X.max())
    # print(y.shape)
    with autograd.record():
        y_hat = net(X)
        l = loss(y_hat, y).sum()
    l.backward()
    if trainer is None:
        d2l.sgd(params, lr, batch_size)
    else:
        trainer.step(batch_size)
    y = y.astype("float32")
    train_l_sum += l.asscalar()
    n += y.size
    print("epoch %d, loss %.4f" % (epoch + 1, train_l_sum / n))
net.export("my_mlp")
print("ok!")