Beispiel #1
0
def linear_regression(train_data, test_data, batch_size):
    # plot the training dataset
#    fig = plt.figure(figsize=(10,10))
#    ax = fig.add_subplot(111)
#    ax.plot(train_data[:,0], train_data[:,1], '*')
#    ax.grid()
#    ax.set_xlabel('x')
#    ax.set_ylabel('y')
#    ax.set_title('Training data')

    #---------------------------------------------------------------------#
    # TODO: Implement training of a linear regression model.
    # Here you should reuse ls_solve() from the registration mini-project.
    # The provided addones() function adds a column of all ones to a data
    # matrix X in a similar way to the c2h() function used in registration.

    trainX = train_data[:,0].reshape(-1,1)
    trainXones = util.addones(trainX)
    trainY = train_data[:,1].reshape(-1,1)
 
    Theta, _ = reg.ls_solve(trainXones, trainY) 
    print(Theta)
    #---------------------------------------------------------------------

    fig1 = plt.figure(figsize=(10,10))
    ax1 = fig1.add_subplot(111)
    util.plot_regression_no_bars(trainX, trainY, Theta, ax1)
    ax1.grid()
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.legend(('Original data', 'Regression curve', 'Predicted Data', 'Error'))
    ax1.set_title('Training set')
    
    fig1.savefig("Regression train with batch size {}.png".format(batch_size)) 


    testX = test_data[:,0].reshape(-1,1)
    testY = test_data[:,1].reshape(-1,1)

    fig2 = plt.figure(figsize=(10,10))
    ax2 = fig2.add_subplot(111)
    util.plot_regression_no_bars(testX, testY, Theta, ax2)
    ax2.grid()
    ax2.set_xlabel('x')
    ax2.set_ylabel('y')
    ax2.legend(('Original data', 'Regression curve', 'Predicted Data', 'Error'))
    ax2.set_title('Test set')
    
    fig2.savefig("Regression test with batch size {}.png".format(batch_size)) 

    #---------------------------------------------------------------------#
    # TODO: Compute the error for the trained model.
    predictedY_test = util.addones(testX).dot(Theta)
    E_test  =np.sum(np.square(np.subtract(predictedY_test, testY)))
    #---------------------------------------------------------------------#

    return E_test, predictedY_test
Beispiel #2
0
def linear_regression():
    
    # load the training, validation and testing datasets
    fn1 = '../data/linreg_ex_test.txt'
    fn2 = '../data/linreg_ex_train.txt'
    fn3 = '../data/linreg_ex_validation.txt'
    # shape (30,2) numpy array; x = column 0, y = column 1
    test_data = np.loadtxt(fn1)
    # shape (20,2) numpy array; x = column 0, y = column 1
    train_data = np.loadtxt(fn2)
    # shape (10,2) numpy array; x = column 0, y = column 1
    validation_data = np.loadtxt(fn3)

    # plot the training dataset
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)
    ax.plot(train_data[:,0], train_data[:,1], '*')
    ax.grid()
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Training data')

    #---------------------------------------------------------------------#
    # TODO: Implement training of a linear regression model.
    # Here you should reuse ls_solve() from the registration mini-project.
    # The provided addones() function adds a column of all ones to a data
    # matrix X in a similar way to the c2h() function used in registration.

    trainX = train_data[:,0].reshape(-1,1)
    trainXones = util.addones(trainX)
    trainY = train_data[:,1].reshape(-1,1)
    
    #---------------------------------------------------------------------#

    fig1 = plt.figure(figsize=(10,10))
    ax1 = fig1.add_subplot(111)
    util.plot_regression(trainX, trainY, Theta, ax1)
    ax1.grid()
    ax1.set_xlabel('x')
    ax1.set_ylabel('y')
    ax1.legend(('Original data', 'Regression curve', 'Predicted Data', 'Error'))
    ax1.set_title('Training set')

    testX = test_data[:,0].reshape(-1,1)
    testY = test_data[:,1].reshape(-1,1)

    fig2 = plt.figure(figsize=(10,10))
    ax2 = fig2.add_subplot(111)
    util.plot_regression(testX, testY, Theta, ax2)
    ax2.grid()
    ax2.set_xlabel('x')
    ax2.set_ylabel('y')
    ax2.legend(('Original data', 'Regression curve', 'Predicted Data', 'Error'))
    ax2.set_title('Test set')

    #---------------------------------------------------------------------#
    # TODO: Compute the error for the trained model.
    #---------------------------------------------------------------------#

    return E_validation, E_test
Beispiel #3
0
def logistic_regression():
    
    # dataset preparation
    num_training_samples = 300
    num_validation_samples = 100
    
    # here we reuse the function from the segmentation practicals
    m1=[2,3]
    m2=[-0,-4]
    s1=[[8,7],[7,8]]
    s2=[[8,6],[6,8]]

    [trainingX, trainingY] = util.generate_gaussian_data(num_training_samples, m1, m2, s1, s2)
    r,c = trainingX.shape
    print('Training sample shape: {}'.format(trainingX.shape))

    # we need a validation set to monitor for overfitting
    [validationX, validationY] = util.generate_gaussian_data(num_validation_samples, m1, m2, s1, s2)
    r_val,c_val = validationX.shape
    print('Validation sample shape: {}'.format(validationX.shape))
    
    validationXones = util.addones(validationX)

    # train a logistic regression model:
    # the learning rate for the gradient descent method
    # (the same as in intensity-based registration)
    mu = 0.001

    # we are actually using stochastic gradient descent
    batch_size = 30

    # initialize the parameters of the model with small random values,
    # we need one parameter for each feature and a bias
    Theta = 0.02*np.random.rand(c+1, 1)

    # number of gradient descent iterations
    num_iterations = 300

    # variables to keep the loss and gradient at every iteration
    # (needed for visualization)
    iters = np.arange(num_iterations)
    loss = np.full(iters.shape, np.nan)
    validation_loss = np.full(iters.shape, np.nan)

    # Create base figure
    fig = plt.figure(figsize=(15,8))
    ax1 = fig.add_subplot(121)
    im1, Xh_ones, num_range_points = util.plot_lr(trainingX, trainingY, Theta, ax1)
    util.scatter_data(trainingX, trainingY, ax=ax1);
    ax1.grid()
    ax1.set_xlabel('x_1')
    ax1.set_ylabel('x_2')
    ax1.legend()
    ax1.set_title('Training set')
    text_str1 = '{:.4f};  {:.4f};  {:.4f}'.format(0, 0, 0)
    txt1 = ax1.text(0.3, 0.95, text_str1, bbox={'facecolor': 'white', 'alpha': 1, 'pad': 10}, transform=ax1.transAxes)

    ax2 = fig.add_subplot(122)
    ax2.set_xlabel('Iteration')
    ax2.set_ylabel('Loss (average per sample)')
    ax2.set_title('mu = '+str(mu))
    h1, = ax2.plot(iters, loss, linewidth=2, label='Training loss')
    h2, = ax2.plot(iters, validation_loss, linewidth=2, label='Validation loss')
    ax2.set_ylim(0, 0.7)
    ax2.set_xlim(0, num_iterations)
    ax2.grid()
    ax1.legend()

    text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
    txt2 = ax2.text(0.3, 0.95, text_str2, bbox={'facecolor': 'white', 'alpha': 1, 'pad': 10}, transform=ax2.transAxes)

    # iterate
    for k in np.arange(num_iterations):
        
        # pick a batch at random
        idx = np.random.randint(r, size=batch_size)

        # the loss function for this particular batch
        loss_fun = lambda Theta: cad.lr_nll(util.addones(trainingX[idx,:]), trainingY[idx], Theta)

        # gradient descent:
        # here we reuse the code for numerical computation of the gradient
        # of a function
        Theta = Theta - mu*reg.ngradient(loss_fun, Theta)

        # compute the loss for the current model parameters for the
        # training and validation sets
        # note that the loss is divided with the number of samples so
        # it is comparable for different number of samples
        loss[k] = loss_fun(Theta)/batch_size
        validation_loss[k] = cad.lr_nll(validationXones, validationY, Theta)/r_val

        # upldate the visualization
        ph = cad.sigmoid(Xh_ones.dot(Theta)) > 0.5
        decision_map = ph.reshape(num_range_points, num_range_points)
        decision_map_trns = np.flipud(decision_map)
        im1.set_data(decision_map_trns)
        text_str1 = '{:.4f};  {:.4f};  {:.4f}'.format(Theta[0,0], Theta[1,0], Theta[2,0])
        txt1.set_text(text_str1)
        h1.set_ydata(loss)
        h2.set_ydata(validation_loss)
        text_str2 = 'iter.={}, loss={:.3f}, val. loss={:.3f} '.format(k, loss[k], validation_loss[k])
        txt2.set_text(text_str2)


        display(fig)
        clear_output(wait = True)
Beispiel #4
0
def nuclei_classification():
    ## dataset preparation
    fn = '../data/nuclei_data_classification.mat'
    mat = scipy.io.loadmat(fn)

    test_images = mat["test_images"]  # (24, 24, 3, 20730)
    test_y = mat["test_y"]  # (20730, 1)
    training_images = mat["training_images"]  # (24, 24, 3, 14607)
    training_y = mat["training_y"]  # (14607, 1)
    validation_images = mat["validation_images"]  # (24, 24, 3, 7303)
    validation_y = mat["validation_y"]  # (7303, 1)

    ## dataset preparation
    training_x, validation_x, test_x = util.reshape_and_normalize(
        training_images, validation_images, test_images)

    ## training linear regression model
    #-------------------------------------------------------------------#
    # TODO: Select values for the learning rate (mu), batch size
    # (batch_size) and number of iterations (num_iterations), as well as
    # initial values for the model parameters (Theta) that will result in
    # fast training of an accurate model for this classification problem.
    #-------------------------------------------------------------------#

    xx = np.arange(num_iterations)
    loss = np.empty(*xx.shape)
    loss[:] = np.nan
    validation_loss = np.empty(*xx.shape)
    validation_loss[:] = np.nan
    g = np.empty(*xx.shape)
    g[:] = np.nan

    fig = plt.figure(figsize=(8, 8))
    ax2 = fig.add_subplot(111)
    ax2.set_xlabel('Iteration')
    ax2.set_ylabel('Loss (average per sample)')
    ax2.set_title('mu = ' + str(mu))
    h1, = ax2.plot(xx, loss, linewidth=2)  #'Color', [0.0 0.2 0.6],
    h2, = ax2.plot(xx, validation_loss, linewidth=2)  #'Color', [0.8 0.2 0.8],
    ax2.set_ylim(0, 0.7)
    ax2.set_xlim(0, num_iterations)
    ax2.grid()

    text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
    txt2 = ax2.text(0.3,
                    0.95,
                    text_str2,
                    bbox={
                        'facecolor': 'white',
                        'alpha': 1,
                        'pad': 10
                    },
                    transform=ax2.transAxes)

    for k in np.arange(num_iterations):
        # pick a batch at random
        idx = np.random.randint(training_x.shape[0], size=batch_size)

        training_x_ones = util.addones(training_x[idx, :])
        validation_x_ones = util.addones(validation_x)

        # the loss function for this particular batch
        loss_fun = lambda Theta: cad.lr_nll(training_x_ones, training_y[idx],
                                            Theta)

        # gradient descent
        # instead of the numerical gradient, we compute the gradient with
        # the analytical expression, which is much faster
        Theta_new = Theta - mu * cad.lr_agrad(training_x_ones, training_y[idx],
                                              Theta).T

        loss[k] = loss_fun(Theta_new) / batch_size
        validation_loss[k] = cad.lr_nll(validation_x_ones, validation_y,
                                        Theta_new) / validation_x.shape[0]

        # visualize the training
        h1.set_ydata(loss)
        h2.set_ydata(validation_loss)
        text_str2 = 'iter.: {}, loss: {:.3f}, val. loss={:.3f} '.format(
            k, loss[k], validation_loss[k])
        txt2.set_text(text_str2)

        Theta = None
        Theta = np.array(Theta_new)
        Theta_new = None
        tmp = None

        display(fig)
        clear_output(wait=True)
        plt.pause(.005)
Beispiel #5
0
s1 = [[8, 7], [7, 8]]
s2 = [[8, 6], [6, 8]]

[trainingX, trainingY] = seg.generate_gaussian_data(num_training_samples, m1,
                                                    m2, s1, s2)
r, c = trainingX.shape
print('Training sample shape: {}'.format(trainingX.shape))

# we need a validation set to monitor for overfitting
[validationX,
 validationY] = seg.generate_gaussian_data(num_validation_samples, m1, m2, s1,
                                           s2)
r_val, c_val = validationX.shape
print('Validation sample shape: {}'.format(validationX.shape))

validationXones = util.addones(validationX)

# train a logistic regression model:
# the learning rate for the gradient descent method
# (the same as in intensity-based registration)
mu = 0.001

# we are actually using stochastic gradient descent
batch_size = 30

# initialize the parameters of the model with small random values,
# we need one parameter for each feature and a bias
Theta = 0.02 * np.random.rand(c + 1, 1)

# number of gradient descent iterations
num_iterations = 300
Beispiel #6
0
ax.plot(train_data[:, 0], train_data[:, 1], '*')
ax.grid()
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Training data')

#---------------------------------------------------------------------#
# TODO: Implement training of a linear regression model.
# Here you should reuse ls_solve() from the registration mini-project.
# The provided addones() function adds a column of all ones to a data
# matrix X in a similar way to the c2h() function used in registration.

trainX = train_data[:, 0].reshape(-1, 1)
trainXsquared = np.square(train_data[:, 0]).reshape(-1, 1)
trainX = np.hstack((trainX, trainXsquared))
trainXones = util.addones(trainX)
trainY = train_data[:, 1].reshape(-1, 1)

validationX = validation_data[:, 0].reshape(-1, 1)
validationXsquared = np.square(validation_data[:, 0]).reshape(-1, 1)
validationX = np.hstack((validationX, validationXsquared))
validationones = util.addones(validationX)
validationY = validation_data[:, 1].reshape(-1, 1)

Theta, _ = reg.ls_solve(trainXones, trainY)
print(Theta)
#---------------------------------------------------------------------#

fig1 = plt.figure(figsize=(10, 10))
ax1 = fig1.add_subplot(111)
util.plot_regression(trainX, trainY, Theta, ax1)
Beispiel #7
0
text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
txt2 = ax2.text(0.3, 0.95, text_str2, bbox={'facecolor': 'white', 'alpha': 1, 'pad': 10}, transform=ax2.transAxes)

E_validation = 1; 
E_validation_new = 0; 
k = 0
counter = 0; 
stopnow = 0
normgradient = 1;
while normgradient>0.1 and stopnow<1:
    
    # pick a batch at random
    idx = np.random.randint(training_x.shape[0], size=batch_size)

    training_x_ones = util.addones(training_x[idx,:])
        
    validation_x_ones = util.addones(validation_x)

    # the loss function for this particular batch
    loss_fun = lambda Theta: cad.lr_nll(training_x_ones, training_y[idx], Theta)

    # gradient descent
    # instead of the numerical gradient, we compute the gradient with
    # the analytical expression, which is much faster
    Theta_new = Theta - mu*cad.lr_agrad(training_x_ones, training_y[idx], Theta).T

    loss[k] = loss_fun(Theta_new)/batch_size
    validation_loss[k] = cad.lr_nll(validation_x_ones, validation_y, Theta_new)/validation_x.shape[0]
    
    
Beispiel #8
0
def auto_nuclei_classification(mu, batch_size):
    ## dataset preparation
    fn = '../data/nuclei_data_classification.mat'
    mat = scipy.io.loadmat(fn)

    test_images = mat["test_images"]  # (24, 24, 3, 20730)
    test_y = mat["test_y"]  # (20730, 1)
    training_images = mat["training_images"]  # (24, 24, 3, 14607)
    training_y = mat["training_y"]  # (14607, 1)
    validation_images = mat["training_images"]  # (24, 24, 3, 14607)
    validation_y = mat["training_y"]  # (14607, 1)

    ## dataset preparation
    imageSize = training_images.shape
    # every pixel is a feature so the number of features is:
    # height x width x color channels
    numFeatures = imageSize[0] * imageSize[1] * imageSize[2]
    training_x = training_images.reshape(
        numFeatures, training_images.shape[3]).T.astype(float)
    validation_x = validation_images.reshape(
        numFeatures, validation_images.shape[3]).T.astype(float)
    test_x = test_images.reshape(numFeatures,
                                 test_images.shape[3]).T.astype(float)

    # the training will progress much better if we
    # normalize the features
    meanTrain = np.mean(training_x, axis=0).reshape(1, -1)
    stdTrain = np.std(training_x, axis=0).reshape(1, -1)

    training_x = training_x - np.tile(meanTrain, (training_x.shape[0], 1))
    training_x = training_x / np.tile(stdTrain, (training_x.shape[0], 1))

    validation_x = validation_x - np.tile(meanTrain,
                                          (validation_x.shape[0], 1))
    validation_x = validation_x / np.tile(stdTrain, (validation_x.shape[0], 1))

    test_x = test_x - np.tile(meanTrain, (test_x.shape[0], 1))
    test_x = test_x / np.tile(stdTrain, (test_x.shape[0], 1))

    ## training linear regression model
    #-------------------------------------------------------------------#
    # TODO: Select values for the learning rate (mu), batch size
    # (batch_size) and number of iterations (num_iterations), as well as
    # initial values for the model parameters (Theta) that will result in
    # fast training of an accurate model for this classification problem.
    #    a = 5
    #    mu_init =10**-a;
    #    mu = mu_init;

    #number of training samples
    #    batch_size = 3000
    r, c = training_x.shape

    #initial weights
    Theta = 0.02 * np.random.rand(c + 1, 1)

    #-------------------------------------------------------------------#

    xx = np.arange(100000)
    loss = np.empty(*xx.shape)
    loss[:] = np.nan
    validation_loss = np.empty(*xx.shape)
    validation_loss[:] = np.nan
    g = np.empty(*xx.shape)
    g[:] = np.nan

    idx = np.random.randint(training_x.shape[0], size=batch_size)

    # Create base figure
    fig = plt.figure(figsize=(15, 10))

    ax2 = fig.add_subplot(111)
    ax2.set_xlabel('Iteration')

    ax2.set_ylabel('Loss (average per sample)')
    ax2.set_title('mu = ' + str(mu))
    h1, = ax2.plot(xx, loss, linewidth=2)  #'Color', [0.0 0.2 0.6],
    h2, = ax2.plot(xx, validation_loss, linewidth=2)  #'Color', [0.8 0.2 0.8],
    ax2.set_ylim(0, 0.7)
    ax2.grid()

    text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
    txt2 = ax2.text(0.3,
                    0.95,
                    text_str2,
                    bbox={
                        'facecolor': 'white',
                        'alpha': 1,
                        'pad': 10
                    },
                    transform=ax2.transAxes)

    #Some initial parameter settings
    k = 0  #iteration number
    counter = 0
    #count number of iterations, resets after 100 iterations
    stopnow = 0  #used to stop when loss doesn't decrease any further
    normgradient = 1
    while normgradient > 0.1 and stopnow < 1:

        # pick a batch at random
        idx = np.random.randint(training_x.shape[0], size=batch_size)

        training_x_ones = util.addones(training_x[idx, :])

        validation_x_ones = util.addones(validation_x)

        # the loss function for this particular batch
        loss_fun = lambda Theta: cad.lr_nll(training_x_ones, training_y[idx],
                                            Theta)

        # gradient descent
        # instead of the numerical gradient, we compute the gradient with
        # the analytical expression, which is much faster
        Theta_new = Theta - mu * cad.lr_agrad(training_x_ones, training_y[idx],
                                              Theta).T

        #Caclulate the loss and the validation loss
        loss[k] = loss_fun(Theta_new) / batch_size
        validation_loss[k] = cad.lr_nll(validation_x_ones, validation_y,
                                        Theta_new) / validation_x.shape[0]

        #distance to zero (0,0) used for minimizing loss
        normgradient = np.linalg.norm(validation_loss[k])

        # visualize the training
        ax2.set_xlim(0, k)  #axis needs to be adapted every iteration
        ax2.set_title('mu = {:.2}'.format(mu))

        h1.set_ydata(loss)
        h2.set_ydata(validation_loss)

        text_str2 = 'iter.: {}, loss: {:.3f}, val. loss={:.3f} '.format(
            k, loss[k], validation_loss[k])
        txt2.set_text(text_str2)

        display(fig)
        clear_output(wait=True)

        #Set the new weights
        Theta = None
        Theta = np.array(Theta_new)
        Theta_new = None
        tmp = None

        display(fig)
        clear_output(wait=True)
        plt.pause(.005)

        #Stop when the validation_loss doesn't decrease any further by comparing the current validation loss with x iterations ago
        if k > 100:
            if round(validation_loss[k], 4) == round(validation_loss[k - 25],
                                                     4):
                stopnow = 1
                print("The validation loss has reached its equilibrium")

        #increment iteration parameters
        k += 1
        counter += 1

    #save the final loss curve
    fig.savefig("Loss curve for batch size {} and init mu {:.2}.png".format(
        batch_size, mu))

    #predict the test data with the final weights
    predictedY_test = util.addones(test_x).dot(Theta)

    #calculate the error for the test squared error
    E_test = np.sum(np.square(np.subtract(predictedY_test, test_y)))

    return predictedY_test, E_test
Beispiel #9
0
def nuclei_classification(mu, batch_size, num_iterations):
    ## dataset preparation
    fn = '../data/nuclei_data_classification.mat'
    mat = scipy.io.loadmat(fn)

    test_images = mat["test_images"]  # (24, 24, 3, 20730)
    test_y = mat["test_y"]  # (20730, 1)
    training_images = mat["training_images"]  # (24, 24, 3, 14607)
    training_y = mat["training_y"]  # (14607, 1)
    validation_images = mat["training_images"]  # (24, 24, 3, 14607)
    validation_y = mat["training_y"]  # (14607, 1)

    ## dataset preparation
    imageSize = training_images.shape
    # every pixel is a feature so the number of features is:
    # height x width x color channels
    numFeatures = imageSize[0] * imageSize[1] * imageSize[2]
    training_x = training_images.reshape(
        numFeatures, training_images.shape[3]).T.astype(float)
    validation_x = validation_images.reshape(
        numFeatures, validation_images.shape[3]).T.astype(float)
    test_x = test_images.reshape(numFeatures,
                                 test_images.shape[3]).T.astype(float)

    # the training will progress much better if we
    # normalize the features
    meanTrain = np.mean(training_x, axis=0).reshape(1, -1)
    stdTrain = np.std(training_x, axis=0).reshape(1, -1)

    training_x = training_x - np.tile(meanTrain, (training_x.shape[0], 1))
    training_x = training_x / np.tile(stdTrain, (training_x.shape[0], 1))

    validation_x = validation_x - np.tile(meanTrain,
                                          (validation_x.shape[0], 1))
    validation_x = validation_x / np.tile(stdTrain, (validation_x.shape[0], 1))

    test_x = test_x - np.tile(meanTrain, (test_x.shape[0], 1))
    test_x = test_x / np.tile(stdTrain, (test_x.shape[0], 1))

    ## training linear regression model
    #-------------------------------------------------------------------#
    # TODO: Select values for the learning rate (mu), batch size
    # (batch_size) and number of iterations (num_iterations), as well as
    # initial values for the model parameters (Theta) that will result in
    # fast training of an accurate model for this classification problem.
    #    mu = 0.00001
    #    batch_size = 500
    #    num_iterations = 300

    r, c = training_x.shape

    Theta = 0.02 * np.random.rand(c + 1, 1)

    #-------------------------------------------------------------------#

    xx = np.arange(num_iterations)
    loss = np.empty(*xx.shape)
    loss[:] = np.nan
    validation_loss = np.empty(*xx.shape)
    validation_loss[:] = np.nan
    g = np.empty(*xx.shape)
    g[:] = np.nan

    fig = plt.figure(figsize=(8, 8))
    ax2 = fig.add_subplot(111)
    ax2.set_xlabel('Iteration')
    ax2.set_ylabel('Loss (average per sample)')
    ax2.set_title('mu = ' + str(mu))
    h1, = ax2.plot(xx, loss, linewidth=2)  #'Color', [0.0 0.2 0.6],
    h2, = ax2.plot(xx, validation_loss, linewidth=2)  #'Color', [0.8 0.2 0.8],
    ax2.set_ylim(0, 0.7)
    ax2.set_xlim(0, num_iterations)
    ax2.grid()

    text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
    txt2 = ax2.text(0.3,
                    0.95,
                    text_str2,
                    bbox={
                        'facecolor': 'white',
                        'alpha': 1,
                        'pad': 10
                    },
                    transform=ax2.transAxes)

    for k in np.arange(num_iterations):
        # pick a batch at random
        idx = np.random.randint(training_x.shape[0], size=batch_size)

        training_x_ones = util.addones(training_x[idx, :])
        validation_x_ones = util.addones(validation_x)

        # the loss function for this particular batch
        loss_fun = lambda Theta: cad.lr_nll(training_x_ones, training_y[idx],
                                            Theta)

        # gradient descent
        # instead of the numerical gradient, we compute the gradient with
        # the analytical expression, which is much faster
        Theta_new = Theta - mu * cad.lr_agrad(training_x_ones, training_y[idx],
                                              Theta).T

        loss[k] = loss_fun(Theta_new) / batch_size
        validation_loss[k] = cad.lr_nll(validation_x_ones, validation_y,
                                        Theta_new) / validation_x.shape[0]

        # visualize the training
        h1.set_ydata(loss)
        h2.set_ydata(validation_loss)
        text_str2 = 'iter.: {}, loss: {:.3f}, val. loss={:.3f}'.format(
            k, loss[k], validation_loss[k])
        txt2.set_text(text_str2)

        Theta = None
        Theta = np.array(Theta_new)
        Theta_new = None
        tmp = None

        display(fig)
        clear_output(wait=True)
        plt.pause(.005)

    #save the final loss curve
    plt.savefig("Loss curve for batch size {} and init mu {:.2}.png".format(
        batch_size, mu))

    #   ---------------------------------------------------------------------#
    #     TODO: Compute the error for the trained model.
    predictedY_test = util.addones(test_x).dot(Theta)
    E_test = np.sum(np.square(np.subtract(predictedY_test, test_y)))
    return predictedY_test, E_test