Example #1
def main():
    X_train, X_test, t_train, t_test = get_pca_normalized_data()
    print("Performing multi-class logistic regression...\n")

    N, D = X_train.shape
    K = 10
    T_train = T_indicator(t_train)
    T_test = T_indicator(t_test)

    lr = float(sys.argv[1])
    reg = float(sys.argv[2])
    batch_size = int(sys.argv[3])

    ######## 1. FULL GRADIENT DESCENT ########
    print('Full Gradient Descent')
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    J_test_full = []
    t0 = datetime.now()
    for epoch in range(50):
        Y_train = forward(X_train, W, b)
        W -= lr * (gradW(T_train, Y_train, X_train) - reg * W)
        b -= lr * (gradb(T_train, Y_train) - reg * b)

        Y_test = forward(X_test, W, b)
        j_test = J(T_test, Y_test)

        if epoch % 1 == 0:
            err = accuracy(predict(Y_test), t_test)
            if epoch % 10 == 0:
                print("Epoch {}:\tcost: {}\taccuracy: {}".format(
                    epoch, round(j_test, 4), err))
    Y_test = forward(X_test, W, b)
    print("Final accuracy:", accuracy(predict(Y_test), t_test))
    print("Elapsted time for full GD: {}\n".format(datetime.now() - t0))

    ######## 2. STOCHASTIC GRADIENT DESCENT ########
    print('Stochastic Gradient Descent')
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    J_test_stochastic = []
    t0 = datetime.now()
    for epoch in range(
            50):  # takes very long since we're computing cost for 41k samples
        tmpX, tmpT = shuffle(X_train, T_train)
        for n in range(min(N, 500)):  # shortcut so it won't take so long...
            x = tmpX[n, :].reshape(1, D)
            t = tmpT[n, :].reshape(1, 10)
            Y_train = forward(x, W, b)

            W -= lr * (gradW(t, Y_train, x) - reg * W)
            b -= lr * (gradb(t, Y_train) - reg * b)

            Y_test = forward(X_test, W, b)
            j_test = J(T_test, Y_test)

        if epoch % 1 == 0:
            err = accuracy(predict(Y_test), t_test)
            if epoch % 10 == 0:
                print("Epoch {}:\tcost: {}\taccuracy: {}".format(
                    epoch, round(j_test, 4), err))
    Y_test_final = forward(X_test, W, b)
    print("Final accuracy:", accuracy(predict(Y_test_final), t_test))
    print("Elapsted time for SGD: {}\n".format(datetime.now() - t0))

    ######## 3. BATCH GRADIENT DESCENT ########
    print('Batch Gradient Descent')
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    J_test_batch = []
    nb_batches = N // batch_size
    t0 = datetime.now()
    for epoch in range(50):
        tmpX, tmpT = shuffle(X_train, T_train)
        for batch_index in range(nb_batches):
            x = tmpX[batch_index * batch_size:(batch_index * batch_size +
                                               batch_size), :]
            t = tmpT[batch_index * batch_size:(batch_index * batch_size +
                                               batch_size), :]
            Y_train = forward(x, W, b)

            W -= lr * (gradW(t, Y_train, x) - reg * W)
            b -= lr * (gradb(t, Y_train) - reg * b)

            Y_test = forward(X_test, W, b)
            j_test = J(T_test, Y_test)
        if epoch % 1 == 0:
            err = accuracy(predict(Y_test), t_test)
            if epoch % 10 == 0:
                print("Epoch {}\tcost: {}\taccuracy: {}".format(
                    epoch, round(j_test, 4), err))
    Y_test_final = forward(X_test, W, b)
    print("Final accuracy:", accuracy(predict(Y_test_final), t_test))
    print("Elapsted time for batch GD:", datetime.now() - t0)

    ######## PLOTS ########
    x1 = np.linspace(0, 1, len(J_test_full))
    plt.plot(x1, J_test_full, label="full")
    x2 = np.linspace(0, 1, len(J_test_stochastic))
    plt.plot(x2, J_test_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(J_test_batch))
    plt.plot(x3, J_test_batch, label="batch")
    #plt.savefig('full_vs_stoch_vs_batch_lr={}_reg={}_batch_size={}.png'.format(lr, reg, batch_size))
Example #2
from keras.models import Model
from keras.layers import Dense, Input
from util import get_normalized_data, T_indicator

import matplotlib.pyplot as plt
X_train, X_test, t_train, t_test = get_normalized_data()

# get shapes
N, D = X_train.shape
K = len(set(t_train))

T_train = T_indicator(t_train)
T_test = T_indicator(t_test)

# ANN with layers [784] -> [500] -> [300] -> [10]
x = Input(shape=(D,))
a = Dense(500, activation='relu')(x)
a = Dense(300, activation='relu')(a)
a = Dense(K, activation='softmax')(a)

model = Model(inputs=x, outputs=a)


# note: multiple ways to choose a backend
# either theano, tensorflow, or cntk
# https://keras.io/backend/
Example #3
def main():
    # STEP 1 : get the data and define all the usual variables
    X_train, X_test, t_train, t_test = get_normalized_data()
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    t_train = t_train.astype(np.float32)
    t_test = t_test.astype(np.float32)
    T_train = T_indicator(t_train).astype(np.float32)
    T_test = T_indicator(t_test).astype(np.float32)

    # Dimensionality
    N, D = X_train.shape
    M = 300
    K = 10
    max_iter = 20
    print_period = 10
    batch_size = 500
    nb_batches = N // batch_size

    # Hyperparameters
    lr = 0.0004
    reg = 0.01

    # Initialize weights
    W0_init = np.random.randn(D, M) / np.sqrt(D)
    b0_init = np.zeros(M)
    W1_init = np.random.randn(M, K) / np.sqrt(M)
    b1_init = np.zeros(K)

    # STEP 2 : define Theano variables and expressions
    # data and labels to go into the inputs
    thX = Tens.matrix('X')
    thT = Tens.matrix('T')
    # weights to be update
    W0 = theano.shared(W0_init, 'W0')
    b0 = theano.shared(b0_init, 'b0')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    # use the built-in theano functions to forward
    thA = relu(thX.dot(W0) + b0)
    thY = Tens.nnet.softmax(thA.dot(W1) + b1)
    # define cost function
    J = -(thT * Tens.log(thY)).sum() + reg * ((W1 * W1).sum() +
                                              (b1 * b1).sum() +
                                              (W0 * W0).sum() +
                                              (b0 * b0).sum())
    prediction = Tens.argmax(thY, axis=1)

    # STEP 3 : training expression and function
    update_W1 = W1 - lr * Tens.grad(J, W1)
    update_b1 = b1 - lr * Tens.grad(J, b1)
    update_W0 = W0 - lr * Tens.grad(J, W0)
    update_b0 = b0 - lr * Tens.grad(J, b0)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W0, update_W0),
                 (b0, update_b0)],

    # the prediction over the whole dataset
    prediction_state = theano.function(inputs=[thX, thT],
                                       outputs=[J, prediction])

    Js = []
    for epoch in range(max_iter):
        for batch_index in range(nb_batches):
            X_batch = X_train[batch_index * batch_size:(batch_index + 1) *
                              batch_size, ]
            T_batch = T_train[batch_index * batch_size:(batch_index + 1) *
                              batch_size, ]
            train(X_batch, T_batch)

            if batch_index % print_period == 0:
                J_test, prediction_test = prediction_state(X_test, T_test)
                print('Epoch {}\t batch_index {}:\t J {}\t accuracy {}'.format(
                    epoch, batch_index, J_test,
                    accuracy(prediction_test, t_test)))
Example #4
def main():
    ## STEP 1: get the data and define all the usual variables
    X_train, X_test, t_train, t_test = get_normalized_data()
    T_train = T_indicator(t_train)
    T_test = T_indicator(t_test)
    # Dimensionality
    max_iter = 15
    print_period = 50
    N, D = X_train.shape
    M1 = 300
    M2 = 100
    K = 10
    batch_size = 500
    nb_batches = N // batch_size
        'Dim: N:{}\t D:{}\t M1:{}\t M2:{}\t K:{}\t batch_size:{}\t nb_batches={}'
        .format(N, D, M1, M2, K, batch_size, nb_batches))
    # Hyperparameters
    lr = 0.0004
    reg = 0.01
    print('HP: lr:{}\t reg:{}'.format(lr, reg))
    # Weigths initialization
    W0_init = np.random.randn(D, M1) / 28
    b0_init = np.zeros(M1)
    W1_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b1_init = np.zeros(M2)
    W2_init = np.random.randn(M2, K) / np.sqrt(M2)
    b2_init = np.zeros(K)

    ## STEP 2: DEFINE Tensorflow variables and expressions
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')

    W0 = tf.Variable(W0_init.astype(np.float32))
    b0 = tf.Variable(b0_init.astype(np.float32))
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))

    A1 = tf.nn.relu(tf.matmul(X, W0) + b0)
    A2 = tf.nn.relu(tf.matmul(A1, W1) + b1)
    # U and not Y because softmax is taken care while calculating the cost
    U = tf.matmul(A2, W2) + b2
    J = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=U, labels=T))

    ## STEP 3: GD updates expressions, training and predict functions expressions
    # The optimizeris are already implemented
    # let's go with RMSprop, it includes momentum.
    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99,
    predict_op = tf.argmax(U, 1)

    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        for epoch in range(max_iter):
            for batch_index in range(nb_batches):
                X_batch = X_train[batch_index * batch_size:(batch_index + 1) *
                                  batch_size, ]
                T_batch = T_train[batch_index * batch_size:(batch_index + 1) *
                                  batch_size, ]
                session.run(train_op, feed_dict={X: X_batch, T: T_batch})
                if batch_index % print_period == 0:
                    j = session.run(J, feed_dict={X: X_test, T: T_test})
                    prediction = session.run(predict_op, feed_dict={X: X_test})
                    acc = accuracy(prediction, t_test)
                    print('Epoch: {}\t cost:{}\t accuracy:{}'.format(
                        epoch, round(j, 3), round(acc, 3)))
        .format(N, D, M1, M2, K, batch_size, nb_batches, lr, reg))
Example #5
def main():
    max_iter = 20
    print_period = 10

    X_train, X_test, t_train, t_test = get_normalized_data()
    T_train = T_indicator(t_train)
    T_test = T_indicator(t_test)

    lr = 0.00004
    reg = 0.01
    N, D = X_train.shape
    batch_sz = 500
    nb_batches = N // batch_sz
    M = 300
    K = 10
        'N_train = {}\t N_test = 1000\t D = {}\t M = {}\t K = {}\t batch_size = {}\t nb_batches = {}\t lr_cst = {}\n'
        .format(N, D, M, K, batch_sz, nb_batches, lr))
    # np.sqrt(D) ~ 28
    W0 = np.random.randn(D, M) / 28
    b0 = np.zeros(M)
    W1 = np.random.randn(M, K) / np.sqrt(M)
    b1 = np.zeros(K)

    #t0 = datetime.now()
    J_constant_lr = []  # measured on test data every 10 batches
    accuracy_constant_lr = []  # measured on test data every 10 batches
    for epoch in range(max_iter):
        for batch_index in range(nb_batches):
            X_batch = X_train[batch_index * batch_sz:(batch_index + 1) *
                              batch_sz, ]
            T_batch = T_train[batch_index * batch_sz:(batch_index + 1) *
                              batch_sz, ]

            A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1)

            # Updates
            W1 -= lr * J_derivative_W1(T_batch, Y_batch, A_batch)
            b1 -= lr * J_derivative_b1(T_batch, Y_batch)
            W0 -= lr * J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch)
            b0 -= lr * J_derivative_b0(T_batch, Y_batch, W1, A_batch)

            if (batch_index % print_period) == 0:
                _, Y_test = forward(X_test, W0, b0, W1, b1)
                j_test = J(T_test, Y_test)
                acc = accuracy(predict(Y_test), t_test)
                    'Epoch n° {} batch n° {}:\t TEST COST {}\t TEST ACCURACY RATE: {}'
                    .format(epoch, batch_index, j_test, acc))
    _, Y_test_final = forward(X_test, W0, b0, W1, b1)
    print('Final ACCURACY RATE on TEST data: {}\n'.format(
        accuracy(predict(Y_test_final), t_test)))
    #print('Constant lr execution time: {}\n'.format(datetime.now() - t0))

    # 2. RMSProp
    #t0 = datetime.now()

    W0 = np.random.randn(D, M) / 28
    b0 = np.zeros(M)
    W1 = np.random.randn(M, K) / np.sqrt(M)
    b1 = np.zeros(K)

    J_RMSProp = []
    accuracy_RMSProp = []

    lr0 = 0.001  #if you set the initial lr too high you'll get Nan
    cache_W1 = 0
    cache_b1 = 0
    cache_W0 = 0
    cache_b0 = 0
    decay = 0.999
    eps = 0.000001
    for epoch in range(max_iter):
        for b_index in range(nb_batches):
            X_batch = X_train[b_index * batch_sz:(b_index + 1) * batch_sz, ]
            T_batch = T_train[b_index * batch_sz:(b_index + 1) * batch_sz, ]
            A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1)

            # Updates
            gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr / (np.sqrt(cache_W1 + eps)) * gW1

            gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr / (np.sqrt(cache_b1) + eps) * gb1

            gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch,
                                  X_batch) + reg * W0
            cache_W0 = decay * cache_b0 + (1 - decay) * gW0 * gW0
            W0 -= lr / (np.sqrt(cache_W0) + eps) * gW0

            gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch)
            cache_b0 = decay * cache_b0 + (1 - decay) * gb0 * gb0
            b0 -= lr / (np.sqrt(cache_b0) + eps) * gb0

            if (b_index % 10) == 0:
                _, Y_test = forward(X_test, W0, b0, W1, b1)
                j_test = J(T_test, Y_test)
                acc = accuracy(predict(Y_test), t_test)
                    'Epoch n° {} Batch n°{}:\t TEST COST: {}\t TEST ACCURACY RATE: {}'
                    .format(epoch, b_index * nb_batches, j_test, acc))

    _, Y_test_final = forward(X_test, W0, b0, W1, b1)
    print('Final accuracy rate on test data: {}'.format(
        accuracy(predict(Y_test_final), t_test)))
    #print('Constant lr execution time: {}'.format(datetime.now() - t0))

    plt.plot(J_constant_lr, label='constant lr')
    plt.plot(J_RMSProp, label='RMSProp')
Example #6
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    # Inputs and targets
    X_train, X_test, t_train, t_test = get_normalized_data()
    T_train = T_indicator(t_train)
    T_test = T_indicator(t_test)

    # Dimensionnality and hyperparameters
    N, D = X_train.shape
    M = 300
    K = 10
    print('Dimensionality: N = {}\t D = {}\t M = {}\t K = {}'.format(
        N, D, M, K))
    batch_sz = 500
    n_batches = N // batch_sz
    lr = 0.00004
    reg = 0.01
    max_iter = 30  # make it 20 for relu
    mu = 0.9
        'Hyperparameters: lr = {}\t reg = {}\t velocity = {}\t nb_batches = {}\t batch_size={}\t nb_epochs={}'
        .format(lr, reg, mu, n_batches, batch_sz, max_iter))
    print_period = 50

    # Weights
    W0 = np.random.randn(D, M) / np.sqrt(D)
    b0 = np.zeros(M)
    W1 = np.random.randn(M, K) / np.sqrt(M)
    b1 = np.zeros(K)
    # save initial weights
    W0_0 = W0.copy()
    b0_0 = b0.copy()
    W1_0 = W1.copy()
    b1_0 = b1.copy()

    # 1. Batch
    t0 = datetime.now()
    losses_batch = []
    errors_batch = []
    for epoch in range(max_iter):
        for batch_index in range(n_batches):
            X_train_batch = X_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            T_train_batch = T_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1)

            W1 -= lr * (J_derivative_W1(T_train_batch, Y_train_batch, A1) +
                        reg * W1)
            b1 -= lr * (J_derivative_b1(T_train_batch, Y_train_batch) +
                        reg * b1)
            W0 -= lr * (J_derivative_W0(T_train_batch, Y_train_batch, W1, A1,
                                        X_train_batch) + reg * W0)
            b0 -= lr * (J_derivative_b0(T_train_batch, Y_train_batch, W1, A1) +
                        reg * b0)

            if batch_index % print_period == 0:
                _, Y_test = forward(X_test, W0, b0, W1, b1)
                j_test = J(T_test, Y_test)
                e = accuracy(predict(Y_test), t_test)
                    "Cost at iteration epoch={}, batch_index={}: {}\t Accuracy = {}"
                    .format(epoch, batch_index, round(j_test, 6), e))

    _, Y_test = forward(X_test, W0, b0, W1, b1)
    print("Final accuracy: {}\n".format(accuracy(predict(Y_test), t_test)))
    print("Elapsted time for batch GD: {}\n".format(datetime.now() - t0))

    # 2. Batch with momentum
    t0 = datetime.now()

    W0 = W0_0.copy()
    b0 = b0_0.copy()
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    losses_momentum = []
    errors_momentum = []

    dW1 = 0
    db1 = 0
    dW0 = 0
    db0 = 0

    for epoch in range(max_iter):
        for batch_index in range(n_batches):
            X_train_batch = X_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            T_train_batch = T_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1)

            # gradients
            gW1 = J_derivative_W1(T_train_batch, Y_train_batch, A1) + reg * W1
            gb1 = J_derivative_b1(T_train_batch, Y_train_batch) + reg * b1
            gW0 = J_derivative_W0(T_train_batch, Y_train_batch, W1, A1,
                                  X_train_batch) + reg * W0
            gb0 = J_derivative_b0(T_train_batch, Y_train_batch, W1,
                                  A1) + reg * b0

            # update velocities
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1
            dW0 = mu * dW0 - lr * gW0
            db0 = mu * db0 - lr * gb0

            # updates
            W1 += dW1
            b1 += db1
            W0 += dW0
            b0 += db0

            if batch_index % print_period == 0:
                _, Y_test = forward(X_test, W0, b0, W1, b1)
                j_test = J(T_test, Y_test)
                e = accuracy(predict(Y_test), t_test)
                    "Cost at iteration epoch={}, batch_index={}: {}\tAccuracy: {}"
                    .format(epoch, batch_index, round(j_test, 6), e))
    _, Y_test_final = forward(X_test, W0, b0, W1, b1)
    print("Final accuracy:", accuracy(predict(Y_test_final), t_test))
    print("Elapsted time for batch GD with Momentum: {}\n".format(
        datetime.now() - t0))

    # 3. Batch with Nesterov momentum
    t0 = datetime.now()
    W0 = W0_0.copy()
    b0 = b0_0.copy()
    W1 = W1_0.copy()
    b1 = b1_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    vW1 = 0
    vb1 = 0
    vW0 = 0
    vb0 = 0

    for epoch in range(max_iter):
        for batch_index in range(n_batches):
            X_train_batch = X_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            T_train_batch = T_train[batch_index *
                                    batch_sz:(batch_index * batch_sz +
                                              batch_sz), ]
            A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1)

            # updates
            gW1 = J_derivative_W1(T_train_batch, Y_train_batch, A1) + reg * W1
            gb1 = J_derivative_b1(T_train_batch, Y_train_batch) + reg * b1
            gW0 = J_derivative_W0(T_train_batch, Y_train_batch, W1, A1,
                                  X_train_batch) + reg * W0
            gb0 = J_derivative_b0(T_train_batch, Y_train_batch, W1,
                                  A1) + reg * b0

            # v update
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1
            vW0 = mu * vW0 - lr * gW0
            vb0 = mu * vb0 - lr * gb0

            # param update
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1
            W0 += mu * vW0 - lr * gW0
            b0 += mu * vb0 - lr * gb0

            if (batch_index % print_period == 0):
                _, Y_test = forward(X_test, W0, b0, W1, b1)
                j_test = J(T_test, Y_test)
                e = accuracy(predict(Y_test), t_test)
                    "Cost at iteration epoch={}, batch_index={}: {}\tAccuracy: {}"
                    .format(epoch, batch_index, round(j_test, 6), e))
    _, Y_test_final = forward(X_test, W0, b0, W1, b1)
    print("Final accuracy:", accuracy(predict(Y_test_final), t_test))
    print("Elapsted time for batch GD with Nesterov Momentum: {}\n".format(
        datetime.now() - t0))

    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
Example #7
def main():
    max_iter = 10
    print_period = 10
    X_train, X_test, t_train, t_test = get_normalized_data()
    T_train = T_indicator(t_train)
    T_test = T_indicator(t_test)
    # Dimensionality 
    N, D = X_train.shape
    M = 300
    K = 10
    batch_size = 500
    nb_batches = N // batch_size
    print('N:{}\t batch_size: {}\t nb_batches: {}\t  D:{}\t M:{}\t K:{}'.format(N, batch_size, nb_batches, D, M, K))
    # hyperparameters
    reg = 0.01
    lr0= 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8
    print('Hyperparameters: reg:{}\t lr0:{}\t beta1:{}\t beta2:{}\t eps:{}\n'.format(reg, lr0, beta1, beta2, eps))
    # Weights initialization
    W0_0 = np.random.randn(D, M) / np.sqrt(D)
    b0_0 = np.zeros(M)
    W1_0 = np.random.randn(M, K) / np.sqrt(M)
    b1_0 = np.zeros(K)
    W0 = W0_0.copy()
    b0 = b0_0.copy()
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    # 1st Moment
    mW0 = 0
    mb0 = 0
    mW1 = 0
    mb1 = 0
    # 2nd Moment
    vW0 = 0
    vb0 = 0
    vW1 = 0
    vb1 = 0

    # 1. Adam
    t0 = datetime.now()
    J_adam = []
    accuracy_adam = []
    t = 1
    for epoch in range(max_iter):
        for batch_index in range(nb_batches):
            X_batch = X_train[batch_index*batch_size: (batch_index+1)*batch_size,]
            T_batch = T_train[batch_index*batch_size: (batch_index+1)*batch_size,]
            A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1)
            # gradient updates
            gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1
            gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1
            gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0
            gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) + reg * b0
            # 1st moment updates
            mW1 = beta1 * mW1 + (1-beta1) * gW1
            mb1 = beta1 * mb1 + (1-beta1) * gb1
            mW0 = beta1 * mW0 + (1-beta1) * gW0
            mb0 = beta1 * mb0 + (1-beta1) * gb0
            # 2nd moment updates
            vW1 = beta2 * vW1 + (1-beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1-beta2) * gb1 * gb1
            vW0 = beta2 * vW0 + (1-beta2) * gW0 * gW0
            vb0 = beta2 * vb0 + (1-beta2) * gb0 * gb0
            # corrections
            corr1 = 1 - beta1 ** t
            corr2 = 1 - beta2 ** t
            mW1_c = mW1 / corr1
            mb1_c = mb1 / corr1
            mW0_c = mW0 / corr1
            mb0_c = mb0 / corr1
            vW1_c = vW1 / corr2
            vb1_c = vb1 / corr2
            vW0_c = vW0 / corr2
            vb0_c = vb0 / corr2        
            # t update
            t += 1
            # gradient descent
            W1 -= lr0 * mW1_c / np.sqrt(vW1_c + eps)
            b1 -= lr0 * mb1_c / np.sqrt(vb1_c + eps)
            W0 -= lr0 * mW0_c / np.sqrt(vW0_c + eps)
            b0 -= lr0 * mb0_c / np.sqrt(vb0_c + eps)
            if (batch_index % print_period) == 0:
                _, Y_validation = forward(X_test, W0, b0, W1, b1)
                j = J(T_test, Y_validation)
                acc = accuracy(predict(Y_validation), t_test)
                print('Epoch {}\t batch_index {}\t iteration {} : cost {}\t accuracy: {}\t'.format(epoch, batch_index, epoch * batch_index, j, acc))
    _, Y_final_test = forward(X_test, W0, b0, W1, b1)
    print('Final accuracy with Adam: {}'.format(accuracy(predict(Y_final_test), t_test)))
    print('Execution time with Adam: {}\n'.format(datetime.now() - t0))

    # 2. RMSProp with momentum
    W0 = W0_0
    b0 = b0_0
    W1 = W1_0
    b1 = b1_0
    decay_rate = 0.999
    mu = 0.9

    cW1 = 1
    cb1 = 1
    cW0 = 1
    cb0 = 0
    vW1 = 0
    vb1 = 0
    vW0 = 0
    vb1 = 0

    t0 = datetime.now()
    J_rmsprop_momentum = []
    accuracy_rmsprop_momentum = []
    for epoch in range(max_iter):
        for batch_index in range(nb_batches):
            X_batch = X_train[batch_index*batch_size:(batch_index+1)*batch_size,]
            T_batch = T_train[batch_index*batch_size:(batch_index+1)*batch_size,]
            A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1)

            # gradient_update
            gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1
            gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1
            gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0
            gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) + reg * b0
            # cache update
            cW1 = decay_rate * cW1 + (1 - decay_rate) * gW1 * gW1
            cb1 = decay_rate * cb1 + (1 - decay_rate) * gb1 * gb1
            cW0 = decay_rate * cW0 + (1 - decay_rate) * gW0 * gW0
            cb0 = decay_rate * cb0 + (1 - decay_rate) * gb0 * gb0
            # momentum updates
            vW1 = mu * vW1 + (1 - mu) * lr0 * gW1 / np.exp(cW1 + eps)
            vb1 = mu * vb1 + (1 - mu) * lr0 * gb1 / np.exp(cb1 + eps)
            vW0 = mu * vW0 + (1 - mu) * lr0 * gW0 / np.exp(cW0 + eps)
            vb0 = mu * vb0 + (1 - mu) * lr0 * gb0 / np.exp(cb0 + eps)
            # gradient descent
            W1 -= vW1
            b1 -= vb1
            W0 -= vW0
            b0 -= vb0
            if (batch_index % print_period) == 0:
                _, Y_validation = forward(X_test, W0, b0, W1, b1)
                j = J(T_test, Y_validation)
                acc = accuracy(predict(Y_validation), t_test)
                print('Epoch {}\t batch_index {}\t iteration {} : cost {}\t accuracy: {}\t'.format(epoch, batch_index, epoch * batch_index, j, acc))
    _, Y_final_test = forward(X_test, W0, b0, W1, b1)
    print('Final accuracy with RMSProp with momentum: {}'.format(accuracy(predict(Y_final_test), t_test)))
    print('Execution time with RMSProp with momentum: {}\n'.format(datetime.now() - t0))

    plt.plot(J_adam, label='adam')
    plt.plot(J_rmsprop_momentum, label='rmsprop with momentum')