Exemple #1
0
def main():
	train_X, test_X, train_Y, test_Y = get_normalized_data()
	model = ANN([500, 300])
	session = tf.InteractiveSession()
	model.set_session(session)
	model.fit(train_X, train_Y, test_X, test_Y, show_fig=True)
	print("Train accuracy:", model.score(train_X, train_Y))
	print("Test accuracy:", model.score(test_X, test_Y))
Exemple #2
0
def main():
    max_iter = 20
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    learning_rate = 0.00004
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    #initialize theano variables
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    W2 = theano.shared(W2_init, 'W2')
    b1 = theano.shared(b1_init, 'b1')
    b2 = theano.shared(b2_init, 'b2')

    #action function and softmax
    tZ = relu(thX.dot(W1) + b1)
    t_pY = T.nnet.softmax(tZ.dot(W2) + b2)

    #cost function and predition function
    cost = -(thT * T.log(t_pY)).sum() + reg * (W1 * W1).sum() + reg * (
        W2 * W2).sum() + reg * (b1 * b1).sum() + reg * (b2 * b2).sum()
    predition = T.argmax(t_pY, axis=1)

    #training
    update_b2 = b2 - learning_rate * T.grad(cost, b2)
    update_W2 = W2 - learning_rate * T.grad(cost, W2)
    update_b1 = b1 - learning_rate * T.grad(cost, b1)
    update_W1 = W1 - learning_rate * T.grad(cost, W1)

    train = theano.function(inputs=[thX, thT],
                            updates=[(W1, update_W1), (W2, update_W2),
                                     (b1, update_b1), (b2, update_b2)])

    get_prediction = theano.function(inputs=[thX, thT],
                                     outputs=[cost, predition])

    costs = []
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            train(x, y)
            if j % print_period == 0:
                cost, test_pY = get_prediction(test_X, test_Y_ind)
                error = error_rate(test_pY, test_Y)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                      (i, j, cost, error))
                costs.append(cost)
    plt.plot(costs)
    plt.show()
Exemple #3
0
def main():
    max_iter = 10
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #SAVE INITIAL WEIGHT AND BIAS
    W1_copy = W1.copy()
    b1_copy = b1.copy()
    W2_copy = W2.copy()
    b2_copy = b2.copy()

    #1st moment
    mW1 = 0
    mW2 = 0
    mb1 = 0
    mb2 = 0

    #2nd moment
    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    #hyperparams
    learning_rate = 0.001
    beta1 = 0.99
    beta2 = 0.999
    eps = 1e-8

    #adam
    lose_adam = []
    error_adam = []
    t = 1
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            #update gradient
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            gb2 = derivative_b2(y, pY) + reg * b2
            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1

            #update 1st moment
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            #update 2nd moment
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            #bias correction
            correction_1 = 1 - beta1**t
            correction_2 = 1 - beta2**t
            mW1_hat = mW1 / correction_1
            mW2_hat = mW2 / correction_1
            mb1_hat = mb1 / correction_1
            mb2_hat = mb2 / correction_1

            vW1_hat = vW1 / correction_2
            vW2_hat = vW2 / correction_2
            vb1_hat = vb1 / correction_2
            vb2_hat = vb2 / correction_2

            #update t
            t += 1

            #update weight
            W2 -= learning_rate * mW2_hat / np.sqrt(vW2_hat + eps)
            b2 -= learning_rate * mb2_hat / np.sqrt(vb2_hat + eps)
            b1 -= learning_rate * mb1_hat / np.sqrt(vb1_hat + eps)
            W1 -= learning_rate * mW1_hat / np.sqrt(vW1_hat + eps)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_adam.append(l)
                error_adam.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    #RMSprop with momentum
    W1 = W1_copy.copy()
    b1 = b1_copy.copy()
    W2 = W2_copy.copy()
    b2 = b2_copy.copy()

    #hyperparams
    learning_rate = 0.001
    decay_rate = 0.999
    mu = 0.9
    eps = 1e-8

    #rmsprop cache
    cache_W1 = 1
    cache_W2 = 1
    cache_b1 = 1
    cache_b2 = 1

    #momentum
    dW1 = 0
    dW2 = 0
    db1 = 0
    db2 = 0

    lose_rmsprop_m = []
    error_rmsprop_m = []
    t = 1
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            #udpate
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 - (
                1 - mu) * learning_rate * gW2 / np.sqrt(cache_W2 + eps)
            W2 += dW2

            gb2 = derivative_b2(y, pY) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 - (
                1 - mu) * learning_rate * gb2 / np.sqrt(cache_b2 + eps)
            b2 += db2

            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 - (
                1 - mu) * learning_rate * gW1 / np.sqrt(cache_W1 + eps)
            W1 += dW1

            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 - (
                1 - mu) * learning_rate * gb1 / np.sqrt(cache_b1 + eps)
            b1 += db1
            # #update cache
            # cache_W1 = decay_rate * cache_W1 + (1-decay_rate)*gW1*gW1
            # cache_W2 = decay_rate * cache_W2 + (1-decay_rate)*gW2*gW2
            # cache_b1 = decay_rate * cache_b1 + (1-decay_rate)*gb1*gb1
            # cache_b2 = decay_rate * cache_b2 + (1-decay_rate)*gb2*gb2

            # #update momentum
            # dW2 = mu*dW2 + (1-mu) * learning_rate * gW2 / (np.sqrt(cache_W2) + eps)
            # db2 = mu*db2 + (1-mu) * learning_rate * gb2 / (np.sqrt(cache_b2) + eps)
            # dW1 = mu*dW1 + (1-mu) * learning_rate * dW1 / (np.sqrt(cache_W1) + eps)
            # db1 = mu*db1 + (1-mu) * learning_rate * db1 / (np.sqrt(cache_b1) + eps)

            # #update weights
            # W2 -= dW2
            # b2 -= db2
            # W1 -= dW1
            # b1 -= db1

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_rmsprop_m.append(l)
                error_rmsprop_m.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    plt.plot(lose_adam, label="adam")
    plt.plot(lose_rmsprop_m, label="rmsprop with momentum")
    plt.legend()
    plt.show()
Exemple #4
0
def main():
    max_iter = 20
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    learning_rate = 0.00004
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #SAVE INITIAL WEIGHT AND BIAS
    W1_copy = W1.copy()
    b1_copy = b1.copy()
    W2_copy = W2.copy()
    b2_copy = b2.copy()

    #constant learning_rate
    lose_constant = []
    error_constant = []
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg * W2)
            b2 -= learning_rate * (derivative_b2(y, pY) + reg * b2)
            W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg * W1)
            b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg * b1)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_constant.append(l)
                error_constant.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    #RMSprop
    W1 = W1_copy.copy()
    b1 = b1_copy.copy()
    W2 = W2_copy.copy()
    b2 = b2_copy.copy()

    learning_rate_0 = 0.001
    lose_non_costant = []
    error_non_constant = []
    cache_W1 = 1
    cache_W2 = 1
    cache_b1 = 1
    cache_b2 = 1
    decay_rate = 0.999
    eps = 1e-10

    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= learning_rate_0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(y, pY) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= learning_rate_0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= learning_rate_0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= learning_rate_0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_non_costant.append(l)
                error_non_constant.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    plt.plot(lose_constant, label="batch")
    plt.plot(lose_non_costant, label="non_constant")
    plt.legend()
    plt.show()
Exemple #5
0
def main():
    max_iter = 20
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    # learning_rate = 0.00004
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    #1st moment
    mW1_init = np.zeros((K, M))
    mW2_init = np.zeros((M, K))
    mb1_init = np.zeros((1, M))
    mb2_init = np.zeros((1, K))

    #2nd moment
    vW1_init = np.zeros((M, M))
    vW2_init = np.zeros((M, M))
    vb1_init = np.zeros((M, M))
    vb2_init = np.zeros((K, K))

    #hyperparams
    learning_rate = 0.001
    beta1 = 0.99
    beta2 = 0.999
    eps = 1e-8

    #other parameters
    t_init = 1

    #initialize theano variables
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    W2 = theano.shared(W2_init, 'W2')
    b1 = theano.shared(b1_init, 'b1')
    b2 = theano.shared(b2_init, 'b2')
    mW1 = theano.shared(mW1_init, 'mW1')
    mW2 = theano.shared(mW2_init, 'mW2')
    mb1 = theano.shared(mb1_init, 'mb1')
    mb2 = theano.shared(mb2_init, 'mb2')
    vW1 = theano.shared(vW1_init, 'vW1')
    vW2 = theano.shared(vW2_init, 'vW2')
    vb1 = theano.shared(vb1_init, 'vb1')
    vb2 = theano.shared(vb2_init, 'vb2')
    t = theano.shared(t_init, 't')

    #action fuction
    tZ = relu(thX.dot(W1) + b1)
    t_pY = T.nnet.softmax(tZ.dot(W2) + b2)

    #cost and prediction function
    cost = -(thT * T.log(t_pY)).sum() + reg * ((W1 * W1).sum() +
                                               (W2 * W2).sum() +
                                               (b1 * b1).sum() +
                                               (b2 * b2).sum())
    prediction = T.argmax(t_pY, axis=1)

    #trainning
    #update gradient
    gW2 = T.grad(cost, W2)
    gb2 = T.grad(cost, b2)
    gW1 = T.grad(cost, W1)
    gb1 = T.grad(cost, b1)

    #update 1st moment
    update_mW1 = beta1 * mW1 + (1 - beta1) * gW1
    update_mW2 = beta1 * mW2 + (1 - beta1) * gW2
    update_mb1 = beta1 * mb1 + (1 - beta1) * gb1
    update_mb2 = beta1 * mb2 + (1 - beta1) * gb2

    #update 2nd moment
    update_vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
    update_vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
    update_vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
    update_vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

    #bias correction
    correction_1 = 1 - beta1**t
    correction_2 = 1 - beta2**t
    mW1_hat = mW1 / correction_1
    mW2_hat = mW2 / correction_1
    mb1_hat = mb1 / correction_1
    mb2_hat = mb2 / correction_1

    vW1_hat = vW1 / correction_2
    vW2_hat = vW2 / correction_2
    vb1_hat = vb1 / correction_2
    vb2_hat = vb2 / correction_2

    #update
    update_t = t + 1
    update_W2 = W2 - learning_rate * mW2_hat / T.sqrt(vW2_hat + eps)
    update_b2 = b2 - learning_rate * mb2_hat / T.sqrt(vb2_hat + eps)
    update_b1 = b1 - learning_rate * mb1_hat / T.sqrt(vb1_hat + eps)
    update_W1 = W1 - learning_rate * mW1_hat / T.sqrt(vW1_hat + eps)

    train = theano.function(inputs=[thX, thT],
                            updates=[(W1, update_W1), (W2, update_W2),
                                     (b1, update_b1), (b2, update_b2),
                                     (mW1, update_mW1), (mW2, update_mW2),
                                     (mb1, update_mb1), (mb2, update_mb2),
                                     (vW1, update_vW1), (vW2, update_vW2),
                                     (vb1, update_vb1), (vb2, update_vb2),
                                     (t, update_t)])

    get_prediciton = theano.function(inputs=[thX, thT],
                                     outputs=[cost, prediction])

    costs = []
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            train(x, y)
            if j % print_period == 0:
                cost, test_pY = get_prediciton(test_X, test_Y_ind)
                rror = error_rate(test_pY, test_Y)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                      (i, j, cost, error))
                costs.append(cost)
    plt.plot(costs)
    plt.show()
Exemple #6
0
def main():
    max_iter = 20
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    learning_rate = 0.00004
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / np.sqrt(D)
    b1_init = np.zeros(M1)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(M2)
    W3_init = np.random.randn(M2, K) / np.sqrt(M2)
    b3_init = np.zeros(K)

    #initialize tensorflow variables
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    #define model
    Z1 = tf.nn.relu(tf.matmul(X, W1) + b1)
    Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2)
    Y_temp = tf.matmul(Z2, W3) + b3

    cost = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=Y_temp, labels=T))
    train_op = tf.train.RMSPropOptimizer(learning_rate,
                                         decay=0.99,
                                         momentum=0.9).minimize(cost)

    prediction_op = tf.argmax(Y_temp, axis=1)

    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        for i in range(max_iter):
            shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
            for j in range(batch_num):
                x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
                y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

                session.run(train_op, feed_dict={X: x, T: y})

                if j % print_period == 0:
                    test_cost = session.run(cost,
                                            feed_dict={
                                                X: test_X,
                                                T: test_Y_ind
                                            })
                    prediction = session.run(prediction_op,
                                             feed_dict={X: test_X})
                    error = error_rate(prediction, test_Y)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                          (i, j, test_cost, error))
                    costs.append(test_cost)

    plt.plot(costs)
    plt.show()
Exemple #7
0
def main():
    train_X, test_X, train_Y, test_Y = get_normalized_data()
    model = ANN([500, 300], [0.8, 0.5, 0.5])
    model.fit(train_X, train_Y, test_X, test_Y, show_fig=True)
Exemple #8
0
def main():
	max_iter = 20
	print_period = 50

	train_X, test_X, train_Y, test_Y = get_normalized_data()
	learning_rate = 0.00004
	reg = 0.01
	train_Y_ind = indicator(train_Y)
	test_Y_ind = indicator(test_Y)

	N, D = train_X.shape
	batch_size = 500
	batch_num = N // batch_size

	M = 300
	K = 10
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#SAVE INITIAL WEIGHT AND BIAS
	W1_copy = W1.copy()
	b1_copy = b1.copy()
	W2_copy = W2.copy()
	b2_copy = b2.copy()

	#batch
	loss_batch = []
	error_batch =[]
	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
		for j in range(batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)

			W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg*W2)
			b2 -= learning_rate * (derivative_b2(y, pY) + reg*b2)
			W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg*W1)
			b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg*b1)

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				loss_batch.append(l)
				error_batch.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))



	#momentum
	W1 = W1_copy.copy()
	b1 = b1_copy.copy()
	W2 = W2_copy.copy()
	b2 = b2_copy.copy()

	lose_momentum = []
	error_momentum = []
	mu = 0.9
	dW1 = 0
	dW2 = 0
	db1 = 0
	db2 = 0

	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
		for j in range (batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)
			# print("overflow?")
			gW2 = derivative_w2(Z, y, pY) + reg*W2
			gb2 = derivative_b2(y, pY) + reg*b2
			gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1
			gb1 = derivative_b1(Z, y, pY, W2) + reg*b1

			#UDPATE VELOCITIES
			dW2 = mu*dW2 - learning_rate*gW2
			db2 = mu*db2 - learning_rate*gb2
			dW1 = mu*dW1 - learning_rate*gW1
			db1 = mu*db1 - learning_rate*gb1

			#UPDATE WEIGHT
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				lose_momentum.append(l)
				error_momentum.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))


	#Nesterov momentum
	W1 = W1_copy.copy()
	b1 = b1_copy.copy()
	W2 = W2_copy.copy()
	b2 = b2_copy.copy()

	lose_nesterov = []
	error_nesterov = []
	mu = 0.9
	dW1 = 0
	db1 = 0
	dW2 = 0
	db2 = 0

	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(test_X, test_Y_ind)
		for j in range(batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)

			gW2 = derivative_w2(Z, y, pY) + reg*W2
			gb2 = derivative_b2(y, pY) + reg*b2
			gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1
			gb1 = derivative_b1(Z, y, pY, W2) + reg*b1

			#update velocities
			dW2 = mu*dW2 - learning_rate*gW2
			db2 = mu*db2 - learning_rate*db2
			dW1 = mu*dW1 - learning_rate*gW1
			db1 = mu*db1 - learning_rate*gb1

			#update weight
			W2 += mu*dW2 - learning_rate*gW2
			b2 += mu*db2 - learning_rate*db2
			W1 += mu*dW1 - learning_rate*gW1
			b1 += mu*db1 - learning_rate*gb1

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				lose_nesterov.append(l)
				error_nesterov.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))


	
	plt.plot(loss_batch, label="batch")
	plt.plot(lose_momentum, label="momentum")
	plt.plot(lose_nesterov, label="Nesterov")
	plt.legend()
	plt.show()