コード例 #1
0
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    lr = 0.00004
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 20

    # step 2: define theano variables and expressions
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    # we can use the built-in theano functions to do relu and softmax
    thZ = T.nnet.relu(thX.dot(W1) + b1)
    thpY = T.nnet.softmax(thZ.dot(W2) + b2)

    # define the cost function and prediction
    cost = -(thT * T.log(thpY)).sum() + reg * ((W1 * W1).sum() +
                                               (b1 * b1).sum() +
                                               (W2 * W2).sum() +
                                               (b2 * b2).sum())
    prediction = T.argmax(thpY, axis=1)

    # step 3: training expressions and functions
    update_W1 = W1 - lr * T.grad(cost, W1)
    update_b1 = b1 - lr * T.grad(cost, b1)
    update_W2 = W2 - lr * T.grad(cost, W2)
    update_b2 = b2 - lr * T.grad(cost, b2)

    train = theano.function(inputs=[thX, thT],
                            updates=[(W1, update_W1), (b1, update_b1),
                                     (W2, update_W2), (b2, update_b2)])

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(inputs=[thX, thT],
                                     outputs=[cost, prediction])

    costs_batch = []
    for i in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            train(x, y)
            if j % 10 == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                e = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                      (i, j, cost_val, e))
                costs_batch.append(cost_val)

    plt.plot(costs_batch)
    plt.show()
コード例 #2
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
    print("Performing logistic regression...")

    N, D = Xtrain.shape
    K = len(set(Ytrain))

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, K) / np.sqrt(D + K)
    b = np.zeros(K)

    costs = []
    lr = 0.0001
    reg = 0.01
    epochs = 50
    t0 = datetime.now()

    for t in range(epochs):
        pY = forward(Xtrain, W, b)

        W -= lr * (gradW(Xtrain, pY, Ytrain_ind) + reg * W)
        b -= lr * (gradb(pY, Ytrain_ind) + reg * b)

        pY_test = forward(Xtest, W, b)
        c = cost(pY_test, Ytest_ind)
        costs.append(c)

        if t % 1 == 0:
            e = error_rate(pY_test, Ytest)

            if t % 10 == 0:
                print("Cost at iteration %d: %.6f" % (t, c))
                print("Error rate:", e)

    print("Elapsted time for full GD:", datetime.now() - t0)
    print("\n")

    # 2. stochastic
    W = np.random.randn(D, K) / np.sqrt(D + K)
    b = np.zeros(K)

    costs_stochastic = []
    lr = 0.0001
    reg = 0.01
    epochs = 50
    t0 = datetime.now()

    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(min(N, 500)):  # shortcut so it won't take so long...
            x = tmpX[n, :].reshape(1, D)
            y = tmpY[n, :].reshape(1, K)

            pY = forward(x, W, b)

            W -= lr * (gradW(x, pY, y) + reg * W)
            b -= lr * (gradb(pY, y) + reg * b)

            pY_test = forward(Xtest, W, b)
            c = cost(pY_test, Ytest_ind)
            costs_stochastic.append(c)

        if t % 1 == 0:
            e = error_rate(pY_test, Ytest)

            if t % 10 == 0:
                print("Cost at iteration %d: %.6f" % (t, c))
                print("Error rate:", e)

    print("Elapsted time for SGD:", datetime.now() - t0)
    print("\n")

    # 3. batch
    W = np.random.randn(D, K) / np.sqrt(D + K)
    b = np.zeros(K)

    costs_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 50
    t0 = datetime.now()

    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY = forward(x, W, b)

            W -= lr * (gradW(x, pY, y) + reg * W)
            b -= lr * (gradb(pY, y) + reg * b)

            pY_test = forward(Xtest, W, b)
            c = cost(pY_test, Ytest_ind)
            costs_batch.append(c)

        if t % 1 == 0:
            e = error_rate(pY_test, Ytest)

            if t % 10 == 0:
                print("Cost at iteration %d: %.6f" % (t, c))
                print("Error rate:", e)

    print("Elapsted time for batch GD:", datetime.now() - t0)

    x1 = np.linspace(0, 1, len(costs))
    plt.plot(x1, costs, label="full")
    x2 = np.linspace(0, 1, len(costs_stochastic))
    plt.plot(x2, costs_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(costs_batch))
    plt.plot(x3, costs_batch, label="batch")
    plt.legend()
    plt.show()
コード例 #3
0
def main():

    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.00004
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 20

    # 1. batch
    costs_batch = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2)
            b2 -= lr * (derivative_b2(pY, y) + reg * b2)
            W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_batch.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)
    print("\n")

    # 2. RMSprop
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10
    lr0 = 0.001

    costs_RMS = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = (derivative_b2(pY, y) + reg * b2)
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_RMS.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)

    plt.plot(costs_batch, label="batch")
    plt.plot(costs_RMS, label="rms")
    plt.legend()
    plt.show()
コード例 #4
0
def main():
	# step 1: get the data and define all the usual variables
	Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
	
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)
	
	N, D = Xtrain.shape
    # add an extra layer just for fun
	M1 = 300
	M2 = 100
	K = len(set(Ytrain))
	
	W1_init = np.random.randn(D, M1) / np.sqrt(D)
	b1_init = np.zeros(M1)
	W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
	b2_init = np.zeros(M2)
	W3_init = np.random.randn(M2, K) / np.sqrt(M2)
	b3_init = np.zeros(K)
	
	lr = 0.00004
	reg = 0.01
	batch_sz = 500
	n_batches = N // batch_sz
	epochs = 15
	
	# define variables and expressions
	X = tf.placeholder(tf.float32, shape=(None, D), name='X')
	T = tf.placeholder(tf.float32, shape=(None, K), name='T')
	W1 = tf.Variable(W1_init.astype(np.float32))
	b1 = tf.Variable(b1_init.astype(np.float32))
	W2 = tf.Variable(W2_init.astype(np.float32))
	b2 = tf.Variable(b2_init.astype(np.float32))
	W3 = tf.Variable(W3_init.astype(np.float32))
	b3 = tf.Variable(b3_init.astype(np.float32))
	
	# define the model
	Z1 = tf.nn.relu(tf.matmul(X, W1)+b1)
	Z2 = tf.nn.relu(tf.matmul(Z1, W2)+b2)
	pY = tf.matmul(Z2, W3)+b3 # remember, the cost function does the softmaxing!
	
	# define the cost function
	cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=T, logits=pY))
	
    # we choose the optimizer but don't implement the algorithm ourselves
    # let's go with RMSprop, since we just learned about it.
    # it includes momentum!
	train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)
	
	prediction = tf.argmax(pY, axis=1)
	
	costs_batch = []
	init = tf.global_variables_initializer()
	with tf.Session() as session:
		session.run(init)

		for i in range(epochs):
			tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
			for j in range(n_batches):
				x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
				y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
				session.run(train_op, feed_dict={X: x, T: y})
				if j % 50 == 0:
					cost_val = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
					prediction_val = session.run(prediction, feed_dict={X: Xtest})
					e = error_rate(prediction_val, Ytest)
					print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, e))
					costs_batch.append(cost_val)

	plt.plot(costs_batch)
	plt.show()
コード例 #5
0
def main():

    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 10
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1st moment
    mW2 = 0
    mb2 = 0
    mW1 = 0
    mb1 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # 1. Adam
    costs_adam = []
    t = 1
    for i in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            gb2 = (derivative_b2(pY, y) + reg * b2)
            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)

            # new m
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # new v
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction
            correction1 = 1 - beta1**t
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1

            correction2 = 1 - beta2**t
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2

            # update t
            t += 1

            # apply updates to the params
            W2 -= lr * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 -= lr * hat_mb2 / np.sqrt(hat_vb2 + eps)
            W1 -= lr * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 -= lr * hat_mb1 / np.sqrt(hat_vb1 + eps)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_adam.append(c)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)
    print("\n")

    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999

    # momentum
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    costs_RMS = []
    for i in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            # updates
            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = (derivative_b2(pY, y) + reg * b2)
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_RMS.append(c)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)

    plt.plot(costs_adam, label='adam')
    plt.plot(costs_RMS, label='rmsprop')
    plt.legend()
    plt.show()
コード例 #6
0
def main():
	# compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
	
	Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
	
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)
	
	N, D = Xtrain.shape
	M = 300
	K = len(set(Ytrain))
		
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)
	
	# save initial weights
	W1_0 = W1.copy()
	b1_0 = b1.copy()
	W2_0 = W2.copy()
	b2_0 = b2.copy()
	
	lr = 0.00004
	reg = 0.01
	batch_sz = 500
	n_batches = N // batch_sz
	epochs = 20
	
	# 1. batch
	costs_batch = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
		
			W2 -= lr * (derivative_W2(Z, pY, y) + reg*W2)
			b2 -= lr * (derivative_b2(pY, y) + reg*b2)
			W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	print("\n")
			
	# 2. batch with momentum
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()
	
	mu = 0.9
	dW2 = 0
	db2 = 0
	dW1 = 0
	db1 = 0
	
	costs_batch_momentum = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
			
			# gradients
			gW2 = (derivative_W2(Z, pY, y) + reg*W2)
			gb2 = (derivative_b2(pY, y) + reg*b2)
			gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			# update velocities
			dW2 = mu*dW2 - lr*gW2
			db2 = mu*db2 - lr*gb2
			dW1 = mu*dW1 - lr*gW1
			db1 = mu*db1 - lr*gb1
			
			# updates
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1
			
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch_momentum.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	print("\n")
				
	# 3. batch with Nesterov momentum
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()
	
	mu = 0.9
	vW2 = 0
	vb2 = 0
	vW1 = 0
	vb1 = 0
	
	costs_batch_momentum_nesterov = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
			
			# gradients
			gW2 = (derivative_W2(Z, pY, y) + reg*W2)
			gb2 = (derivative_b2(pY, y) + reg*b2)
			gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			# v update
			vW2 = mu*vW2 - lr*gW2
			vb2 = mu*vb2 - lr*gb2
			vW1 = mu*vW1 - lr*gW1
			vb1 = mu*vb1 - lr*gb1
			
			# param update
			W2 += mu*vW2 - lr*gW2
			b2 += mu*vb2 - lr*gb2
			W1 += mu*vW1 - lr*gW1
			b1 += mu*vb1 - lr*gb1
				
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch_momentum_nesterov.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	
	plt.plot(costs_batch, label="batch")
	plt.plot(costs_batch_momentum, label="momentum")
	plt.plot(costs_batch_momentum_nesterov, label="nesterov")
	plt.legend()
	plt.show()