Beispiel #1
0
def main():
    # compare 5 scenarios:
    # 1. batch SGD with constant learning rate
    # 2. batch SGD with RMSProp
    # 3. batch SGD with AdaGrad
    # 4. batch SGD with exponential decay

    np.random.seed(2)

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights:
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch SGD with constant learning rate:
    LL_batch = []
    CR_batch = []
    t0 = datetime.now()
    print('\nperforming batch SGD with constant learning rate...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_batch.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err1 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_batch)
    #plt.title('Cost for batch GD with const lr')
    #plt.show()

    # 2. batch GD with RMSProp:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_RMSProp = []
    CR_RMSProp = []

    lr0 = 0.001  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay = 0.999
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with RMSProp...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_RMSProp.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_RMSProp.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err2 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_RMSProp)
    #plt.title('Cost for batch SGD with RMSProp')
    #plt.show()

    # 3. batch SGD with AdaGrad:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_AdaGrad = []
    CR_AdaGrad = []

    lr0 = 0.01  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with AdaGrad...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = cache_W2 + gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = cache_b2 + gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = cache_W1 + gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = cache_b1 + gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_AdaGrad.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_AdaGrad.append(error)
                print('error rate:', error)

    dt3 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err3 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_AdaGrad)
    #plt.title('Cost for batch SGD with AdaGrad')
    #plt.show()
    '''

	# 4. batch SGD with exponential decay:
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()

	LL_exp = []
	CR_exp = []

	
	lr0 = 0.0004 # initial learning rate
	k = 1e-7
	t = 0 # initial log
	lr = lr0 
	t0 = datetime.now()

	print('\nperforming batch SGD with lr exponential decay...')
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :]
			Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :]
			p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
			#print(Z.shape, p_Ybatch.shape, Ybatch.shape)
			#print('First batch cost:', cost(p_Ybatch, Ybatch))
			
			# updates:
			gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2)
			W2 -= lr*gW2
			
			gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2)			
			b2 -= lr*gb2
			
			gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1)
			W1 -= lr*gW1
			
			gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1)
			b1 -= lr*gb1 

			# decrease the learning rate
			lr = lr0 * np.exp(-k*t)
			t += 1

			if j % print_period == 0:
				print('current learning rate:', lr)
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				#print('pY:', pY)
				ll = cost(pY, Ytest_ind)
				LL_exp.append(ll)
				print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

				error = error_rate(pY, Ytest)
				CR_exp.append(error)
				print('error rate:', error)

	dt4 = datetime.now() - t0
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('\nFinal error rate:', error_rate(pY, Ytest))
	print('Elapsed time for batch SGD with lr exponential decay:', dt4)

	# plot the cost
	#plt.plot(LL_exp)
	#plt.title('Cost for batch SGD with lr exponential decay')
	#plt.show()

'''
    print('\nBatch SGD with constant learning rate:')
    print('final error rate:', final_err1)
    print('elapsed time:', dt1)

    print('\nBatch SGD with RMSProp:')
    print('final error rate:', final_err2)
    print('elapsed time:', dt2)

    print('\nBatch SGD with AdaGrad:')
    print('final error rate:', final_err3)
    print('elapsed time:', dt3)

    # plot the costs together:
    plt.plot(LL_batch, label='const_lr')
    plt.plot(LL_RMSProp, label='RMSProp')
    plt.plot(LL_AdaGrad, label='AdaGrad')
    #plt.plot(LL_exp, label='lr_exp_decay')
    plt.legend()
    plt.show()
Beispiel #2
0
def main():
    # compare 2 scenarios:
    # 1. batch GD with RMSProp and momentum
    # 2. Adam GD

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    # 1. batch GD with RMSProp and momentum:
    print('\nperforming batch GD with RMSProp and momentum...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_rm = []
    CR_rm = []

    # hyperparams:
    lr0 = 0.001
    #lr0 = 0.0001
    mu = 0.9
    decay = 0.999
    eps = 10e-9

    # momentum (velocity terms):
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    # rms-prop cache (with no bias correction):
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    t0 = datetime.now()
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            # (note: we utilize a bit different version of momentum)
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps))
            W2 -= dW2
            #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps)
            #W2 += dW2

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps))
            b2 -= db2
            #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps)
            #b2 += db2

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps))
            W1 -= dW1
            #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps)
            #W1 += dW1

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps))
            b1 -= db1
            #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps)
            #b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_rm.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_rm.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for batch GD with RMSProp and momentum:', dt1)

    # plot the cost
    plt.plot(LL_rm)
    plt.title('Cost for batch GD with RMSProp and momentum')
    plt.show()

    # 2. Adam optimizer
    print('\nperforming Adam optimizer...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # hyperparams:
    lr = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 10e-9

    # 1st moment:
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment:
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    LL_adam = []
    CR_adam = []
    t0 = datetime.now()
    t = 1  # index; used instead of j, because j starts with 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates:
            # gradients:
            gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2
            gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2
            gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1

            # 1st moment:
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # 2nd moment:
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction:
            mW2_bc = mW2 / (1 - beta1**t)
            mb2_bc = mb2 / (1 - beta1**t)
            mW1_bc = mW1 / (1 - beta1**t)
            mb1_bc = mb1 / (1 - beta1**t)

            vW2_bc = vW2 / (1 - beta2**t)
            vb2_bc = vb2 / (1 - beta2**t)
            vW1_bc = vW1 / (1 - beta2**t)
            vb1_bc = vb1 / (1 - beta2**t)

            # weights and biases (parameters):
            W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps)
            b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps)
            W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps)
            b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps)

            t += 1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_adam.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_adam.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for Adam optimizer:', dt2)

    # plot the cost
    plt.plot(LL_adam)
    plt.title('Cost for Adam optimizer')
    plt.show()

    # plot costs from the two experiments together:
    plt.plot(LL_rm, label='RMSProp with momentum')
    plt.plot(LL_adam, label='Adam optimizer')
    plt.title('Cost')
    plt.legend()
    plt.show()
def main():

    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.00004
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 20

    # 1. batch
    costs_batch = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2)
            b2 -= lr * (derivative_b2(pY, y) + reg * b2)
            W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_batch.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)
    print("\n")

    # 2. RMSprop
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10
    lr0 = 0.001

    costs_RMS = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = (derivative_b2(pY, y) + reg * b2)
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_RMS.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)

    plt.plot(costs_batch, label="batch")
    plt.plot(costs_RMS, label="rms")
    plt.legend()
    plt.show()
Beispiel #4
0
def main():
	# load the data:
	(Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data()
	# print(Xtrain.shape)
	N, d, _ = Xtrain.shape
	D = d*d
	Ntest = len(Xtest)

	# normalize the data:
	Xtrain = Xtrain / 255.0
	Xtest = Xtest / 255.0

	# display:
	# n = np.random.choice(N)
	# plt.imshow(Xtrain[n], cmap='gray')
	# plt.title(str(Ytrain[n]))
	# plt.show()

	# reshape the data:
	Xtrain = Xtrain.reshape(N, D)
	Xtest = Xtest.reshape(Ntest, D)	

	# print('Xtrain.max():', Xtrain.max())
	# print('Xtrain.shape:', Xtrain.shape)

	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)


	# define hyperparameters:
	epochs = 30
	print_period = 10
	lr = 0.00004
	reg = 0.01

	batch_sz = 500
	n_batches = N // batch_sz

	M = 300 # the hidden layer size
	K = len(set(Ytrain))

	# randomly initialize the weights:
	W1_init = np.random.randn(D, M) / np.sqrt(D)
	b1_init = np.zeros(M)
	W2_init = np.random.randn(M, K) / np.sqrt(M)
	b2_init = np.zeros(K)

	
	# 1. mini-batch SGD:
	losses_batch = []
	errors_batch = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	print('\nmini-batch SGD')

	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# update the params:
			W2 -= lr*(derivative_W2(Z, Ybatch, pYbatch) + reg*W2)
			b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
			W1 -= lr*(derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
			b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_batch.append(l)
				e = error_rate(pY, Ytest)
				errors_batch.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)

	sys.stdout.flush()	
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)

	
	# 2. mini-batch SGD with momentum - version 1:
	losses_momentum1 = []
	errors_momentum1 = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0

	print('\nmini-batch SGD with momentum - version 1')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

			# update the 'velocities':
			dW2 = mu*dW2 - lr*gW2  
			db2 = mu*db2 - lr*gb2 
			dW1 = mu*dW1 - lr*gW1 
			db1 = mu*db1 - lr*gb1 
			
			# update the params:
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_momentum1.append(l)
				e = error_rate(pY, Ytest)
				errors_momentum1.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)
	
	sys.stdout.flush()
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
	
	'''
	# 3. mini-batch SGD with momentum - version 2:
	losses_momentum2 = []
	errors_momentum2 = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0

	# lr = 0.0004

	print('\nmini-batch SGD with momentum - version 2')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

			# # update the 'velocities':
			dW2 = mu*dW2 + (1-mu)*gW2  
			db2 = mu*db2 + (1-mu)*gb2 
			dW1 = mu*dW1 + (1-mu)*gW1 
			db1 = mu*db1 + (1-mu)*gb1 

			# update the 'velocities':
			# dW2 = mu*dW2 + gW2  
			# db2 = mu*db2 + gb2 
			# dW1 = mu*dW1 + gW1 
			# db1 = mu*db1 + gb1 
			
			# update the params:
			W2 -= lr*dW2
			b2 -= lr*db2
			W1 -= lr*dW1
			b1 -= lr*db1

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_momentum2.append(l)
				e = error_rate(pY, Ytest)
				errors_momentum2.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				sys.stdout.flush()				
				
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
    # best result: epochs = 25, final_error = 0.0179
	'''
	# 4. mini-batch SGD with Nesterov momentum:
	losses_nesterov_momentum = []
	errors_nesterov_momentum = []

	W1 = W1_init.copy()
	b1 = b1_init.copy()
	W2 = W2_init.copy()
	b2 = b2_init.copy()

	mu = 0.9 # momentum term
	# initial values for the 'velocities':
	dW2 = 0 
	db2 = 0
	dW1 = 0
	db1 = 0


	print('\nmini-batch SGD with Nesterov momentum')
	t0 = datetime.now()
	for i in range(epochs):
		Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			# calculate the gradients:
			gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			
			# update the 'velocities':
			dW2 = mu*dW2 - lr*gW2  
			db2 = mu*db2 - lr*gb2 
			dW1 = mu*dW1 - lr*gW1 
			db1 = mu*db1 - lr*gb1 
			
			# update the params:
			W2 += mu*dW2 - lr*gW2  
			b2 += mu*db2 - lr*gb2 
			W1 += mu*dW1 - lr*gW1 
			b1 += mu*db1 - lr*gb1 

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				l = cross_entropy(pY, Ytest)
				losses_nesterov_momentum.append(l)
				e = error_rate(pY, Ytest)
				errors_nesterov_momentum.append(e)
				sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e))
				sys.stdout.flush()
				# print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l))
				# print('error_rate:', e)

	
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20)
	
	# plot the losses:
	plt.plot(losses_batch, label='mini-batch SGD')
	plt.plot(losses_momentum1, label='+ momentum')
	plt.plot(losses_nesterov_momentum, label='+ Nesterov momentum')
	plt.xlabel('iterations')
	plt.ylabel('loss')
	plt.legend()
	plt.show()
def main():

    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 10
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1st moment
    mW2 = 0
    mb2 = 0
    mW1 = 0
    mb1 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # 1. Adam
    costs_adam = []
    t = 1
    for i in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            gb2 = (derivative_b2(pY, y) + reg * b2)
            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)

            # new m
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # new v
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction
            correction1 = 1 - beta1**t
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1

            correction2 = 1 - beta2**t
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2

            # update t
            t += 1

            # apply updates to the params
            W2 -= lr * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 -= lr * hat_mb2 / np.sqrt(hat_vb2 + eps)
            W1 -= lr * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 -= lr * hat_mb1 / np.sqrt(hat_vb1 + eps)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_adam.append(c)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)
    print("\n")

    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999

    # momentum
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    costs_RMS = []
    for i in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            # updates
            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = (derivative_b2(pY, y) + reg * b2)
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_RMS.append(c)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)

    plt.plot(costs_adam, label='adam')
    plt.plot(costs_RMS, label='rmsprop')
    plt.legend()
    plt.show()
Beispiel #6
0
def main():
	# compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
	
	Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
	
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)
	
	N, D = Xtrain.shape
	M = 300
	K = len(set(Ytrain))
		
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)
	
	# save initial weights
	W1_0 = W1.copy()
	b1_0 = b1.copy()
	W2_0 = W2.copy()
	b2_0 = b2.copy()
	
	lr = 0.00004
	reg = 0.01
	batch_sz = 500
	n_batches = N // batch_sz
	epochs = 20
	
	# 1. batch
	costs_batch = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
		
			W2 -= lr * (derivative_W2(Z, pY, y) + reg*W2)
			b2 -= lr * (derivative_b2(pY, y) + reg*b2)
			W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	print("\n")
			
	# 2. batch with momentum
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()
	
	mu = 0.9
	dW2 = 0
	db2 = 0
	dW1 = 0
	db1 = 0
	
	costs_batch_momentum = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
			
			# gradients
			gW2 = (derivative_W2(Z, pY, y) + reg*W2)
			gb2 = (derivative_b2(pY, y) + reg*b2)
			gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			# update velocities
			dW2 = mu*dW2 - lr*gW2
			db2 = mu*db2 - lr*gb2
			dW1 = mu*dW1 - lr*gW1
			db1 = mu*db1 - lr*gb1
			
			# updates
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1
			
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch_momentum.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	print("\n")
				
	# 3. batch with Nesterov momentum
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()
	
	mu = 0.9
	vW2 = 0
	vb2 = 0
	vW1 = 0
	vb1 = 0
	
	costs_batch_momentum_nesterov = []
	for t in range(epochs):
		tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
		for j in range(n_batches):
			x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
			y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
			
			pY, Z = forward(x, W1, b1, W2, b2)
			
			# gradients
			gW2 = (derivative_W2(Z, pY, y) + reg*W2)
			gb2 = (derivative_b2(pY, y) + reg*b2)
			gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1)
			gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1)
			
			# v update
			vW2 = mu*vW2 - lr*gW2
			vb2 = mu*vb2 - lr*gb2
			vW1 = mu*vW1 - lr*gW1
			vb1 = mu*vb1 - lr*gb1
			
			# param update
			W2 += mu*vW2 - lr*gW2
			b2 += mu*vb2 - lr*gb2
			W1 += mu*vW1 - lr*gW1
			b1 += mu*vb1 - lr*gb1
				
			if j % 50 == 0:
				pY_test, _ = forward(Xtest, W1, b1, W2, b2)
				c = cost(pY_test, Ytest_ind)
				costs_batch_momentum_nesterov.append(c)
				print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

				e = error_rate(pY_test, Ytest)
				print("Error rate:", e)
	
	plt.plot(costs_batch, label="batch")
	plt.plot(costs_batch_momentum, label="momentum")
	plt.plot(costs_batch_momentum_nesterov, label="nesterov")
	plt.legend()
	plt.show()