Beispiel #1
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # v update
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # param update
            W2 += mu * vW2 - lr * gW2
            b2 += mu * vb2 - lr * gb2
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
def main():
    max_iter = 10
    print_period = 40

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    Ytrain_ind = y2indicator(Ytrain)   # Target of train data 
    Ytest_ind = y2indicator(Ytest)      # Target of test data 

    N, D = Xtrain.shape
    M = 300
    K = 10

    np.random.seed(123)
    W1 = np.random.randn(D, M) / np.sqrt(D)   
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)  
    b2 = np.zeros(K)

    batch_sz = 500
    n_batches = N // batch_sz    # 82

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.001
    reg = 0.01

    # 1. RMSProp + Momemtum
    decay_rate = 0.999
    eps = 1e-8
    mu = 0.9

    # RMSProp cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    losses_rms = []
    errors_rms = []
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            ## 這邊updata momentum的形式,不是原始定義,而是模仿adam的 moment1
            ## 可以當成沒有使用 Bias correction 的 adam
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            dW2 = mu*dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            db2 = mu*db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            dW1 = mu*dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            db1 = mu*db1 + (1 - mu)* lr * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_rms.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2. Adam
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    beta1 = 0.9  # 0.999  # 這兩個值調一下  就會發生很大變化的樣子, 也就是說參數調教超重要的阿
    beta2 = 0.999  # 0.99
    eps = 1e-8
    t = 1

    m_W2 = 0
    m_b2 = 0
    m_W1 = 0
    m_b1 = 0

    v_W2 = 0
    v_b2 = 0
    v_W1 = 0
    v_b1 = 0

    losses_adam = []
    errors_adam = []
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradient
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # new m
            m_W2 = beta1*m_W2 + (1 - beta1)*gW2
            m_b2 = beta1*m_b2 + (1 - beta1)*gb2
            m_W1 = beta1*m_W1 + (1 - beta1)*gW1
            m_b1 = beta1*m_b1 + (1 - beta1)*gb1

            # new v
            v_W2 = beta2*v_W2 + (1 - beta2)*gW2*gW2
            v_b2 = beta2*v_b2 + (1 - beta2)*gb2*gb2
            v_W1 = beta2*v_W1 + (1 - beta2)*gW1*gW1
            v_b1 = beta2*v_b1 + (1 - beta2)*gb1*gb1

            # m_hat get from bias correction
            m_W2_hat = m_W2 / (1 - beta1**t)
            m_b2_hat = m_b2 / (1 - beta1**t)
            m_W1_hat = m_W1 / (1 - beta1**t)
            m_b1_hat = m_b1 / (1 - beta1**t)

            # v_hat get from bias correction
            v_W2_hat = v_W2 / (1 - beta2**t)
            v_b2_hat = v_b2 / (1 - beta2**t)
            v_W1_hat = v_W1 / (1 - beta2**t)
            v_b1_hat = v_b1 / (1 - beta2**t)

            # update
            W2 -= lr * m_W2_hat / np.sqrt(v_W2_hat + eps)
            b2 -= lr * m_b2_hat / np.sqrt(v_b2_hat + eps)
            W1 -= lr * m_W1_hat / np.sqrt(v_W1_hat + eps)
            b1 -= lr * m_b1_hat / np.sqrt(v_b1_hat + eps)

            t += 1
            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_adam.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(losses_rms, label='RMSprop + Momentun')
    plt.plot(losses_adam, label='Adam')
    plt.legend()
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)



    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
def main():
    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = int(N / batch_sz)

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001  # if you set this too high you'll get NaN!
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
Beispiel #5
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    '''
    # steps of training a model
    # 0. get data
    # 1. function --> forwrad --> OK
    # 2. loss --> cost --> ok
    # 3. solver --> gradient descent
    '''

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)  # Target of train data
    Ytest_ind = y2indicator(Ytest)  # Target of test data

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    np.random.seed(42)
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    '''
    initial weight 分母除上np.sqrt(D)對cost非常有幫助
    1. 如果不除這個東東,initial weight 會太大,將造成以下後果
        1.1 計算forward()時,通過sigmoid中的np.exp(-( X.dot(W1) + b1 )),一開始很可能會出現太大數字(overflow encountered 數據溢出),
        溢出代表該數字物件的型別無法承載這麼大數字,會報錯,但code還是能繼續跑,因為sigmoid的值在0~1之間

        1.2 如果是通過relu,輸出值可能會是一極大值
        之後後再經過expA = np.exp(A),大概首輪epoch中的第二個batch計算時,就會得到無限大爆掉了

        1.3 結論: activation = relu時, initial weight要夠小

    2. 除上一個極大值500 ( 約np.sqrt(D)的20倍 ),在update weight with momentum的實驗中,
        最後一個epoch的cost = 106.49

    3. 除 np.sqrt(D),在update weight with momentum的實驗中,
        最後一個epoch的cost = 119.005

    4. 除上一個極大值5000,在update weight with momentum的實驗中,
        最後一個epoch的cost = 160.68

    5. 其他參考書上說,大部分情況直接無腦除上100即可以得到很好的訓練成果
    '''

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch SGD
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz),
                                ]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_momentum = []
    errors_momentum = []
    mu = 0.9  # momentu的係數,大多數用0.9 就已經足夠好了
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print('Cost at iteration i=%d, j=%d : %.6f' % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print('Error rate:', e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('Fianl error rate:', error_rate(pY, Ytest))

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_momentum, label="momentum")
    plt.legend()
    plt.show()
Beispiel #6
0
def main():
    # compare 5 scenarios:
    # 1. batch SGD with constant learning rate
    # 2. batch SGD with RMSProp
    # 3. batch SGD with AdaGrad
    # 4. batch SGD with exponential decay

    np.random.seed(2)

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights:
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch SGD with constant learning rate:
    LL_batch = []
    CR_batch = []
    t0 = datetime.now()
    print('\nperforming batch SGD with constant learning rate...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_batch.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err1 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_batch)
    #plt.title('Cost for batch GD with const lr')
    #plt.show()

    # 2. batch GD with RMSProp:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_RMSProp = []
    CR_RMSProp = []

    lr0 = 0.001  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay = 0.999
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with RMSProp...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_RMSProp.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_RMSProp.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err2 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_RMSProp)
    #plt.title('Cost for batch SGD with RMSProp')
    #plt.show()

    # 3. batch SGD with AdaGrad:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_AdaGrad = []
    CR_AdaGrad = []

    lr0 = 0.01  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with AdaGrad...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = cache_W2 + gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = cache_b2 + gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = cache_W1 + gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = cache_b1 + gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_AdaGrad.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_AdaGrad.append(error)
                print('error rate:', error)

    dt3 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err3 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_AdaGrad)
    #plt.title('Cost for batch SGD with AdaGrad')
    #plt.show()
    '''

	# 4. batch SGD with exponential decay:
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()

	LL_exp = []
	CR_exp = []

	
	lr0 = 0.0004 # initial learning rate
	k = 1e-7
	t = 0 # initial log
	lr = lr0 
	t0 = datetime.now()

	print('\nperforming batch SGD with lr exponential decay...')
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :]
			Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :]
			p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
			#print(Z.shape, p_Ybatch.shape, Ybatch.shape)
			#print('First batch cost:', cost(p_Ybatch, Ybatch))
			
			# updates:
			gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2)
			W2 -= lr*gW2
			
			gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2)			
			b2 -= lr*gb2
			
			gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1)
			W1 -= lr*gW1
			
			gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1)
			b1 -= lr*gb1 

			# decrease the learning rate
			lr = lr0 * np.exp(-k*t)
			t += 1

			if j % print_period == 0:
				print('current learning rate:', lr)
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				#print('pY:', pY)
				ll = cost(pY, Ytest_ind)
				LL_exp.append(ll)
				print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

				error = error_rate(pY, Ytest)
				CR_exp.append(error)
				print('error rate:', error)

	dt4 = datetime.now() - t0
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('\nFinal error rate:', error_rate(pY, Ytest))
	print('Elapsed time for batch SGD with lr exponential decay:', dt4)

	# plot the cost
	#plt.plot(LL_exp)
	#plt.title('Cost for batch SGD with lr exponential decay')
	#plt.show()

'''
    print('\nBatch SGD with constant learning rate:')
    print('final error rate:', final_err1)
    print('elapsed time:', dt1)

    print('\nBatch SGD with RMSProp:')
    print('final error rate:', final_err2)
    print('elapsed time:', dt2)

    print('\nBatch SGD with AdaGrad:')
    print('final error rate:', final_err3)
    print('elapsed time:', dt3)

    # plot the costs together:
    plt.plot(LL_batch, label='const_lr')
    plt.plot(LL_RMSProp, label='RMSProp')
    plt.plot(LL_AdaGrad, label='AdaGrad')
    #plt.plot(LL_exp, label='lr_exp_decay')
    plt.legend()
    plt.show()
Beispiel #7
0
def main():

    # compare 3:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
    # all with L2 regularization

    X, Y = get_normalized_data()

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 300
    K = 10
    ######################IMPORTANETE PARAMETERE ####################
    t = 1  # !!!!!!!!!!!!!!!!
    ###############################################################
    epochs = 20
    print_period = 10
    lr0 = 0.001
    reg = 0.01
    epsilon = 1e-8  # is it the same as 10e-8

    beta1 = 0.9  # mu = 0.9
    beta2 = 0.999  # decay = 0.999
    batch_size = 500
    number_batches = int(N // batch_size)

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    tr_costs_momentum = []
    errors_batch_momentum = []
    losses_test_momentum = []

    # momentum coeficient

    mW2 = 0
    mW1 = 0
    mb2 = 0
    mb1 = 0

    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    mW2_hat = 0
    mW1_hat = 0
    mb2_hat = 0
    mb1_hat = 0

    vW1_hat = 0
    vW2_hat = 0
    vb1_hat = 0
    vb2_hat = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # update momentum
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # update velocity
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction
            correction1 = (1 - beta1**t)
            mW2_hat = mW2 / correction1
            mW1_hat = mW1 / correction1
            mb2_hat = mb2 / correction1
            mb1_hat = mb1 / correction1

            correction2 = (1 - beta2**t)
            vW2_hat = vW2 / correction2
            vW1_hat = vW1 / correction2
            vb2_hat = vb2 / correction2
            vb1_hat = vb1 / correction2

            # update t !!!!!!!
            t += 1

            # update
            W2 -= lr0 * (mW2_hat / np.sqrt(vW2_hat + epsilon))
            W1 -= lr0 * (mW1_hat / np.sqrt(vW1_hat + epsilon))
            b2 -= lr0 * (mb2_hat / np.sqrt(vb2_hat + epsilon))
            b1 -= lr0 * (mb1_hat / np.sqrt(vb1_hat + epsilon))

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_momentum.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" %
                      (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_momentum.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_momentum.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_momentum, label='tr_costs momentum')
    plt.plot(losses_test_momentum, label='losses_test momentum RMS')
    #plt.plot(errors_batch, label='errors_batch')
    # plt.show()
    #    print("tr_costs", errors_batch_momentum)
    print("Final error rate:", error_rate(pY, Ytest))
    plt.legend()
    plt.show()
Beispiel #8
0
def main():

    dobatch = False
    dobatchwithmomentum = True
    dobatchwithnesterovmomentum = True

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # batch

    #cost = -16
    if dobatch:
        losses_batch = []
        errors_batch = []

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
                b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
                W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                            reg * W1)
                b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_batch.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_batch.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    # batch with momentum

    if dobatchwithmomentum:
        print("momentum")

        W1 = W1_0.copy
        b1 = b1_0.copy
        W2 = W2_0.copy
        b2 = b2_0.copy

        losses_momentum = []
        errors_momentum = []

        mu = 0.9

        dW2 = 0
        db2 = 0
        dW1 = 0
        db1 = 0

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
                gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
                gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
                gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

                dW2 = mu * dW2 - lr * gW2
                db2 = mu * db2 - lr * gb2
                dW1 = mu * dW1 - lr * gW1
                db1 = mu * db1 - lr * gb1

                W2 += dW2
                b2 += db2
                W1 += dW1
                b1 += db1

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_momentum.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_momentum.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    # Nesterov momentum
    if dobatchwithnesterovmomentum:
        W1 = W1_0.copy
        b1 = b1_0.copy
        W2 = W2_0.copy
        b2 = b2_0.copy

        losses_nesterov = []
        errors_nesterov = []

        mu = 0.9

        vW2 = 0
        vb2 = 0
        vW1 = 0
        vb1 = 0

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
                gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
                gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
                gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

                vW2 = mu * vW2 - lr * gW2
                vb2 = mu * vb2 - lr * gb2
                vW1 = mu * vW1 - lr * gW1
                vb1 = mu * vb1 - lr * gb1

                W2 += mu * vW2 - lr * gW2
                b2 += mu * vb2 - lr * gb2
                W1 += mu * vW1 - lr * gW1
                b1 += mu * vb1 - lr * gb1

                W2 += dW2
                b2 += db2
                W1 += dW1
                b1 += db1

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_nesterov.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_nesterov.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
def main():

    # compare 3:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
    # all with L2 regularization

    X, Y = get_normalized_data()

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 300
    K = 10

    max_iter = 20
    epochs = 20
    print_period = 10
    lr0 = 0.0004
    reg = 0.01
    epsilon = 10e-10
    decay = 0.999
    batch_size = 500
    number_batches = int(N // batch_size)

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    cache_W2 = 1
    cache_W1 = 1
    cache_b2 = 1
    cache_b1 = 1

    tr_costs = []
    errors_batch = []
    losses_test = []

# 1. Just grad & RMSprop

   # 1.
    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # # AdaGrad
            # cache_W2 += derivative_w2(z_tr, ytr, ytr_pred) * derivative_w2(z_tr, ytr, ytr_pred)
            # cache_W1 += derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) * derivative_w1(xtr, z_tr, ytr, ytr_pred, W2)
            # cache_b2 += derivative_b2(ytr, ytr_pred) * derivative_b2(ytr, ytr_pred)
            # cache_b1 += derivative_b1(z_tr, ytr, ytr_pred, W2) * derivative_b1(z_tr, ytr, ytr_pred, W2)

            # RMSProp
            cache_W2 += decay * cache_W2 + (1 - decay) * gW2 * gW2
            cache_W1 += decay * cache_W1 + (1 - decay) * gW1 * gW1
            cache_b2 += decay * cache_b2 + (1 - decay) * gb2 * gb2
            cache_b1 += decay * cache_b1 + (1 - decay) * gb1 * gb1

            W2 -= lr0 * (gW2 // (cache_W2 + epsilon) + reg * W2)
            b2 -= lr0 * (gb2 // (cache_b2 + epsilon) + reg * b2)
            W1 -= lr0 * (gW1 // (cache_W1 + epsilon) + reg * W1)
            b1 -= lr0 * (gb1 // (cache_b1 + epsilon) + reg * b1)

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs, label='tr_costs')
    plt.plot(losses_test, label='losses_test RMS')
    #plt.plot(errors_batch, label='errors_batch')
#    plt.show()
#    print("tr_costs", tr_costs)
    print("Final error rate:", error_rate(pY, Ytest))


# 2. batch grad with momentum & RMSprop
#
#
    # 2.
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # regular batch gradient descend

    tr_costs_momentum = []
    errors_batch_momentum = []
    losses_test_momentum = []

    # momentum coeficient
    mu = 0.8

    cache_W2 = 1
    cache_W1 = 1
    cache_b2 = 1
    cache_b1 = 1

    dW1 = 0
    dW2 = 0
    db1 = 0
    db2 = 0

    cW1 = 0
    cW2 = 0
    cb1 = 0
    cb2 = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # potencjalnie pojebalem momentum i velocity

            # RMSProp
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1

            cW2 = (gW2 // (cache_W2) + epsilon)
            cb2 = (gb2 // (cache_b2) + epsilon)
            cW1 = (gW1 // (cache_W1) + epsilon)
            cb1 = (gb1 // (cache_b1) + epsilon)

            # update velocity
            dW2 = mu * dW2 + (1 - mu) * lr0 * cW2
            db2 = mu * db2 + (1 - mu) * lr0 * cb2
            dW1 = mu * dW1 + (1 - mu) * lr0 * cW1
            db1 = mu * db1 + (1 - mu) * lr0 * cb1

            # update
            W2 -= dW2
            W1 -= dW1
            b2 -= db2
            b1 -= db1

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_momentum.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_momentum.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_momentum.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_momentum, label='tr_costs momentum')
    plt.plot(losses_test_momentum, label='losses_test momentum RMS')
    #plt.plot(errors_batch, label='errors_batch')
    # plt.show()
#    print("tr_costs", errors_batch_momentum)
    print("Final error rate:", error_rate(pY, Ytest))
    plt.legend()
    plt.show()
Beispiel #10
0
def main():

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparameters
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    losses_adam = []
    errors_adam = []

    t = 1

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1**t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2**t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_adam.append(l)
                print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_adam.append(e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("final error rate:  ", error_rate(pY, Ytest))
    print()
    plt.plot(losses_adam, label='adam')

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    plt.legend()
    plt.show()
Beispiel #11
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            #         # updates
            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                #             # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print "Final error rate:", error_rate(pY, Ytest)

        # # 3. batch with Nesterov momentum
        W1 = np.random.randn(D, M) / 28
        b1 = np.zeros(M)
        W2 = np.random.randn(M, K) / np.sqrt(M)
        b2 = np.zeros(K)
        LL_nest = []
        CR_nest = []
        mu = 0.9
        dW2 = 0
        db2 = 0
        dW1 = 0
        db1 = 0
        for i in xrange(max_iter):
            for j in xrange(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                # updates
                dW2 = mu * mu * dW2 - (1 + mu) * lr * (
                    derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
                W2 += dW2
                db2 = mu * mu * db2 - (1 + mu) * lr * (
                    derivative_b2(Ybatch, pYbatch) + reg * b2)
                b2 += db2
                dW1 = mu * mu * dW1 - (1 + mu) * lr * (
                    derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
                W1 += dW1
                db1 = mu * mu * db1 - (1 + mu) * lr * (
                    derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
                b1 += db1

                if j % print_period == 0:
                    # calculate just for LL
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    # print "pY:", pY
                    ll = cost(pY, Ytest_ind)
                    LL_nest.append(ll)
                    print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                    err = error_rate(pY, Ytest)
                    CR_nest.append(err)
                    print "Error rate:", err
        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print "Final error rate:", error_rate(pY, Ytest)

        plt.plot(LL_batch, label="batch")
        plt.plot(LL_momentum, label="momentum")
        plt.plot(LL_nest, label="nesterov")
        plt.legend()
        plt.show()
Beispiel #12
0
def main():
    # Compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nestrov momentum

    max_iter = 30
    print_period = 10

    X_train, X_test, Y_train, Y_test = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    # cost = -16

    losses_batch = []
    errors_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z,
                                      Ybatch, pYbatch, W2) + reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_batch.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_batch.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # Update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_momentum.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_momentum.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_nesterov = []
    errors_nesterov = []
    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # v update
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # param update
            W2 += mu * vW2 - lr * gW2
            b2 += mu * vb2 - lr * gb2
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_nesterov.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_nesterov.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_momentum, label='momentum')
    plt.plot(losses_nesterov, label='nesterov')
    plt.show()
Beispiel #13
0
def main():
	max_iter = 20
	print_period = 10

	Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
	lr = 0.00004
	reg = 0.01

	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)

	N, D = Xtrain.shape
	batch_sz = 500
	n_batches = N // batch_sz

	M = 300
	K = 10
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#const
	LL_batch = []
	CR_batch = []
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),]
			Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
			#gradient
			gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			#updates
			W2 -= lr*gW2
			b2 -= lr*gb2
			W1 -= lr*gW1
			b1 -= lr*gb1
			if j%print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_batch.append(ll)
				print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
				err = error_rate(pY, Ytest)
				CR_batch.append(err)
				print("Error rate:", err)
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print("Final error rate:", error_rate(pY, Ytest))

	#RMSprop
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#const
	LL_rms = []
	CR_rms = []
	lr0 = 0.001
	cache_W2 = 1
	cache_b2 = 1
	cache_W1 = 1
	cache_b1 = 1
	decay_rate = 0.999
	eps = 1e-10
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),]
			Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			#updates
			gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
			W2 -= lr0*gW2 / (np.sqrt(cache_W2) + eps)

			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
			b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

			gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
			W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
			b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)
			if j%print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_rms.append(ll)
				print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
				err = error_rate(pY, Ytest)
				CR_rms.append(err)
				print("Error rate:", err)
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print("Final error rate:", error_rate(pY, Ytest))

	plt.plot(LL_batch, label = 'const')
	plt.plot(LL_rms, label = 'rms')
	plt.legend()
	plt.show()
Beispiel #14
0
def main():
    # 3 scenarios
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 15
    print_period = 10

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.0001
    reg = 0.001

    # Xtrain = X[:-1000, ]
    # Ytrain = Y[:-1000]
    # Xtest = X[-1000:, ]
    # Ytest = Y[-1000:, ]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = int(N / batch_sz)

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # Batch
    losses_batch = []
    error_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            # A = ' '
            # A = u"\n|                      |\n|----------------------|   \n(\\__/)   || \n(• v •)  || \n /   D"
            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                # print(
                # u"|----------------------|\n|                      | \n Costo
                # en i=%d, j=%d: \n      %.6f" % (i, j, l) + A)

                e = error_rate(pY, Ytest)
                error_batch.append(e)
                print("Ratio de error:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # v update
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # param update
            W2 += mu * vW2 - lr * gW2
            b2 += mu * vb2 - lr * gb2
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_momentum, label='momentum')
    plt.plot(losses_nesterov, label='Nesterov')
    plt.legend()
    plt.show()
Beispiel #15
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    # alternate version uses dW
    # dW2 = 0
    # db2 = 0
    # dW1 = 0
    # db1 = 0
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
            # dW(t) = mu*dW(t-1) + g(t)
            # W(t) = W(t-1) - mu*dW(t)
            W1_tmp = W1 - lr * mu * vW1
            b1_tmp = b1 - lr * mu * vb1
            W2_tmp = W2 - lr * mu * vW2
            b2_tmp = b2 - lr * mu * vb2

            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)

            # updates
            # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            # W2 += dW2
            # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            # b2 += db2
            # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            # W1 += dW1
            # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            # b1 += db1
            vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp
            W2 -= lr * vW2
            vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp
            b2 -= lr * vb2
            vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * W1_tmp
            W1 -= lr * vW1
            vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * b1_tmp
            b1 -= lr * vb1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
def main():
    max_iter = 10
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparams
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1. Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1 ** t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2 ** t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    loss_rms = []
    err_rms = []

    # comparable hyperparameters for fair comparison
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0
    
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
    for i in xrange(max_iter):
        for j in xrange(n_batches):
=======
    for i in range(max_iter):
        for j in range(n_batches):
>>>>>>> upstream/master
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
<<<<<<< HEAD
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
Beispiel #18
0
def main():
    # 3 scenarios
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 15
    print_period = 10

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.0001
    reg = 0.001

    # Xtrain = X[:-1000, ]
    # Ytrain = Y[:-1000]
    # Xtest = X[-1000:, ]
    # Ytest = Y[-1000:, ]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = int(N / batch_sz)

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # Batch
    losses_batch = []
    error_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            # A = ' '
            # A = u"\n|                      |\n|----------------------|   \n(\\__/)   || \n(• v •)  || \n /   D"
            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                # print(
                # u"|----------------------|\n|                      | \n Costo
                # en i=%d, j=%d: \n      %.6f" % (i, j, l) + A)

                e = error_rate(pY, Ytest)
                error_batch.append(e)
                print("Ratio de error:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_rms = []
    errors_rms = []
    lr0 = 0.001
    cacheW2 = 0
    cacheb2 = 0
    cacheW1 = 0
    cacheb1 = 0
    decay_rate = 0.99
    eps = 0.000001

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # caches
            cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2
            cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2
            cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1
            cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1

            W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps)
            b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps)
            W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps)
            b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_rms.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_rms, label='rmsprop')
    plt.legend()
    plt.show()
def main():
    max_iter = 20
    print_period = 20

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    Ytrain_ind = y2indicator(Ytrain)   # Target of train data 
    Ytest_ind = y2indicator(Ytest)      # Target of test data 

    lr = 0.00004
    reg = 0.01

    N, D = Xtrain.shape
    M = 300
    K = 10

    np.random.seed(123)
    W1 = np.random.randn(D, M) / np.sqrt(D)   
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)  
    b2 = np.zeros(K)

    batch_sz = 500
    n_batches = N // batch_sz    # 82

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. learning rate =  constant
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)    
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2.  RMSprop
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_rms = []
    errors_rms = []

    '''
    in RMSprop you can use a bigger lr, 
    but if you set this too high you'll get NaN!
    if you use the same learning rate within RMSprop and General method, there is only slight difference between them. 
    '''
    lr0 = 0.001   
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # # update
            # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)

            # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2)
            # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2)

            # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2))
            # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)

            # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)



            # updates
            # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_rms.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='contant')
    plt.plot(losses_rms, label='RMSprop')
    plt.legend()
    plt.show()
Beispiel #20
0
    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
>>>>>>> upstream/master
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
<<<<<<< HEAD
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
Beispiel #21
0
def main():
    max_iter = 20
    print_period = 10
    X_train, X_test, Y_train, Y_test = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(K)
    b2 = np.zeros(K)

    # copy weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. Constant Learning rate

    losses_batch = []
    errors_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_batch.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_batch.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)

    batch_error = error_rate(pY, Y_test)
    print(f"Final batch error rate: {batch_error}")

    # 2. RMSProp

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_RMSP = []
    errors_RMSP = []

    lr0 = 0.001
    cache_W1 = 1
    cache_b1 = 1
    cache_W2 = 1
    cache_b2 = 1
    decay = 0.999
    epsilon = 1e-10

    for i in range(max_iter):

        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + epsilon)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + epsilon)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + epsilon)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + epsilon)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_RMSP.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_RMSP.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final RMSProp error rate: {error_rate(pY, Y_test)}")
    print(f"Final batch error rate: {batch_error}")
    plt.plot(losses_batch, label='batch cost')
    plt.plot(losses_RMSP, label='RMSProp cost')
    plt.legend()
    plt.show()
Beispiel #22
0
def main():
    # compare 2 scenarios:
    # 1. batch GD with RMSProp and momentum
    # 2. Adam GD

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    # 1. batch GD with RMSProp and momentum:
    print('\nperforming batch GD with RMSProp and momentum...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_rm = []
    CR_rm = []

    # hyperparams:
    lr0 = 0.001
    #lr0 = 0.0001
    mu = 0.9
    decay = 0.999
    eps = 10e-9

    # momentum (velocity terms):
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    # rms-prop cache (with no bias correction):
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    t0 = datetime.now()
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            # (note: we utilize a bit different version of momentum)
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps))
            W2 -= dW2
            #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps)
            #W2 += dW2

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps))
            b2 -= db2
            #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps)
            #b2 += db2

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps))
            W1 -= dW1
            #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps)
            #W1 += dW1

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps))
            b1 -= db1
            #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps)
            #b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_rm.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_rm.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for batch GD with RMSProp and momentum:', dt1)

    # plot the cost
    plt.plot(LL_rm)
    plt.title('Cost for batch GD with RMSProp and momentum')
    plt.show()

    # 2. Adam optimizer
    print('\nperforming Adam optimizer...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # hyperparams:
    lr = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 10e-9

    # 1st moment:
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment:
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    LL_adam = []
    CR_adam = []
    t0 = datetime.now()
    t = 1  # index; used instead of j, because j starts with 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates:
            # gradients:
            gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2
            gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2
            gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1

            # 1st moment:
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # 2nd moment:
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction:
            mW2_bc = mW2 / (1 - beta1**t)
            mb2_bc = mb2 / (1 - beta1**t)
            mW1_bc = mW1 / (1 - beta1**t)
            mb1_bc = mb1 / (1 - beta1**t)

            vW2_bc = vW2 / (1 - beta2**t)
            vb2_bc = vb2 / (1 - beta2**t)
            vW1_bc = vW1 / (1 - beta2**t)
            vb1_bc = vb1 / (1 - beta2**t)

            # weights and biases (parameters):
            W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps)
            b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps)
            W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps)
            b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps)

            t += 1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_adam.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_adam.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for Adam optimizer:', dt2)

    # plot the cost
    plt.plot(LL_adam)
    plt.title('Cost for Adam optimizer')
    plt.show()

    # plot costs from the two experiments together:
    plt.plot(LL_rm, label='RMSProp with momentum')
    plt.plot(LL_adam, label='Adam optimizer')
    plt.title('Cost')
    plt.legend()
    plt.show()
def main():

    # compare 3:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
    # all with L2 regularization

    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004

    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 300
    K = 10

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # regular batch gradient descend

    epochs = 30

    tr_costs = []
    errors_batch = []
    losses_test = []
    batch_size = 500
    number_batches = int(N // batch_size)

    #max_iter = 30
    # 1.
    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            W2 -= lr * (derivative_w2(z_tr, ytr, ytr_pred) + reg * W2)
            b2 -= lr * (derivative_b2(ytr, ytr_pred) + reg * b2)
            W1 -= lr * (derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1)
            b1 -= lr * (derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1)

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs, label='tr_costs')
    plt.plot(losses_test, label='losses_test')
    #plt.plot(errors_batch, label='errors_batch')
#    plt.show()
#    print("tr_costs", tr_costs)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2.
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # regular batch gradient descend

    tr_costs_momentum = []
    errors_batch_momentum = []
    losses_test_momentum = []

    # momentum coeficient
    mu = 0.9

    dW1 = 0
    dW2 = 0
    db1 = 0
    db2 = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # update velocity
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            # update
            W2 += dW2
            W1 += dW1
            b2 += db2
            b1 += db1
            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_momentum.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_momentum.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_momentum.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_momentum, label='tr_costs momentum')
    plt.plot(losses_test_momentum, label='losses_test momentum')
    #plt.plot(errors_batch, label='errors_batch')
    # plt.show()
#    print("tr_costs", errors_batch_momentum)
    print("Final error rate:", error_rate(pY, Ytest))


# 3.

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # regular batch gradient descend

    tr_costs_nesterov = []
    errors_batch_nesterov = []
    losses_test_nesterov = []

    # momentum coeficient
    mu = 0.9

    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # update velocity
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # update
            W2 += mu * vW2 - lr * gW2
            W1 += mu * vW1 - lr * gW1
            b2 += mu * vb2 - lr * gb2
            b1 += mu * vb1 - lr * gb1

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_nesterov.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_nesterov.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_nesterov.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_nesterov, label='tr_costs_nesterov')
    plt.plot(losses_test_nesterov, label='losses_test_nesterov')
    #plt.plot(errors_batch_nesterov, label='errors_batch')
    plt.legend()
    plt.show()
#    print("tr_costs_nesterov", errors_batch_momentum)
    print("Final error rate nesterov:", error_rate(pY, Ytest))
def main():
	'''
		RMSprop is a form adaptative learning rate which decreases over time
	'''
	
	max_iter = 20  #for RelU
	#max_iter = 30 #for sigmoid
	print_period = 10 	
	X, Y   = get_normalized_data()
	lr = 0.0004
	reg = 0.01 

	
	Xtrain = X[:-1000,]
	Ytrain = Y[:-1000]
	Xtest = X[-1000:,]
	Ytest = Y[-1000:]
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)
	
	
	N, D = Xtrain.shape
	batch_sz = 500
	n_batches = N / batch_sz

	M =300
	K=10

	#1. batch SGD
	W1 = np.random.randn(D, M) / 28
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)



	LL_batch = [] 
	CR_batch = [] 
	
	
	for i in xrange(max_iter):
		for j in xrange(n_batches):
		
			Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :]
			Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) 
			 

			W2 -=  lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
			b2 -=  lr*(derivative_b2(Ybatch,pYbatch) + reg*b2)
			W1 -=  lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
			b1 -=  lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)			


			if j % print_period ==0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_batch.append(ll)
				print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_batch.append(err)
				print "Error rate:", err 

	pY, _ = forward(Xtest, W1, b1, W2, b2) 			
	print "Final error rate:", error_rate(pY, Ytest)

	#2. RMSProp
	W1 = np.random.randn(D, M) / 28
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	LL_rms = [] 
	CR_rms = [] 
	lr0 = 0.001
	cache_W2 = 0 
	cache_b2 = 0 
	cache_W1 = 0 
	cache_b1 = 0 
	decay_rate = 1 - 1e-5
	eps = 1e-10
	
	
	for i in xrange(max_iter):
		for j in xrange(n_batches):
		
			Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :]
			Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) 
			 

			#updates
			gW2 =  derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 			
			W2 -=  lr0*gW2 /(np.sqrt(cache_W2) + eps)
			
			gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2
			cache_b2  = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 			
			b2 -=  lr0*gb2 /(np.sqrt(cache_b2) + eps)

			gW1 =  derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 			
			W1 -=  lr0*gW1 /(np.sqrt(cache_W1) + eps)
			
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			cache_b1  = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1			
			b1 -=  lr0*gb1 /(np.sqrt(cache_b1) + eps)


			if j % print_period ==0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_rms.append(ll)
				print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_rms.append(err)
				print "RMS Error rate:", err 

	pY, _ = forward(Xtest, W1, b1, W2, b2) 			
	print "RMS 	Final error rate:", error_rate(pY, Ytest)

	
	
	plt.plot(LL_batch, label='batch')	
	plt.plot(LL_rms, label='rms')	
	plt.legend()
	plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # update velocities
            dW2 = mu*dW2 - lr*gW2
            db2 = mu*db2 - lr*gb2
            dW1 = mu*dW1 - lr*gW1
            db1 = mu*db1 - lr*gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # v update
            vW2 = mu*vW2 - lr*gW2
            vb2 = mu*vb2 - lr*gb2
            vW1 = mu*vW1 - lr*gW1
            vb1 = mu*vb1 - lr*gb1

            # param update
            W2 += mu*vW2 - lr*gW2
            b2 += mu*vb2 - lr*gb2
            W1 += mu*vW1 - lr*gW1
            b1 += mu*vb1 - lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))



    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
Beispiel #26
0
def main():
    max_iter = 20
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    learning_rate = 0.00004
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #SAVE INITIAL WEIGHT AND BIAS
    W1_copy = W1.copy()
    b1_copy = b1.copy()
    W2_copy = W2.copy()
    b2_copy = b2.copy()

    #constant learning_rate
    lose_constant = []
    error_constant = []
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg * W2)
            b2 -= learning_rate * (derivative_b2(y, pY) + reg * b2)
            W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg * W1)
            b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg * b1)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_constant.append(l)
                error_constant.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    #RMSprop
    W1 = W1_copy.copy()
    b1 = b1_copy.copy()
    W2 = W2_copy.copy()
    b2 = b2_copy.copy()

    learning_rate_0 = 0.001
    lose_non_costant = []
    error_non_constant = []
    cache_W1 = 1
    cache_W2 = 1
    cache_b1 = 1
    cache_b2 = 1
    decay_rate = 0.999
    eps = 1e-10

    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= learning_rate_0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(y, pY) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= learning_rate_0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= learning_rate_0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= learning_rate_0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_non_costant.append(l)
                error_non_constant.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    plt.plot(lose_constant, label="batch")
    plt.plot(lose_non_costant, label="non_constant")
    plt.legend()
    plt.show()
Beispiel #27
0
def main():
    max_iter = 10
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparams
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1. Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1**t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2**t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    loss_rms = []
    err_rms = []

    # comparable hyperparameters for fair comparison
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
Beispiel #28
0
def main():
    max_iter = 10
    print_period = 10
    X_train, X_test, Y_train, Y_test = get_normalized_data()
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(K)
    b2_0 = np.zeros(K)

    # .1 Adam

    W1 = W1_0.copy()
    W2 = W2_0.copy()
    b1 = b1_0.copy()
    b2 = b2_0.copy()

    losses_adam = []
    errors_adam = []

    # 1st moment
    mW1 = 0
    mW2 = 0
    mb1 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    # Hyperparams
    eps = 1e-8
    lr = 0.001
    beta1 = 0.9
    beta2 = 0.999

    t = 1

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            # bias correction
            correction1 = 1 - beta1 ** t
            mW1_hat = mW1 / correction1
            mb1_hat = mb1 / correction1
            mW2_hat = mW2 / correction1
            mb2_hat = mb2 / correction1

            #
            correction2 = 1 - beta2 ** t
            vb2_hat = vb2 / correction2
            vb1_hat = vb1 / correction2
            vW2_hat = vW2 / correction2
            vW1_hat = vW1 / correction2

            t += 1
            # weights
            W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps)
            b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps)
            W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps)
            b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_adam.append(l)
                print(f'Adam Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_adam.append(e)
                print("error_rate", e)

    pY, _ = forward(X_test, W1, b1, W2, b2)
    adam_error = error_rate(pY, Y_test)

    # 3. RMSProp with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_rms = []
    errors_rms = []

    # comparable hyper parameters for fair
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_rms.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')
                err = error_rate(pY, Y_test)
                errors_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(X_test, W1, b1, W2, b2)

    rms_error = error_rate(pY, Y_test)

    print(f"Final RMSProp error rate: {rms_error}")
    print(f"Final Adam error rate: {adam_error}")
    plt.plot(losses_adam, label='batch cost')
    plt.plot(losses_rms, label='RMSProp cost')
    plt.legend()
    plt.show()
Beispiel #29
0
def main():
    max_iter = 20 # make 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10

    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. constant learning rate
    LL_batch = []
    CR_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            pYbatch, Z= forward(xBatch, W1, b1, W2, b2)

            W1 -= lr*(derivative_w2(Z, yBatch, pYbatch) + reg*W2)
            b1 -= lr*(derivative_b2(yBatch, pYbatch) + reg*b2)
            W2 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b2 -= lr*(derivative_b1(Z, yBatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))

    # 2. RMS prop
    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    LL_rms = []
    CR_rms = []

    lr0 = 0.001 # if too high, will result with NaN
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in range(max_iter):
        for j in range(n_batches):
            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            pYbatch, Z= forward(xBatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))

    # 3. batch GD w/ Nesterov momentum
    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    LL_nest = []
    CR_nest = []

    mu = 0.9
    # dW2 = 0
    # db2 = 0
    # dW1 = 0
    # db1 = 0
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
            # dW(t) = mu*dW(t-1) + g(t)
            # W(t) = W(t-1) - mu*dW(t)
            W1_tmp = W1 - lr*mu*vW1
            b1_tmp = b1 - lr*mu*vb1
            W2_tmp = W2 - lr*mu*vW2
            b2_tmp = b2 - lr*mu*vb2

            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]

            pYbatch, Z= forward(xBatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)

            # updates
            vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp
            W2 -= lr*vW2
            vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp
            b2 -= lr*vb2
            vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp
            W1 -= lr*vW1
            vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp
            b1 -= lr*vb1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))
Beispiel #30
0
def main():
    max_iter = 10
    print_period = 50

    train_X, test_X, train_Y, test_Y = get_normalized_data()
    reg = 0.01
    train_Y_ind = indicator(train_Y)
    test_Y_ind = indicator(test_Y)

    N, D = train_X.shape
    batch_size = 500
    batch_num = N // batch_size

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #SAVE INITIAL WEIGHT AND BIAS
    W1_copy = W1.copy()
    b1_copy = b1.copy()
    W2_copy = W2.copy()
    b2_copy = b2.copy()

    #1st moment
    mW1 = 0
    mW2 = 0
    mb1 = 0
    mb2 = 0

    #2nd moment
    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    #hyperparams
    learning_rate = 0.001
    beta1 = 0.99
    beta2 = 0.999
    eps = 1e-8

    #adam
    lose_adam = []
    error_adam = []
    t = 1
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            #update gradient
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            gb2 = derivative_b2(y, pY) + reg * b2
            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1

            #update 1st moment
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            #update 2nd moment
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            #bias correction
            correction_1 = 1 - beta1**t
            correction_2 = 1 - beta2**t
            mW1_hat = mW1 / correction_1
            mW2_hat = mW2 / correction_1
            mb1_hat = mb1 / correction_1
            mb2_hat = mb2 / correction_1

            vW1_hat = vW1 / correction_2
            vW2_hat = vW2 / correction_2
            vb1_hat = vb1 / correction_2
            vb2_hat = vb2 / correction_2

            #update t
            t += 1

            #update weight
            W2 -= learning_rate * mW2_hat / np.sqrt(vW2_hat + eps)
            b2 -= learning_rate * mb2_hat / np.sqrt(vb2_hat + eps)
            b1 -= learning_rate * mb1_hat / np.sqrt(vb1_hat + eps)
            W1 -= learning_rate * mW1_hat / np.sqrt(vW1_hat + eps)

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_adam.append(l)
                error_adam.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    #RMSprop with momentum
    W1 = W1_copy.copy()
    b1 = b1_copy.copy()
    W2 = W2_copy.copy()
    b2 = b2_copy.copy()

    #hyperparams
    learning_rate = 0.001
    decay_rate = 0.999
    mu = 0.9
    eps = 1e-8

    #rmsprop cache
    cache_W1 = 1
    cache_W2 = 1
    cache_b1 = 1
    cache_b2 = 1

    #momentum
    dW1 = 0
    dW2 = 0
    db1 = 0
    db2 = 0

    lose_rmsprop_m = []
    error_rmsprop_m = []
    t = 1
    for i in range(max_iter):
        shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
        for j in range(batch_num):
            x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :]
            y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :]

            pY, Z = forward(x, W1, W2, b1, b2)

            #udpate
            gW2 = derivative_w2(Z, y, pY) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 - (
                1 - mu) * learning_rate * gW2 / np.sqrt(cache_W2 + eps)
            W2 += dW2

            gb2 = derivative_b2(y, pY) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 - (
                1 - mu) * learning_rate * gb2 / np.sqrt(cache_b2 + eps)
            b2 += db2

            gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 - (
                1 - mu) * learning_rate * gW1 / np.sqrt(cache_W1 + eps)
            W1 += dW1

            gb1 = derivative_b1(Z, y, pY, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 - (
                1 - mu) * learning_rate * gb1 / np.sqrt(cache_b1 + eps)
            b1 += db1
            # #update cache
            # cache_W1 = decay_rate * cache_W1 + (1-decay_rate)*gW1*gW1
            # cache_W2 = decay_rate * cache_W2 + (1-decay_rate)*gW2*gW2
            # cache_b1 = decay_rate * cache_b1 + (1-decay_rate)*gb1*gb1
            # cache_b2 = decay_rate * cache_b2 + (1-decay_rate)*gb2*gb2

            # #update momentum
            # dW2 = mu*dW2 + (1-mu) * learning_rate * gW2 / (np.sqrt(cache_W2) + eps)
            # db2 = mu*db2 + (1-mu) * learning_rate * gb2 / (np.sqrt(cache_b2) + eps)
            # dW1 = mu*dW1 + (1-mu) * learning_rate * dW1 / (np.sqrt(cache_W1) + eps)
            # db1 = mu*db1 + (1-mu) * learning_rate * db1 / (np.sqrt(cache_b1) + eps)

            # #update weights
            # W2 -= dW2
            # b2 -= db2
            # W1 -= dW1
            # b1 -= db1

            if j % print_period == 0:
                p_test, Z_test = forward(test_X, W1, W2, b1, b2)
                l = cost(p_test, test_Y_ind)
                e = error_rate(p_test, test_Y)
                lose_rmsprop_m.append(l)
                error_rmsprop_m.append(e)
                print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
                print("error_rate: ", e)
    p_final, z_final = forward(test_X, W1, W2, b1, b2)
    print("final error_rate:", error_rate(p_final, test_Y))

    plt.plot(lose_adam, label="adam")
    plt.plot(lose_rmsprop_m, label="rmsprop with momentum")
    plt.legend()
    plt.show()
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001 # if you set this too high you'll get NaN!
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
# 1. Adam optimizer
loss_adam = []
err_adam = []
t = 1

for i in range(max_iter):
    for j in range(n_batch):
        X_batch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
        Y_batch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
        pY_batch, Z = forward(X_batch, W1, b1, W2, b2)

        # Update the gradiant
        gW2 = derivative_w2(Z, Y_batch, pY_batch) + reg * W2
        gb2 = derivative_b2(Y_batch, pY_batch) + reg * b2
        gW1 = derivative_w1(X_batch, Z, Y_batch, pY_batch, W2) + reg * W1
        gb1 = derivative_b1(Z, Y_batch, pY_batch, W2) + reg * b1

        # Update new Moments
        mW1 = beta1 * mW1 + (1 - beta1) * gW1
        mb1 = beta1 * mb1 + (1 - beta1) * gb1
        mW2 = beta1 * mW2 + (1 - beta1) * gW2
        mb2 = beta1 * mb2 + (1 - beta1) * gb2

        # Update new Velocity
        vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
        vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
        vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
        vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

        # Bias Correction Update
        correction1 = 1 - beta1**t
def main():
    # 1.batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000], Y[:-1000]
    Xtest, Ytest = X[-1000:], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = X.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300  # number of hidden neurons
    K = 10  # number of output classes

    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #1. batch SGD
    LL_batch = []
    CR_batch = []

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]

            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nesterov = []
    CR_nesterov = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * mu * dW2 - (1 + mu) * lr * (
                derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            W2 += dW2
            db2 = mu * mu * db2 - (1 + mu) * lr * (
                derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * mu * dW1 - (1 + mu) * lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * mu * db1 - (1 + mu) * lr * (
                derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_nesterov.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nesterov.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label='momentum')
    plt.plot(LL_nesterov, label='nesterov')
    plt.legend()
    plt.show()
def main():

    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300
    K = len(set(Ytrain))

    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    lr = 0.00004
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz
    epochs = 20

    # 1. batch
    costs_batch = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2)
            b2 -= lr * (derivative_b2(pY, y) + reg * b2)
            W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_batch.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)
    print("\n")

    # 2. RMSprop
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10
    lr0 = 0.001

    costs_RMS = []
    for t in range(epochs):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]

            pY, Z = forward(x, W1, b1, W2, b2)

            gW2 = (derivative_W2(Z, pY, y) + reg * W2)
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = (derivative_b2(pY, y) + reg * b2)
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1)
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1)
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % 10 == 0:
                pY_test, _ = forward(Xtest, W1, b1, W2, b2)
                c = cost(pY_test, Ytest_ind)
                costs_RMS.append(c)
                print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c))

                e = error_rate(pY_test, Ytest)
                print("Error rate:", e)

    plt.plot(costs_batch, label="batch")
    plt.plot(costs_RMS, label="rms")
    plt.legend()
    plt.show()