コード例 #1
0
def main():
    Xtrain, Xtest, Ttrain, Ttest = get_normalized_data()
    Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest)

    lr = .00004
    reg = .01

    max_iter = 20
    print_period = 10

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300  # hidden units
    K = 10  # output classes

    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    thX = T.matrix('X')  # placeholders
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')  # updateable
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    thZ = T.nnet.relu(thX.dot(W1) + b1)  # more placeholders for calculations
    thY = T.nnet.softmax(thZ.dot(W2) + b2)

    # cost function that will be differentiated
    cost = -(thT*T.log(thY)).sum() + reg*((W1**2).sum() + (b1**2).sum()
                                          + (W2**2).sum() + (b2**2).sum())
    prediction = T.argmax(thY, axis=1)

    update_W1 = W1 - lr*T.grad(cost, W1)  # derivative of cost wrt W1
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[[W1, update_W1], [b1, update_b1], [W2, update_W2],
                 [b2, update_b2]]
    )
    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction]
    )

    LL = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)]
            Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)]

            train(Xbatch, Tbatch)
            if j % print_period == 0:
                cost_val, Ptest = get_prediction(Xtest, Ttest)
                err = error_rate(Ptest, np.argmax(Ttest, axis=1))
                print("cost / err at iteration i=%d, j=%d: %.3f / %.3f"
                      % (i, j, cost_val, err))
                LL.append(cost_val)

    cost_val, Ytest = get_prediction(Xtest, Ttest)
    print("final error rate:", error_rate(Ptest, np.argmax(Ttest, axis=1)))
    plt.plot(LL)
    plt.show()
コード例 #2
0
import numpy as np
import matplotlib.pyplot as plt

from LP_mlp import forward, derivative_w1, derivative_w2, derivative_b1, derivative_b2
from LP_util import get_normalized_data, error_rate, cost, y2indicator
from sklearn.utils import shuffle

max_iter = 20
print_period = 10

Xtrain, Xtest, Ttrain, Ttest = get_normalized_data()
lr = .00004
reg = .01

Ttrain = y2indicator(Ttrain)
Ttest = y2indicator(Ttest)

N, D = Xtrain.shape
batch_sz = 500
n_batches = N // batch_sz

M = 300  # hidden units, around the number of components according to PCA
K = Ttrain.shape[1]
W1_0 = np.random.randn(D, M) / np.sqrt(D)
b1_0 = np.zeros(M)
W2_0 = np.random.randn(M, K) / np.sqrt(M)
b2_0 = np.random.randn(K)

# batch gradient descent (no momentum)
W1 = W1_0.copy()
b1 = b1_0.copy()
コード例 #3
0
ファイル: tensorflow2.py プロジェクト: geoffder/learning
def main():
    Xtrain, Xtest, Ttrain_label, Ttest_label = get_normalized_data()
    Ttrain, Ttest = y2indicator(Ttrain_label), y2indicator(Ttest_label)

    lr = .0004
    # reg = .01

    max_iter = 20
    print_period = 10

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M1 = 300  # hidden units
    M2 = 100
    K = 10  # output classes

    def init_weights(shape):
        return tf.Variable(tf.random_normal(shape, stddev=0.1))

    def forward(X, W1, b1, W2, b2, W3, b3):
        Z1 = tf.nn.relu(tf.matmul(X, W1) + b1)
        Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2)
        return tf.matmul(Z2, W3) + b3  # return activation, not softmax

    def error_rate(P, T):
        return (P != T).mean()

    tfX = tf.placeholder(tf.float32, [None, D])
    tfT = tf.placeholder(tf.float32, [None, K])

    W1 = init_weights([D, M1])
    b1 = init_weights([M1])
    W2 = init_weights([M1, M2])
    b2 = init_weights([M2])
    W3 = init_weights([M2, K])
    b3 = init_weights([K])

    tfY = forward(tfX, W1, b1, W2, b2, W3, b3)
    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=tfT, logits=tfY))
    predict_op = tf.argmax(tfY, axis=1)
    # train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost)
    train_op = tf.train.RMSPropOptimizer(lr, decay=.99,
                                         momentum=.9).minimize(cost)

    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    LL = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)]
            Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)]

            sess.run(train_op, feed_dict={tfX: Xbatch, tfT: Tbatch})
            if j % print_period == 0:
                Ptest = sess.run(predict_op, feed_dict={tfX: Xtest})
                err = error_rate(Ptest, Ttest_label)
                c = sess.run(cost, feed_dict={tfX: Xtest, tfT: Ttest})
                print("cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                      (i, j, c, err))
                LL.append(c)

    Ptest = sess.run(predict_op, feed_dict={tfX: Xtest})
    print("final error rate:", error_rate(Ptest, Ttest_label))
    plt.plot(LL)
    plt.show()
コード例 #4
0
ファイル: adam.py プロジェクト: geoffder/learning
def main():
    max_iter = 20
    print_period = 10

    Xtrain, Xtest, Ttrain, Ttest = get_normalized_data()
    reg = .01

    Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300  # hidden units
    K = 10  # output classes

    # use the same initial weights for adam and RMSprop+momentum
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    # intial learning rate and decay
    lr0 = .001  # don't go too high, or NaNs will result
    beta1 = .9  # decay constants for first and second moment
    beta2 = .999
    eps = 1e-8

    # 1. Adam
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    loss_adam = []
    err_adam = []
    t = 1  # by convention. Otherwise, first correction is 0 (div by 0)
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)]
            Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)]
            Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg*W2
            gb2 = derivative_b2(Tbatch, Ybatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg*b1
            # new first moment (m)
            # exponentially smoothed avg of the gradients
            mW1 = beta1*mW1 + (1 - beta1)*gW1  # add fraction of gradient
            mb2 = beta1*mb1 + (1 - beta1)*mb1
            mW2 = beta1*mW2 + (1 - beta1)*gW2
            mb2 = beta1*mb2 + (1 - beta1)*gb2
            # new second moment (v)
            # exponentially smoothed avg of the squared gradients
            vW1 = beta2*vW1 + (1 - beta2)*gW1**2  # add fraction of gradient^2
            vb2 = beta2*vb1 + (1 - beta2)*mb1**2
            vW2 = beta2*vW2 + (1 - beta2)*gW2**2
            vb2 = beta2*vb2 + (1 - beta2)*gb2**2
            # bias correction
            correction1 = 1 - beta1**t
            hat_mW1 = mW1/correction1  # boost first updates (against 0 bias)
            hat_mb1 = mb1/correction1
            hat_mW2 = mW2/correction1
            hat_mb2 = mb2/correction1
            correction2 = 1 - beta2**t
            hat_vW1 = vW1/correction2  # boost first updates (against 0 bias)
            hat_vb1 = vb1/correction2
            hat_vW2 = vW2/correction2
            hat_vb2 = vb2/correction2
            t += 1  # update t (each learning step)
            # update weights
            W1 -= lr0 * hat_mW1 / (np.sqrt(hat_vW1) + eps)
            b1 -= lr0 * hat_mb1 / (np.sqrt(hat_vb1) + eps)
            W2 -= lr0 * hat_mW2 / (np.sqrt(hat_vW2) + eps)
            b2 -= lr0 * hat_mb2 / (np.sqrt(hat_vb2) + eps)

            if j % print_period == 0:
                # calculate Log-Likelihood
                Y, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(Y, Ttest)
                loss_adam.append(ll)
                print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll))

                err = error_rate(Y, np.argmax(Ttest, axis=1))
                err_adam.append(err)
                print('error rate:', err)

    Y, _ = forward(Xtest, W1, b1, W2, b2)
    print('final error rate (adam):', error_rate(Y, np.argmax(Ttest, axis=1)))

    # 2. RMSprop + momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    cache_W1 = 1  # updates more comparable with adam
    cache_b1 = 1
    cache_W2 = 1
    cache_b2 = 1

    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    loss_rmsmom = []
    err_rmsmom = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)]
            Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)]
            Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg*W2
            gb2 = derivative_b2(Tbatch, Ybatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg*b1
            # RMSprop cache (less learning as more learning occurs)
            cache_W1 = beta2*cache_W1 + (1 - beta2)*gW1*gW1
            cache_b1 = beta2*cache_b1 + (1 - beta2)*gb1*gb1
            cache_W2 = beta2*cache_W2 + (1 - beta2)*gW2*gW2
            cache_b2 = beta2*cache_b2 + (1 - beta2)*gb2*gb2
            # velocity term (momentum)
            # dW1 = dW1*beta1 + lr0*gW1 # this is plain velocity..
            # db1 = db1*beta1 + lr0*gb1
            # dW2 = dW2*beta1 + lr0*gW2
            # db2 = db2*beta1 + lr0*db2
            # LP added (1- beta1) discussed in adam lecture, he says it makes
            # the comparison more fair. (velo only topped up)
            # add some gradient divided by learning history cache
            dW1 = dW1*beta1 + (1-beta1)*lr0*gW1/(np.sqrt(cache_W1)+eps)
            db1 = db1*beta1 + (1-beta1)*lr0*gb1/(np.sqrt(cache_b1)+eps)
            dW2 = dW2*beta1 + (1-beta1)*lr0*gW2/(np.sqrt(cache_W2)+eps)
            db2 = db2*beta1 + (1-beta1)*lr0*db2/(np.sqrt(cache_b2)+eps)
            # subtract accumlated velocity
            W2 -= dW2
            b2 -= db2
            W1 -= dW1
            b1 -= db1

            if j % print_period == 0:
                # calculate LL
                Y, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(Y, Ttest)
                loss_rmsmom.append(ll)
                print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll))

                err = error_rate(Y, np.argmax(Ttest, axis=1))
                err_rmsmom.append(err)
                print('error rate:', err)

    Y, _ = forward(Xtest, W1, b1, W2, b2)
    print('final error rate (RMSprop+mom):',
          error_rate(Y, np.argmax(Ttest, axis=1)))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rmsmom, label='RMSprop+momentum')
    plt.legend()
    plt.show()
コード例 #5
0
def main():
    max_iter = 20
    print_period = 10

    Xtrain, Xtest, Ttrain, Ttest = get_normalized_data()
    reg = .01
    lr = .0001

    Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300  # hidden units
    K = 10  # output classes

    # 1. constant learning rate
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(K)
    b2 = np.zeros(K)

    LL_batch = []
    CR_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)]
            Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)]
            Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            W2 -= lr * (derivative_w2(Z, Tbatch, Ybatch) + reg * W2)
            b2 -= lr * (derivative_b2(Tbatch, Ybatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Tbatch, Ybatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate LL
                Y, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(Y, Ttest)
                LL_batch.append(ll)
                print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll))

                err = error_rate(Y, np.argmax(Ttest, axis=1))
                CR_batch.append(err)
                print('error rate:', err)

    Y, _ = forward(Xtest, W1, b1, W2, b2)
    print('final error rate (batch):', error_rate(Y, np.argmax(Ttest, axis=1)))

    # 1. RMSprop
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(K)
    b2 = np.zeros(K)

    # intial learning rate and decay
    lr0 = .0005  # don't go too high, or NaNs will result
    decay_rate = .999
    eps = .000001  # epsilon (constant, see notes)
    cache_W1 = 0
    cache_b1 = 0
    cache_W2 = 0
    cache_b2 = 0

    LL_rms = []
    CR_rms = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)]
            Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)]
            Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg * W2  # gradient
            # replace fraction of cache with new gradient
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            # divide gradient by sqrt of cache
            # (greater learning history -> less new learning)
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Tbatch, Ybatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate LL
                Y, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(Y, Ttest)
                LL_rms.append(ll)
                print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll))

                err = error_rate(Y, np.argmax(Ttest, axis=1))
                CR_rms.append(err)
                print('error rate:', err)

    Y, _ = forward(Xtest, W1, b1, W2, b2)
    print('final error rate (RMSprop):', error_rate(Y, np.argmax(Ttest,
                                                                 axis=1)))

    plt.plot(LL_batch, label='batch')
    plt.plot(LL_rms, label='RMSprop')
    plt.legend()
    plt.show()