def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)
        self.W = np.random.randn(D, K) / np.sqrt(D)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY = self.forward(X)

            # gradient descent step
            self.W -= learning_rate*(X.T.dot(pY - T) + reg*self.W)
            self.b -= learning_rate*((pY - T).sum(axis=0) + reg*self.b)

            if i % 10 == 0:
                pYvalid = self.forward(Xvalid)
                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #2
0
forward_set = np.empty((18, 1))
data_sets = []

for k in range(0, X.shape[1]):
    best_mse = -1  # Starting value for best mean square error
    for p in range(k, X.shape[1]):  # starting from k is the same as p-k

        count += 1  # Keep track of models trained
        temp = forward_set.copy(
        )  # Create deep copy of forward model to avoid changing the optimized model
        X_train = np.c_[
            temp,
            X[:,
              p]]  # Concat previous forward model with new feature for training
        beta = util.calcBeta(X_train, y)  # Calculate beta
        mse = util.cost(X_train, y, beta)  # Calculate cost
        print(f"Training MSE: {mse}, Column: {p}")

        # Save best feature if found
        if mse < best_mse or best_mse < 0:
            best_mse = mse  # assign new best mse
            best_index = p  # index of best column
            keep_feature = X[:, p]  # actual feature

    # Add the column to M+1
    forward_set = np.c_[forward_set, keep_feature]
    # Move chosen feature to index k to avoid singular matrix when dotting and inversing
    X[:, [k, best_index]] = X[:, [best_index, k]]
    print(f"Lowest MSE for {k + 1} features: {best_mse}")
    data_sets.append(forward_set)  # Save the best 18 x n sets to a list
Example #3
0
def main():

    # compare 3:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
    # all with L2 regularization

    X, Y = get_normalized_data()

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 300
    K = 10
    ######################IMPORTANETE PARAMETERE ####################
    t = 1  # !!!!!!!!!!!!!!!!
    ###############################################################
    epochs = 20
    print_period = 10
    lr0 = 0.001
    reg = 0.01
    epsilon = 1e-8  # is it the same as 10e-8

    beta1 = 0.9  # mu = 0.9
    beta2 = 0.999  # decay = 0.999
    batch_size = 500
    number_batches = int(N // batch_size)

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    tr_costs_momentum = []
    errors_batch_momentum = []
    losses_test_momentum = []

    # momentum coeficient

    mW2 = 0
    mW1 = 0
    mb2 = 0
    mb1 = 0

    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    mW2_hat = 0
    mW1_hat = 0
    mb2_hat = 0
    mb1_hat = 0

    vW1_hat = 0
    vW2_hat = 0
    vb1_hat = 0
    vb2_hat = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # update momentum
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # update velocity
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction
            correction1 = (1 - beta1**t)
            mW2_hat = mW2 / correction1
            mW1_hat = mW1 / correction1
            mb2_hat = mb2 / correction1
            mb1_hat = mb1 / correction1

            correction2 = (1 - beta2**t)
            vW2_hat = vW2 / correction2
            vW1_hat = vW1 / correction2
            vb2_hat = vb2 / correction2
            vb1_hat = vb1 / correction2

            # update t !!!!!!!
            t += 1

            # update
            W2 -= lr0 * (mW2_hat / np.sqrt(vW2_hat + epsilon))
            W1 -= lr0 * (mW1_hat / np.sqrt(vW1_hat + epsilon))
            b2 -= lr0 * (mb2_hat / np.sqrt(vb2_hat + epsilon))
            b1 -= lr0 * (mb1_hat / np.sqrt(vb1_hat + epsilon))

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_momentum.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" %
                      (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_momentum.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_momentum.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_momentum, label='tr_costs momentum')
    plt.plot(losses_test_momentum, label='losses_test momentum RMS')
    #plt.plot(errors_batch, label='errors_batch')
    # plt.show()
    #    print("tr_costs", errors_batch_momentum)
    print("Final error rate:", error_rate(pY, Ytest))
    plt.legend()
    plt.show()
def main():
    max_iter = 20
    print_period = 20

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    Ytrain_ind = y2indicator(Ytrain)   # Target of train data 
    Ytest_ind = y2indicator(Ytest)      # Target of test data 

    lr = 0.00004
    reg = 0.01

    N, D = Xtrain.shape
    M = 300
    K = 10

    np.random.seed(123)
    W1 = np.random.randn(D, M) / np.sqrt(D)   
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)  
    b2 = np.zeros(K)

    batch_sz = 500
    n_batches = N // batch_sz    # 82

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. learning rate =  constant
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)    
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2.  RMSprop
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_rms = []
    errors_rms = []

    '''
    in RMSprop you can use a bigger lr, 
    but if you set this too high you'll get NaN!
    if you use the same learning rate within RMSprop and General method, there is only slight difference between them. 
    '''
    lr0 = 0.001   
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 1e-10
    for i in range(max_iter):
        # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]  # Target of each batch
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # # update
            # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)

            # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2)
            # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2)

            # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2))
            # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)

            # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)



            # updates
            # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_rms.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='contant')
    plt.plot(losses_rms, label='RMSprop')
    plt.legend()
    plt.show()
def main():
    X, Y, _, _ = get_transformed_data()
    X = X[:, :300]

    # normalize X first
    mu = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mu) / std

    print "Performing logistic regression..."
    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in xrange(200):
        p_y = forward(Xtrain, W, b)

        W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W)
        b += lr*(gradb(Ytrain_ind, p_y) - reg*b)
        

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 10 == 0:
            err = error_rate(p_y_test, Ytest)
            print "Cost at iteration %d: %.6f" % (i, ll)
            print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for full GD:", datetime.now() - t0


    # 2. stochastic
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in xrange(1): # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in xrange(min(N, 500)): # shortcut so it won't take so long...
            x = tmpX[n,:].reshape(1,D)
            y = tmpY[n,:].reshape(1,10)
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

            if n % (N/2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for SGD:", datetime.now() - t0


    # 3. batch
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N / batch_sz

    t0 = datetime.now()
    for i in xrange(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in xrange(n_batches):
            x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
            y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
            if j % (n_batches/2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for batch GD:", datetime.now() - t0



    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
Example #6
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # updates
            W2 -= lr*gW2
            b2 -= lr*gb2
            W1 -= lr*gW1
            b1 -= lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # update velocities
            dW2 = mu*dW2 - lr*gW2
            db2 = mu*db2 - lr*gb2
            dW1 = mu*dW1 - lr*gW1
            db1 = mu*db1 - lr*gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # v update
            vW2 = mu*vW2 - lr*gW2
            vb2 = mu*vb2 - lr*gb2
            vW1 = mu*vW1 - lr*gW1
            vb1 = mu*vb1 - lr*gb1

            # param update
            W2 += mu*vW2 - lr*gW2
            b2 += mu*vb2 - lr*gb2
            W1 += mu*vW1 - lr*gW1
            b1 += mu*vb1 - lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))



    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
Example #7
0
def main():
    max_iter = 10
    print_period = 10
    X_train, X_test, Y_train, Y_test = get_normalized_data()
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(K)
    b2_0 = np.zeros(K)

    # .1 Adam

    W1 = W1_0.copy()
    W2 = W2_0.copy()
    b1 = b1_0.copy()
    b2 = b2_0.copy()

    losses_adam = []
    errors_adam = []

    # 1st moment
    mW1 = 0
    mW2 = 0
    mb1 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vW2 = 0
    vb1 = 0
    vb2 = 0

    # Hyperparams
    eps = 1e-8
    lr = 0.001
    beta1 = 0.9
    beta2 = 0.999

    t = 1

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            # bias correction
            correction1 = 1 - beta1 ** t
            mW1_hat = mW1 / correction1
            mb1_hat = mb1 / correction1
            mW2_hat = mW2 / correction1
            mb2_hat = mb2 / correction1

            #
            correction2 = 1 - beta2 ** t
            vb2_hat = vb2 / correction2
            vb1_hat = vb1 / correction2
            vW2_hat = vW2 / correction2
            vW1_hat = vW1 / correction2

            t += 1
            # weights
            W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps)
            b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps)
            W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps)
            b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_adam.append(l)
                print(f'Adam Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_adam.append(e)
                print("error_rate", e)

    pY, _ = forward(X_test, W1, b1, W2, b2)
    adam_error = error_rate(pY, Y_test)

    # 3. RMSProp with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_rms = []
    errors_rms = []

    # comparable hyper parameters for fair
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_rms.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')
                err = error_rate(pY, Y_test)
                errors_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(X_test, W1, b1, W2, b2)

    rms_error = error_rate(pY, Y_test)

    print(f"Final RMSProp error rate: {rms_error}")
    print(f"Final Adam error rate: {adam_error}")
    plt.plot(losses_adam, label='batch cost')
    plt.plot(losses_rms, label='RMSProp cost')
    plt.legend()
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)



    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
Example #9
0
def main():
    # Compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nestrov momentum

    max_iter = 30
    print_period = 10

    X_train, X_test, Y_train, Y_test = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    # cost = -16

    losses_batch = []
    errors_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z,
                                      Ybatch, pYbatch, W2) + reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_batch.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_batch.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # Update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_momentum.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_momentum.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_nesterov = []
    errors_nesterov = []
    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # v update
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # param update
            W2 += mu * vW2 - lr * gW2
            b2 += mu * vb2 - lr * gb2
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1
            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_nesterov.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_nesterov.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final error rate: {error_rate(pY, Y_test)}")

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_momentum, label='momentum')
    plt.plot(losses_nesterov, label='nesterov')
    plt.show()
def main():
    X, Y, _, _ = get_transformed_data()
    X = X[:, :300]

    # normalize the data:
    mu = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mu) / std

    print('Performing logistic regression...')
    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)
    K = len(set(Y))

    np.random.seed()

    # 1. Full Gradient Descend:
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    LL = []  # a storage for costs
    lr = 0.0001  # learning rate
    reg = 0.01  # L2-regularization term
    t0 = datetime.now()
    print('utilizing full GD...')
    for i in range(200):
        p_y = forward(Xtrain, W, b)

        W += lr * (grad_W(Ytrain_ind, p_y, Xtrain) - reg * W)
        b += lr * (grad_b(Ytrain_ind, p_y).sum(axis=0) - reg * b)

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)

        if i % 10 == 0:
            error = error_rate(p_y_test, Ytest)
            print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error))
    dt1 = datetime.now() - t0
    p_y_test = forward(Xtest, W, b)
    plt.plot(LL)
    plt.title('Cost for full GD')
    plt.show()
    plt.savefig('Cost_full_GD.png')
    print('Final error rate:', error_rate(p_y_test, Ytest))
    print('Elapsed time for full GD:', dt1)

    # 2. Stochastic Gradien Descent
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    LLstochastic = []  # a storage for costs
    lr = 0.0001  # learning rate
    reg = 0.01  # L2-regularization term
    t0 = datetime.now()
    print('utilizing stochastic GD...')
    for i in range(25):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        # we consider just 500 samples, not all the dataset
        for n in range(N):
            x = tmpX[n, :].reshape(1, D)
            y = tmpY[n, :].reshape(1, K)
            p_y = forward(x, W, b)

            W += lr * (grad_W(y, p_y, x) - reg * W)
            b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LLstochastic.append(ll)

            if n % (N // 2) == 0:
                error = error_rate(p_y_test, Ytest)
                print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error))

    dt2 = datetime.now() - t0
    p_y_test = forward(Xtest, W, b)
    plt.plot(LLstochastic)
    plt.title('Cost for stochastic GD')
    plt.show()
    plt.savefig('Cost_stochastic_GD.png')
    print('Final error rate:', error_rate(p_y_test, Ytest))
    print('Elapsed time for stochastic GD:', dt2)

    # 3. Batch Gradient Descent:
    W = np.random.randn(D, K) / np.sqrt(D)
    b = np.zeros(K)
    LLbatch = []
    lr = 0.0001  # learning rate
    reg = 0.01  # L2-regularization term
    batch_size = 500
    n_batches = N // batch_size
    t0 = datetime.now()
    print('utilizing batch GD...')
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_size:batch_size * (j + 1), :]
            y = tmpY[j * batch_size:batch_size * (j + 1), :]
            p_y = forward(x, W, b)

            W += lr * (grad_W(y, p_y, x) - reg * W)
            b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LLbatch.append(ll)

            if j % (n_batches // 2) == 0:
                error = error_rate(p_y_test, Ytest)
                print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error))
    dt3 = datetime.now() - t0
    p_y_test = forward(Xtest, W, b)
    plt.plot(LLbatch)
    plt.title('Cost for batch GD')
    plt.show()
    plt.savefig('Cost_batch_GD.png')
    print('Final error rate:', error_rate(p_y_test, Ytest))
    print('Elapsed time for batch GD', dt3)

    # plot all costs together:
    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label='full')

    x2 = np.linspace(0, 1, len(LLstochastic))
    plt.plot(x2, LLstochastic, label='stochastic')

    x3 = np.linspace(0, 1, len(LLbatch))
    plt.plot(x3, LLbatch, label='batch')

    plt.legend()
    plt.show()
    plt.savefig('Costs_together.png')
Example #11
0
def main():
	max_iter = 20
	print_period = 10

	Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
	lr = 0.00004
	reg = 0.01

	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)

	N, D = Xtrain.shape
	batch_sz = 500
	n_batches = N // batch_sz

	M = 300
	K = 10
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#const
	LL_batch = []
	CR_batch = []
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),]
			Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
			#gradient
			gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			#updates
			W2 -= lr*gW2
			b2 -= lr*gb2
			W1 -= lr*gW1
			b1 -= lr*gb1
			if j%print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_batch.append(ll)
				print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
				err = error_rate(pY, Ytest)
				CR_batch.append(err)
				print("Error rate:", err)
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print("Final error rate:", error_rate(pY, Ytest))

	#RMSprop
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#const
	LL_rms = []
	CR_rms = []
	lr0 = 0.001
	cache_W2 = 1
	cache_b2 = 1
	cache_W1 = 1
	cache_b1 = 1
	decay_rate = 0.999
	eps = 1e-10
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),]
			Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			#updates
			gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
			W2 -= lr0*gW2 / (np.sqrt(cache_W2) + eps)

			gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
			cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
			b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

			gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
			W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
			b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)
			if j%print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_rms.append(ll)
				print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll))
				err = error_rate(pY, Ytest)
				CR_rms.append(err)
				print("Error rate:", err)
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print("Final error rate:", error_rate(pY, Ytest))

	plt.plot(LL_batch, label = 'const')
	plt.plot(LL_rms, label = 'rms')
	plt.legend()
	plt.show()
Example #12
0
def main():
    # 3 scenarios
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 15
    print_period = 10

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.0001
    reg = 0.001

    # Xtrain = X[:-1000, ]
    # Ytrain = Y[:-1000]
    # Xtest = X[-1000:, ]
    # Ytest = Y[-1000:, ]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = int(N / batch_sz)

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # Batch
    losses_batch = []
    error_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            # A = ' '
            # A = u"\n|                      |\n|----------------------|   \n(\\__/)   || \n(• v •)  || \n /   D"
            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                # print(
                # u"|----------------------|\n|                      | \n Costo
                # en i=%d, j=%d: \n      %.6f" % (i, j, l) + A)

                e = error_rate(pY, Ytest)
                error_batch.append(e)
                print("Ratio de error:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # update velocities
            dW2 = mu * dW2 - lr * gW2
            db2 = mu * db2 - lr * gb2
            dW1 = mu * dW1 - lr * gW1
            db1 = mu * db1 - lr * gb1

            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # v update
            vW2 = mu * vW2 - lr * gW2
            vb2 = mu * vb2 - lr * gb2
            vW1 = mu * vW1 - lr * gW1
            vb1 = mu * vb1 - lr * gb1

            # param update
            W2 += mu * vW2 - lr * gW2
            b2 += mu * vb2 - lr * gb2
            W1 += mu * vW1 - lr * gW1
            b1 += mu * vb1 - lr * gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_momentum, label='momentum')
    plt.plot(losses_nesterov, label='Nesterov')
    plt.legend()
    plt.show()
def main():
	max_iter = 20
	print_period = 10

	X, Y = get_normalized_data()
	lr = 0.00004
	reg = 0.01

	Xtrain, Ytrain = X[:-1000,], Y[:-1000]
	Xtest, Ytest = X[-1000:,], Y[-1000:]
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)

	N, D = X.shape
	batch_sz = 500
	n_batches = N / batch_sz

	M = 300
	K = 10
	W1 = np.random.randn(D, M) / np.sqrt(D+M)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#1. batch SGD
	LL_batch = []
	CR_batch = []

	for i in xrange(max_iter):
		for j in xrange(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
			b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
			W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
			b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_batch.append(ll)
				print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_batch.append(err)
				print "Error rate:", err

	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print "Final error rate:", error_rate(pY, Ytest)

	#RMSProp
	W1 = np.random.randn(D, M) / np.sqrt(D+M)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)
	LL_rms = []
	CR_rms = []
	lr_rms = 0.001
	cache_W2, cache_b2, cache_W1, cache_b1 = 0, 0, 0, 0
	decay_rate = 0.999
	eps = 1e-6
	for i in xrange(max_iter):
		for j in xrange(n_batches):
			Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,]
			Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

			#update
			gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
			cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2**2
			W2 -= lr_rms * gW2 / (np.sqrt(cache_W2) + eps)

			gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
			cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2**2
			b2 -= lr_rms * gb2 / (np.sqrt(cache_b2) + eps)

			gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
			cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1**2
			W1 -= lr_rms * gW1 / (np.sqrt(cache_W1) + eps)

			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
			cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1**2
			b1 -= lr_rms * gb1 / (np.sqrt(cache_b1) + eps)

			if j % print_period == 0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_rms.append(ll)
				print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_rms.append(err)
				print "Error rate:", err

	pY, Z = forward(X, W1, b1, W2, b2)
	print "Final error rate:", error_rate(pY, Ytest)

	plt.plot(LL_batch, label='const')
	plt.plot(LL_rms, label='rms')
	plt.legend()
	plt.show()
Example #14
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    print("Performing logistic regression...")

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    test_losses_full = []
    lr = 0.9
    # lr0 = lr # save for later
    reg = 0.
    t0 = datetime.now()
    last_dt = 0
    intervals = []
    for i in range(50):
        p_y = forward(Xtrain, W, b)

        gW = gradW(Ytrain_ind, p_y, Xtrain) / N
        gb = gradb(Ytrain_ind, p_y) / N

        W += lr*(gW - reg*W)
        b += lr*(gb - reg*b)

        p_y_test = forward(Xtest, W, b)
        test_loss = cost(p_y_test, Ytest_ind)
        dt = (datetime.now() - t0).total_seconds()

        # save these
        dt2 = dt - last_dt
        last_dt = dt
        intervals.append(dt2)

        test_losses_full.append([dt, test_loss])
        if (i + 1) % 10 == 0:
            print("Cost at iteration %d: %.6f" % (i + 1, test_loss))
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for full GD:", datetime.now() - t0)

    # save the max time so we don't surpass it in subsequent iterations
    max_dt = dt
    avg_interval_dt = np.mean(intervals)


    # 2. stochastic
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    test_losses_sgd = []
    lr = 0.001
    reg = 0.

    t0 = datetime.now()
    last_dt_calculated_loss = 0
    done = False
    for i in range(50): # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(N):
            x = tmpX[n,:].reshape(1,D)
            y = tmpY[n,:].reshape(1,10)
            p_y = forward(x, W, b)

            gW = gradW(y, p_y, x)
            gb = gradb(y, p_y)

            W += lr*(gW - reg*W)
            b += lr*(gb - reg*b)

            dt = (datetime.now() - t0).total_seconds()
            dt2 = dt - last_dt_calculated_loss

            if dt2 > avg_interval_dt:
                p_y_test = forward(Xtest, W, b)
                test_loss = cost(p_y_test, Ytest_ind)
                test_losses_sgd.append([dt, test_loss])

            # time to quit
            if dt > max_dt:
                done = True
                break
        if done:
            break

        if (i + 1) % 10 == 0:
            print("Cost at iteration %d: %.6f" % (i + 1, test_loss))
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for SGD:", datetime.now() - t0)


    # 3. mini-batch
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    test_losses_batch = []
    batch_sz = 500
    lr = 0.08
    reg = 0.
    n_batches = N // batch_sz


    t0 = datetime.now()
    last_dt_calculated_loss = 0
    done = False
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
            y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
            p_y = forward(x, W, b)

            gW = gradW(y, p_y, x) / batch_sz
            gb = gradb(y, p_y) / batch_sz

            W += lr*(gW - reg*W)
            b += lr*(gb - reg*b)

            dt = (datetime.now() - t0).total_seconds()
            dt2 = dt - last_dt_calculated_loss

            if dt2 > avg_interval_dt:
                p_y_test = forward(Xtest, W, b)
                test_loss = cost(p_y_test, Ytest_ind)
                test_losses_batch.append([dt, test_loss])

            # time to quit
            if dt > max_dt:
                done = True
                break
        if done:
            break

        if (i + 1) % 10 == 0:
            print("Cost at iteration %d: %.6f" % (i + 1, test_loss))
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for mini-batch GD:", datetime.now() - t0)


    # convert to numpy arrays
    test_losses_full = np.array(test_losses_full)
    test_losses_sgd = np.array(test_losses_sgd)
    test_losses_batch = np.array(test_losses_batch)

    
    plt.plot(test_losses_full[:,0], test_losses_full[:,1], label="full")
    plt.plot(test_losses_sgd[:,0], test_losses_sgd[:,1], label="sgd")
    plt.plot(test_losses_batch[:,0], test_losses_batch[:,1], label="mini-batch")
    plt.legend()
    plt.show()
Example #15
0
def main():
    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001  # if you set this too high you'll get NaN!
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay_rate = 0.999
    eps = 0.0000000001
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
Example #16
0
def main():
    max_iter = 10
    print_period = 10

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    #1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    #2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    #hyperparameters
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    #Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #gradient
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            #new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            #new
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            #bias correction
            correction1 = 1 - beta1**t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2**t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            #update t
            t += 1

            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    #RMSprop with momentum

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    loss_rms = []
    err_rms = []

    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
Example #17
0
def main():

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparameters
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    losses_adam = []
    errors_adam = []

    t = 1

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1**t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2**t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_adam.append(l)
                print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_adam.append(e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("final error rate:  ", error_rate(pY, Ytest))
    print()
    plt.plot(losses_adam, label='adam')

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    plt.legend()
    plt.show()
Example #18
0
def main():
    max_iter = 20 # make 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10

    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch GD
    LL_batch = []
    CR_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            pYbatch, Z= forward(xBatch, W1, b1, W2, b2)

            W1 -= lr*(derivative_w2(Z, yBatch, pYbatch) + reg*W2)
            b1 -= lr*(derivative_b2(yBatch, pYbatch) + reg*b2)
            W2 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b2 -= lr*(derivative_b1(Z, yBatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))

    # 2. batch GD w/ momentum
    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    LL_momentum = []
    CR_momentum = []

    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            pYbatch, Z= forward(xBatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*dW2 - lr*(derivative_w2(Z,yBatch, pYbatch) + reg*W2)
            W1 += dW2
            db2 = mu*db2 - lr*(derivative_b2(yBatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1


            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))

    # 3. batch GD w/ Nesterov momentum
    W1 = np.random(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    LL_nest = []
    CR_nest = []

    mu = 0.9
    # dW2 = 0
    # db2 = 0
    # dW1 = 0
    # db1 = 0
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0

    for i in range(max_iter):
        for j in range(n_batches):
            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
            # dW(t) = mu*dW(t-1) + g(t)
            # W(t) = W(t-1) - mu*dW(t)
            W1_tmp = W1 - lr*mu*vW1
            b1_tmp = b1 - lr*mu*vb1
            W2_tmp = W2 - lr*mu*vW2
            b2_tmp = b2 - lr*mu*vb2

            xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:]
            yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:]

            pYbatch, Z= forward(xBatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)

            # updates
            vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp
            W2 -= lr*vW2
            vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp
            b2 -= lr*vb2
            vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp
            W1 -= lr*vW1
            vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp
            b1 -= lr*vb1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll))

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print("Error rate:", err)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print()"Final error rate:", error_rate(pY, Ytest))



    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
Example #19
0
def main():

    dobatch = False
    dobatchwithmomentum = True
    dobatchwithnesterovmomentum = True

    max_iter = 20  # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # batch

    #cost = -16
    if dobatch:
        losses_batch = []
        errors_batch = []

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
                b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
                W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                            reg * W1)
                b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_batch.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_batch.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    # batch with momentum

    if dobatchwithmomentum:
        print("momentum")

        W1 = W1_0.copy
        b1 = b1_0.copy
        W2 = W2_0.copy
        b2 = b2_0.copy

        losses_momentum = []
        errors_momentum = []

        mu = 0.9

        dW2 = 0
        db2 = 0
        dW1 = 0
        db1 = 0

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
                gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
                gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
                gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

                dW2 = mu * dW2 - lr * gW2
                db2 = mu * db2 - lr * gb2
                dW1 = mu * dW1 - lr * gW1
                db1 = mu * db1 - lr * gb1

                W2 += dW2
                b2 += db2
                W1 += dW1
                b1 += db1

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_momentum.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_momentum.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    # Nesterov momentum
    if dobatchwithnesterovmomentum:
        W1 = W1_0.copy
        b1 = b1_0.copy
        W2 = W2_0.copy
        b2 = b2_0.copy

        losses_nesterov = []
        errors_nesterov = []

        mu = 0.9

        vW2 = 0
        vb2 = 0
        vW1 = 0
        vb1 = 0

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
                pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

                gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
                gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
                gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
                gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1

                vW2 = mu * vW2 - lr * gW2
                vb2 = mu * vb2 - lr * gb2
                vW1 = mu * vW1 - lr * gW1
                vb1 = mu * vb1 - lr * gb1

                W2 += mu * vW2 - lr * gW2
                b2 += mu * vb2 - lr * gb2
                W1 += mu * vW1 - lr * gW1
                b1 += mu * vb1 - lr * gb1

                W2 += dW2
                b2 += db2
                W1 += dW1
                b1 += db1

                if j % print_period == 0:
                    pY, _ = forward(Xtest, W1, b1, W2, b2)
                    l = cost(pY, Ytest_ind)
                    losses_nesterov.append(l)
                    print("cost at iter i %d, j %d:  %.6f" % (i, j, l))

                    e = error_rate(pY, Ytest)
                    errors_nesterov.append(e)

        pY, _ = forward(Xtest, W1, b1, W2, b2)
        print("final error rate:  ", error_rate(pY, Ytest))
        print()

    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
Example #20
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
    print("Performing logistic regression...")

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in range(50):
        p_y = forward(Xtrain, W, b)

        W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W)
        b += lr * (gradb(Ytrain_ind, p_y) - reg * b)

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for full GD:", datetime.now() - t0)

    # 2. stochastic
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in range(
            50):  # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(min(N, 500)):  # shortcut so it won't take so long...
            x = tmpX[n, :].reshape(
                1, D
            )  # in this case, x is a vector, shape=(D,) ,為了讓 feature and target 可以計算 forward and Weights 要做reshape
            y = tmpY[n, :].reshape(
                1, 10
            )  # y is a vector, need to convert into metrix for y2indicator calculation
            p_y = forward(x, W, b)

            W += lr * (gradW(y, p_y, x) - reg * W)
            b += lr * (gradb(y, p_y) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for SGD:", datetime.now() - t0)

    # 3. batch
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz

    t0 = datetime.now()
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]
            p_y = forward(x, W, b)

            W += lr * (gradW(y, p_y, x) - reg * W)
            b += lr * (gradb(y, p_y) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for batch GD:", datetime.now() - t0)

    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
def main():

    # compare 3:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum
    # all with L2 regularization

    X, Y = get_normalized_data()

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 300
    K = 10

    max_iter = 20
    epochs = 20
    print_period = 10
    lr0 = 0.0004
    reg = 0.01
    epsilon = 10e-10
    decay = 0.999
    batch_size = 500
    number_batches = int(N // batch_size)

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    cache_W2 = 1
    cache_W1 = 1
    cache_b2 = 1
    cache_b1 = 1

    tr_costs = []
    errors_batch = []
    losses_test = []

# 1. Just grad & RMSprop

   # 1.
    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # # AdaGrad
            # cache_W2 += derivative_w2(z_tr, ytr, ytr_pred) * derivative_w2(z_tr, ytr, ytr_pred)
            # cache_W1 += derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) * derivative_w1(xtr, z_tr, ytr, ytr_pred, W2)
            # cache_b2 += derivative_b2(ytr, ytr_pred) * derivative_b2(ytr, ytr_pred)
            # cache_b1 += derivative_b1(z_tr, ytr, ytr_pred, W2) * derivative_b1(z_tr, ytr, ytr_pred, W2)

            # RMSProp
            cache_W2 += decay * cache_W2 + (1 - decay) * gW2 * gW2
            cache_W1 += decay * cache_W1 + (1 - decay) * gW1 * gW1
            cache_b2 += decay * cache_b2 + (1 - decay) * gb2 * gb2
            cache_b1 += decay * cache_b1 + (1 - decay) * gb1 * gb1

            W2 -= lr0 * (gW2 // (cache_W2 + epsilon) + reg * W2)
            b2 -= lr0 * (gb2 // (cache_b2 + epsilon) + reg * b2)
            W1 -= lr0 * (gW1 // (cache_W1 + epsilon) + reg * W1)
            b1 -= lr0 * (gb1 // (cache_b1 + epsilon) + reg * b1)

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs, label='tr_costs')
    plt.plot(losses_test, label='losses_test RMS')
    #plt.plot(errors_batch, label='errors_batch')
#    plt.show()
#    print("tr_costs", tr_costs)
    print("Final error rate:", error_rate(pY, Ytest))


# 2. batch grad with momentum & RMSprop
#
#
    # 2.
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # regular batch gradient descend

    tr_costs_momentum = []
    errors_batch_momentum = []
    losses_test_momentum = []

    # momentum coeficient
    mu = 0.8

    cache_W2 = 1
    cache_W1 = 1
    cache_b2 = 1
    cache_b1 = 1

    dW1 = 0
    dW2 = 0
    db1 = 0
    db2 = 0

    cW1 = 0
    cW2 = 0
    cb1 = 0
    cb2 = 0

    for epoch in range(epochs):
        for j in range(number_batches):
            xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :]
            ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :]
            ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2
            gb2 = derivative_b2(ytr, ytr_pred) + reg * b2
            gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1
            gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1

            # potencjalnie pojebalem momentum i velocity

            # RMSProp
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1

            cW2 = (gW2 // (cache_W2) + epsilon)
            cb2 = (gb2 // (cache_b2) + epsilon)
            cW1 = (gW1 // (cache_W1) + epsilon)
            cb1 = (gb1 // (cache_b1) + epsilon)

            # update velocity
            dW2 = mu * dW2 + (1 - mu) * lr0 * cW2
            db2 = mu * db2 + (1 - mu) * lr0 * cb2
            dW1 = mu * dW1 + (1 - mu) * lr0 * cW1
            db1 = mu * db1 + (1 - mu) * lr0 * cb1

            # update
            W2 -= dW2
            W1 -= dW1
            b2 -= db2
            b1 -= db1

            if j % print_period == 0:
                yte_pred, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(yte_pred, Ytest_ind)
                losses_test_momentum.append(l)
                print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l))

                e = error_rate(yte_pred, Ytest)
                errors_batch_momentum.append(e)
                print("Error rate:", e)

                ctr = cost(ytr_pred, ytr)
                print("traning set cost", ctr)
                tr_costs_momentum.append(ctr)

    pY, _ = forward(Xtest, W1, b1, W2, b2)

    #plt.plot(tr_costs_momentum, label='tr_costs momentum')
    plt.plot(losses_test_momentum, label='losses_test momentum RMS')
    #plt.plot(errors_batch, label='errors_batch')
    # plt.show()
#    print("tr_costs", errors_batch_momentum)
    print("Final error rate:", error_rate(pY, Ytest))
    plt.legend()
    plt.show()
        hat_vb1 = vb1 / correction2
        hat_vW2 = vW2 / correction2
        hat_vb2 = vb2 / correction2

        # Update T
        t += 1

        # Apply Update to parameters
        W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
        b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
        W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
        b1 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)

        if j % print_period == 0:
            pY, _ = forward(X_test, W1, b1, W2, b2)
            l = cost(pY, Y_test_ind)
            print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

            err = error_rate(pY, Y_test)
            err_adam.append(err)
            print("Eror rate: ", err)

pY, _ = forward(X_test, W1, b1, W2, b2)
print("Final error rate: ", error_rate(pY, Y_test))

# 2. RMSprop with momentum
W1 = W1_0.copy()
b1 = b1_0.copy()
W2 = W2_0.copy()
b2 = b2_0.copy()
def main():
    max_iter = 10
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparams
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1. Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1 ** t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2 ** t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    loss_rms = []
    err_rms = []

    # comparable hyperparameters for fair comparison
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0
    
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
Example #24
0
def main():
    # compare 5 scenarios:
    # 1. batch SGD with constant learning rate
    # 2. batch SGD with RMSProp
    # 3. batch SGD with AdaGrad
    # 4. batch SGD with exponential decay

    np.random.seed(2)

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights:
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch SGD with constant learning rate:
    LL_batch = []
    CR_batch = []
    t0 = datetime.now()
    print('\nperforming batch SGD with constant learning rate...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_batch.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err1 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_batch)
    #plt.title('Cost for batch GD with const lr')
    #plt.show()

    # 2. batch GD with RMSProp:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_RMSProp = []
    CR_RMSProp = []

    lr0 = 0.001  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    decay = 0.999
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with RMSProp...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_RMSProp.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_RMSProp.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err2 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_RMSProp)
    #plt.title('Cost for batch SGD with RMSProp')
    #plt.show()

    # 3. batch SGD with AdaGrad:
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_AdaGrad = []
    CR_AdaGrad = []

    lr0 = 0.01  # initial learning rate
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1
    eps = 10e-10

    t0 = datetime.now()

    print('\nperforming batch SGD with AdaGrad...')
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = cache_W2 + gW2 * gW2
            W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps)

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = cache_b2 + gb2 * gb2
            b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps)

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = cache_W1 + gW1 * gW1
            W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps)

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = cache_b1 + gb1 * gb1
            b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_AdaGrad.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_AdaGrad.append(error)
                print('error rate:', error)

    dt3 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    final_err3 = error_rate(pY, Ytest)

    # plot the cost
    #plt.plot(LL_AdaGrad)
    #plt.title('Cost for batch SGD with AdaGrad')
    #plt.show()
    '''

	# 4. batch SGD with exponential decay:
	W1 = W1_0.copy()
	b1 = b1_0.copy()
	W2 = W2_0.copy()
	b2 = b2_0.copy()

	LL_exp = []
	CR_exp = []

	
	lr0 = 0.0004 # initial learning rate
	k = 1e-7
	t = 0 # initial log
	lr = lr0 
	t0 = datetime.now()

	print('\nperforming batch SGD with lr exponential decay...')
	for i in range(max_iter):
		for j in range(n_batches):
			Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :]
			Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :]
			p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
			#print(Z.shape, p_Ybatch.shape, Ybatch.shape)
			#print('First batch cost:', cost(p_Ybatch, Ybatch))
			
			# updates:
			gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2)
			W2 -= lr*gW2
			
			gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2)			
			b2 -= lr*gb2
			
			gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1)
			W1 -= lr*gW1
			
			gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1)
			b1 -= lr*gb1 

			# decrease the learning rate
			lr = lr0 * np.exp(-k*t)
			t += 1

			if j % print_period == 0:
				print('current learning rate:', lr)
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				#print('pY:', pY)
				ll = cost(pY, Ytest_ind)
				LL_exp.append(ll)
				print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

				error = error_rate(pY, Ytest)
				CR_exp.append(error)
				print('error rate:', error)

	dt4 = datetime.now() - t0
	pY, _ = forward(Xtest, W1, b1, W2, b2)
	print('\nFinal error rate:', error_rate(pY, Ytest))
	print('Elapsed time for batch SGD with lr exponential decay:', dt4)

	# plot the cost
	#plt.plot(LL_exp)
	#plt.title('Cost for batch SGD with lr exponential decay')
	#plt.show()

'''
    print('\nBatch SGD with constant learning rate:')
    print('final error rate:', final_err1)
    print('elapsed time:', dt1)

    print('\nBatch SGD with RMSProp:')
    print('final error rate:', final_err2)
    print('elapsed time:', dt2)

    print('\nBatch SGD with AdaGrad:')
    print('final error rate:', final_err3)
    print('elapsed time:', dt3)

    # plot the costs together:
    plt.plot(LL_batch, label='const_lr')
    plt.plot(LL_RMSProp, label='RMSProp')
    plt.plot(LL_AdaGrad, label='AdaGrad')
    #plt.plot(LL_exp, label='lr_exp_decay')
    plt.legend()
    plt.show()
Example #25
0
def main():
    # 3 scenarios
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 15
    print_period = 10

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.0001
    reg = 0.001

    # Xtrain = X[:-1000, ]
    # Ytrain = Y[:-1000]
    # Xtest = X[-1000:, ]
    # Ytest = Y[-1000:, ]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = int(N / batch_sz)

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # Batch
    losses_batch = []
    error_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            # A = ' '
            # A = u"\n|                      |\n|----------------------|   \n(\\__/)   || \n(• v •)  || \n /   D"
            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
                # print(
                # u"|----------------------|\n|                      | \n Costo
                # en i=%d, j=%d: \n      %.6f" % (i, j, l) + A)

                e = error_rate(pY, Ytest)
                error_batch.append(e)
                print("Ratio de error:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    # Momentum
    W1 = W1_0.copy()
    b1 = b1.copy()
    W2 = W2.copy()
    b2 = b2.copy()

    losses_rms = []
    errors_rms = []
    lr0 = 0.001
    cacheW2 = 0
    cacheb2 = 0
    cacheW1 = 0
    cacheb1 = 0
    decay_rate = 0.99
    eps = 0.000001

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2)
            gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            # caches
            cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2
            cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2
            cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1
            cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1

            W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps)
            b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps)
            W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps)
            b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_rms.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate: ", error_rate(pY, Ytest))

    plt.plot(losses_batch, label='batch')
    plt.plot(losses_rms, label='rmsprop')
    plt.legend()
    plt.show()
Example #26
0
def batch_grad():

    #get data and for test and train sets
    X, Y = get_normalized_data()
    #XTrain = X[:-1000, :]
    #YTrain = Y[:-1000]
    #YTrain_ind = y2indicator(YTrain)
    #XTest = X[-1000:, :]
    #YTest = Y[-1000:]
    # = y2indicator(YTest)
    Y_ind = y2indicator(Y)

    batchSz = 500
    #Initialize random weights
    N, D = X.shape
    K = len(set(Y))
    M = 300
    W1 = np.random.randn(D, M)
    b1 = np.random.randn(M)
    W2 = np.random.randn(M, K)
    b2 = np.random.randn(K)

    learning_rate = 10e-5
    reg = 0.01

    no_batches = int(N / batchSz)
    print("No of bathces: ", no_batches)
    for i in range(300):
        for n in range(no_batches):
            #get current batch
            XBatch = X[n * batchSz:(n * batchSz + batchSz), :]
            YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :]
            #Forward prop
            pY, Z = forward_relu(XBatch, W1, b1, W2, b2)

            #Backprop
            W2 += learning_rate * (derivative_w2(pY, YBatch_ind, Z) + reg * W2)
            b2 += learning_rate * (derivative_b2(pY, YBatch_ind) + reg * b2)
            W1 += learning_rate * (
                derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1)
            b1 += learning_rate * (derivative_b1(pY, YBatch_ind, W2, Z) +
                                   reg * b1)

            if n % 100 == 0:
                #Forward prop
                #pY, Z = forward_relu(XBatch, W1, b1, W2, b2)
                YBatch = Y[n * batchSz:n * batchSz + batchSz]
                P = np.argmax(pY, axis=1)
                er = error_rate(P, YBatch)

                c = cost(YBatch_ind, pY)
                print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c)

    pY, Z = forward_relu(X, W1, b1, W2, b2)
    p = np.argmax(pY, axis=1)
    print("Final training error rate: ", error_rate(p, Y))

    XTest = get_test_data()
    pY, ZTest = forward_relu(XTest, W1, b1, W2, b2)
    YTest = np.argmax(pY, axis=1)

    f = open("test_result.csv", "w")
    f.write("ImageId,Label\n")
    n = YTest.shape[0]
    for i in range(n):
        f.write(str(i + 1) + "," + str(YTest[i]) + "\n")
    f.close()
def batch_grad():

    #get data and for test and train sets
    X, Y = get_normalized_data()
    #XTrain = X[:-1000, :]
    #YTrain = Y[:-1000]
    #YTrain_ind = y2indicator(YTrain)
    #XTest = X[-1000:, :]
    #YTest = Y[-1000:]
    # = y2indicator(YTest)
    Y_ind = y2indicator(Y)

    batchSz = 500
    #Initialize random weights
    N, D = X.shape
    K = len(set(Y))
    M = 300
    W1 = np.random.randn(D, M)
    b1 = np.random.randn(M)
    W2 = np.random.randn(M, K)
    b2 = np.random.randn(K)

    learning_rate = 10e-5

    no_batches = int(N / batchSz)
    print("No of bathces: ", no_batches)
    for i in range(300):
        for n in range(no_batches):
            #get current batch
            XBatch = X[n * batchSz:(n * batchSz + batchSz), :]
            #YBatch = Y[n*batchSz:n*batchSz + batchSz]
            YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :]
            #Forward prop
            pY, Z = forward_relu(XBatch, W1, b1, W2, b2)

            #Backprop
            W2 += learning_rate * derivative_w2(pY, YBatch_ind, Z)
            b2 += learning_rate * derivative_b2(pY, YBatch_ind)
            W1 += learning_rate * derivative_w1(pY, YBatch_ind, W2, Z, XBatch)
            b1 += learning_rate * derivative_b1(pY, YBatch_ind, W2, Z)

            if n % 100 == 0:
                #Forward prop
                #pY, Z = forward_relu(XBatch, W1, b1, W2, b2)
                YBatch = Y[n * batchSz:n * batchSz + batchSz]
                P = np.argmax(pY, axis=1)
                er = error_rate(P, YBatch)

                c = cost(YBatch_ind, pY)
                print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c)

    # pY, Z = forward_prop(XTrain, W1, b1, W2, b2)
    # P = np.argmax(pY, axis=1)
    # print("Final training error rate: ", error_rate(P, YTrain))
    #
    # pY, Z = forward_prop(XTest, W1, b1, W2, b2)
    # P = np.argmax(pY, axis=1)
    # print("Final testing error rate: ", error_rate(P, YTest))

    pY, Z = forward_relu(X, W1, b1, W2, b2)
    p = np.argmax(pY, axis=1)
    print("Final Final training error rate: ", error_rate(p, Y))
Example #28
0
def main():
    X, Y, _, _ = get_transformed_data()
    X = X[:, :300]

    # normalize X first
    mu = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mu) / std

    print "Performing logistic regression..."
    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in xrange(200):
        p_y = forward(Xtrain, W, b)

        W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W)
        b += lr * (gradb(Ytrain_ind, p_y) - reg * b)

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 10 == 0:
            err = error_rate(p_y_test, Ytest)
            print "Cost at iteration %d: %.6f" % (i, ll)
            print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for full GD:", datetime.now() - t0

    # 2. stochastic
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in xrange(
            1):  # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in xrange(min(N, 500)):  # shortcut so it won't take so long...
            x = tmpX[n, :].reshape(1, D)
            y = tmpY[n, :].reshape(1, 10)
            p_y = forward(x, W, b)

            W += lr * (gradW(y, p_y, x) - reg * W)
            b += lr * (gradb(y, p_y) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

            if n % (N / 2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for SGD:", datetime.now() - t0

    # 3. batch
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N / batch_sz

    t0 = datetime.now()
    for i in xrange(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in xrange(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]
            p_y = forward(x, W, b)

            W += lr * (gradW(y, p_y, x) - reg * W)
            b += lr * (gradb(y, p_y) - reg * b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
            if j % (n_batches / 2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for batch GD:", datetime.now() - t0

    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
    print("Performing logistic regression...")

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in range(50):
        p_y = forward(Xtrain, W, b)

        W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W)
        b += lr*(gradb(Ytrain_ind, p_y) - reg*b)
        

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for full GD:", datetime.now() - t0)


    # 2. stochastic
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in range(50): # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(min(N, 500)): # shortcut so it won't take so long...
            x = tmpX[n,:].reshape(1,D)
            y = tmpY[n,:].reshape(1,10)
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for SGD:", datetime.now() - t0)


    # 3. batch
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz

    t0 = datetime.now()
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
            y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for batch GD:", datetime.now() - t0)



    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
Example #30
0
def main():
    max_iter = 20
    print_period = 10
    X_train, X_test, Y_train, Y_test = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Y_train_ind = y2indicator(Y_train)
    Y_test_ind = y2indicator(Y_test)

    N, D = X_train.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10

    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(K)
    b2 = np.zeros(K)

    # copy weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. Constant Learning rate

    losses_batch = []
    errors_batch = []

    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_batch.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_batch.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)

    batch_error = error_rate(pY, Y_test)
    print(f"Final batch error rate: {batch_error}")

    # 2. RMSProp

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_RMSP = []
    errors_RMSP = []

    lr0 = 0.001
    cache_W1 = 1
    cache_b1 = 1
    cache_W2 = 1
    cache_b2 = 1
    decay = 0.999
    epsilon = 1e-10

    for i in range(max_iter):

        for j in range(n_batches):
            Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + epsilon)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + epsilon)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + epsilon)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + epsilon)

            if j % print_period == 0:
                pY, _ = forward(X_test, W1, b1, W2, b2)
                l = cost(pY, Y_test_ind)
                losses_RMSP.append(l)
                print(f'Cost at iteration i={i}, j={j} : {l}')

                e = error_rate(pY, Y_test)
                errors_RMSP.append(e)
                print("error_rate", e)
    pY, _ = forward(X_test, W1, b1, W2, b2)
    print(f"Final RMSProp error rate: {error_rate(pY, Y_test)}")
    print(f"Final batch error rate: {batch_error}")
    plt.plot(losses_batch, label='batch cost')
    plt.plot(losses_RMSP, label='RMSProp cost')
    plt.legend()
    plt.show()
Example #31
0
def main():
    # compare 2 scenarios:
    # 1. batch GD with RMSProp and momentum
    # 2. Adam GD

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain, Ytrain = X[:-1000, :], Y[:-1000]
    Xtest, Ytest = X[-1000:, :], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    M = 300  # number of hidden layer units
    K = len(set(Ytrain))
    batch_size = 500
    n_batches = N // batch_size

    # randomly initialize weights:
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    # 1. batch GD with RMSProp and momentum:
    print('\nperforming batch GD with RMSProp and momentum...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    LL_rm = []
    CR_rm = []

    # hyperparams:
    lr0 = 0.001
    #lr0 = 0.0001
    mu = 0.9
    decay = 0.999
    eps = 10e-9

    # momentum (velocity terms):
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0

    # rms-prop cache (with no bias correction):
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    t0 = datetime.now()
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)
            #print(Z.shape, p_Ybatch.shape, Ybatch.shape)
            #print('First batch cost:', cost(p_Ybatch, Ybatch))

            # updates:
            # (note: we utilize a bit different version of momentum)
            gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2)
            cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps))
            W2 -= dW2
            #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps)
            #W2 += dW2

            gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2)
            cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps))
            b2 -= db2
            #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps)
            #b2 += db2

            gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1)
            cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps))
            W1 -= dW1
            #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps)
            #W1 += dW1

            gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1)
            cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps))
            b1 -= db1
            #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps)
            #b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                #print('pY:', pY)
                ll = cost(pY, Ytest_ind)
                LL_rm.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_rm.append(error)
                print('error rate:', error)

    dt1 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for batch GD with RMSProp and momentum:', dt1)

    # plot the cost
    plt.plot(LL_rm)
    plt.title('Cost for batch GD with RMSProp and momentum')
    plt.show()

    # 2. Adam optimizer
    print('\nperforming Adam optimizer...')
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # hyperparams:
    lr = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 10e-9

    # 1st moment:
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment:
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    LL_adam = []
    CR_adam = []
    t0 = datetime.now()
    t = 1  # index; used instead of j, because j starts with 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :]
            Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :]
            p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates:
            # gradients:
            gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2
            gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2
            gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1
            gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1

            # 1st moment:
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1

            # 2nd moment:
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1

            # bias correction:
            mW2_bc = mW2 / (1 - beta1**t)
            mb2_bc = mb2 / (1 - beta1**t)
            mW1_bc = mW1 / (1 - beta1**t)
            mb1_bc = mb1 / (1 - beta1**t)

            vW2_bc = vW2 / (1 - beta2**t)
            vb2_bc = vb2 / (1 - beta2**t)
            vW1_bc = vW1 / (1 - beta2**t)
            vb1_bc = vb1 / (1 - beta2**t)

            # weights and biases (parameters):
            W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps)
            b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps)
            W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps)
            b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps)

            t += 1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_adam.append(ll)
                print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll))

                error = error_rate(pY, Ytest)
                CR_adam.append(error)
                print('error rate:', error)

    dt2 = datetime.now() - t0
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print('\nFinal error rate:', error_rate(pY, Ytest))
    print('Elapsed time for Adam optimizer:', dt2)

    # plot the cost
    plt.plot(LL_adam)
    plt.title('Cost for Adam optimizer')
    plt.show()

    # plot costs from the two experiments together:
    plt.plot(LL_rm, label='RMSProp with momentum')
    plt.plot(LL_adam, label='Adam optimizer')
    plt.title('Cost')
    plt.legend()
    plt.show()
Example #32
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20  # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    # alternate version uses dW
    # dW2 = 0
    # db2 = 0
    # dW1 = 0
    # db1 = 0
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1)))
            # dW(t) = mu*dW(t-1) + g(t)
            # W(t) = W(t-1) - mu*dW(t)
            W1_tmp = W1 - lr * mu * vW1
            b1_tmp = b1 - lr * mu * vb1
            W2_tmp = W2 - lr * mu * vW2
            b2_tmp = b2 - lr * mu * vb2

            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]
            # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp)

            # updates
            # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            # W2 += dW2
            # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            # b2 += db2
            # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            # W1 += dW1
            # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            # b1 += db1
            vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp
            W2 -= lr * vW2
            vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp
            b2 -= lr * vb2
            vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * W1_tmp
            W1 -= lr * vW1
            vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch,
                                           W2_tmp) + reg * b1_tmp
            b1 -= lr * vb1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
def main():
	'''
		RMSprop is a form adaptative learning rate which decreases over time
	'''
	
	max_iter = 20  #for RelU
	#max_iter = 30 #for sigmoid
	print_period = 10 	
	X, Y   = get_normalized_data()
	lr = 0.0004
	reg = 0.01 

	
	Xtrain = X[:-1000,]
	Ytrain = Y[:-1000]
	Xtest = X[-1000:,]
	Ytest = Y[-1000:]
	Ytrain_ind = y2indicator(Ytrain)
	Ytest_ind = y2indicator(Ytest)
	
	
	N, D = Xtrain.shape
	batch_sz = 500
	n_batches = N / batch_sz

	M =300
	K=10

	#1. batch SGD
	W1 = np.random.randn(D, M) / 28
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)



	LL_batch = [] 
	CR_batch = [] 
	
	
	for i in xrange(max_iter):
		for j in xrange(n_batches):
		
			Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :]
			Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) 
			 

			W2 -=  lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
			b2 -=  lr*(derivative_b2(Ybatch,pYbatch) + reg*b2)
			W1 -=  lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
			b1 -=  lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)			


			if j % print_period ==0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_batch.append(ll)
				print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_batch.append(err)
				print "Error rate:", err 

	pY, _ = forward(Xtest, W1, b1, W2, b2) 			
	print "Final error rate:", error_rate(pY, Ytest)

	#2. RMSProp
	W1 = np.random.randn(D, M) / 28
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	LL_rms = [] 
	CR_rms = [] 
	lr0 = 0.001
	cache_W2 = 0 
	cache_b2 = 0 
	cache_W1 = 0 
	cache_b1 = 0 
	decay_rate = 1 - 1e-5
	eps = 1e-10
	
	
	for i in xrange(max_iter):
		for j in xrange(n_batches):
		
			Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :]
			Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :]
			pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) 
			 

			#updates
			gW2 =  derivative_w2(Z, Ybatch, pYbatch) + reg*W2
			cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 			
			W2 -=  lr0*gW2 /(np.sqrt(cache_W2) + eps)
			
			gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2
			cache_b2  = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 			
			b2 -=  lr0*gb2 /(np.sqrt(cache_b2) + eps)

			gW1 =  derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
			cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 			
			W1 -=  lr0*gW1 /(np.sqrt(cache_W1) + eps)
			
			gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
			cache_b1  = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1			
			b1 -=  lr0*gb1 /(np.sqrt(cache_b1) + eps)


			if j % print_period ==0:
				pY, _ = forward(Xtest, W1, b1, W2, b2)
				ll = cost(pY, Ytest_ind)
				LL_rms.append(ll)
				print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

				err = error_rate(pY, Ytest)
				CR_rms.append(err)
				print "RMS Error rate:", err 

	pY, _ = forward(Xtest, W1, b1, W2, b2) 			
	print "RMS 	Final error rate:", error_rate(pY, Ytest)

	
	
	plt.plot(LL_batch, label='batch')	
	plt.plot(LL_rms, label='rms')	
	plt.legend()
	plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # update velocities
            dW2 = mu*dW2 - lr*gW2
            db2 = mu*db2 - lr*gb2
            dW1 = mu*dW1 - lr*gW1
            db1 = mu*db1 - lr*gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # v update
            vW2 = mu*vW2 - lr*gW2
            vb2 = mu*vb2 - lr*gb2
            vW1 = mu*vW1 - lr*gW1
            vb1 = mu*vb1 - lr*gb1

            # param update
            W2 += mu*vW2 - lr*gW2
            b2 += mu*vb2 - lr*gb2
            W1 += mu*vW1 - lr*gW1
            b1 += mu*vb1 - lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))



    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
Example #35
0
        correction2 = 1 - beta2**t
        hat_vW1 = vW1 / correction2
        hat_vb1 = vb1 / correction2
        hat_vW2 = vW2 / correction2
        hat_vb2 = vb2 / correction2

        # update t
        t += 1

        # apply updates to the params
        W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
        b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
        W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
        b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)
        pY, _ = forward(Xtest, W1, b1, W2, b2)
        l = cost(pY, Ytest_ind)
        loss_adam.append(l)

        if j % print_period == 0:
            print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))
            err = error_rate(pY, Ytest)
            err_adam.append(err)
            print("Error rate:", err)

pY, _ = forward(Xtest, W1, b1, W2, b2)
print("Final error rate:", error_rate(pY, Ytest))

plt.plot(loss_adam, label='adam')
plt.show()

# In[4]:
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001 # if you set this too high you'll get NaN!
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
Example #37
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()

    print('logistic regression')

    # randomly assign weights

    N, D = Xtrain.shape

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    M = 10
    scale = 28

    # full grad descent
    W, b = initwb(D, M, scale)

    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in range(200):
        P_Y = forward(Xtrain, W, b)

        W += lr * (gradW(Ytrain_ind, P_Y, Xtrain) - reg * W)
        b += lr * (gradb(Ytrain_ind, P_Y) - reg * b)

        P_Y_test = forward(Xtest, W, b)
        ll = cost(P_Y_test, Ytest_ind)
        LL.append(ll)
        if i % 10 == 0:
            err = error_rate(P_Y_test, Ytest)
            print("cost at iter:  %d:  %.6f" % (i, ll))
            print("error rate:  ", err, "\n")

    P_Y = forward(Xtest, W, b)
    print("final error:  ", error_rate(P_Y, Ytest))
    print("elapsed time for full GD:  ", datetime.now() - t0)

    # 2.  Stochastic
    W, b = initwb(D, M, scale)

    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in range(1):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(min(N, 500)):
            x = tmpX[n, :].reshape(1, D)
            y = tmpY[n, :].reshape(1, 10)
            P_Y = forward(x, W, b)

            W += lr * (gradW(y, P_Y, x) - reg * W)
            b += lr * (gradb(y, P_Y) - reg * b)

            P_Y_test = forward(Xtest, W, b)

            ll = cost(P_Y_test, Ytest_ind)

            LL_stochastic.append(ll)

            if n % (N / 2) == 0:
                err = error_rate(P_Y_test, Ytest)
                print("Cost at iteration %d:  %6.f" % (i, ll))
                print("error rate:  ", err)

    P_Y = forward(Xtest, W, b)

    print("error rate:  ", error_rate(P_Y, Ytest))
    print("elapsed time for SGD:  ", datetime.now() - t0)

    # batch
    W, b = initwb(D, M, scale)

    LL_batch = []
    lr = 0.001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz

    t0 = datetime.now()
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :]
            y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :]
            P_Y = forward(x, W, b)

            W += lr * (gradW(y, P_Y, x) - reg * W)
            b += lr * (gradb(y, P_Y) - reg * b)
            P_Y_test = forward(Xtest, W, b)

            ll = cost(P_Y_test, Ytest_ind)
            LL_batch.append(ll)
            if j % (n_batches / 2) == 0:
                err = error_rate(P_Y_test, Ytest)
                print("Cost at iteration %d:  %6.f" % (i, ll))
                print("error rate:  ", err)
    P_Y = forward(Xtest, W, b)

    print("error rate:  ", error_rate(P_Y, Ytest))
    print("elapsed time for SGD:  ", datetime.now() - t0)

    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()