def rmsprop():
    """
    revise from benchmark_batch.py
    """

    X_train, Y_train, X_test, Y_test = get_transformed_digit()

    N, D = X_train.shape
    yindi_train = y2indicator(Y_train)
    yindi_test = y2indicator(Y_test)

    M = 300
    K = 10

    # W = np.random.rand(D,M)
    # b = np.random.rand(M)
    W1 = np.random.rand(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.rand(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    cost_test = []
    error_test = []

    eta = 0.00004
    penalty = 0.001

    batch_size = 500
    batch_num = N // batch_size

    # two new variable
    decay = 0.9
    eps = 1e-10

    cw2 = 1
    cb2 = 1
    cw1 = 1
    cb1 = 1

    t1 = time.time()

    #batch
    for i in range(10):
        X_shuffle, Y_train_shuffle = shuffle(X_train, yindi_train)
        for ii in range(int(batch_num)):

            x_tem = X_shuffle[int(i * batch_size):int((i + 1) * batch_size)]
            y_tem = Y_train_shuffle[int(i * batch_size):int((i + 1) *
                                                            batch_size)]

            y_fit, z = forward(x=x_tem,
                               w1=W1,
                               b1=b1,
                               w2=W2,
                               b2=b2,
                               method='relu')

            # W2 -= eta*(deri_w2(z = z, y= y_fit,t = y_tem) + penalty * W2)
            # b2 -= eta*(deri_b2(y = y_fit, t = y_tem) + penalty*b2)
            # W1 -= eta*(deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) + penalty*W1 )
            # b1 -= eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2) + penalty*b1)

            #the only new thing is the update rule
            dw2 = (deri_w2(z=z, y=y_fit, t=y_tem) + penalty * W2)
            db2 = (deri_b2(y=y_fit, t=y_tem) + penalty * b2)
            dw1 = (deri_w1(X=x_tem, Z=z, T=y_tem, Y=y_fit, W2=W2) +
                   penalty * W1)
            db1 = (deri_b1(Z=z, T=y_tem, Y=y_fit, W2=W2) + penalty * b1)

            cw2 = decay * cw2 + (1 - decay) * dw2 * dw2
            cb2 = decay * cb2 + (1 - decay) * db2 * db2
            cw1 = decay * cw1 + (1 - decay) * dw1 * dw1
            cb1 = decay * cb1 + (1 - decay) * db1 * db1

            W2 -= eta * dw2 / (np.sqrt(cw2) + eps)
            b2 -= eta * db2 / (np.sqrt(cb2) + eps)
            W1 -= eta * dw1 / (np.sqrt(cw1) + eps)
            b1 -= eta * db1 / (np.sqrt(cb1) + eps)

            p_y_test, _ = forward(x=X_test,
                                  w1=W1,
                                  b1=b1,
                                  w2=W2,
                                  b2=b2,
                                  method='relu')
            cost_test_tem = cost(y_matrix=p_y_test, t_matrix=yindi_test)
            cost_test.append(cost_test_tem)

        error_tem = error_rate(y_matrix=p_y_test, target=Y_test)
        print("the error rate in " + str(i) + "  is :" + str(error_tem))

    t2 = time.time()
    print("the whole process takes " + str(t2 - t1) + " seconds")
    p_y_final, _ = forward(x=X_test, w1=W1, b1=b1, w2=W2, b2=b2, method='relu')
    error_final = error_rate(y_matrix=p_y_final, target=Y_test)
    print("the final error rate is " + str(error_final))
def benchmark_batch():
    """
    use util functions to run the logistic classification with bp
    """
    
    X_train, Y_train, X_test, Y_test = get_transformed_digit()
    
    N,D = X_train.shape
    yindi_train = y2indicator(Y_train)
    yindi_test = y2indicator(Y_test)
    
    M = 300
    K = 10
    
    # W = np.random.rand(D,M)
    # b = np.random.rand(M)
    W1 = np.random.rand(D,M)/np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.rand(M,K)/np.sqrt(M)
    b2 = np.zeros(K)

    cost_test = []
    error_test = []
    
    eta = 0.00004
    penalty = 0.001

    batch_size = 500
    batch_num = N // batch_size


    t1 = time.time()

    #batch
    for i in range(100):
        X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train)
        for ii in range(int(batch_num)):
            # x_tem = X_shuffle[ii].reshape(1,D)
            # y_tem = Y_train_shuffle[ii].reshape(1,10)

            x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)]
            y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)]

            # y_fit = forward(x = x_tem,w=W,b=b)
            y_fit, z = forward(x = x_tem, w1 = W1, b1 = b1, w2 = W2, b2 = b2, method = 'relu')

            W2 -= eta*(deri_w2(z = z, y= y_fit,t = y_tem) + penalty * W2)
            b2 -= eta*(deri_b2(y = y_fit, t = y_tem) + penalty*b2)
            W1 -= eta*(deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) + penalty*W1 )
            b1 -= eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2) + penalty*b1)
            # W2 -= eta*(deri_w2(z = z, y= y_fit,t = y_tem) )
            # b2 -= eta*(deri_b2(y = y_fit, t = y_tem) )
            # W1 -= eta*(deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) )
            # b1 -= eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2))

            
            # W += eta*(deri_w(t_matrix = y_tem, y_matrix = y_fit,x = x_tem)-penalty*W)
            # b += eta*(deri_b(t_matrix = y_tem, y_matrix = y_fit)-penalty*b)

            p_y_test,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
            cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test)
            cost_test.append(cost_test_tem)

            
        error_tem = error_rate(y_matrix = p_y_test, target = Y_test)
        print("the error rate in "+str(i)+"  is :"+str(error_tem))
    
    t2 = time.time()
    print("the whole process takes "+str(t2-t1)+" seconds")
    p_y_final,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
    error_final = error_rate(y_matrix = p_y_final, target = Y_test)
    print("the final error rate is "+str(error_final))
def momentum_batch():
    """
    use util functions to run the logistic classification with bp
    """
    
    X_train, Y_train, X_test, Y_test = get_transformed_digit()
    
    N,D = X_train.shape
    yindi_train = y2indicator(Y_train)
    yindi_test = y2indicator(Y_test)
    
    M = 300
    K = 10
    
    # W = np.random.rand(D,M)
    # b = np.random.rand(M)
    W1 = np.random.rand(D,M)/np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.rand(M,K)/np.sqrt(M)
    b2 = np.zeros(K)

    cost_test = []
    error_test = []
    
    eta = 0.00004
    penalty = 0.001

    batch_size = 500
    batch_num = N // batch_size

    mu = 0.9



    vw2 = 0
    vb2 = 0
    vw1 = 0
    vb1 = 0


    t1 = time.time()

    #batch
    for i in range(100):
        X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train)
        for ii in range(int(batch_num)):
            # x_tem = X_shuffle[ii].reshape(1,D)
            # y_tem = Y_train_shuffle[ii].reshape(1,10)

            x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)]
            y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)]

            # y_fit = forward(x = x_tem,w=W,b=b)
            y_fit, z = forward(x = x_tem, w1 = W1, b1 = b1, w2 = W2, b2 = b2, method = 'relu')

            #the only change to benchmark batch is the update rule:
            gw2 = deri_w2(z = z, y= y_fit,t = y_tem) + penalty * W2
            gb2 = deri_b2(y = y_fit, t = y_tem) + penalty*b2
            gw1 = deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) + penalty*W1
            gb1 = eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2) + penalty*b1)

            vw2 = mu*vw2 - eta * gw2
            vb2 = mu*vb2 - eta * gb2
            vw1 = mu*vw1 - eta * gw1
            vb1 = mu*vb1 - eta * gb1

            W2 += vw2
            b2 += vb2
            W1 += vw1
            b1 += vb1


            p_y_test,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
            cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test)
            cost_test.append(cost_test_tem)

            
        error_tem = error_rate(y_matrix = p_y_test, target = Y_test)
        print("the error rate in "+str(i)+"  is :"+str(error_tem))
    
    t2 = time.time()
    print("the whole process takes "+str(t2-t1)+" seconds")
    p_y_final,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
    error_final = error_rate(y_matrix = p_y_final, target = Y_test)
    print("the final error rate is "+str(error_final))
Beispiel #4
0
def adam():
    """
    revise from benchmark_batch.py
    """
    
    X_train, Y_train, X_test, Y_test = get_transformed_digit()
    
    N,D = X_train.shape
    yindi_train = y2indicator(Y_train)
    yindi_test = y2indicator(Y_test)
    
    M = 300
    K = 10
    
    # W = np.random.rand(D,M)
    # b = np.random.rand(M)
    W1 = np.random.rand(D,M)/np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.rand(M,K)/np.sqrt(M)
    b2 = np.zeros(K)

    cost_test = []
    error_test = []
    
    penalty = 0.001

    batch_size = 500
    batch_num = N // batch_size

    # two new variable
    decay = 0.9
    eps = 1e-10

    eta = 1e-3
    beta1 = 0.9
    beta2 = 0.999
    epsilon = 1e-8
    
    cw2 = 1
    cb2 = 1
    cw1 = 1
    cb1 = 1

    mw2 = 0
    mb2 = 0
    mw1 = 0
    mb1 = 0

    vw2 = 0
    vb2 = 0
    vw1 = 0
    vb1 = 0

    t = 1


    t1 = time.time()

    #batch
    for i in range(10):
        X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train)
        for ii in range(int(batch_num)):

            x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)]
            y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)]

            y_fit, z = forward(x = x_tem, w1 = W1, b1 = b1, w2 = W2, b2 = b2, method = 'relu')

####################### adam ##############################

            gW2 = deri_w2(z, y_fit, y_tem) + penalty*W2
            gb2 = deri_b2(y_fit, y_tem) + penalty*b2
            gW1 = deri_w1(x_tem, z, y_tem, y_fit, W2) + penalty*W1
            gb1 = deri_b1(z, y_tem, y_fit, W2) + penalty*b1

            # new m
            mw1 = beta1 * mw1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mw2 = beta1 * mw2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vw1 = beta2 * vw1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vw2 = beta2 * vw2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1 ** t
            hat_mw1 = mw1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mw2 = mw2 / correction1
            hat_mb2 = mb2 / correction1


            correction2 = 1 - beta2 ** t
            hat_vw1 = vw1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vw2 = vw2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - eta * hat_mw1 / np.sqrt(hat_vw1 + eps)
            b1 = b1 - eta * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - eta * hat_mw2 / np.sqrt(hat_vw2 + eps)
            b2 = b2 - eta * hat_mb2 / np.sqrt(hat_vb2 + eps)

##########################################################

            p_y_test,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
            cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test)
            cost_test.append(cost_test_tem)



            
        error_tem = error_rate(y_matrix = p_y_test, target = Y_test)
        print("the error rate in "+str(i)+"  is :"+str(error_tem))
    
    t2 = time.time()
    print("the whole process takes "+str(t2-t1)+" seconds")
    p_y_final,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu')
    error_final = error_rate(y_matrix = p_y_final, target = Y_test)
    print("the final error rate is "+str(error_final))