Exemple #1
0
 def fit(self, X, Y, learning_rate=10e-8, reg=10e-12, epochs=10000, show_fig=False):
     X, Y = shuffle(X, Y)
     Xvalid, Yvalid = X[-1000:], Y[-1000:]
     Tvalid = y2indicator(Yvalid)
     X, Y = X[:-1000], Y[:-1000]
     
     N,D = X.shape
     K = len(set(Y))
     T = y2indicator(Y)
     self.W = np.random.randn(D, K) / np.sqrt(D+K)
     self.b = np.zeros(K)
     
     costs = []
     best_validation_error = 1
     for i in range(epochs):
         pY = self.forward(X)
         
         #gradient descent
         self.W -= learning_rate * (X.T.dot(pY-T) + reg*self.W)
         self.b -= learning_rate * ((pY -T).sum(axis=0) + reg*self.b)
         
         if i % 10 == 0:
             pYvalid = self.forward(Xvalid)
             c = cost(Tvalid, pYvalid)
             costs.append(c)
             e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
             print(f"i: {i}, cost: {c}, error: {e}")
             if e < best_validation_error:
                 best_validation_error = e
     print(best_validation_error)
     
     if show_fig:
         plt.plot(costs)
         plt.show()
    def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)
        self.W = np.random.randn(D, K) / np.sqrt(D)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY = self.forward(X)

            # gradient descent step
            self.W -= learning_rate*(X.T.dot(pY - T) + reg*self.W)
            self.b -= learning_rate*((pY - T).sum(axis=0) + reg*self.b)

            if i % 10 == 0:
                pYvalid = self.forward(Xvalid)
                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
def main():
    X,Y = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(K)
    W3_init = np.random.randn(M2, k) / np.sqrt(M2)
    b3_init = np.zeros(K)
Exemple #4
0
    def fit(self, X, Y, learning_rate = 10e-7, \
            reg=10e-7, epoch = 10000, show_fig = False):
        #divide into train and test data
        Xtest, Ytest, Xtrain, Ytrain = self.prepare_data(X, Y, multi=True)
        Ttrain = y2indicator(Ytrain)
        Ttest = y2indicator(Ytest)

        costs = []
        best_validation_error = 1
        for i in xrange(epoch):
            pY, Z = self.forward_multi(Xtrain)  #forward prop

            #back prop
            pY_Y = pY - Ttrain
            self.W2 -= learning_rate * (Z.T.dot(pY_Y) + reg * self.W2)
            self.b2 -= learning_rate * (pY_Y.sum(axis=0) + reg * self.b2)

            #            dZ = np.outer(pY_Y, self.W2)* (Z > 0) #Z > 0 is derivative of ReLU
            #            print pY_Y.shape, self.W2.shape, Z.shape
            dZ = pY_Y.dot(self.W2.T) * (1 - Z * Z)
            self.W1 -= learning_rate * (Xtrain.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (np.sum(dZ, axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYtest, _ = self.forward_multi(Xtest)
                c = cost(Ttest, pYtest)
                costs.append(c)
                e = error_rate(Ytest, np.argmax(pYtest, axis=1))
                print "i: ", i, "cost: ", c, "error: ", e
                if e < best_validation_error: best_validation_error = e
        print "best validation error:", best_validation_error
        self.show_fig_cost(costs, show_fig)
Exemple #5
0
    def __init__(self, Xtrain, Ytrain, Xtest, Ytest):
        print("Initialising NN...")
        self.Xtrain = Xtrain
        self.Ytrain = Ytrain
        self.Xtest = Xtest
        self.Ytest = Ytest

        self.Ytest_ind = y2indicator(self.Ytest)
        self.Ytrain_ind = y2indicator(self.Ytrain)
Exemple #6
0
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)
    
    lr = 0.00004
    reg = 0.01
    N, D = Xtrain.shape
    K = 10
    
    max_iter = 1000
    batch_sz = 500
    n_batches = N // batch_sz
    print_period = 10
    
    W_init = np.random.randn(D, K)
    b_init = np.random.randn(K)
    
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')
    W = tf.Variable(W_init.astype(np.float32))
    b = tf.Variable(b_init.astype(np.float32))
    
    Yish = tf.matmul(X, W) + b
    
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))
    
    #train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost)
    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)
    
    predict_op = tf.argmax(Yish, 1)
    
    LL = []
    init = tf.initialize_all_variables()
    with tf.Session() as session:
        session.run(init)
        
        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
                Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
                
                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
                    LL.append(test_cost)
                    
    plt.plot(LL)
    plt.show()
def benchmark_full():
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    print("Performing logistic regression...")
    # lr = LogisticRegression(solver='lbfgs')

    # convert Ytrain and Ytest to (N x K) matrices of indicator variables
    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL = []
    LLtest = []
    CRtest = []

    # reg = 1
    # learning rate 0.0001 is too high, 0.00005 is also too high
    # 0.00003 / 2000 iterations => 0.363 error, -7630 cost
    # 0.00004 / 1000 iterations => 0.295 error, -7902 cost
    # 0.00004 / 2000 iterations => 0.321 error, -7528 cost

    # reg = 0.1, still around 0.31 error
    # reg = 0.01, still around 0.31 error
    lr = 0.00004
    reg = 0.01
    for i in range(500):
        p_y = forward(Xtrain, W, b)
        # print "p_y:", p_y
        ll = cost(p_y, Ytrain_ind)
        LL.append(ll)

        p_y_test = forward(Xtest, W, b)
        lltest = cost(p_y_test, Ytest_ind)
        LLtest.append(lltest)

        err = error_rate(p_y_test, Ytest)
        CRtest.append(err)

        W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W)
        b += lr * (gradb(Ytrain_ind, p_y) - reg * b)
        if i % 10 == 0:
            print("Cost at iteration %d: %.6f" % (i, ll))
            print("Error rate:", err)

    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    iters = range(len(LL))
    plt.plot(iters, LL, iters, LLtest)
    plt.show()
    plt.plot(CRtest)
    plt.show()
def sgd_batch():
    """
    use util functions to run the logistic classification with bp
    """
    
    X_train, Y_train, X_test, Y_test = get_transformed_digit()
    
    N,D = X_train.shape
    yindi_train = y2indicator(Y_train)
    yindi_test = y2indicator(Y_test)
    
    M = yindi_test.shape[1]
    
    W = np.random.rand(D,M)
    b = np.random.rand(M)
    
    cost_train = []
    cost_test = []
    error_test = []
    
    eta = 1e-4
    penalty = 1e-2

    batch_size = 500
    batch_num = N // batch_size

    #batch
    for i in range(500):
        X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train)
        for ii in range(int(batch_num)):
            # x_tem = X_shuffle[ii].reshape(1,D)
            # y_tem = Y_train_shuffle[ii].reshape(1,10)

            x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)]
            y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)]

            y_fit = forward(x = x_tem,w=W,b=b)
            
            W += eta*(deri_w(t_matrix = y_tem, y_matrix = y_fit,x = x_tem)-penalty*W)
            b += eta*(deri_b(t_matrix = y_tem, y_matrix = y_fit)-penalty*b)

            p_y_test = forward(x = X_test,w=W,b=b)
            cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test)
            cost_test.append(cost_test_tem)

            if ii % 100 == 0:
                error_tem = error_rate(y_matrix = p_y_test, target = Y_test)
                print("the error rate in "+str(ii)+" iteration is :"+str(error_tem))
    
    p_y_final = forward(x = X_test,w=W,b=b)
    error_final = error_rate(y_matrix = p_y_final, target = Y_test)
    print("the final error rate is "+str(error_final))
Exemple #9
0
def get_data():
    print('loading data ...')
    data_train = []
    targets_train = []
    data_test = []
    targets_test = []
    with open('../large_files/r8-train-all-terms.txt', encoding='utf-8') as f1:
        for line in f1:
            values = line.split('\t')
            data_train.append(values[1])
            targets_train.append(values[0])
    with open('../large_files/r8-test-all-terms.txt', encoding='utf-8') as f2:
        for line in f2:
            values = line.split('\t')
            data_test.append(values[1])
            targets_test.append(values[0])

    # one-hot encode targets
    # how do i know it's assigning the same labels to each?
    Ytrain_labels = LabelEncoder().fit_transform(targets_train)
    Ytest_labels = LabelEncoder().fit_transform(targets_test)
    Ytrain = y2indicator(Ytrain_labels)
    Ytest = y2indicator(Ytest_labels)
    print('Ytrain: ', Ytrain.shape)
    print('Ytest: ', Ytest.shape)
    # possible shape problem if K test != K train
    if (Ytrain.shape[1] != Ytest.shape[1]):
        raise ValueError('A very specific bad thing happened.')

    # get an average word vector for the data
    def avgwords(data):
        tot = []
        for article in data:
            totalwordvecs = []
            for word in article.split():
                if word in word2vec:
                    wvec = word2vec[word]
                    totalwordvecs.append(wvec)
                else:
                    # if word not vectorized, return all zeros
                    totalwordvecs.append(np.zeros(int(d)))
            totalwordvecs = np.array(totalwordvecs)
            avgword = np.mean(totalwordvecs, axis=0)
            tot.append(avgword.tolist())
        return np.array(tot)

    Xtrain = avgwords(data_train)
    Xtest = avgwords(data_test)
    print('Xtrain: ', Xtrain.shape)
    print('Xtest: ', Xtest.shape)
    return Xtrain, Xtest, Ytrain, Ytest, Ytrain_labels, Ytest_labels
Exemple #10
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-6,
            reg=1e-6,
            epochs=10000,
            show_fig=False):
        Tvalid = y2indicator(Yvalid)

        N, D = X.shape
        K = len(set(Y) | set(Yvalid))
        T = y2indicator(Y)

        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY, Z = self.forward(X)

            # Gradient Descent step
            ''' 在玩這個資料集的時候,首度引入L2概念, 注意L2原本就寫在 loss function內,
            整個loss funciton作微分後,||W||變成一次方的型態,如下方運算式'''
            pY_T = pY - T  # 先設成變數,這樣之後計算才會快阿
            self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2)
            self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2)
            # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)  # tanh
            self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            if i % 20 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                # c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print('best_validation_error:', best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-06,
            reg=10e-1,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]
        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)

        self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation
            pY, Z = self.forward(X)

            #gradient descent step
            pY_T = pY - T
            self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2)
            self.b2 -= learning_rate * ((pY_T.sum()) + reg * self.b2)

            # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)  # tanh
            self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (np.sum(dZ, axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i: ", i, "cost: ", c, "error: ", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best validation error = ", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            T,
            learning_rate=10e-8,
            reg=10e-12,
            epochs=10000,
            show_fig=False):
        X, T = shuffle(X, T)
        X_train, T_train = X[:-1000], T[:-1000]
        X_valid, T_valid = X[-1000:], T[-1000:]

        N, D = X_train.shape
        K = len(set(T_train))
        T_train_ind = y2indicator(T_train)

        #initialize parameter: W need independence to number of para
        self.W = np.random.randn(D, K) / np.sqrt(D + K)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1
        for n in range(epochs):
            # forwardpropogation process
            Y = self.forwardprop(X_train)

            #Gradient descent
            Y_T = Y - T_train_ind
            self.W -= learning_rate * (X_train.T.dot(Y_T) + reg * self.W)
            self.b -= learning_rate * (Y_T.sum(axis=0) + reg * self.b)

            #presentation
            if n % 10 == 0:
                Y_valid = self.forwardprop(X_valid)
                T_valid_ind = y2indicator(T_valid)
                c = cost(T_valid_ind, Y_valid)
                costs.append(c)
                er = error_rate(T_valid, self.predict(X_valid))
                print(n, 'cost', c, 'error', er)
                if er < best_validation_error:
                    best_validation_error = er

        print('Best validation error', best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.title('cross entropy loss')
            plt.show()
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    def fit(self,
            X,
            Y,
            learning_rate=10e-6,
            regularisation=10e-1,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)

        # print("X.shape"+str(X.shape))
        # print("Y.shape"+str(Y.shape))
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        # Tvalid = y2indicator(Yvalid) # WE DONT NEED TVALID CAUSE WE ARE USING COST2
        X, Y = X[:-1000], Y[:-1000]
        # print("X.shape"+str(X.shape))
        # print("Y.shape"+str(Y.shape))
        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)  #Need this for gradient descent

        self.W1, self.b1 = init_weight_and_bias(D, self.M)
        self.W2, self.b2 = init_weight_and_bias(self.M, K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation
            pY, Z = self.forward(X)

            # gradient descent
            pY_T = pY - T
            self.W2 -= learning_rate * (Z.T.dot(pY_T) +
                                        regularisation * self.W2)
            self.b2 -= learning_rate * (
                (pY_T).sum(axis=0) + regularisation * self.b2)

            # dZ = pY_T.dot(self.W2.T) * (Z>0) #Relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)  # Tanh
            self.W1 -= learning_rate * (X.T.dot(dZ) + regularisation * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) +
                                        regularisation * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i : " + str(i) + "; Cost : " + str(c) + "; Error : " +
                      str(e))
                if e < best_validation_error:
                    best_validation_error = e

        print("Best Validation error : " + str(best_validation_error))

        if (show_fig):
            plt.plot(costs)
            plt.show()
Exemple #15
0
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
Exemple #16
0
    def fit(self,X,Y,learning_rate=10e-8,regularisation=10e-12,epochs=10000,show_fig=False):
        X,Y = shuffle(X,Y)

        # print("X.shape"+str(X.shape))
        # print("Y.shape"+str(Y.shape))
        Xvalid, Yvalid = X[-1000:],Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X,Y = X[:-1000],Y[:-1000]
        # print("X.shape"+str(X.shape))
        # print("Y.shape"+str(Y.shape))
        N,D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)


        self.W,self.b = init_weight_and_bias(D,K)


        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation
            pY = self.forward(X)

            # gradient descent
            self.W -= learning_rate*(X.T.dot(pY-T) + regularisation*self.W)
            self.b -= learning_rate*((pY-T).sum(axis=0) + regularisation*self.b)


            if i%10 ==0 :
                pYvalid = self.forward(Xvalid)
                c = cost(Tvalid,pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid,axis=1))
                print("i : "+str(i)+"; Cost : "+str(c)+"; Error : "+str(e))
                if e < best_validation_error:
                    best_validation_error = e

        print("Best Validation error : "+str(best_validation_error))

        if(show_fig):
            plt.plot(costs)
            plt.show()
def fit(X, Y, show_fig=False):

    K = len(set(Y))
    # make a validation set
    X, Y = shuffle(X, Y)
    X = X.astype(np.float32)
    Y = y2indicator(Y).astype(np.float32)
    Xvalid, Yvalid = X[-1000:], Y[-1000:]
    Yvalid_flat = np.argmax(Yvalid, axis=1)  # for calculating error rate
    X, Y = X[:-1000], Y[:-1000]

    N, D = X.shape
    tfX = tf.placeholder(tf.float32, shape=(None, D), name='X')
    tfT = tf.placeholder(tf.float32, shape=(None, K), name='T')
    prediction = neural_network(D, K, tfX)

    cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=tfT))

    train_op = tf.train.RMSPropOptimizer(learning_rate,
                                         decay=decay,
                                         momentum=mu).minimize(cost)

    n_batches = N // batch_sz
    epoch_loss = 0
    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch})
                if j % 20 == 0:
                    c = session.run([train_op, cost],
                                    feed_dict={
                                        tfX: Xbatch,
                                        tfT: Ybatch
                                    })
                    costs.append(c)
                    p = session.run(tf.argmax(prediction, 1),
                                    feed_dict={
                                        tfX: Xvalid,
                                        tfT: Yvalid
                                    })
                    e = error_rate(Yvalid_flat, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=1e-7,
            reg=0.,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)
        self.W = np.random.randn(D, K) / np.sqrt(D)
        self.b = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY = self.forward(X)

            # gradient descent step
            self.W -= learning_rate * (X.T.dot(pY - T) + reg * self.W)
            self.b -= learning_rate * ((pY - T).sum(axis=0) + reg * self.b)

            if i % 10 == 0:
                pYvalid = self.forward(Xvalid)
                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemple #19
0
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    def fit(self,
            X,
            T,
            learning_rate=10e-7,
            reg=10e-7,
            epochs=10000,
            show_fig=False):
        X, T = shuffle(X, T)
        X_train, T_train = X[:-1000], T[:-1000]
        X_valid, T_valid = X[-1000:], T[-1000:]

        N, D = X_train.shape
        K = len(set(T_train))

        #initialize parameters
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for n in range(epochs):
            #forwardpropogation process
            Y, Z = self.forwardprop(X_train)

            #Gradient Descent
            T_train_ind = y2indicator(T_train)
            Y_T = Y - T_train_ind
            self.W2 -= learning_rate * (Z.T.dot(Y_T) + reg * self.W2)
            self.b2 -= learning_rate * (Y_T.sum(axis=0) + reg * self.b2)

            dZ = Y_T.dot(self.W2.T) * (1 - Z * Z)
            self.W1 = learning_rate * (X_train.T.dot(dZ) + reg * self.W1)
            self.b1 = learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            #representation of validation cost and error rate
            if n % 10 == 0:
                Y_valid, _ = self.forwardprop(X_valid)
                cost = cost2(T_valid, Y_valid)
                costs.append(cost)
                er = error_rate(T_valid, np.argmax(Y_valid, axis=1))
                print(n, 'cost:', cost, 'error', er)
                if er < best_validation_error:
                    best_validation_error = er
        print('Best validation error:', best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.title('cross entropy loss')
            plt.show()
Exemple #21
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-6,
            reg=1e-6,
            epochs=10000,
            show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:1000], Y[:1000]

        N, D = X.shape
        K = max(Y) + 1
        T = y2indicator(Y)
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, k) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        cost = []
        best_validation_error = 1

        # forward propagation and cost calculation
        for i in range(epochs):
            pY, Z = self.forward(X)

            # gradient descent step
            pY_T = pY - T
            self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2)
            self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2)
            # dZ = pY_T.dot(self.W2.T) * (Z > 0) #relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)  # tanh
            self.W1 -= learning_rate * (X.T.dot(Z) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print('i: ', i, 'cost:', c, 'error: ', e)
                if e < best_validation_error:
                    best_validation_error = e
        print('best_validation_error: ', best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemple #22
0
    def fit(self,
            X_train,
            labels_train,
            X_val,
            labels_val,
            learning_rate=5e-7,
            lambda_=1e0,
            epochs=5000,
            show_fig=False):
        N, D = X_train.shape
        K = len(set(labels_train))
        Y_train = y2indicator(labels_train)
        self.W1 = np.random.randn(D, self.M) * np.sqrt(2 / (D + self.M))
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) * np.sqrt(2 / (self.M + K))
        self.b2 = np.zeros(K)

        costs = []
        best_val_error = 1
        for i in range(epochs):
            # Forward Propagation
            Y_train_pred, Z = self.forward(X_train)

            # Gradient Descent step
            delta2 = Y_train_pred - Y_train
            self.W2 -= learning_rate * (Z.T.dot(delta2) + lambda_ * self.W2)
            self.b2 -= learning_rate * (delta2.sum(axis=0) + lambda_ * self.b2)

            #delta1 = np.outer(delta2, self.W2) * (Z > 0)
            delta1 = delta2.dot(self.W2.T) * (1 - Z * Z)
            self.W1 -= learning_rate * (X_train.T.dot(delta1) +
                                        lambda_ * self.W1)
            self.b1 -= learning_rate * (delta1.sum(axis=0) + lambda_ * self.b1)

            if i % 50 == 0:
                Y_val_pred, _ = self.forward(X_val)
                c = softmax_cost2(labels_val, Y_val_pred)
                costs.append(c)
                e = error_rate(labels_val, np.argmax(Y_val_pred, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_val_error:
                    best_val_error = e
        print("best_val_error:", best_val_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemple #23
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-7,
            reg=10e-7,
            epochs=10000,
            show_fig=False):
        # Tvalid = y2indicator(Yvalid)
        Xvalid, Yvalid, X, Y = splitTrainTestFromLast(X, Y, 1000)

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in xrange(epochs):
            # forward propagation and cost calculation
            pY, Z = self.forward(X)

            # gradient descent step
            pY_T = pY - T
            self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2)
            self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2)
            # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z * Z)  # tanh
            self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print "i:", i, "cost:", c, "error:", e
                if e < best_validation_error:
                    best_validation_error = e
        print "best_validation_error:", best_validation_error

        if show_fig:
            plt.plot(costs)
            plt.show()
Exemple #24
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-1,
            activation=tf.nn.sigmoid,
            epochs=20):
        N, T, D = Xtrain.shape
        Y_flat = np.copy(Y)
        Y = y2indicator(Y)

        self.f = activation

        batch_count = N // self.batch_size
        costs = []

        for i in range(epochs):
            batch_grp = np.arange(0, self.batch_size)

            for j in range(batch_count):
                Xbatch, Ybatch = X[batch_grp], Y[batch_grp]
                Xbatch = Xbatch.reshape(
                    (self.batch_size, self.chunk_size, self.input_size))
                batch_grp += self.batch_size

                session.run([self.train_op, self.cost_op, self.predict_op],
                            feed_dict={
                                self.Xin: Xbatch,
                                self.labels: Ybatch
                            })

                if j % 20 == 0:
                    testbatch_grp = np.random.choice(N,
                                                     self.batch_size,
                                                     replace=True)

                    c, p = self.session.run([self.cost_op, self.predict_op],
                                            feed_dict={
                                                self.Xin: X[testbatch_grp],
                                                self.labels: Y[testbatch_grp]
                                            })

                    a = accuracy(Y_flat[testbatch_grp], p)
                    print("i:", i, "j:", j, "nb:", batch_count, "cost:", c,
                          "accuracy:", a)
Exemple #25
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-3,
            epochs=2,
            batch_size=100,
            test_size=1000):

        N, *D = X.shape

        Y_flat = np.copy(Y)
        Y = y2indicator(Y)

        batch_count = N // batch_size
        costs = []
        for i in range(epochs):
            batch_grp = np.arange(0, batch_size)
            for j in range(batch_count):

                Xbatch, Ybatch = X[batch_grp], Y[batch_grp]
                batch_grp += batch_size

                self.session.run([self.train_op, self.cost],
                                 feed_dict={
                                     self.Xin: Xbatch,
                                     self.labels: Ybatch
                                 })

                if j % 20 == 0:
                    testbatch_grp = np.random.choice(N,
                                                     test_size,
                                                     replace=True)

                    c, p = self.session.run([self.cost, self.predictions],
                                            feed_dict={
                                                self.Xin: X[testbatch_grp],
                                                self.labels: Y[testbatch_grp]
                                            })

                    costs.append(c)
                    a = accuracy(Y_flat[testbatch_grp], p)
                    print("i:", i, "j:", j, "nb:", batch_count, "cost:", c,
                          "accuracy:", a)
    def fit(self, X, Y, learning_rate=1e-6, reg=1e-6, epochs=10000, show_fig=False):
        X, Y = shuffle(X, Y)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        # Tvalid = y2indicator(Yvalid)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape
        K = len(set(Y))
        T = y2indicator(Y)
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D)
        self.b1 = np.zeros(self.M)
        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY, Z = self.forward(X)

            # gradient descent step
            pY_T = pY - T
            self.W2 -= learning_rate*(Z.T.dot(pY_T) + reg*self.W2)
            self.b2 -= learning_rate*(pY_T.sum(axis=0) + reg*self.b2)
            # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu
            dZ = pY_T.dot(self.W2.T) * (1 - Z*Z) # tanh
            self.W1 -= learning_rate*(X.T.dot(dZ) + reg*self.W1)
            self.b1 -= learning_rate*(dZ.sum(axis=0) + reg*self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
        print("best_validation_error:", best_validation_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X_train,
            labels_train,
            X_val,
            labels_val,
            learning_rate=5e-7,
            lambda_=1e0,
            epochs=5000,
            show_fig=False):
        N, D = X_train.shape
        K = len(set(labels_train))
        Y_train = y2indicator(labels_train)
        self.W = np.random.randn(D, K) * np.sqrt(1 / D)
        self.b = np.zeros(K)

        costs = []
        best_val_error = 1
        for i in range(epochs):
            # Forward propagation
            Y_train_pred = self.forward(X_train)

            # Gradient descent
            self.W -= learning_rate * (X_train.T.dot(Y_train_pred - Y_train) +
                                       lambda_ * self.W)
            self.b -= learning_rate * (
                (Y_train_pred - Y_train).sum(axis=0) + lambda_ * self.b)

            if i % 50 == 0:
                Y_val_pred = self.forward(X_val)
                c = softmax_cost2(labels_val, Y_val_pred)
                costs.append(c)
                e = error_rate(labels_val, np.argmax(Y_val_pred, axis=1))
                print("Epoch:", i, "Cost:", c, "Error rate", e)
                if e < best_val_error:
                    best_val_error = e
        print("Best validation error", best_val_error)

        if show_fig:
            plt.plot(costs)
            plt.show()
def main():
    #some load data
    K = 10
    (Xtrain, Ytrain), (Xtest, Ytest) = cifar10_test()
    Xtrain = (Xtrain / 255).astype(np.float32)
    Xtest = (Xtest / 255).astype(np.float32)
    Ytrain_ind = y2indicator(Ytrain, K).astype(np.int32)
    Ytest_ind = y2indicator(Ytest, K).astype(np.int32)

    print(Xtrain.shape)
    print(Ytrain_ind.shape)
    epoch = 200
    print_period = 10
    N = Xtrain.shape[0]
    batch_sz = 250
    n_batches = N // batch_sz
    n_batches_test = Xtrain.shape[0] // batch_sz

    M = 512
    M1 = 512
    #K = 10 ABOVE
    poolsz = (2, 2)

    W1_shape = (5, 5, 3, 16)  #32 / 2 = 16
    W1_init = init_filter(W1_shape, poolsz)
    b1_init = np.zeros(W1_shape[-1], dtype=np.float32)

    W2_shape = (5, 5, 16, 40)  # 16 / 2= 8
    W2_init = init_filter(W2_shape, poolsz)
    b2_init = np.zeros(W2_shape[-1], dtype=np.float32)

    W3_shape = (5, 5, 40, 100)  # 8 / 2 =4
    W3_init = init_filter(W3_shape, poolsz)
    b3_init = np.zeros(W3_shape[-1], dtype=np.float32)

    W4_shape = (5, 5, 100, 196)  #4 / 2 = 2
    W4_init = init_filter(W4_shape, poolsz)
    b4_init = np.zeros(W4_shape[-1], dtype=np.float32)

    W5_init = np.random.randn(W4_shape[-1] * 2 * 2,
                              M) / np.sqrt(W4_shape[-1] * 2 * 2 + M)
    b5_init = np.zeros(M, dtype=np.float32)
    W6_init = np.random.randn(M, M1) / np.sqrt(M + M1)
    b6_init = np.zeros(M1, dtype=np.float32)
    W7_init = np.random.randn(M1, K) / np.sqrt(M1 + K)
    b7_init = np.zeros(K, dtype=np.float32)

    X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X')
    T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T')

    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))

    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))

    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    W4 = tf.Variable(W4_init.astype(np.float32))
    b4 = tf.Variable(b4_init.astype(np.float32))

    W5 = tf.Variable(W5_init.astype(np.float32))
    b5 = tf.Variable(b5_init.astype(np.float32))

    W6 = tf.Variable(W6_init.astype(np.float32))
    b6 = tf.Variable(b6_init.astype(np.float32))

    W7 = tf.Variable(W7_init.astype(np.float32))
    b7 = tf.Variable(b7_init.astype(np.float32))

    Z1 = convpool(X, W1, b1)
    Z2 = convpool(Z1, W2, b2)
    Z3 = convpool(Z2, W3, b3)
    Z4 = convpool(Z3, W4, b4)
    Z4_shape = Z4.get_shape().as_list()

    Z4r = tf.reshape(Z4, [-1, np.prod(Z4_shape[1:])])
    Z5 = tf.nn.relu(tf.matmul(Z4r, W5) + b5)
    Z6 = tf.nn.relu(tf.matmul(Z5, W6) + b6)
    Yish = tf.matmul(Z6, W7) + b7

    cost = tf.reduce_sum(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T))
    cost_test = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T))

    train_op = tf.train.AdamOptimizer(0.001).minimize(cost)

    predict_op = tf.argmax(tf.nn.softmax(Yish), axis=1)

    t0 = time.time()
    LL = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        for i in range(epoch):
            for j in range(n_batches):

                Xbatch = Xtrain[j * batch_sz:batch_sz * (j + 1)]
                Ybatch = Ytrain_ind[j * batch_sz:batch_sz * (j + 1)]

                if len(Xbatch) == batch_sz:
                    session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                    if j % print_period == 0:
                        # due to RAM limitations we need to have a fixed size input
                        # so as a result, we have this ugly total cost and prediction computation
                        test_cost = 0
                        prediction = np.zeros(len(Xtest))
                        for k in range(Xtest.shape[0] // batch_sz):
                            Xtestbatch = Xtest[k * batch_sz:batch_sz * (k + 1)]
                            Ytestbatch = Ytest_ind[k * batch_sz:batch_sz *
                                                   (k + 1), ]
                            test_cost += session.run(cost_test,
                                                     feed_dict={
                                                         X: Xtestbatch,
                                                         T: Ytestbatch
                                                     })
                            prediction[k * batch_sz:batch_sz *
                                       (k + 1)] = session.run(
                                           predict_op,
                                           feed_dict={X: Xtestbatch})
                        accur = error_rate(prediction,
                                           np.argmax(Ytest_ind, axis=1))
                        print(
                            f'epoch is {i} accuracy is {round(accur,5)} and cost is {round(test_cost,5)}'
                        )
                        LL.append(test_cost)

    plt.plot(LL)
    plt.show()
def main():
    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. const
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 2. RMSprop
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_rms = []
    CR_rms = []
    lr0 = 0.001 # if you set this too high you'll get NaN!
    cache_W2 = 0
    cache_b2 = 0
    cache_W1 = 0
    cache_b1 = 0
    decay_rate = 0.999
    eps = 0.0000000001
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps)

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps)

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps)

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_rms.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_rms.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label='const')
    plt.plot(LL_rms, label='rms')
    plt.legend()
    plt.show()
# installation is easy! just the usual "sudo pip(3) install keras"


# get the data, same as Theano + Tensorflow examples
# no need to split now, the fit() function will do it
X, Y = get_normalized_data()

# get shapes
N, D = X.shape
K = len(set(Y))

# by default Keras wants one-hot encoded labels
# there's another cost function we can use
# where we can just pass in the integer labels directly
# just like Tensorflow / Theano
Y = y2indicator(Y)


# the model will be a sequence of layers
model = Sequential()


# ANN with layers [784] -> [500] -> [300] -> [10]
model.add(Dense(units=500, input_dim=D))
model.add(Activation('relu'))
model.add(Dense(units=300)) # don't need to specify input_dim
model.add(Activation('relu'))
model.add(Dense(units=K))
model.add(Activation('softmax'))

# installation is easy! just the usual "sudo pip(3) install keras"


# get the data, same as Theano + Tensorflow examples
# no need to split now, the fit() function will do it
Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

# get shapes
N, D = Xtrain.shape
K = len(set(Ytrain))

# by default Keras wants one-hot encoded labels
# there's another cost function we can use
# where we can just pass in the integer labels directly
# just like Tensorflow / Theano
Ytrain = y2indicator(Ytrain)
Ytest = y2indicator(Ytest)


# the model will be a sequence of layers
model = Sequential()


# ANN with layers [784] -> [500] -> [300] -> [10]
model.add(Dense(units=500, input_dim=D))
model.add(Activation('relu'))
model.add(Dense(units=300)) # don't need to specify input_dim
model.add(Activation('relu'))
model.add(Dense(units=K))
model.add(Activation('softmax'))
def main():
    # 1.batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov

    max_iter = 20
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain, Ytrain = X[:-1000], Y[:-1000]
    Xtest, Ytest = X[-1000:], Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = X.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300  # number of hidden neurons
    K = 10  # number of output classes

    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    #1. batch SGD
    LL_batch = []
    CR_batch = []

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) +
                        reg * W1)
            b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #2. batch with momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]

            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) +
                                   reg * W2)
            W2 += dW2
            db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * dW1 - lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) +
                                   reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    #3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / np.sqrt(D + M)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nesterov = []
    CR_nesterov = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0

    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ]
            Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            dW2 = mu * mu * dW2 - (1 + mu) * lr * (
                derivative_w2(Z, Ybatch, pYbatch) + reg * W2)
            W2 += dW2
            db2 = mu * mu * db2 - (1 + mu) * lr * (
                derivative_b2(Ybatch, pYbatch) + reg * b2)
            b2 += db2
            dW1 = mu * mu * dW1 - (1 + mu) * lr * (
                derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1)
            W1 += dW1
            db1 = mu * mu * db1 - (1 + mu) * lr * (
                derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1)
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                ll = cost(pY, Ytest_ind)
                LL_nesterov.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nesterov.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label='momentum')
    plt.plot(LL_nesterov, label='nesterov')
    plt.legend()
    plt.show()
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    mu = 0.9

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    # add an extra layer just for fun
    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M1)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(M2)
    W3_init = np.random.randn(M2, K) / np.sqrt(M2)
    b3_init = np.zeros(K)


    # define variables and expressions
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='T')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    # define the model
    Z1 = tf.nn.relu( tf.matmul(X, W1) + b1 )
    Z2 = tf.nn.relu( tf.matmul(Z1, W2) + b2 )
    Yish = tf.matmul(Z2, W3) + b3 # remember, the cost function does the softmaxing! weird, right?

    # softmax_cross_entropy_with_logits take in the "logits"
    # if you wanted to know the actual output of the neural net,
    # you could pass "Yish" into tf.nn.softmax(logits)
    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(Yish, T))

    # we choose the optimizer but don't implement the algorithm ourselves
    # let's go with RMSprop, since we just learned about it.
    # it includes momentum!
    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)

    # we'll use this to calculate the error rate
    predict_op = tf.argmax(Yish, 1)

    LL = []
    init = tf.initialize_all_variables()
    with tf.Session() as session:
        session.run(init)

        for i in xrange(max_iter):
            for j in xrange(n_batches):
                Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
                Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print "Cost / err at iteration i=%d, j=%d: %.6f / %.3f" % (i, j, test_cost, err)
                    LL.append(test_cost)

    plt.plot(LL)
    plt.show()
def main():
    X, Y = get_normalized_data()

    max_iter = 20
    print_period = 10
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:,]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    thZ = relu( thX.dot(W1) + b1 )
    thY = T.nnet.softmax( thZ.dot(W2) + b2 )

    cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
    prediction = T.argmax(thY, axis=1)

    update_W1 = W1 - lr*T.grad(cost, W1)
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],

    )

    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs =[cost, prediction],
    )

    LL = []
    for i in range(max_iter):
        for j in range(int(n_batches)):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err))
                LL.append(cost_val)

    plt.plot(LL)
    plt.show()
    def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True):
        lr = np.float32(lr)
        mu = np.float32(mu)
        reg = np.float32(reg)
        decay = np.float32(decay)
        eps = np.float32(eps)
        K = len(set(Y))

        # make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = y2indicator(Y).astype(np.float32)

        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]
        Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate

        # initialize convpool layers
        N, d, d, c = X.shape
        mi = c
        outw = d
        outh = d
        self.convpool_layers = []
        for mo, fw, fh in self.convpool_layer_sizes:
            layer = ConvPoolLayer(mi, mo, fw, fh)
            self.convpool_layers.append(layer)
            outw = outw / 2
            outh = outh / 2
            mi = mo

        # initialize mlp layers
        self.hidden_layers = []
        M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # logistic regression layer
        W, b = init_weight_and_bias(M1, K)
        self.W = tf.Variable(W, 'W_logreg')
        self.b = tf.Variable(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.convpool_layers:
            self.params += h.params
        for h in self.hidden_layers:
            self.params += h.params

        # set up tensorflow functions and variables
        tfX = tf.placeholder(tf.float32, shape=(None, d, d, c), name='X')
        tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y')
        act = self.forward(tfX)

        rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(act, tfY)) + rcost
        prediction = self.predict(tfX)

        train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost)

        n_batches = N / batch_sz
        costs = []
        init = tf.initialize_all_variables()
        with tf.Session() as session:
            session.run(init)
            for i in xrange(epochs):
                X, Y = shuffle(X, Y)
                for j in xrange(n_batches):
                    Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                    Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                    session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch})

                    if j % 20 == 0:
                        c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid})
                        costs.append(c)

                        p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid})
                        e = error_rate(Yvalid_flat, p)
                        print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e

        if show_fig:
            plt.plot(costs)
            plt.show()
def main():
    max_iter = 10
    print_period = 10

    X, Y = get_normalized_data()
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_0 = np.random.randn(D, M) / np.sqrt(D)
    b1_0 = np.zeros(M)
    W2_0 = np.random.randn(M, K) / np.sqrt(M)
    b2_0 = np.zeros(K)

    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    # 1st moment
    mW1 = 0
    mb1 = 0
    mW2 = 0
    mb2 = 0

    # 2nd moment
    vW1 = 0
    vb1 = 0
    vW2 = 0
    vb2 = 0

    # hyperparams
    lr0 = 0.001
    beta1 = 0.9
    beta2 = 0.999
    eps = 1e-8

    # 1. Adam
    loss_adam = []
    err_adam = []
    t = 1
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # new m
            mW1 = beta1 * mW1 + (1 - beta1) * gW1
            mb1 = beta1 * mb1 + (1 - beta1) * gb1
            mW2 = beta1 * mW2 + (1 - beta1) * gW2
            mb2 = beta1 * mb2 + (1 - beta1) * gb2

            # new v
            vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
            vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1
            vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2
            vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2

            # bias correction
            correction1 = 1 - beta1 ** t
            hat_mW1 = mW1 / correction1
            hat_mb1 = mb1 / correction1
            hat_mW2 = mW2 / correction1
            hat_mb2 = mb2 / correction1

            correction2 = 1 - beta2 ** t
            hat_vW1 = vW1 / correction2
            hat_vb1 = vb1 / correction2
            hat_vW2 = vW2 / correction2
            hat_vb2 = vb2 / correction2

            # update t
            t += 1

            # apply updates to the params
            W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps)
            b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps)
            W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps)
            b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps)


            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_adam.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_adam.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 2. RMSprop with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    loss_rms = []
    err_rms = []

    # comparable hyperparameters for fair comparison
    lr0 = 0.001
    mu = 0.9
    decay_rate = 0.999
    eps = 1e-8

    # rmsprop cache
    cache_W2 = 1
    cache_b2 = 1
    cache_W1 = 1
    cache_b1 = 1

    # momentum
    dW1 = 0
    db1 = 0
    dW2 = 0
    db2 = 0
    
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2
            dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps)
            W2 -= dW2

            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2
            db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps)
            b2 -= db2

            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1
            dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps)
            W1 -= dW1

            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1
            cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1
            db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps)
            b1 -= db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                loss_rms.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                err = error_rate(pY, Ytest)
                err_rms.append(err)
                print("Error rate:", err)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    plt.plot(loss_adam, label='adam')
    plt.plot(loss_rms, label='rmsprop')
    plt.legend()
    plt.show()
def main():
    Xtrain, Xtest, Ytrain, Ytest = get_transformed_data()
    print("Performing logistic regression...")

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in range(50):
        p_y = forward(Xtrain, W, b)

        W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W)
        b += lr*(gradb(Ytrain_ind, p_y) - reg*b)
        

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for full GD:", datetime.now() - t0)


    # 2. stochastic
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in range(50): # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in range(min(N, 500)): # shortcut so it won't take so long...
            x = tmpX[n,:].reshape(1,D)
            y = tmpY[n,:].reshape(1,10)
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for SGD:", datetime.now() - t0)


    # 3. batch
    W = np.random.randn(D, 10) / np.sqrt(D)
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N // batch_sz

    t0 = datetime.now()
    for i in range(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in range(n_batches):
            x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
            y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
        if i % 1 == 0:
            err = error_rate(p_y_test, Ytest)
            if i % 10 == 0:
                print("Cost at iteration %d: %.6f" % (i, ll))
                print("Error rate:", err)
    p_y = forward(Xtest, W, b)
    print("Final error rate:", error_rate(p_y, Ytest))
    print("Elapsted time for batch GD:", datetime.now() - t0)



    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 50

    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / np.sqrt(D)
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # save initial weights
    W1_0 = W1.copy()
    b1_0 = b1.copy()
    W2_0 = W2.copy()
    b2_0 = b2.copy()

    # 1. batch
    losses_batch = []
    errors_batch = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_batch.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_batch.append(e)
                print("Error rate:", e)

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))

    # 2. batch with momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()
    losses_momentum = []
    errors_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # gradients
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # update velocities
            dW2 = mu*dW2 - lr*gW2
            db2 = mu*db2 - lr*gb2
            dW1 = mu*dW1 - lr*gW1
            db1 = mu*db1 - lr*gb1

            # updates
            W2 += dW2
            b2 += db2
            W1 += dW1
            b1 += db1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_momentum.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_momentum.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))


    # 3. batch with Nesterov momentum
    W1 = W1_0.copy()
    b1 = b1_0.copy()
    W2 = W2_0.copy()
    b2 = b2_0.copy()

    losses_nesterov = []
    errors_nesterov = []

    mu = 0.9
    vW2 = 0
    vb2 = 0
    vW1 = 0
    vb1 = 0
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2
            gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2
            gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1
            gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1

            # v update
            vW2 = mu*vW2 - lr*gW2
            vb2 = mu*vb2 - lr*gb2
            vW1 = mu*vW1 - lr*gW1
            vb1 = mu*vb1 - lr*gb1

            # param update
            W2 += mu*vW2 - lr*gW2
            b2 += mu*vb2 - lr*gb2
            W1 += mu*vW1 - lr*gW1
            b1 += mu*vb1 - lr*gb1

            if j % print_period == 0:
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                l = cost(pY, Ytest_ind)
                losses_nesterov.append(l)
                print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l))

                e = error_rate(pY, Ytest)
                errors_nesterov.append(e)
                print("Error rate:", e)
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print("Final error rate:", error_rate(pY, Ytest))



    plt.plot(losses_batch, label="batch")
    plt.plot(losses_momentum, label="momentum")
    plt.plot(losses_nesterov, label="nesterov")
    plt.legend()
    plt.show()
Exemple #39
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-2,
            mu=0.99,
            decay=0.999,
            reg=1e-3,
            epochs=10,
            batch_sz=100,
            show_fig=False):
        K = len(set(Y))

        #make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = y2indicator(Y).astype(np.float32)

        #for cauculating error rate
        Yvalid_flat = Yvalid
        Yvalid = y2indicator(Yvalid).astype(np.float32)

        #initialize hidden layers
        N, D = X.shape

        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        W, b = init_weight_and_bias(M1, K)
        self.W = tf.Variable(W.astype(np.float32))
        self.b = tf.Variable(b.astype(np.float32))

        #collect param for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        #set up function and variable
        tfX = tf.placeholder(tf.float32, shape=(None, D), name='X')
        tfT = tf.placeholder(tf.float32, shape=(None, K), name='T')
        act = self.forward(tfX)

        rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_sum(
            tf.nn.softmax_cross_entropy_with_logits(logits=act,
                                                    labels=tfT)) + rcost
        prediction = self.predict(tfX)
        train_op = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=decay,
                                             momentum=mu).minimize(cost)

        n_batches = N // batch_sz
        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                for j in range(n_batches):
                    Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                    Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                    session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch})

                    if j % 20 == 0:
                        c = session.run(cost,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        })
                        costs.append(c)

                        p = session.run(prediction,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        })
                        e = error_rate(Yvalid_flat, p)
                        print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                              "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self, X, Y, learning_rate=1e-2, mu=0.99, decay=0.999, reg=1e-3, epochs=10, batch_sz=100, show_fig=False):
        K = len(set(Y)) # won't work later b/c we turn it into indicator

        # make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = y2indicator(Y).astype(np.float32)
        # Y = Y.astype(np.int32)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate
        X, Y = X[:-1000], Y[:-1000]

        # initialize hidden layers
        N, D = X.shape
        
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W, b = init_weight_and_bias(M1, K)
        self.W = tf.Variable(W.astype(np.float32))
        self.b = tf.Variable(b.astype(np.float32))

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        tfX = tf.placeholder(tf.float32, shape=(None, D), name='X')
        tfT = tf.placeholder(tf.float32, shape=(None, K), name='T')
        act = self.forward(tfX)

        rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=act,
                labels=tfT
            )
        ) + rcost
        prediction = self.predict(tfX)
        train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost)

        n_batches = N // batch_sz
        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                for j in range(n_batches):
                    Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                    Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                    session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch})

                    if j % 20 == 0:
                        c = session.run(cost, feed_dict={tfX: Xvalid, tfT: Yvalid})
                        costs.append(c)

                        p = session.run(prediction, feed_dict={tfX: Xvalid, tfT: Yvalid})
                        e = error_rate(Yvalid_flat, p)
                        print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)
        
        if show_fig:
            plt.plot(costs)
            plt.show()
def main():
    X, Y, _, _ = get_transformed_data()
    X = X[:, :300]

    # normalize X first
    mu = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mu) / std

    print "Performing logistic regression..."
    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]

    N, D = Xtrain.shape
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    # 1. full
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL = []
    lr = 0.0001
    reg = 0.01
    t0 = datetime.now()
    for i in xrange(200):
        p_y = forward(Xtrain, W, b)

        W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W)
        b += lr*(gradb(Ytrain_ind, p_y) - reg*b)
        

        p_y_test = forward(Xtest, W, b)
        ll = cost(p_y_test, Ytest_ind)
        LL.append(ll)
        if i % 10 == 0:
            err = error_rate(p_y_test, Ytest)
            print "Cost at iteration %d: %.6f" % (i, ll)
            print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for full GD:", datetime.now() - t0


    # 2. stochastic
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_stochastic = []
    lr = 0.0001
    reg = 0.01

    t0 = datetime.now()
    for i in xrange(1): # takes very long since we're computing cost for 41k samples
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for n in xrange(min(N, 500)): # shortcut so it won't take so long...
            x = tmpX[n,:].reshape(1,D)
            y = tmpY[n,:].reshape(1,10)
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_stochastic.append(ll)

            if n % (N/2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for SGD:", datetime.now() - t0


    # 3. batch
    W = np.random.randn(D, 10) / 28
    b = np.zeros(10)
    LL_batch = []
    lr = 0.0001
    reg = 0.01
    batch_sz = 500
    n_batches = N / batch_sz

    t0 = datetime.now()
    for i in xrange(50):
        tmpX, tmpY = shuffle(Xtrain, Ytrain_ind)
        for j in xrange(n_batches):
            x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:]
            y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:]
            p_y = forward(x, W, b)

            W += lr*(gradW(y, p_y, x) - reg*W)
            b += lr*(gradb(y, p_y) - reg*b)

            p_y_test = forward(Xtest, W, b)
            ll = cost(p_y_test, Ytest_ind)
            LL_batch.append(ll)
            if j % (n_batches/2) == 0:
                err = error_rate(p_y_test, Ytest)
                print "Cost at iteration %d: %.6f" % (i, ll)
                print "Error rate:", err
    p_y = forward(Xtest, W, b)
    print "Final error rate:", error_rate(p_y, Ytest)
    print "Elapsted time for batch GD:", datetime.now() - t0



    x1 = np.linspace(0, 1, len(LL))
    plt.plot(x1, LL, label="full")
    x2 = np.linspace(0, 1, len(LL_stochastic))
    plt.plot(x2, LL_stochastic, label="stochastic")
    x3 = np.linspace(0, 1, len(LL_batch))
    plt.plot(x3, LL_batch, label="batch")
    plt.legend()
    plt.show()
def main():
    # step 1: get the data and define all the usual variables
    X, Y = get_normalized_data()

    max_iter = 15
    print_period = 10

    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000, ]
    Ytrain = Y[:-1000]
    Xtest = X[-1000:, ]
    Ytest = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    # add an extra layer just for fun
    M1 = 300
    M2 = 100
    K = 10
    W1_init = np.random.randn(D, M1) / 28
    b1_init = np.zeros(M1)
    W2_init = np.random.randn(M1, M2) / np.sqrt(M1)
    b2_init = np.zeros(M2)
    W3_init = np.random.randn(M2, K) / np.sqrt(M2)
    b3_init = np.zeros(K)

    # initialize varaibles and expressions
    X = tf.placeholder(tf.float32, shape=(None, D), name='X')
    T = tf.placeholder(tf.float32, shape=(None, K), name='Y')
    W1 = tf.Variable(W1_init.astype(np.float32))
    b1 = tf.Variable(b1_init.astype(np.float32))
    W2 = tf.Variable(W2_init.astype(np.float32))
    b2 = tf.Variable(b2_init.astype(np.float32))
    W3 = tf.Variable(W3_init.astype(np.float32))
    b3 = tf.Variable(b3_init.astype(np.float32))

    # define the model
    Z1 = tf.nn.relu(tf.matmul(X, W1) + b1)
    Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2)
    # the cost function does the softmaxing! SO NO SOFTMAXING HERE
    Yish = tf.matmul(Z2, W3) + b3

    cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T))

    train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost)

    predict_op = tf.argmax(Yish, 1)

    costs = []
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)

        for i in range(max_iter):
            for j in range(n_batches):
                Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
                Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

                session.run(train_op, feed_dict={X: Xbatch, T: Ybatch})
                if j % print_period == 0:
                    test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind})
                    prediction = session.run(predict_op, feed_dict={X: Xtest})
                    err = error_rate(prediction, Ytest)
                    print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
                    costs.append(test_cost)

    plt.plot(costs)
    plt.show()
def main():
    # compare 3 scenarios:
    # 1. batch SGD
    # 2. batch SGD with momentum
    # 3. batch SGD with Nesterov momentum

    max_iter = 20 # make it 30 for sigmoid
    print_period = 10

    X, Y = get_normalized_data()
    lr = 0.00004
    reg = 0.01

    Xtrain = X[:-1000,]
    Ytrain = Y[:-1000]
    Xtest  = X[-1000:,]
    Ytest  = Y[-1000:]
    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N / batch_sz

    M = 300
    K = 10
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)

    # 1. batch
    # cost = -16
    LL_batch = []
    CR_batch = []
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)
            # print "first batch cost:", cost(pYbatch, Ybatch)

            # updates
            W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_batch.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_batch.append(err)
                print "Error rate:", err

    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)

    # 2. batch with momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_momentum = []
    CR_momentum = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_momentum.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_momentum.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)


    # 3. batch with Nesterov momentum
    W1 = np.random.randn(D, M) / 28
    b1 = np.zeros(M)
    W2 = np.random.randn(M, K) / np.sqrt(M)
    b2 = np.zeros(K)
    LL_nest = []
    CR_nest = []
    mu = 0.9
    dW2 = 0
    db2 = 0
    dW1 = 0
    db1 = 0
    for i in xrange(max_iter):
        for j in xrange(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]
            pYbatch, Z = forward(Xbatch, W1, b1, W2, b2)

            # updates
            dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2)
            W2 += dW2
            db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2)
            b2 += db2
            dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1)
            W1 += dW1
            db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1)
            b1 += db1

            if j % print_period == 0:
                # calculate just for LL
                pY, _ = forward(Xtest, W1, b1, W2, b2)
                # print "pY:", pY
                ll = cost(pY, Ytest_ind)
                LL_nest.append(ll)
                print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)

                err = error_rate(pY, Ytest)
                CR_nest.append(err)
                print "Error rate:", err
    pY, _ = forward(Xtest, W1, b1, W2, b2)
    print "Final error rate:", error_rate(pY, Ytest)



    plt.plot(LL_batch, label="batch")
    plt.plot(LL_momentum, label="momentum")
    plt.plot(LL_nest, label="nesterov")
    plt.legend()
    plt.show()
    def fit(self, X, Y, Xvalid, Yvalid, lr=1e-2, mu=0.9, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=5, show_fig=True):
        lr = np.float32(lr)
        mu = np.float32(mu)
        reg = np.float32(reg)
        decay = np.float32(decay)
        eps = np.float32(eps)
        K = len(set(Y))

        # make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = y2indicator(Y).astype(np.float32)

        Yvalid = y2indicator(Yvalid).astype(np.float32)
        Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate

        # initialize convpool layers
        N, width, height, c = X.shape
        mi = c
        outw = width
        outh = height
        self.convpool_layers = []
        for mo, fw, fh in self.convpool_layer_sizes:
            layer = ConvPoolLayer(mi, mo, fw, fh)
            self.convpool_layers.append(layer)
            outw = outw // 2
            outh = outh // 2
            mi = mo

        # initialize mlp layers
        self.hidden_layers = []
        M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # logistic regression layer
        W, b = init_weight_and_bias(M1, K)
        self.W = tf.Variable(W, 'W_logreg')
        self.b = tf.Variable(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.convpool_layers:
            self.params += h.params
        for h in self.hidden_layers:
            self.params += h.params

        # set up tensorflow functions and variables
        tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X')
        tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y')
        act = self.forward(tfX)

        rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=act,
                labels=tfY
            )
        ) + rcost
        prediction = self.predict(tfX)

        train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost)

        n_batches = N // batch_sz
        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                for j in range(n_batches):
                    Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                    Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                    session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch})

                    if j % 20 == 0:
                        c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid})
                        costs.append(c)

                        p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid})
                        e = error_rate(Yvalid_flat, p)
                        print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.0004
    reg = 0.01

    Xtrain = Xtest.astype(np.float32)
    Ytrain = Ytest.astype(np.float32)
    Ytrain_ind = y2indicator(Ytrain).astype(np.float32)
    Ytest_ind = y2indicator(Ytest).astype(np.float32)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / 28
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    # step 2: define theano variables and expressions
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    # we can use the built-in theano functions to do relu and softmax
    thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it
    thY = T.nnet.softmax( thZ.dot(W2) + b2 )

    # define the cost function and prediction
    cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())
    prediction = T.argmax(thY, axis=1)

    # step 3: training expressions and functions
    # we can just include regularization as part of the cost because it is also automatically differentiated!
    update_W1 = W1 - lr*T.grad(cost, W1)
    update_b1 = b1 - lr*T.grad(cost, b1)
    update_W2 = W2 - lr*T.grad(cost, W2)
    update_b2 = b2 - lr*T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction],
    )

    costs = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),]
            Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err))
                costs.append(cost_val)

    plt.plot(costs)
    plt.show()