def main():

    X, Y = get_ecommerce(user_action=None)
    X, Y = shuffle(X, Y)

    N, D = X.shape
    Y = Y.astype(np.int32)
    K = len(np.unique(Y))

    Ntrain = N - 100 + 1
    Xtrain, Ytrain = X[:Ntrain, :], Y[:Ntrain]
    Ytrain_ind = y2indicator(Ytrain, K)

    Ntest = 100
    Xtest, Ytest = X[-Ntest:, :], Y[-Ntest:]
    Ytest_ind = y2indicator(Ytest, K)

    # params
    lr = 5e-3
    max_iteration = 10000
    W = np.random.randn(D, K) / np.sqrt(D + K)
    b = np.zeros(K)

    train_costs = []
    test_costs = []

    for i in xrange(max_iteration):
        pYtrain, pYtest = forward(W, b, Xtrain), forward(W, b, Xtest)
        # Ytrain  = predict(pYtrain)
        ctrain = cross_entropy(Ytrain_ind, pYtrain)
        ctest = cross_entropy(Ytest_ind, pYtest)

        train_costs.append(ctrain)
        test_costs.append(ctest)

        W -= lr * Xtrain.T.dot(pYtrain - Ytrain_ind)
        b -= lr * (pYtrain - Ytrain_ind).sum(axis=0)

        if i % 1000 == 0:
            print "i=%d\ttrain cost=%.3f\ttest error=%.3f" % (i, ctrain, ctest)

    print "i=%d\ttrain cost=%.3f\ttest error=%.3f" % (max_iteration, ctrain,
                                                      ctest)
    print "Final train classification rate", classification_rate(
        Ytrain, predict(pYtrain))
    print "Final test  classification rate", classification_rate(
        Ytest, predict(pYtest))

    plt.title('logistic regression + softmax')
    plt.xlabel('iterations')
    plt.ylabel('training costs')
    legend1, = plt.plot(train_costs, label='train cost')
    legend2, = plt.plot(test_costs, label='test cost')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-8,
            reg=10e-8,
            epochs=10000,
            show_figure=False):

        X, Y = shuffle(X, Y)
        K = len(set(Y))
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        Tvalid = y2indicator(Yvalid, K)
        X, Y = X[:-1000], Y[:-1000]

        N, D = X.shape

        T = y2indicator(Y, K)
        self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M)
        self.b1 = np.zeros(self.M)

        self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K)
        self.b2 = np.zeros(K)

        costs = []
        best_validation_error = 1
        for i in xrange(epochs):
            pY, Z = self.forward(X)
            # gradient descent step
            self.W2 -= learning_rate * (Z.T.dot(pY - T) + reg * self.W2)
            self.b2 -= learning_rate * ((pY - T).sum(axis=0) + reg * self.b2)

            self.W1 -= learning_rate * (X.T.dot(
                (pY - T).dot(self.W2.T) * Z * (1 - Z)) + reg * self.W1)
            self.b1 -= learning_rate * (((pY - T).dot(self.W2.T) * Z *
                                         (1 - Z)).sum(axis=0) + reg * self.b1)

            if i % 10 == 0:
                pYvalid, Zvalid = self.forward(Xvalid)

                c = cost(Tvalid, pYvalid)
                costs.append(c)
                e = error_rate(Yvalid, np.argmax(pYvalid, axis=1))

                print "i", i, "cost:", c, "error", e
                if e < best_validation_error:
                    best_validation_error = e
        print "best_validation_error:", best_validation_error

        if show_figure:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 3
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-6,
            reg=10e-7,
            epochs=1000,
            show_figure=False):
        X, Y = shuffle(X, Y)
        x_valid = X[-10:]
        y_valid = Y[-10:]
        t_valid = utils.y2indicator(y_valid)

        x = X[:-10]
        y = Y[:-10]
        t = utils.y2indicator(y)

        N, D = x.shape
        K = len(set(y))

        self.W1 = np.random.randn(D, self.M)
        self.b1 = np.random.randn(self.M)

        self.W2 = np.random.randn(self.M, K)
        self.b2 = np.random.randn(K)

        costs = []

        for i in range(epochs):
            pY, Z = self.forward(x)

            #Updating Weights
            D = pY - t
            self.W2 -= learning_rate * (Z.T.dot(D) + reg * self.W2)
            self.b2 -= learning_rate * (D.sum() + reg * self.b2)

            dZ = D.dot(self.W2.T) * Z * (1 - Z)
            self.W1 -= learning_rate * (x.T.dot(dZ) + reg * self.W1)
            self.b1 -= learning_rate * (dZ.sum() + reg * self.b1)

            if i % 10 == 0:
                pY_valid, _ = self.forward(x_valid)
                c = utils.cost(t_valid, pY_valid)
                costs.append(c)
                e = utils.error_rate(y_valid, np.argmax(pY_valid, axis=1))
                print("i:", i, " cost: ", c, " error: ", e)

        if show_figure:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 4
0
    def fit(self, X, Y, learning_rate=0.01, epochs=1000, show_figure=False):
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = Y.astype(np.float32)
        X_valid = X[-10:]
        Y_valid = Y[-10:]
        T_valid = utils.y2indicator(Y_valid)

        X = X[:-10]
        Y = Y[:-10]
        T = utils.y2indicator(Y)

        N, D = X.shape
        K = len(set(Y))

        tfX = tf.placeholder(tf.float32, [None, D])
        tfY = tf.placeholder(tf.float32, [None, K])

        self.W1 = self.init_weights([D, self.M])
        self.b1 = self.init_weights([self.M])

        self.W2 = self.init_weights([self.M, K])
        self.b2 = self.init_weights([K])

        py_x = self.forward(X)

        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=tfY, logits=py_x))
        tf.summary.scalar('cost', cost)

        train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            cost)

        predict_op = tf.argmax(py_x, 1)

        sess = tf.Session()
        init = tf.initialize_all_variables()
        sess.run(init)

        for i in range(epochs):
            sess.run(train_op, feed_dict={tfX: X, tfY: T})
            prediction = sess.run(predict_op,
                                  feed_dict={
                                      tfX: X_valid,
                                      tfY: T_valid
                                  })
            if i % 10 == 0:
                print("i: ", i, "accuracy: ", np.mean(Y == prediction))
Ejemplo n.º 5
0
    def fit(self, X, Y):
        X, Y = shuffle(X, Y)
        totalSampleCount, _ = X.shape
        testTrainSeperationIndex = int(self.testTrainSeperatingFactor *
                                       totalSampleCount)
        Xvalid, Yvalid = X[-testTrainSeperationIndex:], Y[
            -testTrainSeperationIndex:]
        X, Y = X[:-testTrainSeperationIndex], Y[:-testTrainSeperationIndex]

        numberOfSamples, featureVectorSize = X.shape
        classesCount = len(set(Y))
        target = y2indicator(Y)
        #input to hidden layer weights and biases
        self.W1 = np.random.randn(
            featureVectorSize, self.numberOfHiddenLayerNeurons) / np.sqrt(
                featureVectorSize + self.numberOfHiddenLayerNeurons)
        self.b1 = np.zeros(self.numberOfHiddenLayerNeurons)
        #hidden layer to output weights and biases
        self.W2 = np.random.randn(
            self.numberOfHiddenLayerNeurons,
            classesCount) / np.sqrt(self.numberOfHiddenLayerNeurons +
                                    classesCount)
        self.b2 = np.zeros(classesCount)

        costs = []
        bestValidationError = 1
        for i in range(self.epochs):
            # forward propagation and cost calculation
            output, hiddenLayerOutput = self.forward(X)

            # gradient descent step
            distance = target - output

            self.W2 += self.learningRate * (hiddenLayerOutput.T.dot(distance) +
                                            self.reg * self.W2)
            self.b2 += self.learningRate * (distance.sum(axis=0) +
                                            self.reg * self.b2)
            dOutput = distance.dot(self.W2.T) * (hiddenLayerOutput > 0)  # relu
            self.W1 += self.learningRate * (X.T.dot(dOutput) +
                                            self.reg * self.W1)
            self.b1 += self.learningRate * (dOutput.sum(axis=0) +
                                            self.reg * self.b1)

            if i % 10 == 0:
                pYvalid, _ = self.forward(Xvalid)
                c = cost2(Yvalid, pYvalid)
                costs.append(c)
                e = errorRate(Yvalid, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < bestValidationError:
                    bestValidationError = e
        print("bestValidationError:", bestValidationError)

        if self.showFigure:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 6
0
def main():
    X, Y = get_ecommerce(user_action=None)
    X, Y = shuffle(X, Y)

    # Running variables
    learning_rate = 5e-4
    max_iterations = 10000

    # Define dimensions
    N, D = X.shape
    M = 5
    K = len(np.unique(Y))

    Ntrain = N - 100
    Xtrain, Ytrain = X[:Ntrain, :], Y[:Ntrain]
    Ytrain_ind = y2indicator(Ytrain, K)

    Ntest = 100
    Xtest, Ytest = X[-Ntest:, :], Y[-Ntest:]
    Ytest_ind = y2indicator(Ytest, K)

    W1_init = np.random.randn(D, M) / np.sqrt(M + D)
    b1_init = np.random.randn(M) / np.sqrt(M)

    W2_init = np.random.randn(M, K) / np.sqrt(M + K)
    b2_init = np.random.randn(K) / np.sqrt(K)

    #Define theano shared
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    #Define constant tensor matrices
    thX = T.matrix('X')
    thT = T.matrix('T')

    #Define cost
    thZ = sigmoid(thX.dot(W1) + b1)
    thY = softmax(thZ.dot(W2) + b2)

    cost = -(thT * np.log(thY) + (1 - thT) * np.log(1 - thY)).sum()
    prediction = T.argmax(thY, axis=1)

    #Define updates
    W1_update = W1 - learning_rate * T.grad(cost, W1)
    b1_update = b1 - learning_rate * T.grad(cost, b1)
    W2_update = W2 - learning_rate * T.grad(cost, W2)
    b2_update = b2 - learning_rate * T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, W1_update), (b1, b1_update), (W2, W2_update),
                 (b2, b2_update)],
    )
    predict = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction],
    )

    LL = []
    train_errors = []
    test_errors = []
    train_costs = []
    test_costs = []
    for i in xrange(max_iterations):
        train(Xtrain, Ytrain_ind)
        if i % 10 == 0:
            c, pYtrain = predict(Xtrain, Ytrain_ind)
            err = error_rate(Ytrain, pYtrain)
            train_costs.append(c)
            train_errors.append(err)

            c, pYtest = predict(Xtest, Ytest_ind)
            err = error_rate(Ytest, pYtest)
            test_costs.append(c)
            test_errors.append(err)
            print "i=%d\tc=%.3f\terr==%.3f\t" % (i, c, err)

    print "i=%d\tc=%.3f\terr==%.3f\t" % (max_iterations, c, err)

    print "Final train classification rate", classification_rate(
        Ytrain, pYtrain)
    print "Final test  classification rate", classification_rate(Ytest, pYtest)

    plt.title('Multi layer perceptron: Costs')
    plt.xlabel('iterations')
    plt.ylabel('costs')
    legend1, = plt.plot(train_costs, label='train cost')
    legend2, = plt.plot(test_costs, label='test cost')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()

    plt.title('Multi layer perceptron: Error rates')
    plt.xlabel('iterations')
    plt.ylabel('error rates')
    legend1, = plt.plot(train_errors, label='train error')
    legend2, = plt.plot(test_errors, label='test error')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()
	def fit(self, X, Y, learning_rate=10e-5, epochs=200, reg=10e-8, batch_sz=200, show_fig=False, activation=tf.tanh):
		X, Y = shuffle(X, Y)
		K = len(np.unique(Y))  

		T = y2indicator(Y, K).astype(np.float32)
		Xvalid, Yvalid, Tvalid = X[-1000:,], Y[-1000:], T[-1000:,:] 
		Xtrain, Ytrain, Ttrain = X[:-1000,:], Y[:-1000],T[:-1000,:] 

		N, D = Xtrain.shape
		

		#Varianel initialization
		W1, b1 = init_weight_and_bias(D,self.M)
		W2, b2 = init_weight_and_bias(self.M,K)



		self.W1 = tf.Variable(W1.astype(np.float32), 'W1')
		self.b1 = tf.Variable(b1.astype(np.float32), 'b1')
		self.W2 = tf.Variable(W2.astype(np.float32), 'W2')
		self.b2 = tf.Variable(b2.astype(np.float32), 'b2')
		self.params = [self.W1, self.b1, self.W2, self.b2] 
		# Define placeholders
		X = tf.placeholder(tf.float32,shape=(None,D),name='X')
		T = tf.placeholder(tf.float32,shape=(None,K),name='Y')

		
		

		Z = activation(tf.matmul(X, self.W1) + self.b1) 		
		Yish = tf.matmul(Z, self.W2) + self.b2 

		rcost  = reg*tf.reduce_sum([tf.nn.l2_loss(p) for p in self.params])
		cost   = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(labels=T, logits=Yish) ) + rcost 
		
		
		train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
		self.predict_op = tf.argmax(Yish, 1)

		n_batches = N // batch_sz 
		costs=[] 
		errors=[] 
		init = tf.global_variables_initializer()
		with tf.Session() as session:
			session.run(init)

			for i in xrange(epochs):
				Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
				for j in xrange(n_batches): 
					Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,:]
					Ybatch = Ytrain[j*batch_sz:(j+1)*batch_sz]
					Tbatch = Ttrain[j*batch_sz:(j+1)*batch_sz,:]

					session.run(train_op,
						feed_dict={
							X: Xbatch,
							T: Tbatch 
					})

					if j % 10 == 0: 
						c = session.run(cost, feed_dict={X:Xvalid, T:Tvalid} )
						pYvalid  = session.run( self.predict_op, feed_dict={X: Xvalid} )
						err = error_rate(Yvalid, pYvalid)
						print "i:%d\tj:%d\tc:%.3f\terr:%.3f\t" % (i,j,c,err)	
						costs.append(c)
						errors.append(err)

		if show_fig:
			plt.title('costs')
			plt.plot(costs)
			plt.show()

			plt.title('error rate')
			plt.plot(errors)
			plt.show()
Ejemplo n.º 8
0
    def fit(self,
            Xin,
            Yin,
            learning_rate=10e-7,
            reg=10e-8,
            epochs=10000,
            show_figure=False):
        Nvalid = 500
        N, D = Xin.shape
        K = len(np.unique(Yin))
        Xin, Yin = shuffle(Xin, Yin)

        Xtrain, Ytrain = Xin[-Nvalid:, :], Yin[-Nvalid:, ]
        Xvalid, Yvalid = Xin[:-Nvalid, :], Yin[:-Nvalid, ]
        Ttrain, Tvalid = y2indicator(Ytrain, K), y2indicator(Yvalid, K)

        #Initialize Wi,bi
        W1_init = np.random.randn(D, self.M) / np.sqrt(D + self.M)
        b1_init = np.random.randn(self.M) / np.sqrt(self.M)
        W2_init = np.random.randn(self.M, K) / np.sqrt(K + self.M)
        b2_init = np.random.randn(K) / np.sqrt(K)

        #Theano shared
        W1 = theano.shared(W1_init, 'W1')
        b1 = theano.shared(b1_init, 'b1')
        W2 = theano.shared(W2_init, 'W2')
        b2 = theano.shared(b2_init, 'b2')

        #Theano variables
        thX = T.matrix('X')
        thT = T.matrix('T')
        thZ = sigmoid(thX.dot(W1) + b1)
        thY = T.nnet.softmax(thZ.dot(W2) + b2)

        #Theano updatebles
        costs = -(thT * np.log(thY) + (1 - thT) * np.log((1 - thY))).sum()
        prediction = T.argmax(thY, axis=1)

        W1_update = W1 - learning_rate * (T.grad(costs, W1) + reg * W1)
        b1_update = b1 - learning_rate * (T.grad(costs, b1) + reg * b1)

        W2_update = W2 - learning_rate * (T.grad(costs, W2) + reg * W2)
        b2_update = b2 - learning_rate * (T.grad(costs, b2) + reg * b2)

        self._train = theano.function(
            inputs=[thX, thT],
            updates=[(W1, W1_update), (b1, b1_update), (W2, W2_update),
                     (b2, b2_update)],
        )

        self._predict = theano.function(
            inputs=[thX, thT],
            outputs=[costs, prediction],
        )

        train_costs = []
        train_errors = []
        valid_costs = []
        valid_errors = []

        for i in xrange(epochs):
            self._train(Xtrain, Ttrain)
            if i % 10 == 0:
                ctrain, pYtrain = self._predict(Xtrain, Ttrain)
                err = error_rate(Ttrain, pYtrain)
                train_costs.append(ctrain)
                train_errors.append(err)

                cvalid, pYvalid = self._predict(Xvalid, Tvalid)
                err = error_rate(Tvalid, pYvalid)
                valid_costs.append(cvalid)
                valid_errors.append(err)
                print "i=%d\tc=%.3f\terr==%.3f\t" % (i, cvalid, err)

        cvalid, pYvalid = self._predict(Xvalid, Tvalid)
        err = error_rate(Tvalid, pYvalid)
        valid_costs.append(cvalid)
        valid_errors.append(err)

        print "i=%d\tc=%.3f\terr==%.3f\t" % (epochs, cvalid, err)

        print "Final train classification rate", classification_rate(
            Ytrain, pYtrain)
        print "Final valid classification rate", classification_rate(
            Yalid, pYalid)

        plt.title('Multi layer perceptron: Costs')
        plt.xlabel('iterations')
        plt.ylabel('costs')
        legend1, = plt.plot(train_costs, label='train cost')
        legend2, = plt.plot(valid_costs, label='valid cost')
        plt.legend([
            legend1,
            legend2,
        ])
        plt.show()

        plt.title('Multi layer perceptron: Error rates')
        plt.xlabel('iterations')
        plt.ylabel('error rates')
        legend1, = plt.plot(train_errors, label='train error')
        legend2, = plt.plot(valid_errors, label='valid error')
        plt.legend([
            legend1,
            legend2,
        ])
        plt.show()
def main():
    X, Y = get_ecommerce(user_action=None)
    X, Y = shuffle(X, Y)

    # Define dimensions
    N, D = X.shape
    M = 5
    K = len(np.unique(Y))

    Ntrain = N - 100 + 1
    Xtrain, Ytrain = X[:Ntrain, :], Y[:Ntrain]
    Ytrain_ind = y2indicator(Ytrain, K)

    Ntest = 100
    Xtest, Ytest = X[-Ntest:, :], Y[-Ntest:]
    Ytest_ind = y2indicator(Ytest, K)

    W1 = np.random.randn(D, M) / np.sqrt(M + D)
    b1 = np.random.randn(M) / np.sqrt(M)

    W2 = np.random.randn(M, K) / np.sqrt(M + K)
    b2 = np.random.randn(K) / np.sqrt(K)

    # Running variables
    lr = 5e-4
    max_iteration = 100000

    train_costs = []
    test_costs = []
    train_errors = []
    test_errors = []
    for i in xrange(max_iteration):
        pYtrain, Ztrain = forward(W1, b1, W2, b2, Xtrain)
        pYtest, Ztest = forward(W1, b1, W2, b2, Xtest)

        ctrain = cross_entropy(Ytrain_ind, pYtrain)
        ctest = cross_entropy(Ytest_ind, pYtest)

        etrain = error_rate(predict(pYtrain), Ytrain)
        etest = error_rate(predict(pYtest), Ytest)

        train_costs.append(ctrain)
        test_costs.append(ctest)
        train_errors.append(etrain)
        test_errors.append(etest)

        if i % 1000 == 0:
            print "i=%d\ttrain cost=%d\ttest cost=%d\ttrain error=%0.3f" % (
                i, int(ctrain), int(ctest), etrain)

        W2 -= lr * Ztrain.T.dot(pYtrain - Ytrain_ind)
        b2 -= lr * (pYtrain - Ytrain_ind).sum(axis=0)
        # derivative_w1(X, Z, T, Y, W2)
        W1 -= lr * Xtrain.T.dot(
            (pYtrain - Ytrain_ind).dot(W2.T) * Ztrain * (1 - Ztrain))
        b1 -= lr * ((pYtrain - Ytrain_ind).dot(W2.T) * Ztrain *
                    (1 - Ztrain)).sum(axis=0)

    print "i=%d\ttrain cost=%.3f\ttest error=%.3f" % (max_iteration, ctrain,
                                                      ctest)
    print "Final train classification rate", classification_rate(
        Ytrain, predict(pYtrain))
    print "Final test  classification rate", classification_rate(
        Ytest, predict(pYtest))

    plt.title('Multi layer perceptron: Costs')
    plt.xlabel('iterations')
    plt.ylabel('costs')
    legend1, = plt.plot(train_costs, label='train cost')
    legend2, = plt.plot(test_costs, label='test cost')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()

    plt.title('Multi layer perceptron: Error rates')
    plt.xlabel('iterations')
    plt.ylabel('erro rates')
    legend1, = plt.plot(train_costs, label='train error')
    legend2, = plt.plot(test_costs, label='test error')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()
    plt.xlabel('iterations')
    plt.ylabel('training costs')
    legend1, = plt.plot(train_costs, label='train cost')
    legend2, = plt.plot(test_costs, label='test cost')
    plt.legend([
        legend1,
        legend2,
    ])
    plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-8,
            mu=0.99,
            decay=0.99,
            reg=10e-8,
            epochs=400,
            batch_sz=100,
            show_figure=False):
        X, Y = shuffle(X, Y)
        K = len(np.unique(Y))
        Y = y2indicator(Y, K).astype(np.float32)

        Xvalid, Yvalid = X[-1000:, :], Y[-1000:]
        Yvalid_flat = np.argmax(Yvalid, axis=1)
        Xtrain, Ytrain = X[:-1000, :], Y[:-1000]

        N, D = X.shape

        #Build hidden layers
        M1 = D
        self.hidden_layers = []
        self.params = []
        for an_id, M2 in enumerate(self.hidden_layer_sizes):
            h = HiddenLayer(M1, M2, an_id)
            self.hidden_layers.append(h)
            self.params += h.params
            M1 = M2

        M2 = K
        an_id = len(self.hidden_layer_sizes)
        W, b = init_weight_and_bias(M1, M2)
        self.W = tf.Variable(W.astype(np.float32), name='W%d' % an_id)
        self.b = tf.Variable(b.astype(np.float32), name='b%d' % an_id)

        self.params += [self.W, self.b]

        X = tf.placeholder(tf.float32, shape=(None, D), name='X')
        Y = tf.placeholder(tf.float32, shape=(None, K), name='Y')
        Yish = self.forward(X)

        # cost functions
        rcost = reg * tf.reduce_sum([tf.nn.l2_loss(p) for p in self.params
                                     ])  # L2 regularization costs
        cost = tf.reduce_sum(
            tf.nn.softmax_cross_entropy_with_logits(labels=Y,
                                                    logits=Yish)) + rcost

        train_op = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=decay,
                                             momentum=mu).minimize(cost)
        predict_op = tf.argmax(Yish, 1)

        LL = []
        n_batches = int(N / batch_sz)
        best_validation_error = 1
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)

            for i in xrange(epochs):
                Xtrain, Ytrain = shuffle(Xtrain, Ytrain)

                for j in xrange(n_batches):
                    Xbatch = Xtrain[j * (batch_sz):(j + 1) * batch_sz, :]
                    Ybatch = Ytrain[j * (batch_sz):(j + 1) * batch_sz, :]

                    session.run(train_op, feed_dict={X: Xbatch, Y: Ybatch})

                    if j % 100 == 0:
                        pY = session.run(predict_op, feed_dict={X: Xvalid})
                        c = session.run(cost, feed_dict={X: Xvalid, Y: Yvalid})
                        err = error_rate(Yvalid_flat, pY)
                        LL.append(c)
                        print "i:%d\tj:%d\tnb:%d\tc:%.3f\te:%.3f\t" % (
                            i, j, n_batches, c, err)

                    if err < best_validation_error:
                        best_validation_error = err
            print "best_validation_error:", best_validation_error

        if show_figure:
            plt.plot(LL)
            plt.show()