コード例 #1
0
    def fit(self, X, Y, learning_rate=10e-6, reg=10e-7, epochs=10001, show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY, Z_4, Z_3, Z_2, Z_1 = self.forward(X)

            Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
            Z_3_deriv = self.nonlinear(self.layers[-1], Z = Z_3)[1]
            Z_2_deriv = self.nonlinear(self.layers[-2], Z = Z_2)[1]
            Z_1_deriv = self.nonlinear(self.layers[-3], Z = Z_1)[1]

            # gradient descent step
            pY_T = pY - T
            self.weights[-1] -= learning_rate * (Z_4.T.dot(pY_T) + reg * self.weights[-1])
            self.biases[-1] -= learning_rate * (pY_T.sum(axis=0) + reg * self.biases[-1])

            dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
            self.weights[-2] -= learning_rate * (Z_3.T.dot(dZ_4) + reg * self.weights[-2])
            self.biases[-2] -= learning_rate * (dZ_4.sum(axis=0) + reg * self.biases[-2])

            dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv
            self.weights[-3] -= learning_rate * (Z_2.T.dot(dZ_3) + reg * self.weights[-3])
            self.biases[-3] -= learning_rate * (dZ_3.sum(axis=0) + reg * self.biases[-3])

            dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv
            self.weights[-4] -= learning_rate * (Z_1.T.dot(dZ_2) + reg * self.weights[-4])
            self.biases[-4] -= learning_rate * (dZ_2.sum(axis=0) + reg * self.biases[-4])

            dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv).dot((self.weights[-4]).T)) * Z_1_deriv
            self.weights[-5] -= learning_rate * (X.T.dot(dZ_1) + reg * self.weights[-5])
            self.biases[-5] -= learning_rate * (dZ_1.sum(axis=0) + reg * self.biases[-5])


            if i % 4000 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                e = error_rate(T, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
                print("best_validation_error:", best_validation_error)

            if show_fig:
                plt.plot(costs)
                plt.show()
コード例 #2
0
 def __init__(self, M1, M2, an_id, nonlin_func):
     self.id = an_id
     self.M1 = M1
     self.M2 = M2
     '''self.params contain W and b for particular layer'''
     self.params = list(map(tf.Variable, init_weight_and_bias(M1, M2)))
     self.nonlin_func = nonlin_func
コード例 #3
0
    def fit(self,
            X,
            Y,
            learning_rate=5 * 10e-5,
            reg=10e-2,
            epochs=51,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = 100
        n_batches = int(N / batch_sz)
        # momentum
        decay_rate = 0.999
        eps = 10e-10
        cache_W = [1, 1, 1, 1, 1]
        cache_b = [1, 1, 1, 1, 1]

        mu = 0.9
        dW = [0, 0, 0, 0, 0]
        db = [0, 0, 0, 0, 0]

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                # learning_rate=5*10e-5, reg=10e-2, epochs=51
                pY_T = pY - Tbatch
                gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1]
                gb5 = pY_T.sum(axis=0) + reg * self.biases[-1]
                cache_W[-1] = decay_rate * cache_W[-1] + (
                    1 - decay_rate) * gW5 * gW5
                cache_b[-1] = decay_rate * cache_b[-1] + (
                    1 - decay_rate) * gb5 * gb5
                dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * gW5 / (
                    np.sqrt(cache_W[-1]) + eps)
                db[-1] = mu * db[-1] - (1 - mu) * learning_rate * gb5 / (
                    np.sqrt(cache_b[-1]) + eps)
                self.weights[-1] += dW[-1]
                self.biases[-1] += db[-1]

                dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
                gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2]
                gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2]
                cache_W[-2] = decay_rate * cache_W[-2] + (
                    1 - decay_rate) * gW4 * gW4
                cache_b[-2] = decay_rate * cache_b[-2] + (
                    1 - decay_rate) * gb4 * gb4
                dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * gW4 / (
                    np.sqrt(cache_W[-2]) + eps)
                db[-2] = mu * db[-2] - (1 - mu) * learning_rate * gb4 / (
                    np.sqrt(cache_b[-2]) + eps)
                self.weights[-2] += dW[-2]
                self.biases[-2] += db[-2]

                dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv
                gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3]
                gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3]
                cache_W[-3] = decay_rate * cache_W[-3] + (
                    1 - decay_rate) * gW3 * gW3
                cache_b[-3] = decay_rate * cache_b[-3] + (
                    1 - decay_rate) * gb3 * gb3
                dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * gW3 / (
                    np.sqrt(cache_W[-3]) + eps)
                db[-3] = mu * db[-3] - (1 - mu) * learning_rate * gb3 / (
                    np.sqrt(cache_b[-3]) + eps)
                self.weights[-3] += dW[-3]
                self.biases[-3] += db[-3]

                dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv
                gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4]
                gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4]
                cache_W[-4] = decay_rate * cache_W[-4] + (
                    1 - decay_rate) * gW2 * gW2
                cache_b[-4] = decay_rate * cache_b[-4] + (
                    1 - decay_rate) * gb2 * gb2
                dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * gW2 / (
                    np.sqrt(cache_W[-4]) + eps)
                db[-4] = mu * db[-4] - (1 - mu) * learning_rate * gb2 / (
                    np.sqrt(cache_b[-4]) + eps)
                self.weights[-4] += dW[-4]
                self.biases[-4] += db[-4]

                dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv
                gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5]
                gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5]
                cache_W[-5] = decay_rate * cache_W[-5] + (
                    1 - decay_rate) * gW1 * gW1
                cache_b[-5] = decay_rate * cache_b[-5] + (
                    1 - decay_rate) * gb1 * gb1
                dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * gW1 / (
                    np.sqrt(cache_W[-5]) + eps)
                db[-5] = mu * db[-5] - (1 - mu) * learning_rate * gb1 / (
                    np.sqrt(cache_b[-5]) + eps)
                self.weights[-5] += dW[-5]
                self.biases[-5] += db[-5]

                # if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)
            if i % 10 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()
コード例 #4
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-7,
            mu=0.99,
            decay=0.999,
            reg=10e-3,
            epochs=400,
            batch_size=100,
            split=True,
            show_fig=False,
            print_every=20):
        self.epochs = epochs
        K = len(set(Y))
        X, Y = X.astype(np.float32).toarray(), y_hot_encoding(Y).astype(
            np.float32)
        X, Y = shuffle(X, Y)
        if split:
            Xvalid, Yvalid = X[-1000:], Y[-1000:]
            X, Y = X[:-1000], Y[:-1000]
        else:
            Xvalid, Yvalid = X, Y
        Yvalid_flat = np.argmax(Yvalid, axis=1)

        self.training = True
        '''Clears the default graph stack and resets the global default graph.'''
        tf.reset_default_graph()
        '''initialize hidden layers'''
        N, D = X.shape
        M1 = D
        self.hidden_layers = []
        for id in range(len(self.hidden_layer_sizes)):
            self.hidden_layers.append(
                HiddenLayer(M1, self.hidden_layer_sizes[id], id,
                            self.nonlin_functions[id]))
            M1 = self.hidden_layer_sizes[id]

        self.params = list(map(tf.Variable, init_weight_and_bias(M1, K)))
        [self.params.append(j) for h in self.hidden_layers for j in h.params]

        tfX = tf.placeholder(tf.float32, shape=(None, D), name="tfX")
        tfT = tf.placeholder(tf.float32, shape=(None, K), name="tfT")
        logits = self.forward(tfX)

        rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=tfT)) + rcost
        #cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tfT)) #+ rcost
        prediction = self.predict(tfX)

        #train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost)
        train_op = tf.train.AdamOptimizer(learning_rate,
                                          beta1=0.99,
                                          beta2=0.999).minimize(cost)
        #train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu, use_nesterov=False).minimize(cost)
        #train_op = tf.train.ProximalGradientDescentOptimizer(learning_rate, l2_regularization_strength=0.0, use_locking=False).minimize(cost)

        n_batches = int(N / batch_size)
        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                for j in range(n_batches):
                    Xbatch = X[j * batch_size:(j * batch_size + batch_size)]
                    Ybatch = Y[j * batch_size:(j * batch_size + batch_size)]
                    session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch})

                    if j % print_every == 0:
                        costs.append(
                            session.run(cost,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        }))
                        p = session.run(prediction,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        })
                        print("i:", i, "j:", j, "nb:", n_batches, "cost:",
                              costs[-1], "error_rate:",
                              error_rate(Yvalid_flat, p))
            saver = tf.train.Saver()
            '''Now, save the graph'''
            saver.save(session,
                       './my_model-' + str(self.counter),
                       global_step=self.epochs)
            print("Done!")

        if show_fig:
            plt.plot(costs)
            plt.show()
コード例 #5
0
    def fit(self,
            X,
            Y,
            batch_size,
            learning_rate=10e-6,
            reg=10e-7,
            epochs=10001,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = batch_size
        n_batches = int(N / batch_sz)
        #momentum
        mu = 0.9
        dW = [0, 0, 0, 0, 0]
        db = [0, 0, 0, 0, 0]

        costs = []
        best_validation_error = 1
        self.training = True
        for i in range(epochs):
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1, U = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                pY_T = pY - Tbatch
                dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * (
                    Z_4.T.dot(pY_T) + reg * self.weights[-1])
                db[-1] = mu * db[-1] - (1 - mu) * learning_rate * (
                    pY_T.sum(axis=0) + reg * self.biases[-1])
                self.weights[-1] += dW[-1]
                self.biases[-1] += db[-1]

                dZ_4 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv) * U[-1]
                dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * (
                    Z_3.T.dot(dZ_4) + reg * self.weights[-2])
                db[-2] = mu * db[-2] - (1 - mu) * learning_rate * (
                    dZ_4.sum(axis=0) + reg * self.biases[-2])
                self.weights[-2] += dW[-2]
                self.biases[-2] += db[-2]

                dZ_3 = ((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv) * U[-2]
                dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * (
                    Z_2.T.dot(dZ_3) + reg * self.weights[-3])
                db[-3] = mu * db[-3] - (1 - mu) * learning_rate * (
                    dZ_3.sum(axis=0) + reg * self.biases[-3])
                self.weights[-3] += dW[-3]
                self.biases[-3] += db[-3]

                dZ_2 = ((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv) * U[-3]
                dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * (
                    Z_1.T.dot(dZ_2) + reg * self.weights[-4])
                db[-4] = mu * db[-4] - (1 - mu) * learning_rate * (
                    dZ_2.sum(axis=0) + reg * self.biases[-4])
                self.weights[-4] += dW[-4]
                self.biases[-4] += db[-4]

                dZ_1 = ((((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv) * U[-4]
                dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * (
                    Xbatch.T.dot(dZ_1) + reg * self.weights[-5])
                db[-5] = mu * db[-5] - (1 - mu) * learning_rate * (
                    dZ_1.sum(axis=0) + reg * self.biases[-5])
                self.weights[-5] += dW[-5]
                self.biases[-5] += db[-5]

                #if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)
            if i % 50 == 0:
                pYvalid, _, __, ___, ____, _____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()
コード例 #6
0
    def fit(self,
            X,
            Y,
            learning_rate=5 * 10e-5,
            reg=10e-2,
            epochs=51,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = 100
        n_batches = int(N / batch_sz)

        decay_rate = 0.999
        eps = 10e-10
        beta_1 = 0.9
        beta_2 = 0.999

        # first momentum
        m_W = [0, 0, 0, 0, 0]
        m_b = [0, 0, 0, 0, 0]

        #second momentum
        v_W = [0, 0, 0, 0, 0]
        v_b = [0, 0, 0, 0, 0]

        def updater(idx, gW, gb):
            m_W[idx] = (beta_1 * m_W[idx] +
                        (1 - beta_1) * gW) / (1 - beta_1**t)
            m_b[idx] = (beta_1 * m_b[idx] +
                        (1 - beta_1) * gb) / (1 - beta_1**t)
            v_W[idx] = (beta_2 * v_W[idx] +
                        (1 - beta_2) * gW * gW) / (1 - beta_2**t)
            v_b[idx] = (beta_2 * v_b[idx] +
                        (1 - beta_2) * gb * gb) / (1 - beta_2**t)
            self.weights[idx] -= learning_rate * m_W[idx] / np.sqrt(v_W[idx] +
                                                                    eps)
            self.biases[idx] -= learning_rate * m_b[idx] / np.sqrt(v_b[idx] +
                                                                   eps)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            for j in range(n_batches):
                #num of iteration
                t = 1 + i * n_batches + j

                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                # learning_rate=5*10e-5, reg=10e-2, epochs=51
                pY_T = pY - Tbatch
                gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1]
                gb5 = pY_T.sum(axis=0) + reg * self.biases[-1]
                updater(-1, gW5, gb5)

                dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
                gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2]
                gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2]
                updater(-2, gW4, gb4)

                dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv
                gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3]
                gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3]
                updater(-3, gW3, gb3)

                dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv
                gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4]
                gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4]
                updater(-4, gW2, gb2)

                dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv
                gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5]
                gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5]
                updater(-5, gW1, gb1)

                # if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)

            if i % 10 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()