Ejemplo n.º 1
0
    def fit(self, X, Y, learning_rate=10e-6, reg=10e-7, epochs=10001, show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            # forward propagation and cost calculation
            pY, Z_4, Z_3, Z_2, Z_1 = self.forward(X)

            Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
            Z_3_deriv = self.nonlinear(self.layers[-1], Z = Z_3)[1]
            Z_2_deriv = self.nonlinear(self.layers[-2], Z = Z_2)[1]
            Z_1_deriv = self.nonlinear(self.layers[-3], Z = Z_1)[1]

            # gradient descent step
            pY_T = pY - T
            self.weights[-1] -= learning_rate * (Z_4.T.dot(pY_T) + reg * self.weights[-1])
            self.biases[-1] -= learning_rate * (pY_T.sum(axis=0) + reg * self.biases[-1])

            dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
            self.weights[-2] -= learning_rate * (Z_3.T.dot(dZ_4) + reg * self.weights[-2])
            self.biases[-2] -= learning_rate * (dZ_4.sum(axis=0) + reg * self.biases[-2])

            dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv
            self.weights[-3] -= learning_rate * (Z_2.T.dot(dZ_3) + reg * self.weights[-3])
            self.biases[-3] -= learning_rate * (dZ_3.sum(axis=0) + reg * self.biases[-3])

            dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv
            self.weights[-4] -= learning_rate * (Z_1.T.dot(dZ_2) + reg * self.weights[-4])
            self.biases[-4] -= learning_rate * (dZ_2.sum(axis=0) + reg * self.biases[-4])

            dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot((self.weights[-2]).T) * Z_3_deriv).dot((self.weights[-3]).T)) * Z_2_deriv).dot((self.weights[-4]).T)) * Z_1_deriv
            self.weights[-5] -= learning_rate * (X.T.dot(dZ_1) + reg * self.weights[-5])
            self.biases[-5] -= learning_rate * (dZ_1.sum(axis=0) + reg * self.biases[-5])


            if i % 4000 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                e = error_rate(T, np.argmax(pYvalid, axis=1))
                print("i:", i, "cost:", c, "error:", e)
                if e < best_validation_error:
                    best_validation_error = e
                print("best_validation_error:", best_validation_error)

            if show_fig:
                plt.plot(costs)
                plt.show()
 def score(self, X, Y):
     #if not isinstance(Y, np.ndarray):
     Y = y_hot_encoding(Y).astype(np.float32)
     p = self.make_prediction(X)
     return 1 - error_rate(np.argmax(Y, axis=1), p)
    def fit(self,
            X,
            Y,
            learning_rate=10e-7,
            mu=0.99,
            decay=0.999,
            reg=10e-3,
            epochs=400,
            batch_size=100,
            split=True,
            show_fig=False,
            print_every=20):
        self.epochs = epochs
        K = len(set(Y))
        X, Y = X.astype(np.float32).toarray(), y_hot_encoding(Y).astype(
            np.float32)
        X, Y = shuffle(X, Y)
        if split:
            Xvalid, Yvalid = X[-1000:], Y[-1000:]
            X, Y = X[:-1000], Y[:-1000]
        else:
            Xvalid, Yvalid = X, Y
        Yvalid_flat = np.argmax(Yvalid, axis=1)
        '''initialize hidden layers'''
        N, D = X.shape
        M1 = D
        self.hidden_layers = []
        for id in range(len(self.hidden_layer_sizes)):
            '''BEFORE IT WAS HiddenLayerBatchNorm'''
            self.hidden_layers.append(
                HiddenLayerBatchNorm_1(M1, self.hidden_layer_sizes[id], id,
                                       self.nonlin_functions[id]))
            M1 = self.hidden_layer_sizes[id]
        self.hidden_layers.append(
            HiddenLayer_1(M1, K, len(self.hidden_layer_sizes), "None"))

        tfX = tf.placeholder(tf.float32, shape=(None, D), name="tfX")
        tfT = tf.placeholder(tf.float32, shape=(None, K), name="tfT")
        self.test = tf.placeholder(tf.float32, shape=(None, D), name="tfTest")
        logits = self.forward(tfX, is_training=True)

        rcost = reg * sum([
            tf.nn.l2_loss(coefs) for layer in self.hidden_layers
            for coefs in layer.params
        ])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=tfT)) + rcost
        prediction = self.predict(tfX)

        #train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost)
        train_op = tf.train.AdamOptimizer(learning_rate,
                                          beta1=0.99,
                                          beta2=0.999).minimize(cost)
        #train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu, use_nesterov=False).minimize(cost)
        #train_op = tf.train.ProximalGradientDescentOptimizer(learning_rate, l2_regularization_strength=0.0, use_locking=False).minimize(cost)

        n_batches = int(N / batch_size)
        costs = []

        self.session.run(tf.global_variables_initializer())
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_size:(j * batch_size + batch_size)]
                Ybatch = Y[j * batch_size:(j * batch_size + batch_size)]
                self.session.run(train_op,
                                 feed_dict={
                                     tfX: Xbatch,
                                     tfT: Ybatch
                                 })

                if j % print_every == 0:
                    costs.append(
                        self.session.run(cost,
                                         feed_dict={
                                             tfX: Xvalid,
                                             tfT: Yvalid
                                         }))
                    p = self.session.run(prediction,
                                         feed_dict={
                                             tfX: Xvalid,
                                             tfT: Yvalid
                                         })
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:",
                          costs[-1], "error_rate:", error_rate(Yvalid_flat, p))

        print("Done!")

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=5 * 10e-5,
            reg=10e-2,
            epochs=51,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = 100
        n_batches = int(N / batch_sz)
        # momentum
        decay_rate = 0.999
        eps = 10e-10
        cache_W = [1, 1, 1, 1, 1]
        cache_b = [1, 1, 1, 1, 1]

        mu = 0.9
        dW = [0, 0, 0, 0, 0]
        db = [0, 0, 0, 0, 0]

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                # learning_rate=5*10e-5, reg=10e-2, epochs=51
                pY_T = pY - Tbatch
                gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1]
                gb5 = pY_T.sum(axis=0) + reg * self.biases[-1]
                cache_W[-1] = decay_rate * cache_W[-1] + (
                    1 - decay_rate) * gW5 * gW5
                cache_b[-1] = decay_rate * cache_b[-1] + (
                    1 - decay_rate) * gb5 * gb5
                dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * gW5 / (
                    np.sqrt(cache_W[-1]) + eps)
                db[-1] = mu * db[-1] - (1 - mu) * learning_rate * gb5 / (
                    np.sqrt(cache_b[-1]) + eps)
                self.weights[-1] += dW[-1]
                self.biases[-1] += db[-1]

                dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
                gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2]
                gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2]
                cache_W[-2] = decay_rate * cache_W[-2] + (
                    1 - decay_rate) * gW4 * gW4
                cache_b[-2] = decay_rate * cache_b[-2] + (
                    1 - decay_rate) * gb4 * gb4
                dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * gW4 / (
                    np.sqrt(cache_W[-2]) + eps)
                db[-2] = mu * db[-2] - (1 - mu) * learning_rate * gb4 / (
                    np.sqrt(cache_b[-2]) + eps)
                self.weights[-2] += dW[-2]
                self.biases[-2] += db[-2]

                dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv
                gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3]
                gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3]
                cache_W[-3] = decay_rate * cache_W[-3] + (
                    1 - decay_rate) * gW3 * gW3
                cache_b[-3] = decay_rate * cache_b[-3] + (
                    1 - decay_rate) * gb3 * gb3
                dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * gW3 / (
                    np.sqrt(cache_W[-3]) + eps)
                db[-3] = mu * db[-3] - (1 - mu) * learning_rate * gb3 / (
                    np.sqrt(cache_b[-3]) + eps)
                self.weights[-3] += dW[-3]
                self.biases[-3] += db[-3]

                dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv
                gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4]
                gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4]
                cache_W[-4] = decay_rate * cache_W[-4] + (
                    1 - decay_rate) * gW2 * gW2
                cache_b[-4] = decay_rate * cache_b[-4] + (
                    1 - decay_rate) * gb2 * gb2
                dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * gW2 / (
                    np.sqrt(cache_W[-4]) + eps)
                db[-4] = mu * db[-4] - (1 - mu) * learning_rate * gb2 / (
                    np.sqrt(cache_b[-4]) + eps)
                self.weights[-4] += dW[-4]
                self.biases[-4] += db[-4]

                dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv
                gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5]
                gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5]
                cache_W[-5] = decay_rate * cache_W[-5] + (
                    1 - decay_rate) * gW1 * gW1
                cache_b[-5] = decay_rate * cache_b[-5] + (
                    1 - decay_rate) * gb1 * gb1
                dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * gW1 / (
                    np.sqrt(cache_W[-5]) + eps)
                db[-5] = mu * db[-5] - (1 - mu) * learning_rate * gb1 / (
                    np.sqrt(cache_b[-5]) + eps)
                self.weights[-5] += dW[-5]
                self.biases[-5] += db[-5]

                # if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)
            if i % 10 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 5
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-7,
            mu=0.99,
            decay=0.999,
            reg=10e-3,
            epochs=400,
            batch_size=100,
            split=True,
            show_fig=False,
            print_every=20):
        self.epochs = epochs
        K = len(set(Y))
        X, Y = X.astype(np.float32).toarray(), y_hot_encoding(Y).astype(
            np.float32)
        X, Y = shuffle(X, Y)
        if split:
            Xvalid, Yvalid = X[-1000:], Y[-1000:]
            X, Y = X[:-1000], Y[:-1000]
        else:
            Xvalid, Yvalid = X, Y
        Yvalid_flat = np.argmax(Yvalid, axis=1)

        self.training = True
        '''Clears the default graph stack and resets the global default graph.'''
        tf.reset_default_graph()
        '''initialize hidden layers'''
        N, D = X.shape
        M1 = D
        self.hidden_layers = []
        for id in range(len(self.hidden_layer_sizes)):
            self.hidden_layers.append(
                HiddenLayer(M1, self.hidden_layer_sizes[id], id,
                            self.nonlin_functions[id]))
            M1 = self.hidden_layer_sizes[id]

        self.params = list(map(tf.Variable, init_weight_and_bias(M1, K)))
        [self.params.append(j) for h in self.hidden_layers for j in h.params]

        tfX = tf.placeholder(tf.float32, shape=(None, D), name="tfX")
        tfT = tf.placeholder(tf.float32, shape=(None, K), name="tfT")
        logits = self.forward(tfX)

        rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=tfT)) + rcost
        #cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tfT)) #+ rcost
        prediction = self.predict(tfX)

        #train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost)
        train_op = tf.train.AdamOptimizer(learning_rate,
                                          beta1=0.99,
                                          beta2=0.999).minimize(cost)
        #train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu, use_nesterov=False).minimize(cost)
        #train_op = tf.train.ProximalGradientDescentOptimizer(learning_rate, l2_regularization_strength=0.0, use_locking=False).minimize(cost)

        n_batches = int(N / batch_size)
        costs = []
        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                for j in range(n_batches):
                    Xbatch = X[j * batch_size:(j * batch_size + batch_size)]
                    Ybatch = Y[j * batch_size:(j * batch_size + batch_size)]
                    session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch})

                    if j % print_every == 0:
                        costs.append(
                            session.run(cost,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        }))
                        p = session.run(prediction,
                                        feed_dict={
                                            tfX: Xvalid,
                                            tfT: Yvalid
                                        })
                        print("i:", i, "j:", j, "nb:", n_batches, "cost:",
                              costs[-1], "error_rate:",
                              error_rate(Yvalid_flat, p))
            saver = tf.train.Saver()
            '''Now, save the graph'''
            saver.save(session,
                       './my_model-' + str(self.counter),
                       global_step=self.epochs)
            print("Done!")

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            batch_size,
            learning_rate=10e-6,
            reg=10e-7,
            epochs=10001,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = batch_size
        n_batches = int(N / batch_sz)
        #momentum
        mu = 0.9
        dW = [0, 0, 0, 0, 0]
        db = [0, 0, 0, 0, 0]

        costs = []
        best_validation_error = 1
        self.training = True
        for i in range(epochs):
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1, U = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                pY_T = pY - Tbatch
                dW[-1] = mu * dW[-1] - (1 - mu) * learning_rate * (
                    Z_4.T.dot(pY_T) + reg * self.weights[-1])
                db[-1] = mu * db[-1] - (1 - mu) * learning_rate * (
                    pY_T.sum(axis=0) + reg * self.biases[-1])
                self.weights[-1] += dW[-1]
                self.biases[-1] += db[-1]

                dZ_4 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv) * U[-1]
                dW[-2] = mu * dW[-2] - (1 - mu) * learning_rate * (
                    Z_3.T.dot(dZ_4) + reg * self.weights[-2])
                db[-2] = mu * db[-2] - (1 - mu) * learning_rate * (
                    dZ_4.sum(axis=0) + reg * self.biases[-2])
                self.weights[-2] += dW[-2]
                self.biases[-2] += db[-2]

                dZ_3 = ((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv) * U[-2]
                dW[-3] = mu * dW[-3] - (1 - mu) * learning_rate * (
                    Z_2.T.dot(dZ_3) + reg * self.weights[-3])
                db[-3] = mu * db[-3] - (1 - mu) * learning_rate * (
                    dZ_3.sum(axis=0) + reg * self.biases[-3])
                self.weights[-3] += dW[-3]
                self.biases[-3] += db[-3]

                dZ_2 = ((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv) * U[-3]
                dW[-4] = mu * dW[-4] - (1 - mu) * learning_rate * (
                    Z_1.T.dot(dZ_2) + reg * self.weights[-4])
                db[-4] = mu * db[-4] - (1 - mu) * learning_rate * (
                    dZ_2.sum(axis=0) + reg * self.biases[-4])
                self.weights[-4] += dW[-4]
                self.biases[-4] += db[-4]

                dZ_1 = ((((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv) * U[-4]
                dW[-5] = mu * dW[-5] - (1 - mu) * learning_rate * (
                    Xbatch.T.dot(dZ_1) + reg * self.weights[-5])
                db[-5] = mu * db[-5] - (1 - mu) * learning_rate * (
                    dZ_1.sum(axis=0) + reg * self.biases[-5])
                self.weights[-5] += dW[-5]
                self.biases[-5] += db[-5]

                #if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)
            if i % 50 == 0:
                pYvalid, _, __, ___, ____, _____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 7
0
    def fit(self,
            X,
            Y,
            learning_rate=5 * 10e-5,
            reg=10e-2,
            epochs=51,
            show_fig=False):
        N, D = X.shape
        K = len(set(Y))
        T = y_hot_encoding(Y)
        W1, b1 = init_weight_and_bias(D, self.M_1)
        W2, b2 = init_weight_and_bias(self.M_1, self.M_2)
        W3, b3 = init_weight_and_bias(self.M_2, self.M_3)
        W4, b4 = init_weight_and_bias(self.M_3, self.M_4)
        W5, b5 = init_weight_and_bias(self.M_4, K)
        self.weights = [W1, W2, W3, W4, W5]
        self.biases = [b1, b2, b3, b4, b5]

        batch_sz = 100
        n_batches = int(N / batch_sz)

        decay_rate = 0.999
        eps = 10e-10
        beta_1 = 0.9
        beta_2 = 0.999

        # first momentum
        m_W = [0, 0, 0, 0, 0]
        m_b = [0, 0, 0, 0, 0]

        #second momentum
        v_W = [0, 0, 0, 0, 0]
        v_b = [0, 0, 0, 0, 0]

        def updater(idx, gW, gb):
            m_W[idx] = (beta_1 * m_W[idx] +
                        (1 - beta_1) * gW) / (1 - beta_1**t)
            m_b[idx] = (beta_1 * m_b[idx] +
                        (1 - beta_1) * gb) / (1 - beta_1**t)
            v_W[idx] = (beta_2 * v_W[idx] +
                        (1 - beta_2) * gW * gW) / (1 - beta_2**t)
            v_b[idx] = (beta_2 * v_b[idx] +
                        (1 - beta_2) * gb * gb) / (1 - beta_2**t)
            self.weights[idx] -= learning_rate * m_W[idx] / np.sqrt(v_W[idx] +
                                                                    eps)
            self.biases[idx] -= learning_rate * m_b[idx] / np.sqrt(v_b[idx] +
                                                                   eps)

        costs = []
        best_validation_error = 1
        for i in range(epochs):
            for j in range(n_batches):
                #num of iteration
                t = 1 + i * n_batches + j

                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                Tbatch = T[j * batch_sz:(j * batch_sz + batch_sz), ]

                # forward propagation and cost calculation
                pY, Z_4, Z_3, Z_2, Z_1 = self.forward(Xbatch)

                Z_4_deriv = self.nonlinear(self.layers[-1], Z=Z_4)[1]
                Z_3_deriv = self.nonlinear(self.layers[-1], Z=Z_3)[1]
                Z_2_deriv = self.nonlinear(self.layers[-2], Z=Z_2)[1]
                Z_1_deriv = self.nonlinear(self.layers[-3], Z=Z_1)[1]

                # gradient descent step
                # learning_rate=5*10e-5, reg=10e-2, epochs=51
                pY_T = pY - Tbatch
                gW5 = Z_4.T.dot(pY_T) + reg * self.weights[-1]
                gb5 = pY_T.sum(axis=0) + reg * self.biases[-1]
                updater(-1, gW5, gb5)

                dZ_4 = pY_T.dot((self.weights[-1]).T) * Z_4_deriv
                gW4 = Z_3.T.dot(dZ_4) + reg * self.weights[-2]
                gb4 = dZ_4.sum(axis=0) + reg * self.biases[-2]
                updater(-2, gW4, gb4)

                dZ_3 = (pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv
                gW3 = Z_2.T.dot(dZ_3) + reg * self.weights[-3]
                gb3 = dZ_3.sum(axis=0) + reg * self.biases[-3]
                updater(-3, gW3, gb3)

                dZ_2 = (((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv
                gW2 = Z_1.T.dot(dZ_2) + reg * self.weights[-4]
                gb2 = dZ_2.sum(axis=0) + reg * self.biases[-4]
                updater(-4, gW2, gb2)

                dZ_1 = (((((pY_T.dot((self.weights[-1]).T) * Z_4_deriv).dot(
                    (self.weights[-2]).T) * Z_3_deriv).dot(
                        (self.weights[-3]).T)) * Z_2_deriv).dot(
                            (self.weights[-4]).T)) * Z_1_deriv
                gW1 = Xbatch.T.dot(dZ_1) + reg * self.weights[-5]
                gb1 = dZ_1.sum(axis=0) + reg * self.biases[-5]
                updater(-5, gW1, gb1)

                # if j % 10 == 0:
                #    pYvalid, _, __, ___, ____ = self.forward(X)
                #    c = cost(T, pYvalid)
                #    costs.append(c)
                #    e = error_rate(Y, np.argmax(pYvalid, axis=1))
                #    print("i:", i, "cost:", c, "error:", e)
                #    if e < best_validation_error:
                #        best_validation_error = e
                #    print("best_validation_error:", best_validation_error)

            if i % 10 == 0:
                pYvalid, _, __, ___, ____ = self.forward(X)
                c = cost(T, pYvalid)
                costs.append(c)
                print("i:", i, "cost:", c)
        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            optimizer="adam",
            optimizer_params=(10e-4, 0.99, 0.999),
            reg=10e-3,
            epochs=400,
            batch_size=100,
            split=True,
            show_fig=False,
            print_every=20,
            print_tofile=False):

        #learning_rate, mu, decay, reg = map(np.float32, [learning_rate, mu, decay, reg])
        K = len(set(Y))
        X, Y = X.astype(np.float32), y_hot_encoding(Y).astype(np.float32)
        X, Y = shuffle(X, Y)
        if split:
            Xvalid, Yvalid = X[-1000:], Y[-1000:]
            X, Y = X[:-1000], Y[:-1000]
        else:
            Xvalid, Yvalid = X, Y
        Yvalid_flat = np.argmax(Yvalid, axis=1)
        ''' initialize convpool layers '''
        N, width, height, color = X.shape
        input_feature = color
        self.convpool_layers = []
        # in self.convpull_layer_sizes should be (new_feature, filter_width, filter_height)
        for index, outF_wdt_hgt in enumerate(self.convpull_layer_sizes):
            self.convpool_layers.append(
                ConvPullLayer(input_feature, *outF_wdt_hgt,
                              self.conv_nonlin_functions[index], self.poolsz))
            input_feature = outF_wdt_hgt[0]

# shape of the image after serie of convolution + maxpool layers
        final_output_width, final_output_height = width / ( self.poolsz[0] ** len(self.convpull_layer_sizes)), \
        height / (self.poolsz[1] ** len(self.convpull_layer_sizes))
        ''' initialize hidden layers '''
        # size of output feature of last convpull layer * shape of output image
        M1 = int(self.convpull_layer_sizes[-1][0] * final_output_width *
                 final_output_height)
        self.hidden_layers = []
        for id in range(len(self.hidden_layer_sizes)):
            '''BEFORE IT WAS HiddenLayerBatchNorm'''
            self.hidden_layers.append(
                HiddenLayerBatchNorm(M1, self.hidden_layer_sizes[id], id,
                                     self.nonlin_functions[id]))
            M1 = self.hidden_layer_sizes[id]

        self.hidden_layers.append(
            HiddenLayer(M1, K, len(self.hidden_layer_sizes), "None"))
        tfX = tf.placeholder(tf.float32,
                             shape=(None, width, height, color),
                             name="tfX")
        tfT = tf.placeholder(tf.float32, shape=(None, K), name="tfT")
        #self.test = tf.placeholder(tf.float32, shape=(None, D), name="tfTest")
        logits = self.forward(tfX, is_training=True)

        rcost = reg * sum([
            tf.nn.l2_loss(coefs)
            for layer in (self.convpool_layers + self.hidden_layers)
            for coefs in layer.params
        ])
        cost = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=tfT)) + rcost
        prediction = self.predict(tfX)

        train_op = self.optimizer(optimizer=optimizer,
                                  opt_args=optimizer_params).minimize(cost)

        n_batches = int(N / batch_size)
        batch_costs = []
        valid_costs = []
        error = []

        self.session.run(tf.global_variables_initializer())

        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_size:(j * batch_size + batch_size)]
                Ybatch = Y[j * batch_size:(j * batch_size + batch_size)]
                self.session.run(train_op,
                                 feed_dict={
                                     tfX: Xbatch,
                                     tfT: Ybatch
                                 })
                if j % print_every == 0:
                    batch_costs.append(
                        self.session.run(cost,
                                         feed_dict={
                                             tfX: Xbatch,
                                             tfT: Ybatch
                                         }))
                    valid_costs.append(
                        self.session.run(cost,
                                         feed_dict={
                                             tfX: Xvalid,
                                             tfT: Yvalid
                                         }))
                    p = self.session.run(prediction,
                                         feed_dict={
                                             tfX: Xvalid,
                                             tfT: Yvalid
                                         })
                    err_rate = error_rate(Yvalid_flat, p)
                    error.append(err_rate)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:",
                          valid_costs[-1], "error_rate:", err_rate)

        print("Done!")

        if show_fig:
            plt.plot(valid_costs)
            plt.xlabel('20 * iteration', fontsize=14)
            plt.ylabel('cost', fontsize=14)
            plt.grid()
            plt.show()

        if print_tofile:
            my_df = pd.DataFrame([batch_costs, valid_costs, error])
            my_df.to_csv(print_tofile, index=False, header=False)