def predict(self, image_matrix):
        X = image_matrix.reshape((1, image_matrix.size))
        mu = np.loadtxt('mu.csv')
        sigma = np.loadtxt('sigma.csv')
        X, mu, sigma = normalize(X, mu, sigma)
        hidden_size = self.config['hidden_size']
        input_size = self.config['input_size']
        num_labels = self.config['num_labels']
        theta1 = self.nn_params[:((hidden_size) * (input_size + 1))].reshape(
            (hidden_size, input_size + 1))
        theta2 = self.nn_params[((hidden_size) * (input_size + 1)):].reshape(
            (num_labels, hidden_size + 1))

        a1 = insert_bias(X)

        z2 = theta1.dot(a1.T)
        a2 = sigmoid(z2)

        a2 = insert_bias(a2.T)
        print theta2.shape

        z3 = theta2.dot(a2.T)
        h = sigmoid(z3)
        print h
        return h
    def nn_cfx(self, X, y, nn_params):
        input_size = self.config['input_size']
        num_labels = self.config['num_labels']
        hidden_size = self.config['hidden_size']
        lambda_ = self.config['lambda']

        theta1 = nn_params[:((hidden_size) * (input_size + 1))].reshape(
            (hidden_size, input_size + 1))

        theta2 = nn_params[((hidden_size) * (input_size + 1)):].reshape(
            (num_labels, hidden_size + 1))
        m = X.shape[0]

        J = 0
        theta1_grad = np.zeros(theta1.shape)
        theta2_grad = np.zeros(theta2.shape)

        a1 = insert_bias(X)

        z2 = theta1.dot(a1.T)
        a2 = sigmoid(z2)

        a2 = insert_bias(a2.T)

        z3 = theta2.dot(a2.T)
        h = sigmoid(z3)

        yk = np.zeros((num_labels, m))


        #back propagation

        for i in range(m):
            yk[int(y[i])-1, i] = 1.0

        error = (-yk) * np.log(h) - (1 - yk) * np.log(1 - h)
        J = (1.0/m)*sum(sum(error))

        t1 = np.array(theta1[:,1:])
        t2 = np.array(theta2[:,1:])

        sum1 = sum(sum(np.power(t1,2)))
        sum2 = sum(sum(np.power(t2,2)))

        r = (lambda_/(2.0*m))*(sum1 + sum2)
        J += r

        for t in range(m):
            z2 = np.matrix(theta1.dot(a1[t,:].T)).T #change to t later
            a2 = sigmoid(z2)
            a2 = insert_bias_row(a2)

            z3 = theta2.dot(a2)
            h = sigmoid(z3)

            z2 = insert_bias_row(z2)

            output = np.matrix(yk[:,t]).T #change to t later

            d3 = np.matrix(h - output)
            sg = np.matrix(sigmoid_gradient(z2))
            d2 = np.multiply(theta2.T.dot(d3),sg)
            d2 = d2[1:,:]

            theta2_grad += d3.dot(a2.T)
            theta1_grad += d2.dot(np.matrix(a1[t,:])) #change to t later

        # regularization

        theta1_grad[:,0] = np.matrix(theta1_grad[:,0]/(m*1.0))
        theta1_grad[:,1:] = (theta1_grad[:,1:]*(1/(m*1.0)) + ((lambda_/(m*1.0)*theta1[:,1:])))

        theta2_grad[:,0] = np.matrix(theta2_grad[:,0]/(m*1.0))
        theta2_grad[:,1:] = (theta2_grad[:,1:]*(1/(m*1.0)) + ((lambda_/(m*1.0)*theta2[:,1:])))
        #print accuracy(predict1(theta1_grad, theta2_grad, X), y)
        return J, wrap(theta1_grad, theta2_grad)