def predict(Theta1, Theta2, X):
    """ outputs the predicted label of X given the
    trained weights of a neural network (Theta1, Theta2)
    """

# Useful values
    m, _ = X.shape
    num_labels, _ = Theta2.shape

# ====================== YOUR CODE HERE ======================
# Instructions: Complete the following code to make predictions using
#               your learned neural network. You should set p to a 
#               vector containing labels between 1 to num_labels.
#
# Hint: The max function might come in useful. In particular, the max
#       function can also return the index of the max element, for more
#       information see 'help max'. If your examples are in rows, then, you
#       can use max(A, [], 2) to obtain the max for each row.
#
    X = np.insert(X, 0, np.ones(m), 1)
    h1 = sigmoid(X.dot(Theta1.T))
    h1 = np.insert(h1, 0, np.ones(m), 1)
    h2 = sigmoid(h1.dot(Theta2.T))
    p = np.argmax(h2, axis=1)
# =========================================================================

    return p + 1        # add 1 to offset index of maximum in A row
Example #2
0
def sigmoidGradient(z):
    """computes the gradient of the sigmoid function
    evaluated at z. This should work regardless if z is a matrix or a
    vector. In particular, if z is a vector or matrix, you should return
    the gradient for each element."""

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the gradient of the sigmoid function evaluated at
    #               each value of z (z can be a matrix, vector or scalar).
    g = sigmoid(z) * (1 - sigmoid(z))
    # =============================================================

    return g
def sigmoidGradient(z):
    """computes the gradient of the sigmoid function
    evaluated at z. This should work regardless if z is a matrix or a
    vector. In particular, if z is a vector or matrix, you should return
    the gradient for each element."""

# ====================== YOUR CODE HERE ======================
# Instructions: Compute the gradient of the sigmoid function evaluated at
#               each value of z (z can be a matrix, vector or scalar).


# =============================================================
    g=sigmoid(z)*(1-sigmoid(z))
    return g
def predict(Theta1, Theta2, X):
    """ outputs the predicted label of X given the
    trained weights of a neural network (Theta1, Theta2)
    """

# Useful values
    m, _ = X.shape
    num_labels, _ = Theta2.shape

# ====================== YOUR CODE HERE ======================
# Instructions: Complete the following code to make predictions using
#               your learned neural network. You should set p to a 
#               vector containing labels between 1 to num_labels.
#
# Hint: The max function might come in useful. In particular, the max
#       function can also return the index of the max element, for more
#       information see 'help max'. If your examples are in rows, then, you
#       can use max(A, [], 2) to obtain the max for each row.
#

# =========================================================================
    X = np.column_stack((np.ones((m, 1)), X))
    A2 = sigmoid(np.dot(X,Theta1.T))
    
    A2 = np.column_stack((np.ones((m, 1)), A2))
    p = np.argmax(np.dot(A2,Theta2.T),axis=1)
    return p + 1        # add 1 to offset index of maximum in A row
def predictOneVsAll(all_theta, X):
    """will return a vector of predictions
  for each example in the matrix X. Note that X contains the examples in
  rows. all_theta is a matrix where the i-th row is a trained logistic
  regression theta vector for the i-th class. You should set p to a vector
  of values from 1..K (e.g., p = [1 3 1 2] predicts classes 1, 3, 1, 2
  for 4 examples) """

    m = X.shape[0]

    # You need to return the following variables correctly
    p = np.zeros((m, 1))

    # Add ones to the X data matrix
    X = np.column_stack((np.ones((m, 1)), X))

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the following code to make predictions using
    #               your learned logistic regression parameters (one-vs-all).
    #               You should set p to a vector of predictions (from 1 to
    #               num_labels).
    #
    # Hint: This code can be done all vectorized using the max function.
    #       In particular, the max function can also return the index of the
    #       max element, for more information see 'help max'. If your examples
    #       are in rows, then, you can use max(A, [], 2) to obtain the max
    #       for each row.
    #

    # =========================================================================
    a = sigmoid(X.dot(all_theta.T))
    p = np.argmax(a, axis=1)
    return p + 1  # add 1 to offset index of maximum in A row
Example #6
0
def gradientFunction(theta, X, y):
    """
    Compute cost and gradient for logistic regression with regularization

    computes the cost of using theta as the parameter for regularized logistic regression and the
    gradient of the cost w.r.t. to the parameters.
    """

    m = len(y)  # number of training examples

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the gradient of a particular choice of theta.
    #               Compute the partial derivatives and set grad to the partial
    #               derivatives of the cost w.r.t. each parameter in theta

    # =============================================================
    #check data type
    if type(X) != np.array: X = asarray(X)
    if type(y) != np.array: y = asarray(y)

    h = sigmoid(X.dot(theta))

    #check to make sure h and y have the same shape.
    #having issues with rank 1 arrays (n,)
    #if h.shape!=y.shape: h = np.reshape(h,(-1,1))
    if h.shape != y.shape: y = y[:, 0]

    grad = (m**-1) * (X.T.dot(h - y))

    return grad
def sigmoidGradient(z):
    """computes the gradient of the sigmoid function
    evaluated at z. This should work regardless if z is a matrix or a
    vector. In particular, if z is a vector or matrix, you should return
    the gradient for each element."""

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the gradient of the sigmoid function evaluated at
    #               each value of z (z can be a matrix, vector or scalar).

    m = len(y)
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    #h=sigmoid(np.dot(X,theta.T))
    h = sigmoid(np.dot(X, theta.T))
    #grad=(1.0/m)*X.T.dot(h-y)
    grad = (1.0 / m) * np.dot(X.T, (h - y))
    #grad=((1.0/m)*np.dot(X.T,(h-y))).T
    gradreg = grad.T + np.multiply((Lambda / m), theta)
    gradreg[0, 0] = np.sum(np.multiply(X[:, 0], (h - y))) / m
    return gradreg

    # =============================================================

    return g
def predict(Theta1, Theta2, X):
    """ outputs the predicted label of X given the
    trained weights of a neural network (Theta1, Theta2)
    """

    # Useful values
    m, _ = X.shape
    num_labels, _ = Theta2.shape

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the following code to make predictions using
    #               your learned neural network. You should set p to a
    #               vector containing labels between 1 to num_labels.
    #
    # Hint: The max function might come in useful. In particular, the max
    #       function can also return the index of the max element, for more
    #       information see 'help max'. If your examples are in rows, then, you
    #       can use max(A, [], 2) to obtain the max for each row.
    #

    # Add ones to the X data matrix
    X = np.column_stack((np.ones((m, 1)), X))

    #calculate a(2)
    z = X.dot(Theta1.T)
    a2 = sigmoid(z)

    # Add ones to the a(2) data matrix
    a2 = np.column_stack((np.ones((m, 1)), a2))

    z2 = a2.dot(Theta2.T)
    a3 = sigmoid(z2)

    p = np.argmax(a3, axis=1)

    # =========================================================================

    return p + 1  # add 1 to offset index of maximum in A row
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, Lambda):
    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.
    """

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, input_layer_size + 1),
                        order='F').copy()

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                        (num_labels, (hidden_layer_size + 1)),
                        order='F').copy()

    # Setup some useful variables
    m, _ = X.shape

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    #
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    #
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    #
    # Part 3: Implement regularization with the cost function and gradients.
    #
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    #
    y_categorical = pd.get_dummies(y.ravel()).as_matrix()

    a1 = np.column_stack((np.ones((m, 1)), X))
    z2 = a1.dot(Theta1.T)
    a2 = np.column_stack((np.ones((z2.shape[0], 1)), sigmoid(z2)))
    a3 = sigmoid(a2.dot(Theta2.T))
    J = np.sum(np.log(a3) * y_categorical + np.log(1 - a3) * (1 - y_categorical)) / float(-m) \
            + Lambda * (np.sum(np.square(Theta1[:,1:])) + np.sum(np.square(Theta2[:,1:]))) / (2 * m)

    a3_grad = a3 - y_categorical
    Theta2_grad = a3_grad.T.dot(a2) / m + Lambda * np.column_stack((np.zeros(
        (Theta2.shape[0], 1)), Theta2[:, 1:])) / m
    a2_grad = (a3_grad).dot(Theta2[:, 1:]) * sigmoidGradient(z2)
    Theta1_grad = a2_grad.T.dot(a1) / m + Lambda * np.column_stack((np.zeros(
        (Theta1.shape[0], 1)), Theta1[:, 1:])) / m
    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradient
    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, Lambda):
    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.
    """

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, input_layer_size + 1),
                        order='F').copy()

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                        (num_labels, (hidden_layer_size + 1)),
                        order='F').copy()

    print(Theta1)
    print(Theta2)

    # Setup some useful variables
    m, _ = X.shape

    #add bias layer to X
    X = np.concatenate((np.ones((m, 1)), X), axis=1)

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    #
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    #
    #computing cost function without regularization
    #make sure X and y are numpy array

    #calulcate hyp - first do layer 1
    z1 = (X.dot(Theta1.T))
    #add bias term
    z1 = np.column_stack((np.ones((m, 1)), z1))
    a2 = sigmoid(z1)

    # Add ones to the a(2) data matrix
    #a2 = np.column_stack((np.ones((m, 1)), a2))

    #calculate the hyp for layer 2
    z2 = a2.dot(Theta2.T)
    a3 = sigmoid(z2)

    #need to get the y values in a matrix of mx10 0s and 1s
    #create empty array of zeros
    y_vec = np.zeros((m, num_labels))
    #get 1 value for the value
    for row in range(m):
        y_vec[row, y[row] - 1] = 1

    J = (-1 / m) * np.sum((y_vec * np.log(a3)) + (1 - y_vec) *
                          (np.log(1 - a3)))

    J = J + Lambda / (2 * m) * (np.sum(Theta1[:, 1:]**2) +
                                np.sum(Theta2[:, 1:]**2))

    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    #

    del3 = (a3 - y_vec)  # m x num_lables (5000 x 10)
    #calculate g prime for the a2, add layer of ones to make shape fit

    del2 = np.multiply(del3.dot(Theta2),
                       sigmoidGradient(z1))  #m x s2 (5000X26)

    Theta2_grad = ((1 / m) * (a2.T.dot(del3)).T)
    Theta1_grad = ((1 / m) * (X.T.dot(del2))[:, 1:]).T

    # Part 3: Implement regularization with the cost function and gradients.
    #
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    #

    # -------------------------------------------------------------

    Theta1_grad[:, 1:] += (Lambda / m) * Theta1[:, 1:]
    Theta2_grad[:, 1:] += (Lambda / m) * Theta2[:, 1:]

    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    # =========================================================================

    return J, grad
Example #11
0
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, Lambda):
    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.
    """

    # Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
    # for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, input_layer_size + 1),
                        order='F').copy()

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                        (num_labels, (hidden_layer_size + 1)),
                        order='F').copy()

    # Setup some useful variables
    m, _ = X.shape

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    #
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    #
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial derivatives of
    #         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
    #         Theta2_grad, respectively. After implementing Part 2, you can check
    #         that your implementation is correct by running checkNNGradients
    #
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector into a
    #               binary vector of 1's and 0's to be used with the neural network
    #               cost function.
    #
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it for the
    #               first time.
    #
    # Part 3: Implement regularization with the cost function and gradients.
    #
    #         Hint: You can implement this around the code for
    #               backpropagation. That is, you can compute the gradients for
    #               the regularization separately and then add them to Theta1_grad
    #               and Theta2_grad from Part 2.
    #
    X = np.column_stack((np.ones((m, 1)), X))
    z2 = np.dot(X, Theta1.T)
    a2 = sigmoid(z2)
    a2 = np.column_stack((np.ones((m, 1)), a2))
    z3 = np.dot(a2, Theta2.T)
    a3 = sigmoid(z3)
    #htheta = np.argmax(a3, axis=1)

    yVec = np.zeros(m * num_labels).reshape(m, num_labels)

    for i in range(0, num_labels):
        y_tem = np.where(y == i + 1, 1, 0)
        yVec[:, i] = y_tem

    ureg_J = (-yVec) * (np.log(a3)) - (1 - yVec) * (np.log(1 - a3))
    ureg_J = (1. / m) * np.sum(ureg_J)

    ureg_T1 = Theta1[:, 1:]
    ureg_T2 = Theta2[:, 1:]
    J = ureg_J + (Lambda / (2. * m)) * (np.sum(np.power(ureg_T1, 2)) +
                                        np.sum(np.power(ureg_T2, 2)))

    DELTA2 = np.zeros((hidden_layer_size, input_layer_size + 1))
    DELTA3 = np.zeros((num_labels, hidden_layer_size + 1))
    for i in range(m):
        a1_s = X[i, :]
        z2_s = np.dot(a1_s, Theta1.T)
        a2_s = sigmoid(z2_s)
        a2_s = np.insert(a2_s, 0, 1)
        z3_s = np.dot(a2_s, Theta2.T)
        a3_s = sigmoid(z3_s)
        delta3 = a3_s - yVec[i, :]
        delta2 = np.dot(delta3, ureg_T2) * sigmoidGradient(z2_s)
        delta2 = delta2.reshape(hidden_layer_size, 1)
        delta3 = delta3.reshape(num_labels, 1)
        DELTA2 = DELTA2 + np.dot(delta2.reshape(hidden_layer_size, 1),
                                 a1_s.reshape(input_layer_size + 1, 1).T)
        DELTA3 = DELTA3 + np.dot(delta3.reshape(num_labels, 1),
                                 a2_s.reshape(hidden_layer_size + 1, 1).T)

    Theta1_grad = (1. / m) * DELTA2
    Theta2_grad = (1. / m) * DELTA3
    #a = Theta1_grad[:,1:]
    Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (1.0 * Lambda / m) * ureg_T1
    #b = Theta1_grad[:,1:]
    Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (1.0 * Lambda / m) * ureg_T2
    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradient
    grad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, grad
Example #12
0
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels,
                   X, y, lambda_):
    """ computes the cost and gradient of the neural network. The
        parameters for the neural network are "unrolled" into the vector
        nn_params and need to be converted back into the weight matrices.

        The returned parameter grad should be a "unrolled" vector of the
        partial derivatives of the neural network.
    """

    # Reshape nn_params back into the parameters Theta1 and Theta2,
    # the weight matrices for our 2 layer neural network
    # Obtain Theta1 and Theta2 back from nn_params
    Theta1 = nn_params[:hidden_layer_size * (input_layer_size + 1)].reshape(
        hidden_layer_size, input_layer_size + 1, order='F')  # (25, 401)
    Theta2 = nn_params[hidden_layer_size * (input_layer_size + 1):].reshape(
        num_labels, hidden_layer_size + 1, order='F')  # (10, 26)

    # Setup some useful variables
    m = len(X)
    y = pd.get_dummies(y).as_matrix()

    # ====================== YOUR CODE HERE ======================
    # Instructions: You should complete the code by working through the
    #               following parts.
    #
    # Part 1: Feedforward the neural network and return the cost in the
    #         variable J. After implementing Part 1, you can verify that your
    #         cost function computation is correct by verifying the cost
    #         computed in ex4.m
    #
    # Part 2: Implement the backpropagation algorithm to compute the gradients
    #         Theta1_grad and Theta2_grad. You should return the partial
    #         derivatives of the cost function with respect to Theta1 and
    #         Theta2 in Theta1_grad and Theta2_grad, respectively.
    #         After implementing Part 2, you can check that
    #         your implementation is correct by running checkNNGradients
    #
    #         Note: The vector y passed into the function is a vector of labels
    #               containing values from 1..K. You need to map this vector
    #               into a binary vector of 1's and 0's to be used with
    #               the neural network cost function.
    #
    #         Hint: We recommend implementing backpropagation using a for-loop
    #               over the training examples if you are implementing it
    #               for the first time.
    #
    # Part 3: Implement regularization with the cost function and gradients.
    #
    #         Hint: You can implement this around the code for backpropagation.
    #               That is, you can compute the gradients
    #               for the regularization separately and then add them
    #               to Theta1_grad and Theta2_grad from Part 2.
    #

    # Feedforward the neural network...
    a1 = np.c_[np.ones(m), X]  # (5000, 401)

    z2 = a1 @ Theta1.T  # (5000, 401) @ (401, 25) = (5000, 25)
    a2 = np.c_[np.ones(len(z2)), sigmoid(z2)]  # (5000, 26)

    z3 = a2 @ Theta2.T  # (5000, 26) @ (26, 10) = (5000, 10)
    a3 = sigmoid(z3)  # (5000, 10)

    # Computing cost...
    J = -np.mean(np.sum(y * np.log(a3) + (1 - y) * np.log(1 - a3), axis=1))

    # Computing regularized cost...
    J += lambda_ * (sum(np.sum(np.square(Theta1[:, 1:]), axis=1)) +
                    sum(np.sum(np.square(Theta2[:, 1:]), axis=1))) / (2 * m)

    # Computing δ(del) and ∆(delta)...
    del3 = a3 - y  # (5000, 10)
    delta2 = del3.T @ a2  # (10, 26)

    del2 = del3 @ Theta2 * sigmoidGradient(np.c_[np.ones(len(z2)), z2])
    delta1 = del2[:, 1:].T @ a1  # (25, 401)

    # Computing gradient...
    Theta1_grad = delta1 / m
    Theta2_grad = delta2 / m

    # Computing regularized gradient...
    Theta1_grad += lambda_ * np.c_[np.zeros(len(Theta1)), Theta1[:, 1:]] / m
    Theta2_grad += lambda_ * np.c_[np.zeros(len(Theta2)), Theta2[:, 1:]] / m
    # -------------------------------------------------------------

    # =========================================================================

    # Unroll gradient
    grad = np.r_[Theta1_grad.flatten(order='F'),
                 Theta2_grad.flatten(order='F')]

    return J, grad
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda):

    """computes the cost and gradient of the neural network. The
  parameters for the neural network are "unrolled" into the vector
  nn_params and need to be converted back into the weight matrices.

  The returned parameter grad should be a "unrolled" vector of the
  partial derivatives of the neural network.
    """
    input_layer_size=int(input_layer_size)
    hidden_layer_size=int(hidden_layer_size)
    num_labels=int(num_labels)
# Reshape nn_params back into the parameters Theta1 and Theta2, the weight matrices
# for our 2 layer neural network
# Obtain Theta1 and Theta2 back from nn_params
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                       (hidden_layer_size, input_layer_size + 1), order='F').copy()

    Theta2 = np.reshape(nn_params[hidden_layer_size * (input_layer_size + 1):],
                       (num_labels, (hidden_layer_size + 1)), order='F').copy()



# Setup some useful variables
    m, _ = X.shape


# ====================== YOUR CODE HERE ======================
# Instructions: You should complete the code by working through the
#               following parts.
#
# Part 1: Feedforward the neural network and return the cost in the
#         variable J. After implementing Part 1, you can verify that your
#         cost function computation is correct by verifying the cost
#         computed in ex4.m
    a1=np.column_stack((np.ones((m, 1)), X))
    #z2 = np.dot(Theta1, a1.T).T
    z2 = np.dot(a1,Theta1.T)
    a2 = np.column_stack((np.ones((m, 1)),sigmoid(z2)))
    z3 = np.dot(a2, Theta2.T)
    a3 = sigmoid(z3)
    
    nn_hx = a3.ravel(order='F')
    #nn_y = np.repeat(y, num_labels)
    nn_y=np.zeros(0)
    for k in range(num_labels):
        nn_y=np.append(nn_y,np.asarray([1 if i==(k+1) else 0 for i in y]))
    MatY=np.reshape(nn_y, (num_labels,m)).T
    
    
    #first = -np.dot(MatY.ravel(), np.log(nn_hx))
    #second = -np.dot((1-MatY.ravel()), np.log(1-nn_hx))
    first = -np.dot(nn_y, np.log(nn_hx))
    second = -np.dot((1-nn_y), np.log(1-nn_hx))
    #first = -np.dot(MatY, np.log(a3))
    #second = -np.dot((1-MatY), np.log(1-a3))
    reg = (Theta1[:,1:]**2).sum() + (Theta2[:,1:]**2).sum()
    J=(first+second)/m + reg*Lambda/(2.*m)
      
    #J = ((-MatY * np.log(a3) - (1-MatY) * np.log(1-a3))/m).sum()

#
# Part 2: Implement the backpropagation algorithm to compute the gradients
#         Theta1_grad and Theta2_grad. You should return the partial derivatives of
#         the cost function with respect to Theta1 and Theta2 in Theta1_grad and
#         Theta2_grad, respectively. After implementing Part 2, you can check
#         that your implementation is correct by running checkNNGradients
#
#         Note: The vector y passed into the function is a vector of labels
#               containing values from 1..K. You need to map this vector into a 
#               binary vector of 1's and 0's to be used with the neural network
#               cost function.
#
#         Hint: We recommend implementing backpropagation using a for-loop
#               over the training examples if you are implementing it for the 
#               first time.
#

    delta3 = a3 - MatY

    delta2 = np.dot(delta3,Theta2)[:,1:] * sigmoidGradient(z2)
    Theta1[:,0]=np.zeros(Theta1.shape[0])
    Theta2[:,0]=np.zeros(Theta2.shape[0])
    Theta1_grad = np.dot(delta2.T,a1)/m + Lambda * Theta1/m
    Theta2_grad = np.dot(delta3.T,a2)/m + Lambda * Theta2/m
# Part 3: Implement regularization with the cost function and gradients.
#
#         Hint: You can implement this around the code for
#               backpropagation. That is, you can compute the gradients for
#               the regularization separately and then add them to Theta1_grad
#               and Theta2_grad from Part 2.
#



    # -------------------------------------------------------------

    # =========================================================================
    
    #Unroll gradient
    Ngrad = np.hstack((Theta1_grad.T.ravel(), Theta2_grad.T.ravel()))

    return J, Ngrad