def costFunction(Theta, X, Y, lam):
    """Returns cost of Theta using logistic regression"""

    m = len(X)
    n = len(X[0])
    k = len(Y[0])
    k_h = (n + k) // 2      #average of features and categories
    Theta1 = np.reshape(Theta[0:(n+1)*k_h], (n+1, k_h))
    Theta2 = np.reshape(Theta[(n+1)*k_h:], (k_h+1, k))

    one = np.ones(m)
    one = np.reshape(one, (m, 1))
    a1 = np.concatenate((one, X), axis=1)
   
    #compute inputs to hidden layer
    a2 = sigmoid(np.dot(a1, Theta1))
    a2 = np.concatenate((one, a2), axis=1)

    #compute output layer
    a3 = sigmoid(np.dot(a2, Theta2))

    #compute cost
    J = -(1.0/m) * (np.dot(np.log(a3).T,  Y) + \
        np.dot(np.log(1.0 - a3).T, (1.0 - Y)))
    J = J.sum()

    #compute regularization term
    Theta1_sq = np.dot(Theta1.T, Theta1)
    Theta1_sq[0, :] = np.zeros(k_h)
    Theta2_sq = np.dot(Theta2.T, Theta2)
    Theta2_sq[0, :] = np.zeros(k)
    J = J + (lam / 2.0 / m) * (Theta1_sq.sum() + Theta2_sq.sum())
    print 'cost =', J
    return J
def lrCostFunction(theta, X, y, Lambda):
    """computes the cost of using
    theta as the parameter for regularized logistic regression and the
    gradient of the cost w.r.t. to the parameters.
    """

# ====================== YOUR CODE HERE ======================
# Instructions: Compute the cost of a particular choice of theta.
#               You should set J to the cost.
#
# Hint: The computation of the cost function and gradients can be
#       efficiently vectorized. For example, consider the computation
#
#           sigmoid(X * theta)
#
#       Each row of the resulting matrix will contain the value of the
#       prediction for that example. You can make use of this to vectorize
#       the cost function and gradient computations. 
#

    # =============================================================
    m = y.size
    
    J=( -dot(y,log(sigmoid(dot(X,theta))))-dot((1-y),log(1-sigmoid(dot(X,theta)))))/m +(Lambda/(2*m))*(sum(theta**2)-theta[0]**2)
    #theta0=np.copy(theta)
    #np.put(theta0,0,0)
    #J=dot((sigmoid(dot(X,theta))-y),X)/m+ (Lambda/m)*theta0
    #J= sum(-y*log(sigmoid(dot(X,theta.T)))-(1-y)*log(1-sigmoid(dot(X,theta.T))))/m
    #J= sum(-y*log(sigmoid(dot(X,theta.T)))-(1-y)*log(1-sigmoid(dot(X,theta.T))))/m
    return J
Beispiel #3
0
def predict(theta,board) :
    """
    theta - unrolled Neural Network weights
    board - n*n matrix representing board
    Returns:
        h - n*1 column vector - confidence level for performing next move
    """
    n = size(board,1)

    #neural network parameters
    input_units = n*n
    hidden_units = n*n
    output_units = n*n

    #theta1 - unrolled weights between input and hidden layer
    #theta2 - unrolled weights between hidden and output layer
    theta1 = theta[:,:hidden_units*(input_units+1)]
    theta2 = theta[:,hidden_units*(input_units+1):]

    #reshaping to obtain rolled weights
    theta1 = np.reshape(theta1,(hidden_units,input_units+1))
    theta2 = np.reshape(theta2,(output_units,hidden_units+1))

    #calculating confidence level given board
    #position and neural network weights
    X = board.flatten().T
    X = concatenate((mat(1),X))
    z2 = theta1*X
    a2 = sigmoid(z2)
    a2 = concatenate((mat(1),a2))
    z3 = theta2*a2
    h = sigmoid(z3)
    return h
Beispiel #4
0
def lrCostFunction(theta, X, y, lmbda):
    # Initialize some useful values
    m = y.shape[0]  # number of training examples

    # You need to return the following variables correctly
    J = 0
    grad = np.zeros(theta.shape)

    # ====================== YOUR CODE HERE ======================

    def h(X, theta):
        return X.dot(theta)

    J = np.float(-y.T * np.nan_to_num(np.log(sigmoid(h(X, theta))).T) -
                 (1 - y).T * np.nan_to_num(np.log(1 - sigmoid(h(X, theta))).T)) / m
    reg_cost = theta.copy()
    reg_cost[0] = 0
    J += (lmbda * reg_cost.T.dot(reg_cost)) / (2 * m)

    grad = np.asarray((sigmoid(h(X, theta)) - y.T).dot(X) / m)[0]
    reg_grad = theta * (float(lmbda) / m)
    reg_grad[0] = 0
    grad += reg_grad

    # =============================================================

    return (J, grad)
def cost_function(cost_function_parameters):
    """Cost function"""
    theta = cost_function_parameters['theta']
    input_layer_size = cost_function_parameters['input_layer_size']
    hidden_layer_size = cost_function_parameters['hidden_layer_size']
    num_labels = cost_function_parameters['number_of_labels']
    x_values = cost_function_parameters['x_values']
    y_values = cost_function_parameters['y_values']
    lambda_value = cost_function_parameters['lambda_value']

    theta_1_parameters = theta[0: (hidden_layer_size * (input_layer_size + 1))]
    theta_2_parameters = theta[(hidden_layer_size * (input_layer_size + 1)):]

    theta_1 = theta_1_parameters.reshape(hidden_layer_size, input_layer_size + 1)
    theta_2 = theta_2_parameters.reshape(num_labels, (hidden_layer_size + 1))

    input_examples_size = x_values.shape[0]

    hidden_layer_input = numpy.c_[numpy.ones(input_examples_size), x_values].dot(theta_1.T)
    hidden_layer_output = sigmoid(hidden_layer_input)

    output_layer_input = numpy.c_[numpy.ones(hidden_layer_output.shape[0]), hidden_layer_output].dot(theta_2.T)
    output = sigmoid(output_layer_input)

    first_part_of_cost = -((y_values) * numpy.log(output))
    second_part_of_cost = ((1.0 - y_values) * numpy.log(1.0-output))

    combined_thetas = numpy.append(theta_1.flatten()[1:], theta_2.flatten()[1:])
    regularization_term = (lambda_value/(2.0 * input_examples_size)) * numpy.sum(numpy.power(combined_thetas, 2))

    j = ((1.0/input_examples_size) * numpy.sum(numpy.sum(first_part_of_cost - second_part_of_cost))) + regularization_term
    return j
def charTrain():
    X = np.matrix('0,0,1,0; 0,1,0,0; 0,0,0,1; 0,0,0,1; 1,0,0,0')  # encoding for hello
    numIn, numHid, numOut = 4, 10, 4
    numInTot = numIn + numHid + 1
    theta1 = np.matrix(1 * np.sqrt(6 / (numIn + numHid)) * np.random.randn(numInTot, numHid)) 
    theta2 = np.matrix(1 * np.sqrt(6 / (numOut + numHid)) * np.random.randn(numHid + 1, numOut)) 
    theta1_grad = np.zeros((numInTot, numHid))
    theta2_grad = np.zeros((numHid + 1, numOut))
    hid_last = np.zeros((numHid, 1))
    m = X.shape[0]
    alpha = 0.05
    for ita in range(5000):
        for j in range(m-1): #for every training element, except for the last one, which we don't know what is followed
            y = X[j+1, :]  # given the input char, the next char is expected
            # forward
            context = hid_last
            x_context = np.concatenate((X[j, :], context.T), axis=1)
            a1 = np.matrix(np.concatenate((x_context, np.matrix('[1]')), axis=1)).T
            z2 = theta1.T * a1;
            a2 = np.concatenate((sigmoid(z2), np.matrix('[1]')))
            hid_last = a2[0:-1, 0];
            z3 = theta2.T * a2
            a3 = sigmoid(z3)
            # backward propagation
            d3 = np.multiply(z3.T, (a3.T - y))   # 1*4, d(loss)/d(z) = z * (a3 - y)
            theta2 = theta2 - alpha * a2 * d3  # 11*1 * 1*4 => 11*4a  d(loss)/d(theta2) = d(loss)/d(z3) * d(z3)/d(theta2)
            d2 = np.multiply((theta2 * d3.T), np.multiply(a2, (1 - a2)))  # (11*4 * 4*1) multiply ( 11*1 multiply 11*1) => 11*1
            theta1 = theta1 - alpha * a1 * d2[0:numHid,:].T  # 15*1 * 1*10 => 15*10
    return theta1, theta2, numHid, numOut
def costFunction(theta, X, y, return_grad=False):
#COSTFUNCTION Compute cost and gradient for logistic regression
#   J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the
#   parameter for logistic regression and the gradient of the cost
#   w.r.t. to the parameters.

    import numpy as np 
    from sigmoid import sigmoid

    # Initialize some useful values
    m = len(y) # number of training examples

    # You need to return the following variables correctly 
    J = 0
    grad = np.zeros(theta.shape)

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the cost of a particular choice of theta.
    #               You should set J to the cost.
    #               Compute the partial derivatives and set grad to the partial
    #               derivatives of the cost w.r.t. each parameter in theta
    #
    # Note: grad should have the same dimensions as theta
    #

    # given the following dimensions:
    # theta.shape = (n+1,1)
    # X.shape     = (m,n+1)
    # the equation's 
    #	theta' times X
    # becomes
    # 	np.dot(X,theta)
    # to obtain a (m,1) vector
    # given that
    #   y.shape     = (m,)
    # we transpose the (m,1) shaped 
    #   np.log( sigmoid( np.dot(X,theta) ) )        , as well as
    #   np.log( 1 - sigmoid( np.dot(X,theta) ) )
    # to obtain (1,m) vectors to be mutually added, 
    # and whose elements are summed to form a scalar 
    one = y * np.transpose(np.log( sigmoid( np.dot(X,theta) ) ))
    two = (1-y) * np.transpose(np.log( 1 - sigmoid( np.dot(X,theta) ) ))
    J = -(1./m)*(one+two).sum()

    # here we need n+1 gradients. 
    # note that 
    #   y.shape                          = (m,)
    #   sigmoid( np.dot(X,theta) ).shape = (m, 1)
    # so we transpose the latter, subtract y, obtaining a vector of (1, m)
    # we multiply such vector by X, whose dimension is 
    #   X.shape = (m, n+1), 
    # and we obtain a (1, n+1) vector, which we also transpose
    # this last vectorized multiplication takes care of the sum
    grad = (1./m) * np.dot(sigmoid( np.dot(X,theta) ).T - y, X).T

    if return_grad == True:
        return J, np.transpose(grad)
    elif return_grad == False:
        return J # for use in fmin/fmin_bfgs optimization function
def sigmoidGradient(z):
    """returns the gradient of the sigmoid function evaluated at z
       g = SIGMOIDGRADIENT(z) computes the gradient of the sigmoid function
       evaluated at z. This should work regardless if z is a matrix or a
       vector. In particular, if z is a vector or matrix, you should return
       the gradient for each element."""
    
    return sigmoid(z) * (1-sigmoid(z))
def sigmoidGradient(z):

    # SIGMOIDGRADIENT returns the gradient of the sigmoid function evaluated at z


    g = zeros(z.shape)
    g = sigmoid(z).transpose() * (1.0 - sigmoid(z))
    
    return g
Beispiel #10
0
def compute_cost(theta,X,y): #computes cost given predicted and actual values
	m = X.shape[0] #number of training examples
	theta = np.reshape(theta,(len(theta),1))
 
	#y = reshape(y,(len(y),1))
	J = (1./m) * (-np.transpose(y).dot(np.log(sg.sigmoid(X.dot(theta)))) - np.transpose(1-y).dot(np.log(1-sg.sigmoid(X.dot(theta)))))
	grad = np.transpose((1./m)*np.transpose(sg.sigmoid(X.dot(theta)) - y).dot(X))
	#optimize.fmin expects a single value, so cannot return grad
	return J.mean()#,grad
Beispiel #11
0
def sigmoidGradient(z):

    import sigmoid as sg
    import numpy as np
    g = np.zeros(np.size(z));

    g=sg.sigmoid(z)*(1-sg.sigmoid(z));

    return g
Beispiel #12
0
def costFunctionReg(theta, X, y, lam):
	dim = X.shape
	m = dim[0]
	theta = theta.reshape(theta.shape[0], 1)
	J = -(1.0/m) * ( numpy.dot(numpy.transpose(y), utils.multimap(math.log, \
	sigmoid.sigmoid(numpy.dot(X, theta)))) + numpy.dot(numpy.transpose(1-y), \
	utils.multimap(math.log, 1 - sigmoid.sigmoid(numpy.dot(X,theta)))) \
	+ lam/2.0*numpy.dot(numpy.transpose(theta[1:, :]),theta[1:,:]) )

	return float(J[0])
def predict(X, Theta1, Theta2):
    m, n = X.shape 

    a1 = X
    z2 = a1.dot(Theta1.T)
    a2 = sigmoid(z2)
    a2 = np.concatenate((np.ones((m, 1)), a2), axis=1)
    z3 = a2.dot(Theta2.T)
    a3 = sigmoid(z3)
    return np.argmax(a3, axis= 1)[np.newaxis]
Beispiel #14
0
def predict(nn_params, layers, X, y, lam, display, path):
	m = X.shape[0]
	Theta = reshapeThetas(nn_params, layers)
	l = len(layers)

	A = []
	A_ones = []
	A_sig = []
	Z = []
	J = 0

	for i in range(0,l):
		A.append(0)
		A_ones.append(0)
		A_sig.append(0)
		Z.append(0)

	A_ones[0] = ones((m,1))+0.0
	A[0] = concatenate((A_ones[0],X),1) 
	Z[1] = dot(A[0],Theta[0].conj().T)


	for i in range(1,l-1):
		A_ones[i] = ones((Z[i].shape[0],1))+0.0
		A_sig[i] = sigmoid(Z[i])
		A[i] = concatenate((A_ones[i], A_sig[i]),1)
		Z[i+1] = dot(A[1],Theta[1].conj().T)

	A[-1] = sigmoid(Z[-1])

	predictions = A[-1].argmax(axis=1)+0.0

	# cost calculation
	if not(isinstance(y, (int, long))): # if there are associated y values, calculate test results
		for i in range(0,layers[-1]):
			J_curr =  (1.0/m)*sum(-1*((y==i)*log(A[-1][:,i])) - (1-(y==i)) * log(1-A[-1][:,i]))
			J += J_curr

		if display == 1: # if the results should be displayed, do so
			print mean(predictions == y)*100, '%'
			print "Cost:",J

		return (J, mean(predictions == y))

	else: # if there are no y values, save predictions to file
		pfile = open(path + '/predictions.txt','w')
		for i in predictions:
			pfile.write(str(int(i))+'\n')
		pfile.close()
		ffile =  open(path + '/feature predict.txt','w')
		A_c = A[-1]
		A_c.tolist()
		for i in range(0,len(A_c)):
			ffile.write(','.join(str(elem) for elem in A_c[i])+'\n')
		ffile.close()
Beispiel #15
0
def sigmoidGradient(z):

    # sigmoidGradient returns the gradient of the sigmoid function evaluated at z

    g = zeros(z.shape)
    # =========================== DONE ==================================
    # Instructions: Compute the gradient of the sigmoid function evaluated at
    #               each value of z.
    g += sigmoid(z) * (1 - sigmoid(z))
    
    return g
def predict(Theta1, Theta2, X):
    
    # Useful values
    m = X.shape[0]
    num_labels = Theta2.shape[0]

    a1 = np.vstack((np.ones(m), X.T)).T
    a2 = sigmoid(np.dot(a1, Theta1.T))
    a2 = np.vstack((np.ones(m), a2.T)).T
    a3 = sigmoid(np.dot(a2, Theta2.T))

    return np.argmax(a3, axis=1)
def costFunction(theta, X,y):
    """ computes the cost of using theta as the
    parameter for logistic regression and the
    gradient of the cost w.r.t. to the parameters."""
    from numpy import dot
# Initialize some useful values
    m = y.size # number of training examples
    first = -dot(y, log(sigmoid(dot(X,theta))))
    second = -dot((1-y), log(1-sigmoid(dot(X,theta))))
    #first = -dot(y, log(sigmoid(dot(X,theta))))
    #second = -dot((ones(m)-y), log(ones(m)-sigmoid(dot(X,theta))))
    J=(first+second)/m
    return J
Beispiel #18
0
def sigmoidGradient(z):

    g = np.zeros(z.shape)

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the gradient of the sigmoid function evaluated at
    #               each value of z (z can be a matrix, vector or scalar).

    g = np.multiply(sigmoid(z), 1 - sigmoid(z))

    # == == == == == == == == == == == == == == == == == == == == =

    return g
def sigmoidGradient(z):
    """computes the gradient of the sigmoid function
    evaluated at z. This should work regardless if z is a matrix or a
    vector. In particular, if z is a vector or matrix, you should return
    the gradient for each element."""

# ====================== YOUR CODE HERE ======================
# Instructions: Compute the gradient of the sigmoid function evaluated at
#               each value of z (z can be a matrix, vector or scalar).


# =============================================================
    g= sigmoid(z)*(1-sigmoid(z))
    return g
def lrCostFunction(theta, X, y, lambda_reg, return_grad=False):
#LRCOSTFUNCTION Compute cost and gradient for logistic regression with 
#regularization
#   J = LRCOSTFUNCTION(theta, X, y, lambda_reg) computes the cost of using
#   theta as the parameter for regularized logistic regression and the
#   gradient of the cost w.r.t. to the parameters. 

    import numpy as np
    from sigmoid import sigmoid
    import sys

    # Initialize some useful values
    m = len(y) # number of training examples

    # You need to return the following variables correctly 
    J = 0
    grad = np.zeros(theta.shape)

    # ====================== YOUR CODE HERE ======================
    # Instructions: Compute the cost of a particular choice of theta.
    #               You should set J to the cost.
    #               Compute the partial derivatives and set grad to the partial
    #               derivatives of the cost w.r.t. each parameter in theta
    #

    # taken from costFunctionReg.py
    one = y * np.transpose(np.log( sigmoid( np.dot(X,theta) ) ))
    two = (1-y) * np.transpose(np.log( 1 - sigmoid( np.dot(X,theta) ) ))
    reg = ( float(lambda_reg) / (2*m)) * np.power(theta[1:theta.shape[0]],2).sum()
    J = -(1./m)*(one+two).sum() + reg

    grad = (1./m) * np.dot(sigmoid( np.dot(X,theta) ).T - y, X).T + ( float(lambda_reg) / m )*theta

    # the case of j = 0 (recall that grad is a n+1 vector)
    grad_no_regularization = (1./m) * np.dot(sigmoid( np.dot(X,theta) ).T - y, X).T

    # and then assign only the first element of grad_no_regularization to grad
    grad[0] = grad_no_regularization[0]

    # display cost at each iteration
    sys.stdout.write("Cost: %f   \r" % (J) )
    sys.stdout.flush()

    if return_grad:
        return J, grad.flatten()
    else:
        return J

    # =============================================================
def runForward(X, theta1, theta2, numHid, numOut):
    m = X.shape[0]
    hid_last = np.zeros((numHid, 1))  # context unit last time, initialized as 0
    results = np.zeros((m, numOut))  # save output, so given 4 samples (each of which is 1*4), output 4 * 4 too
    for j in range(m):  # one sample a time
        context = hid_last
        x_context = np.concatenate((X[j,:], context.T), axis=1)  # concat( (1*4, 10*1) ) ==> 1*14
        a1 = np.matrix(np.concatenate((x_context,np.matrix('[1]')), axis=1)).T # now add bias, make it 1 * 15; then .T -> 15*1
        z2 = theta1.T * a1  # (15*10).T * 15*1 ==> 10*1
        a2 = np.concatenate((sigmoid(z2), np.matrix('[1]')))  # now add hidden layer bias ,make it 5*1
        hid_last = a2[0:-1, 0]  # update hid_last
        z3 = theta2.T * a2  # (10*4).T * 10*1 ==> 4*1
        a3 = sigmoid(z3)
        results[j, :] = a3.reshape(numOut,)  # line of results is the result of the input on current step
    return results
def predict(Theta1, Theta2, X):
    """Predicts the label of an input given a trained neural network
       p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the
       trained weights of a neural network (Theta1, Theta2)"""
    
    #Useful values
    m = X.shape[0]
         
    h1 = sigmoid(np.c_[np.ones((m, 1)), X].dot(Theta1.T))
    
    h2 = sigmoid(np.c_[np.ones((m, 1)), h1].dot(Theta2.T))
    
    p = np.argmax(h2, axis=1)
    
    return p
def runForward(X, theta1, theta2, numHid):
    m = X.shape[0]
    hid_last = np.zeros((numHid, 1))  # context unit last time, initialized as 0
    results = np.zeros((m, 1))  # save output
    for j in range(m):  # one sample a time
        context = hid_last
        x_context = np.concatenate((X[j,:], context))  # concat( (1*1, 4*1) ) ==> 5*1
        a1 = np.matrix(np.concatenate((x_context,np.matrix('[1]'))))  # now add bias, make it 6*1
        z2 = theta1.T * a1  # (6*4).T * 6*1 ==> 4*1
        a2 = np.concatenate((sigmoid(z2), np.matrix('[1]')))  # now add hidden layer bias ,make it 5*1
        hid_last = a2[0:-1, 0]  # update hid_last
        z3 = theta2.T * a2  # (5*1).T * 5*1 ==> 1*1
        a3 = sigmoid(z3)
        results[j] = a3
    return results
Beispiel #24
0
    def predict(self, stream):
        """Predicts the direction of movement based on the NN response"""
        input_layer_size, number_of_labels, x_value = _convert_stream_to_array(stream)
        theta1_params = self.thetas[0: (self.hidden_layer_size * (input_layer_size + 1))]
        theta2_params = self.thetas[(self.hidden_layer_size * (input_layer_size + 1)):]
        theta_1 = theta1_params.reshape(self.hidden_layer_size, input_layer_size + 1)
        theta_2 = theta2_params.reshape(number_of_labels, (self.hidden_layer_size + 1))
        first_layer_output = x_value.dot(theta_1.T)
        hidden_layer_input = sigmoid(first_layer_output)
        hidden_layer_output = c_[[1], [hidden_layer_input]].dot(theta_2.T)
        model_output = sigmoid(hidden_layer_output)

        index, value = max(enumerate(model_output[0]), key=operator.itemgetter(1))
        print(value)
        return CLASSIFICATION_LABELS[index]
Beispiel #25
0
def runForward(X, theta1, theta2):
	m = X.shape[0]
	#forward propagation
	hid_last = np.zeros((numHid, 1)) #context units
	results = np.zeros((m, numOut))
	for j in range(m):#for every input element
		context = hid_last
		x_context = np.concatenate((X[j,:], context.T), axis=1)
		a1 = np.matrix(np.concatenate((x_context, np.matrix('[1]')), axis=1)).T#add bias, context units to input layer
		z2 = theta1.T * a1
		a2 = np.concatenate((sigmoid(z2), np.matrix('[1]'))) #add bias, output hidden layer
		hid_last = a2[0:-1, 0]
		z3 = theta2.T * a2
		a3 = sigmoid(z3)
		results[j, :] = a3.reshape(numOut,)
	return results
Beispiel #26
0
def predict(all_theta, X):

    m, n = X.shape
    X = np.hstack((np.ones((m, 1)), X))
    prediction = np.argmax(sigmoid(np.dot(X, all_theta.T)), axis=1)

    return prediction
Beispiel #27
0
 def activation(self):
     """ Applique la fonction sigmoide a tous
         les neurones de la couche """
     for it_act_neur in self.neurone_list:
         #if not (it_act_neur.in_val < 10000 and it_act_neur.in_val > -10000):
         #    print("Error in activaion, in_val : [{}], lay_type = [{}]".format(it_act_neur.in_val, self.layer_type))
         it_act_neur.in_val = sigmoid(it_act_neur.in_val)
def computeRegularizedCost(theta,X,y,lam):
    m=len(y)
    z = np.dot(theta,X)
    h = sigmoid(z)
    J = ( (lam*sum(theta[1:]*theta[1:])/2.0) + \
          sum((-y*np.log(h))-((1-y)*np.log(1-h))) )/m
    return J
def predict(theta, X):
    """PREDICT Predict whether the label is 0 or 1 using learned logistic 
    regression parameters theta
    p = PREDICT(theta, X) computes the predictions for X using a 
    threshold at 0.5 (i.e., if sigmoid(theta'*x) >= 0.5, predict 1) """ 
    
    return sigmoid(np.dot(X, theta)) >=0.5 
Beispiel #30
0
def nnCostFunction(thetas, X, y, struc, lambd=1.0, bias=1):
    j = 0.0
    grad = {}
    grad_final = np.empty_like([]) 
    m,n = X.shape
    hidden = []
    t1 = 0
    t2 = 0
    
#     try:
#         my2, ny2 = y2.shape
#     except:
#         ny2 = 1
#     
#     if ny2 < 2:
#         y = np.zeros((len(y2),y2.max()+1))
#         for i in range(0,len(y2)):
#             for ii in range(0,len(y[i])):
#                 if y2[i] == ii:
#                     y[i][ii] = 1
#     else:
#         y = y2
        
    for i in range(0,len(struc)):
        m2 = struc[i][0]
        n2 = struc[i][1]
        t2 += m2 * n2
        hidden.append({'layer': i,'theta': thetas[t1:t2].reshape(n2,m2).transpose()})
        t1 = t2
    local = {'a1': X,'t': 0.0}
    c = 1
    last = ''
    if bias == 1:
        for layer in hidden:
            theta = layer['theta']
            local['Theta' + str(c)] = theta 
            local['theta' + str(c)] = theta.copy()
            local['theta' + str(c)][:,0] = 0.0
            local['t'] += (local['theta' + str(c)][:]**2).sum()
            local['a'+ str(c)] = np.hstack((np.ones((m,1)),local['a'+ str(c)]))
            c += 1
            local['z'+ str(c)] = local['a'+ str(c - 1)].dot(theta.conj().transpose())
            local['a'+ str(c)] = s.sigmoid(local['z'+ str(c)])
            last = 'a' + str(c)
            
        cost = y * np.log(local[last]) + (1 - y) * np.log(1 - local[last])
        r = (lambd / (2.0 * m)) * local['t']
        j = -(1.0 / m) * cost.sum() + r
        

        local['s' + str(c)] = local['a'+ str(c)] - y
        for i in range(1,(c)):
            local['s' + str(c-i)] = ((local['s' + str(c)]).dot(local['Theta' + str(c-1)][:,1:])) * sigg.sigmoidGradient(local['z'+ str(c-1)])
        for i in range(0,c-1):
            delta = (local['s' + str(c-i)].conj().transpose()).dot(local['a'+ str(c-(i+1))])
            r = (lambd / m) * local['theta' + str(c-(i+1))]
            grad['Theta' + str(c-(i+1))] = (1.0 / m) * delta + r
        for i in range(1,c):
            grad_final =  np.hstack((grad_final.T.ravel(), grad['Theta' + str(i)].T.ravel()))
    return (j, grad_final)
def computeGrad(theta, X, y):
	# Computes the gradient of the cost with respect to
	# the parameters.
    m = X.shape[0] # number of training examples
    grad = zeros(size(theta))
	
    for i in range(theta.shape[0]):
	    for j in range(m):
	        grad[i] += (sigmoid(dot(X[j,:],theta)) - y[j]) * X[j,i]

    grad /= m
	
	# =============================================================
	
    return grad
Beispiel #32
0
def nn_cost_function(nn_params, input_layer_size, hidden_layer_size,
                     num_labels, X, y, lamb):
    Theta1 = numpy.reshape(nn_params[:hidden_layer_size *
                                     (input_layer_size + 1)],
                           (hidden_layer_size, input_layer_size + 1),
                           order="F")
    Theta2 = numpy.reshape(nn_params[hidden_layer_size *
                                     (input_layer_size + 1):],
                           (num_labels, hidden_layer_size + 1),
                           order="F")
    m = len(X)
    yvec = __formalize(y, num_labels)
    X = numpy.c_[numpy.ones((m, 1)), X]
    a2 = sigmoid(numpy.dot(X, Theta1.T))
    a2 = numpy.c_[numpy.ones((len(a2), 1)), a2]
    a3 = sigmoid(numpy.dot(a2, Theta2.T))
    first = numpy.multiply(yvec, numpy.log(a3))
    second = numpy.multiply(1 - yvec, numpy.log(1 - a3))
    cost = -numpy.sum(numpy.sum(first + second)) / m
    theta1_reg = numpy.sum(numpy.sum(numpy.power(Theta1[:, 1:], 2)))
    theta2_reg = numpy.sum(numpy.sum(numpy.power(Theta2[:, 1:], 2)))
    extra = lamb * (theta1_reg + theta2_reg) / (2 * m)

    return cost + extra
def compute_cost(thetas, X, y):
    cost = 0
    theta_T = thetas.transpose()
    m = X.shape[0]
    n = X.shape[1]

    for i in range(0, m):
        x_i = X[i:i + 1, 0:n].transpose()
        y_i = y[i, 0]
        theta_feature_product = np.dot(theta_T, x_i)[0]
        hypothesis_value = sigmoid(theta_feature_product)
        cost += (y_i * math.log(hypothesis_value)) + (
            (1 - y_i) * math.log(1 - hypothesis_value))
    cost = -(cost / m)
    return cost
Beispiel #34
0
def cost_Function_Reg(X, Y, theta, lmd):
    m = X.shape[0]
    Z = np.dot(X, theta)
    g = sigmoid(Z)

    cost = -(Y.T).dot(np.log(g)) - ((1 - Y).T).dot(np.log(1 - g))
    cost = cost / (m) + lmd * (theta.T).dot(theta) / (2 * m)
    # g.shape = [m,]
    # Y.shape = [m,1]
    # 两者相减得到的是[m,m]
    grad = (X.T).dot(g - Y.reshape(Y.size)) / m
    grad[0] = grad[0]
    grad[1:] = grad[1:] + (lmd * theta[1:]) / m

    return cost, grad
Beispiel #35
0
def gradient(theta, X, Y, l):

    m, n = X.shape
    theta = theta.reshape((n, 1))
    grad = np.zeros((theta.shape))
    Y = Y.reshape((m, 1))
    #grad = np.zeros((theta.shape))
    h_theta = sigmoid.sigmoid(
        X @ theta)  #the hypothesis h(theta) = 1/(1 + e**(z))

    grad[0, :] = (1 / m) * (h_theta - Y).T @ X[:, 0]

    grad[1:, :] = (((1 / m) * (h_theta - Y).T @ X[:, 1:]) +
                   ((l / m) * theta[1:, :]).T).T
    return grad
Beispiel #36
0
def predict(Theta1, Theta2, X):
    #PREDICT Predict the label of an input given a trained neural network
    #   p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the
    #   trained weights of a neural network (Theta1, Theta2)

    # Useful values
    m = np.shape(X)[0]  #number of examples

    # You need to return the following variables correctly
    p = np.zeros(m)

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the following code to make predictions using
    #               your learned neural network. You should set p to a
    #               vector containing labels between 1 to num_labels.

    # add a bias to x
    X = np.hstack((np.ones((X.shape[0], 1)), X))
    # calculate the a_2
    a_2 = sigmoid(np.dot(X, np.transpose(Theta1)))
    # add a bias to a_2
    a_2 = np.hstack((np.ones((a_2.shape[0], 1)), a_2))
    # calculate the output layer
    h = sigmoid(np.dot(a_2, np.transpose(Theta2)))
    # indexing the maxium element of the rows;
    # i.e. find the most possible number index
    p = np.argmax(h, axis=1)
    # because the index of reorder. eg. 10 in the index 9
    p = p + 1

    # Hint: The max function might come in useful. In particular, the max
    #       function can also return the index of the max element, for more
    #       information see 'help max'. If your examples are in rows, then, you
    #       can use max(A, [], 2) to obtain the max for each row.
    #
    return p
Beispiel #37
0
def predict(Theta1, Theta2, X):
#PREDICT Predict the label of an input given a trained neural network
#   p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the
#   trained weights of a neural network (Theta1, Theta2)

    # turns 1D X array into 2D
    if X.ndim == 1:
        X = np.reshape(X, (-1,X.shape[0]))

    # Useful values
    m = X.shape[0]
    num_labels = Theta2.shape[0]

    # You need to return the following variables correctly 
    p = np.zeros((m,1))

    h1 = s.sigmoid( np.dot( np.column_stack( ( np.ones((m,1)), X ) ) , Theta1.T ) )
    h2 = s.sigmoid( np.dot( np.column_stack( ( np.ones((m,1)), h1) ) , Theta2.T ) )

    p = np.argmax(h2, axis=1)

    # =========================================================================

    return p + 1 # offsets python's zero notation
Beispiel #38
0
def DecisionBoundary_reg(data, theta):
    PlotData(data, 'Microchip Test 1', 'Microchip Test 2', 'Accepted',
             'Not Accepted')
    X = data[:, 0:2]
    x1_min, x1_max = X[:, 0].min(), X[:, 0].max()
    x2_min, x2_max = X[:, 1].min(), X[:, 1].max()
    xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max),
                           np.linspace(x2_min, x2_max))
    XX = MapFeature(np.c_[xx1.ravel(), xx2.ravel()])
    XXX = np.hstack((np.ones((XX.shape[0], 1)), XX))
    h = sigmoid(
        XXX.dot(theta)
    )  #only difference from DecisionBoundary() is addition of MapFeature.....
    h = h.reshape(xx1.shape)  #.....same as MapFeature(X)*Theta
    plt.contour(xx1, xx2, h, [0.5], linewidths=1, colors='g')
 def SEAIRD(y, t):
     beta = sg.sigmoid(t - startT, beta0, beta01)
     S = y[0]
     E = y[1]
     A = y[2]
     I = y[3]
     R = y[4]
     p = 0.4
     y0 = (-(beta2 * A + beta * I) * S - mu * S)  #S
     y1 = (beta2 * A + beta * I) * S - sigma * E - mu * E  #E
     y2 = sigma * E * (1 - p) - mu * A - gamma2 * A  #A
     y3 = sigma * E * p - gamma * I - mu * I  #I
     y4 = (b * I + d * A - mu * R)  #R
     y5 = (-(y0 + y1 + y2 + y3 + y4))  #D
     return [y0, y1, y2, y3, y4, y5]
Beispiel #40
0
def lrCostFunction(theta, X, y, lam):
    """
     lam:lambda惩罚系数
     该函数为正则化的代价函数
    """
    theta = theta.T
    theta_Reg = theta[1:]  #不惩罚第一项

    # 同线性代数中矩阵乘法的定义: np.dot()
    # 对应元素相乘 element-wise product: np.multiply(), 或 *
    g = S.sigmoid(np.dot(X, theta))
    J = np.sum((-y * np.log(g)) +
               (y - 1) * np.log(1 - g)) / len(X) + lam * np.sum(
                   theta_Reg * theta_Reg) / (2 * len(X))
    return J
Beispiel #41
0
def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)
    
    error = sigmoid(X * theta.T) - y
    
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        grad[i] = np.sum(term) / len(X)
    
    return grad
Beispiel #42
0
def predict(X, params, dimensions):
    assert len(dimensions) == 3

    W1, b1, W2, b2 = unpack_parmas(params, dimensions)

    z1 = np.dot(X, W1) + b1
    h = sigmoid(z1)

    z2 = np.dot(h, W2) + b2
    y = softmax(z2)
    y = y.argmax(axis=1)
    n_label = np.max(y) + 1
    y = np.eye(n_label,dtype=np.int64)[y]
    
    return y
Beispiel #43
0
def Gradient(theta, X, y):
    X = np.matrix(X)
    y = np.matrix(y)
    theta = np.matrix(theta)
    parameters = int(theta.ravel().shape[1])
    G = np.zeros(parameters)
    #    G = 0
    m = len(X)

    for i in range(parameters):
        pred = sigmoid(X * theta.T)
        #    J = 1/m * ( np.sum(-np.multiply(y ,np.log(pred)) - np.multiply((1-y), np.log(1- pred))))
        G[i] = 1 / m * np.sum((np.multiply(X[:, i], (pred - y))))

    return G
Beispiel #44
0
def costFunctionReg(theta, reg, X, y):
    '''returns the cost in a regularized manner,
    input is theta,lambda as reg,X and y as inputs and predicted value respectively
    np.log(a)==> returns array of elementwise log of element'''
    m = y.size
    h = sigmoid(X.dot(theta))
    theta_J = theta[1:]

    regparameter = (reg / 2 * m) * (theta.T @ theta
                                    )  # the calue added to the cost function

    J = -1 * (1 / m) * ((np.log(h + epsilon).T).dot(y) +
                        np.log(1 - h + epsilon).T.dot(1 - y)) + regparameter

    return J
Beispiel #45
0
def costFunction(theta, X, y):
    '''returns cost for theta, X and y
    np.log(a)==> returns array with elementwise log on array a
    use the sigmoid function that's being imported above 
    '''
    m = y.size
    h = sigmoid(X.dot(theta))
    y = np.array(y)
    h = np.array(h)
    #print(y.shape[0])
    J = -1 * (1 / m) * ((np.log(h).T).dot(y) + np.log(1 - h).T.dot(1 - y))
    return J
    if np.isnan(J[0]):
        return (np.inf)
    return (J[0])
Beispiel #46
0
def predictNN(X, Theta1, Theta2):
    """
        Функция позволяет выполнить предсказание метки класса p 
        в диапазоне от 0 до K (число классов равно K + 1) для 
        множества объектов, описанных в матрице объекты-признаки X. 
        Предсказание метки выполняется с использованием матриц 
        обучененных параметров модели Theta1, Theta2 трехслойной 
        нейронной сети
    """

    m = X.shape[0]
    p = np.zeros([m, 1])

    a1 = X
    a2 = sigmoid(np.dot(a1, Theta1.transpose()))
    a2 = np.concatenate((np.ones((m, 1)), a2), axis=1)

    a3 = sigmoid(np.dot(a2, Theta2.transpose()))
    h = a3

    p = np.argmax(h, axis=1)
    p = np.array([p]).transpose().astype('uint8')

    return p
Beispiel #47
0
def costfunc(theta, X, Y, m):
    m = float(m)
    res = 0

    #print np.dot(theta.T, X)
    h = sigmoid(np.dot(X, theta))

    #tmp = ( Y.T * np.log(h) ) + ((1 - Y.T) * np.log(1 - h))
    tmp = np.dot(Y.T, np.log(h)) + np.dot((1 - Y.T), np.log(1 - h))

    res = -(1 / m * tmp)

    #tmp = 1 / m * np.dot(X.T, (h - Y))

    return res
Beispiel #48
0
def predict(Theta1, Theta2, X):
    # Useful values
    m, n = X.shape
    num_labels = Theta2.shape[0]

    # You need to return the following variables correctly
    p = np.zeros((m, 1))

    # ====================== YOUR CODE HERE ======================

    def h(X, theta):
        return X.dot(theta)

    X = np.hstack((np.ones((m, 1)), X))

    a2 = sigmoid(h(X, Theta1.T))
    a2 = np.hstack((np.ones((m, 1)), a2))
    a3 = sigmoid(h(a2, Theta2.T))

    p = np.argmax(a3, axis=1) + 1

    # =============================================================

    return p
Beispiel #49
0
def gradFunction(theta, *args):
    X, y, l = args

    #reshape theta
    theta = np.reshape(theta, (len(theta), 1))

    m = len(X)

    h = sigmoid(X.dot(theta))

    reg_theta = np.concatenate(([[0.]], theta[1:]))

    grad = 1.0 / m * (X.T.dot(h - y)) + l * reg_theta * (1.0 / m)

    return grad.flatten()
Beispiel #50
0
def predict_function(theta, X, y=None):
    """
    Compute predictions on X using the parameters theta. If y is provided
    computes and returns the accuracy of the classifier as well.
    """

    preds = None
    accuracy = None
    threshold = 0.5
    score = np.dot(X, theta)
    preds_1 = sigmoid(score)
    preds = np.where(preds_1 >= threshold, 1, 0)
    accuracy = np.mean(y == preds)

    return preds, accuracy
Beispiel #51
0
def predict(Theta1, Theta2, X):
    m, n = X.shape
    num_labels = Theta2.shape[0]

    X = np.concatenate((np.ones((m, 1)), X), axis=1)

    # print('num_labels: ', num_labels)
    # print('Theta1: ', Theta1.shape)
    # print('Theta2: ', Theta2.shape)
    # print('X: ', X.shape)

    a2 = sigmoid(X.dot(Theta1.T))
    m, n = a2.shape
    a2 = np.concatenate((np.ones((m, 1)), a2), axis=1)

    # print('a2: ', a2.shape)

    a3 = sigmoid(a2.dot(Theta2.T))

    # print('a3: ', a3.shape)

    p = np.argmax(a3, axis=1)

    return p + 1  # Matlab data is 1-indexed
def lr_cost_function(theta, X, y, lmd):
    m = len(y)
    # You need to return the following values correctly
    theta = theta.reshape((len(theta), 1))
    g = np.array(sigmoid(X.dot(theta)))
    cost = (np.sum(-y * np.log(g) - (1 - y) * np.log(1 - g)) +
            lmd / 2 * np.sum(np.power(theta[1:], 2))) / m

    # ===================== Your Code Here =====================
    # Instructions : Compute the cost of a particular choice of theta
    #                You should set cost and grad correctly.
    #

    # ===========================================================
    return cost
Beispiel #53
0
def cost_function_reg(theta, X, y, l):
    """
    Compute cost and gradient for logistic regression with regularization.

    Parameters
    ----------
    theta : ndarray, shape (n_features,)
        Linear regression parameter.
    X : ndarray, shape (n_samples, n_features)
        Training data, where n_samples is the number of samples and n_features is the number of features.
    y : ndarray, shape (n_samples,)
        Labels.
    l : float
        Regularization parameter.

    Returns
    -------
    J : numpy.float64
        The cost of using theta as the parameter for regularized logistic regression w.r.t. the parameters.
    grad: ndarray, shape (n_features,)
        Partial derivatives of the cost w.r.t. each parameter in theta.
    """
    m, n = X.shape

    x_dot_theta = X.dot(theta)
    mask = np.eye(len(theta))
    # Skip the theta[0, 0] parameter when performing regularization
    mask[0, 0] = 0

    J = 1.0 / m * (np.dot(-y.T, np.log(sigmoid(x_dot_theta))) - np.dot((1 - y).T, np.log(1 - sigmoid(x_dot_theta)))) \
        + 1.0 * l / (2 * m) * np.sum(np.power((mask.dot(theta)), 2))

    grad = 1.0 / m * np.dot(
        (sigmoid(x_dot_theta) - y).T, X).T + 1.0 * l / m * (mask.dot(theta))

    return J, grad
Beispiel #54
0
def gradientDescent(X, y, theta, alpha, num_iters):
    m = len(y)
    nfeatures = len(theta)
    J_history = np.zeros(num_iters)
    for ii in range(num_iters):
        z = np.dot(theta, X)
        h = sigmoid(z)
        for jj in range(nfeatures):
            theta[jj] -= alpha * sum((h - y) * X[jj, :]) / m
        if plotJ:
            J_history[ii] = computeCost(X, y, theta)
    if plotJ:
        p.plot(J_history)
        p.show()
    return theta
Beispiel #55
0
def predict(theta, X):
    """ computes the predictions for X using a threshold at 0.5
    (i.e., if sigmoid(theta'*x) >= 0.5, predict 1)
    """
    p = 0
    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the following code to make predictions using
    #               your learned logistic regression parameters.
    #               You should set p to a vector of 0's and 1's
    #

    # =========================================================================
    p = (np.round(sigmoid(np.dot(X, theta)), 1)) >= 0.5

    return p
Beispiel #56
0
def predict(theta, X):
    '''Predict whether the label is 0 or 1 using learned logistic regression parameters theta'''
    #   p = PREDICT(theta, X) computes the predictions for X using a
    #   threshold at 0.5 (i.e., if sigmoid(theta'*x) >= 0.5, predict 1)

    # You need to return the following variables correctly

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the following code to make predictions using
    #               your learned logistic regression parameters.
    #               You should set p to a vector of 0's and 1's
    #

    p = sigmoid(X.dot(theta)) >= 0.5
    return p
def costFunction(theta, X,y):
    """ computes the cost of using theta as the
    parameter for logistic regression and the
    gradient of the cost w.r.t. to the parameters."""

# Initialize some useful values
    m = len(y) # number of training examples
    J=0


# ====================== YOUR CODE HERE ======================
# Instructions: Compute the cost of a particular choice of theta.
#               You should set J to the cost.
#               Compute the partial derivatives and set grad to the partial
#               derivatives of the cost w.r.t. each parameter in theta
#
# Note: grad should have the same dimensions as theta
#theta.shape=(3,), X.Shape=(100,3), y.shape=(100,) 

    gradient_1=y*np.transpose(np.log(1-sigmoid(np.dot(X,theta))))
    gradient_2=(1-y) * np.transpose(np.log(sigmoid(np.dot(X,theta) ) ))
    J = -(1./m)*(gradient_1+gradient_2).sum()
    
    return J
def costFunction(nn_weights, layers, X, y, num_labels, lambd):
    # Computes the cost function of the neural network.
    # nn_weights: Neural network parameters (vector)
    # layers: a list with the number of units per layer.
    # X: a matrix where every row is a training example for a handwritten digit image
    # y: a vector with the labels of each instance
    # num_labels: the number of units in the output layer
    # lambd: regularization factor
    
    # Setup some useful variables
    m = X.shape[0]
    num_layers = len(layers)

    # Unroll Params
    Theta = roll_params(nn_weights, layers)

    # ================================ TODO ================================
    # The vector y passed into the function is a vector of labels
    # containing values from 1..K. You need to map this vector into a 
    # binary vector of 1's and 0's to be used with the neural network
    # cost function.
    yv = np.zeros((num_labels, m))
    for i in range(len(y)):
        yv[int(y[i]), i] = 1
    yv = np.transpose(yv)

    # ================================ TODO ================================
    # In this point calculate the cost of the neural network (feedforward)
    x = np.copy(X)

    for i in range(num_layers - 1):
        s = np.shape(Theta[i])
        theta = Theta[i][:, 1:s[1]]
        x = np.dot(x, np.transpose(theta))
        x = x + Theta[i][:, 0]
        x = sigmoid(x)

    cost = (yv * np.log(x) + (1 - yv) * np.log(1 - x)) / m
    cost = -np.sum(cost)

    somme = 0

    for i in range(num_layers - 1):
        somme += lambd * np.sum(Theta[i] ** 2) / (2 * m)

    cost += somme

    return cost
def logistic_SGD(X, y, num_iter=10000, alpha=0.01):
    """
    Perform logistic regression with stochastic gradient descent.

    Args:
        theta_0: Initial value for parameters of shape [num_features]
        X: Data matrix of shape [num_train, num_features]
        y: Labels corresponding to X of size [num_train, 1]
        num_iter: Number of iterations of SGD
        alpha: The learning rate

    Returns:
        theta: The value of the parameters after logistic regression

    """

    theta = np.zeros(X.shape[1])
    losses = []
    new_loss = cost_function(theta, X, y)
    for i in range(num_iter):
        start = time.time()

        N = len(X)
        #
        theta_transp = np.transpose(theta)
        theta_x = np.dot(X, theta_transp)
        predictions = sigmoid(theta_x)
        #
        #grad = gradient_function(theta, X, y)
        gradient = np.dot(X.T, predictions - y)
        #
        gradient /= N
        #
        gradient *= alpha
        #
        theta -= gradient
        #
        # return theta

        if i % 1000 == 0:
            exec_time = time.time() - start
            loss = cost_function(theta, X, y)
            losses.append(loss)
            print('Iter {}/{}: cost = {}  ({}s)'.format(
                i, num_iter, loss, exec_time))
            alpha *= 0.9

    return theta, losses
Beispiel #60
0
def forward_backward_prop(data, labels, params, dimensions):
    """
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    and backward propagation for the gradients for all parameters.

    Arguments:
    data -- M x Dx matrix, where each row is a training example.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension
    """

    # Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # YOUR CODE HERE: forward propagation
    z1 = np.dot(data, W1) + b1
    h = sigmoid(z1)
    z2 = np.dot(h, W2) + b2
    y_guess = softmax(z2)
    cost = -np.sum(labels * np.log(y_guess))  # cross entropy loss
    # END YOUR CODE

    # YOUR CODE HERE: backward propagation
    diff_labels = y_guess - labels
    gradb2 = np.sum(diff_labels, axis=0)
    gradW2 = np.dot(h.T, diff_labels)
    gradb1 = np.sum(np.dot(diff_labels, W2.T) * sigmoid_grad(h), axis=0)
    gradW1 = np.dot(data.T, np.dot(diff_labels, W2.T) * sigmoid_grad(h))
    # END YOUR CODE

    # Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(),
                           gradW2.flatten(), gradb2.flatten()))

    return cost, grad