def checkNNGradients(lambd): input_layer_size = 3; hidden_layer_size = 5; num_labels = 3; m = 5; layers = [3, 5, 3] # In this point we generate a number of random data Theta = [] Theta.append(debugInitializeWeights(hidden_layer_size, input_layer_size)) Theta.append(debugInitializeWeights(num_labels, hidden_layer_size)) X = debugInitializeWeights(m, input_layer_size - 1) y = remainder(arange(m)+1, num_labels) # Unroll parameters nn_params = unroll_params(Theta) # Compute Numerical Gradient numgrad = computeNumericalGradient(nn_params,layers, X, y, num_labels, lambd) # Compute Analytical Gradient (BackPropagation) truegrad = backwards(nn_params, layers, X, y, num_labels, lambd) print concatenate(([numgrad], [truegrad]), axis = 0).transpose() print "The above two columns must be very similar.\n(Left-Numerical Gradient, Right-Analytical Gradient (BackPropagation)\n" diff = linalg.norm(numgrad - truegrad) / linalg.norm(numgrad + truegrad) print "\nNote: If the implementation of the backpropagation is correct, the relative different must be quite small (less that 1e-09)." print "Relative difference: " + str(diff) + "\n"
def checkNumericalGradient(): """This code can be used to check your numerical gradient implementation in computeNumericalGradient.m It analytically evaluates the gradient of a very simple function called simpleQuadraticFunction (see below) and compares the result with your numerical solution. Your numerical gradient implementation is incorrect if your numerical solution deviates too much from the analytical solution. """ # Evaluate the function and gradient at x = [4; 10]; (Here, x is a 2d vector.) x = np.array([4, 10]) _, grad = simpleQuadraticFunction(x) # Use your code to numerically compute the gradient of simpleQuadraticFunction at x. numgrad = computeNumericalGradient(simpleQuadraticFunction, x) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.stack((numgrad, grad)).T) print('The above two columns you get should be very similar.') print('Left-Your Numerical Gradient, Right-Analytical Gradient.\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used \epsilon = 0.0001 # in computeNumericalGradient.m, then diff below should be 2.1452e-12 diff = norm(numgrad - grad) / norm(numgrad + grad) print(diff) print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n');
def checkCostFunction(lamda=0): # Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = X_t.dot(Theta_t.T) Y[np.where(np.random.random_sample(Y.shape) > 0.5)] = 0 R = np.zeros(Y.shape) R[np.where(Y != 0)] = 1 # Run Gradient Checking X = np.random.random_sample(X_t.shape) Theta = np.random.random_sample(Theta_t.shape) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] # params = np.hstack((X.T.flatten(), Theta.T.flatten())) costFunc = lambda X, Theta: cofiCostFunc(X, Theta, Y, R, lamda) costFunc_w = lambda X, Theta: costFunc(X, Theta)[0] numgrad = computeNumericalGradient(costFunc_w, X, Theta) cost, grad = cofiCostFunc(X, Theta, Y, R, lamda) print(grad) print(numgrad)
def checkNumericalGradient(): # This code can be used to check your numerical gradient implementation # in computeNumericalGradient.m # It analytically evaluates the gradient of a very simple function called # simpleQuadraticFunction (see below) and compares the result with your # numerical # solution. Your numerical gradient implementation is incorrect if # your numerical solution deviates too much from the analytical solution. # Evaluate the function and gradient at x = [4; 10]; (Here, x is a 2d # vector.) x = np.array([4, 10]).reshape((-1, 1)) [value, grad] = simpleQuadraticFunction(x) # Use your code to numerically compute the gradient of # simpleQuadraticFunction at x. # (The notation "@simpleQuadraticFunction" denotes a pointer to a function.) numgrad = computeNumericalGradient(simpleQuadraticFunction, x) # Visually examine the two gradient computations. The two columns # you get should be very similar. print [numgrad, grad] print 'The above two columns you get should be very similar.\n(Left-Your\ Numerical Gradient, Right-Analytical Gradient)\n\n' # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be 2.1452e-12 diff = np.linalg.norm(numgrad - grad)/np.linalg.norm(numgrad + grad) print diff print 'Norm of the difference between numerical and analytical gradient \
def checkNNGradients(Lambda=0): """Creates a small neural network to check the backpropagation gradients, it will output the analytical gradients produced by your backprop code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values. """ #input_layer_size = 4 #hidden_layer_size = 5 #num_labels = 3 #m = 10 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(range(1, m + 1), num_labels) # Unroll parameters nn_params = np.hstack((Theta1.T.ravel(), Theta2.T.ravel())) # Short hand for cost function costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) #costFunc = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) numgrad = computeNumericalGradient(costFunc, nn_params) grad = costFunc(nn_params)[1] print(numgrad) print(grad) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.column_stack((numgrad, grad))) print('The above two columns you get should be very similar.\n' \ '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then\n ' \ 'the relative difference will be small (less than 1e-9). \n' \ '\nRelative Difference: %g\n' % diff) return (nn_params, grad)
def checkNNGradients(lambda_reg=0): #CHECKNNGRADIENTS Creates a small neural network to check the #backpropagation gradients # CHECKNNGRADIENTS(lambda_reg) Creates a small neural network to check the # backpropagation gradients, it will output the analytical gradients # produced by your backprop code and the numerical gradients (computed # using computeNumericalGradient). These two gradient computations should # result in very similar values. # input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = diw.debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = diw.debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = diw.debugInitializeWeights(m, input_layer_size - 1) y = 1 + np.mod(range(m), num_labels).T # Unroll parameters nn_params = np.concatenate((Theta1.reshape(Theta1.size, order='F'), Theta2.reshape(Theta2.size, order='F'))) # Short hand for cost function def costFunc(p): return nncf.nnCostFunction(p, input_layer_size, hidden_layer_size, \ num_labels, X, y, lambda_reg) _, grad = costFunc(nn_params) numgrad = cng.computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. # code from http://stackoverflow.com/a/27663954/583834 fmt = '{:<25}{}' print(fmt.format('Numerical Gradient', 'Analytical Gradient')) for numerical, analytical in zip(numgrad, grad): print(fmt.format(numerical, analytical)) print('The above two columns you get should be very similar.\n' \ '(Left Col.: Your Numerical Gradient, Right Col.: Analytical Gradient)') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = Decimal(np.linalg.norm(numgrad-grad))/Decimal(np.linalg.norm(numgrad+grad)) print('If your backpropagation implementation is correct, then \n' \ 'the relative difference will be small (less than 1e-9). \n' \ '\nRelative Difference: {:.10E}'.format(diff))
def checkNNGradients(NNlambda = 0.0): input_layer_size = 3 hidden1_layer_size = 5 hidden2_layer_size = 4 num_labels = 3 m = 5 #We generate some 'random' test data Theta1 = debugInitializeWeights(input_layer_size, hidden1_layer_size) Theta2 = debugInitializeWeights(hidden1_layer_size, hidden2_layer_size) Theta3 = debugInitializeWeights(hidden2_layer_size, num_labels) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = 1.0 + transpose(mod(range(0, m), num_labels)) # Unroll parameters nn_params = concatenate((Theta1.flatten(), Theta2.flatten(), Theta3.flatten())) miniBatchSize = 1000.0 theta = nn_params counter = 0 numberOfIterations = range(int(ceil(X.shape[0] / miniBatchSize))) for i in numberOfIterations: values2Train = range(counter, counter + int(miniBatchSize)) counter = max(values2Train) + 1 while X.shape[0] <= max(values2Train): values2Train.remove(values2Train[-1]) arguments = (input_layer_size, hidden1_layer_size, hidden2_layer_size, num_labels, X[values2Train, :], y[values2Train, :], NNlambda) theta = optimize.fmin_l_bfgs_b(nnCostFunction, x0 = theta, fprime = nnGradFunction, args = arguments, maxiter = 20, disp = True, iprint = 0 ) #theta = optimize.fmin_cg(nnCostFunction, x0 = nnThetas, fprime = nnGradFunction, args = arguments, maxiter = 3, disp = True, retall= True ) theta = array(theta[0]) cost = nnCostFunction(theta, input_layer_size, hidden1_layer_size, hidden2_layer_size, num_labels, X, y, NNlambda) grad = nnGradFunction(theta, input_layer_size, hidden1_layer_size, hidden2_layer_size, num_labels, X, y, NNlambda) numgrad = computeNumericalGradient(theta, input_layer_size, hidden1_layer_size, hidden2_layer_size, num_labels, X, y, NNlambda) # Visually examine the two gradient computations. The two columns you get should be very similar. print(hstack((numgrad, grad))) print('The above two columns you get should be very similar') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) print('If your backpropagation implementation is correct, then the relative difference will be small (less than 1e-9) relative Difference') print(diff) return(diff)
def run_training(FLAGS, images, labels): # For debugging purposes, you may wish to reduce the size of the input data # in order to speed up gradient checking. # Here, we create synthetic dataset using random data for testing if FLAGS.debug: inputSize = 8 images = randn(8, 100) labels = randint(0, 10, 100, dtype = np.uint8) else: inputSize = FLAGS.visibleSize numClasses = 5 decay = FLAGS.decay # Randomly initialise theta theta = 0.005 * randn(numClasses * inputSize) # Implement softmaxCost in softmax.py. cost, grad = softmaxCost(theta, numClasses, inputSize, decay, images, labels) # As with any learning algorithm, you should always check that your # gradients are correct before learning the parameters. if FLAGS.debug: # First, lets make sure your numerical gradient computation is correct for a # simple function. After you have implemented computeNumericalGradient.py, # run the following: #checkNumericalGradient() numGrad = computeNumericalGradient(lambda x: softmaxCost(x, numClasses, inputSize, decay, images, labels), theta) # Use this to visually compare the gradients side by side. print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with those computed analytically. diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) sys.exit(1) # The difference should be small. # In our implementation, these values are usually less than 1e-7. # Once you have verified that your gradients are correct, # you can start training your softmax regression code using L-BFGS. theta, _, _ = fmin_l_bfgs_b(softmaxCost, theta, args = (numClasses, inputSize, decay, images, labels), maxiter = 400, disp = 1) # Fold parameters into a matrix format. theta = np.reshape(theta, (numClasses, inputSize)); return theta
def checkCostFunction(lambda_var=0): #CHECKCOSTFUNCTION Creates a collaborative filtering problem #to check your cost function and gradients # CHECKCOSTFUNCTION(lambda_var) Creates a collaborative filtering problem # to check your cost function and gradients, it will output the # analytical gradients produced by your code and the numerical gradients # (computed using computeNumericalGradient). These two gradient # computations should result in very similar values. # Set lambda_var # if not lambda_var or not 'lambda_var' in locals(): # lambda_var = 0 ## Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.random.rand(Y.shape[0], Y.shape[1]) > 0.5] = 0 R = np.zeros(Y.shape) R[Y != 0] = 1 ## Run Gradient Checking X = np.random.randn(X_t.shape[0], X_t.shape[1]) Theta = np.random.randn(Theta_t.shape[0], Theta_t.shape[1]) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] params = np.concatenate( (X.reshape(X.size, order='F'), Theta.reshape(Theta.size, order='F'))) # Short hand for cost function def costFunc(p): return cofiCostFunc(p, Y, R, num_users, num_movies, num_features, lambda_var) numgrad = computeNumericalGradient(costFunc, params) cost, grad = cofiCostFunc(params, Y, R, num_users, num_movies, num_features, lambda_var) print(np.column_stack((numgrad, grad))) print('The above two columns you get should be very similar.\n' \ '(Left-Your Numerical Gradient, Right-Analytical Gradient)') diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then \n' \ 'the relative difference will be small (less than 1e-9). ' \ '\nRelative Difference: {:e}'.format(diff))
def checkNNGradients(_lambda): #CHECKNNGRADIENTS Creates a small neural network to check the #backpropagation gradients # CHECKNNGRADIENTS(lambda) Creates a small neural network to check the # backpropagation gradients, it will output the analytical gradients # produced by your backprop code and the numerical gradients (computed # using computeNumericalGradient). These two gradient computations should # result in very similar values. # if '_lambda' not in locals(): _lambda = 0; input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(np.arange(0,m), num_labels).reshape(-1,1) # Unroll parameters nn_params = np.hstack((Theta1.flatten(), Theta2.flatten())) # Short hand for cost function costFunc = functools.partial(nnCostFunction, input_layer_size = input_layer_size, hidden_layer_size = hidden_layer_size, num_labels = num_labels, X = X, y = y, _lambda = _lambda) #cost, grad = nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda) #(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, _lambda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print (np.vstack((numgrad, grad))) print('The above two columns you get should be very similar.\n (Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) print('If your backpropagation implementation is correct, then \nthe relative difference will be small (less than 1e-9). \nRelative Difference:', diff)
def checkCostFunction(Lambda=0): """Creates a collaborative filering problem to check your cost function and gradients, it will output the analytical gradients produced by your code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values. """ ## Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = X_t.dot(Theta_t.T) Y[np.where(np.random.random_sample(Y.shape) > 0.5, True, False)] = 0 R = np.zeros(Y.shape) R[np.where(Y != 0, True, False)] = 1 ## Run Gradient Checking X = np.random.random_sample(X_t.shape) Theta = np.random.random_sample(Theta_t.shape) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] # Unroll parameters params = np.hstack((X.T.flatten(), Theta.T.flatten())) costFunc = lambda t: cofiCostFunc(t, Y, R, num_users, num_movies, num_features, Lambda) def costFunc_w(t): Jgrad = costFunc(t) return Jgrad numgrad = computeNumericalGradient(costFunc_w, params) cost, grad = cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda) print np.column_stack((numgrad, grad)) print 'The above two columns you get should be very similar.\n' \ '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n' diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print 'If your backpropagation implementation is correct, then\n ' \ 'the relative difference will be small (less than 1e-9). \n' \ '\nRelative Difference: %g\n' % diff
def checkCostFunction(lambda_var=0): #CHECKCOSTFUNCTION Creates a collaborative filering problem #to check your cost function and gradients # CHECKCOSTFUNCTION(lambda_var) Creates a collaborative filering problem # to check your cost function and gradients, it will output the # analytical gradients produced by your code and the numerical gradients # (computed using computeNumericalGradient). These two gradient # computations should result in very similar values. # Set lambda_var # if not lambda_var or not 'lambda_var' in locals(): # lambda_var = 0 ## Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.random.rand(Y.shape[0], Y.shape[1]) > 0.5] = 0 R = np.zeros(Y.shape) R[Y != 0] = 1 ## Run Gradient Checking X = np.random.randn(X_t.shape[0], X_t.shape[1]) Theta = np.random.randn(Theta_t.shape[0], Theta_t.shape[1]) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] params = np.concatenate((X.reshape(X.size, order='F'), Theta.reshape(Theta.size, order='F'))) # Short hand for cost function def costFunc(p): return ccf.cofiCostFunc(p, Y, R, num_users, num_movies, num_features, lambda_var) numgrad = cng.computeNumericalGradient(costFunc, params) cost, grad = ccf.cofiCostFunc(params, Y, R, num_users, num_movies, num_features, lambda_var) print(np.column_stack((numgrad, grad))) print('The above two columns you get should be very similar.\n' \ '(Left-Your Numerical Gradient, Right-Analytical Gradient)') diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) print('If your backpropagation implementation is correct, then \n' \ 'the relative difference will be small (less than 1e-9). ' \ '\nRelative Difference: {:e}'.format(diff))
def checkNNGradients(reg_lambda=0): """ Creates a small neural network to check the backpropagation gradients CHECKNNGRADIENTS(reg_lambda) Creates a small neural network to check the backpropagation gradients, it will output the analytical gradients produced by your backprop code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values.""" input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(np.arange(m), num_labels).T.reshape(m, 1) # Unroll parameters nn_params = np.r_[Theta1.ravel(), Theta2.ravel()] # Short hand for cost function costFunc = lambda params: nnCostFunction( params, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(numgrad, grad) print('The above two columns you get should be very similar.\n', '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.py, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then \n', 'the relative difference will be small (less than 1e-9). \n', '\nRelative Difference: \n', diff)
def checkNNGradients(lambda_=0): """ Creates a small neural network to check the backpropagation gradients, it will output the analytical gradients produced by your backprop code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values. """ input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.arange(1, m + 1) % num_labels # Unroll parameters nn_params = np.r_[Theta1.flatten(order='F'), Theta2.T.flatten(order='F')] # Short hand for cost function def costFunction(p): return nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_) numgrad = computeNumericalGradient(costFunction, nn_params) _, grad = costFunction(nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.c_[numgrad, grad]) print('The above two columns you get should be very similar.\n' '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON 1e-4 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then\n ' 'the relative difference will be small (less than 1e-9). \n' '\nRelative Difference: %g\n' % diff)
def checkNNGradients(Lambda = 0): """Creates a small neural network to check the backpropagation gradients, it will output the analytical gradients produced by your backprop code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values. """ input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(range(1, m+1), num_labels) # Unroll parameters nn_params = np.hstack((Theta1.T.ravel(), Theta2.T.ravel())) # Short hand for cost function costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, Lambda) numgrad = computeNumericalGradient(costFunc, nn_params) grad = costFunc(nn_params)[1] # Visually examine the two gradient computations. The two columns # you get should be very similar. print np.column_stack((numgrad, grad)) numgrad.shape # grad.shape # print 'The above two columns you get should be very similar.\n' \ '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n' # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) print 'If your backpropagation implementation is correct, then\n ' \ 'the relative difference will be small (less than 1e-9). \n' \ '\nRelative Difference: %g\n' % diff
def checkCostFunction(reg_lambda=0): """ Creates a collaborative filtering problem to check your cost function and gradients checkCostFunction(lambda) Creates a collaborative filtering problem to check your cost function and gradients, it will output the analytical gradients produced by your code and the numerical gradients (computed using computeNumericalGradient). These two gradient computations should result in very similar values.""" # Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = X_t.dot(Theta_t.T) rand_data = np.random.randn(*Y.shape) Y[np.where(rand_data > 0.5)] = 0 R = np.zeros(Y.shape) R[np.where(Y != 0)] = 1 # Run Gradient Checking X = np.random.randn(*X_t.shape) Theta = np.random.randn(*Theta_t.shape) num_movies, num_users = Y.shape num_features = Theta_t.shape[1] # build params params = np.r_[X.flatten(), Theta.flatten()].reshape(-1, 1) costFunc = lambda t: cofiCostFunc(t, Y, R, num_users, num_movies, num_features, reg_lambda) numgrad = computeNumericalGradient(costFunc, params) cost, grad = costFunc(params) # make sure both grad have the same shape grad = grad.reshape(numgrad.shape) print(np.c_[numgrad.ravel(), grad.ravel()]) print('The above two columns you get should be very similar. ' '(Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your cost function implementation is correct, then \n the relative difference ' 'will be small (less than 1e-9). ' '\n \nRelative Difference: \n', diff)
def checkNNGradients(lmbda): #CHECKNNGRADIENTS Creates a small neural network to check the #backpropagation gradients # CHECKNNGRADIENTS(lmbda) Creates a small neural network to check the # backpropagation gradients, it will output the analytical gradients # produced by your backprop code and the numerical gradients (computed # using computeNumericalGradient). These two gradient computations should # result in very similar values. # if not 'lmbda' in locals(): lmbda = 0 input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = (np.reshape(np.mod(range(0,m), num_labels), (1,m))).flatten() # Unroll parameters nn_params = np.concatenate((Theta1.ravel(), Theta2.ravel()), axis=0) # Short hand for cost function costFunc = lambda p : nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print np.concatenate((np.reshape(numgrad, (1, numgrad.size)).T, np.reshape(grad, (1, grad.size)).T), axis=1) print 'The above two columns you get should be very similar.\n(Left-Your Numerical Gradient, Right-Analytical Gradient)' # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad-grad) / np.linalg.norm(numgrad+grad) print 'If your backpropagation implementation is correct, then \nthe relative difference will be small (less than 1e-9). \nRelative Difference: {}\n'.format(diff)
def checkCostFunction(*xlambda): if len(xlambda) == 0: xlambda = 0 # Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.where(np.random.rand(Y.shape[0], Y.shape[1]) > 0.5)] = 0 R = np.zeros(np.shape(Y)) R[np.where(Y != 0)] = 1 # Run Gradient Checking X = np.random.randn(X_t.shape[0], X_t.shape[1]) Theta = np.random.randn(Theta_t.shape[0], Theta_t.shape[1]) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] # cost function def cost_func(p): return cCF.cofiCostFunc(p, Y, R, num_users, num_movies, num_features, xlambda) nn_params = np.r_[(X.ravel().reshape(num_movies * num_features, 1), Theta.ravel().reshape(num_users * num_features, 1))] numgrad = cNG.computeNumericalGradient(cost_func, nn_params) cost, grad = cCF.cofiCostFunc(nn_params, Y, R, num_users, num_movies, num_features, xlambda) # Visually examine the two gradient computations. The two columns you get should be very similar. print(np.c_[numgrad, grad]) print( 'The above two columns you get should be very similar.\n(Left: Numerical Gradient\tRight: Analytical Gradient)' ) diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('''If your cost function implementation is correct, then the relative difference will be small (less than 1e-9). Relative Difference: %.16f);''' % diff)
def checkNNGradients(_lambda=None): if _lambda == None: _lambda = 0 input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = (1 + np.arange(m)) % num_labels y = y.reshape(-1,1) # Unroll parameters nn_params = np.append(Theta1.flatten(),Theta2.flatten()) # Short hand for cost function costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, \ num_labels, X, y, _lambda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(grad) print(numgrad) print('The above two columns you get should be very similar.\n \ (Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad-grad)/np.linalg.norm(numgrad+grad) print('If your backpropagation implementation is correct, then \n \ the relative difference will be small (less than 1e-9). \n \ \nRelative Difference: %g\n'%diff)
def checkCostFunction(_lambda=None): if _lambda == None: _lambda = 0 ## Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.where(np.random.rand(Y.shape[0], Y.shape[1]) > 0.5)] = 0 R = np.zeros(Y.shape) R[np.where(Y != 0)] = 1 R = R.astype(int) ## Run Gradient Checking X = np.random.randn(X_t.shape[0], X_t.shape[1]) Theta = np.random.randn(Theta_t.shape[0], Theta_t.shape[1]) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] func = lambda t: cofiCostFunc(t, Y, R, num_users, num_movies, num_features, _lambda) numgrad = computeNumericalGradient(func, np.append(X.flatten(), Theta.flatten())) cost, grad = cofiCostFunc( np.append(X.flatten(), Theta.flatten()), \ Y, R, num_users, num_movies, num_features, _lambda ) print(numgrad) print(grad) print("The above two columns you get should be very similar.\n \ (Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n") diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then \n \ the relative difference will be small (less than 1e-9). \n \ \nRelative Difference: %g\n' % diff)
def checkNNGradients(lambda_value=0): input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = 1 + np.transpose(np.mod(range(1, m + 1), num_labels)) # y=np.expand_dims(y,axis=1) # Unroll parameters Theta1_1d = np.reshape(Theta1, Theta1.size, order='F') Theta2_1d = np.reshape(Theta2, Theta2.size, order='F') nn_params = np.hstack((Theta1_1d, Theta2_1d)) # Short hand for cost function costFunc = lambda p: nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, np.expand_dims(nn_params, axis=1)) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(numgrad, grad) print( 'The above two columns you get should be very similar.\n (Left-Numerical Gradient, Right-(Your) Analytical Gradient)\n\n' ) # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print( 'If your backpropagation implementation is correct, then \n the relative difference will be small (less than 1e-9). \n \nRelative Difference: ', diff)
def checkCostFunction(_lambda=0): X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.random.rand(Y.shape[0], Y.shape[1]) > 0.5] = 0 R = np.zeros(Y.shape) R[Y != 0] = 1 # Run Gradient Checking X = np.random.randn(X_t.shape[0], X_t.shape[1]) Theta = np.random.randn(Theta_t.shape[0], Theta_t.shape[1]) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = Theta_t.shape[1] params = np.concatenate( (X.reshape(X.size, order='F'), Theta.reshape(Theta.size, order='F'))) def costFunc(t): return cofiCostFunc(t, Y, R, num_users, num_movies, num_features, _lambda, True) _, grad = costFunc(params) numgrad = computeNumericalGradient(costFunc, params) print('Numerical Gradient', 'Analytical Gradient') for numerical, analytical in zip(numgrad, grad): print(numerical, analytical) print('The above two columns you get should be very similar.\n' \ '(Left Col.: Your Numerical Gradient, Right Col.: Analytical Gradient)') diff = Decimal(np.linalg.norm(numgrad - grad)) / Decimal( np.linalg.norm(numgrad + grad)) print('If your backpropagation implementation is correct, then \n' \ 'the relative difference will be small (less than 1e-9). \n' \ '\nRelative Difference: {:.10E}'.format(diff))
def checkNNGradients(lmbda=0): input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(range(m), num_labels) # Unroll parameters nn_params = np.hstack((Theta1.flatten(), Theta2.flatten())) # Short hand for cost function costFunc = lambda nn_params: nnCostFunction( nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda ) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.vstack((numgrad, grad)).T) print('The above two columns you get should be very similar.') print('(Left-Your Numerical Gradient, Right-Analytical Gradient)') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then') print('the relative difference will be small (less than 1e-9).') print('Relative Difference: %g' % (diff))
def checkCostFunction(lambda_value=0): #CHECKCOSTFUNCTION Creates a collaborative filering problem #to check your cost function and gradients # CHECKCOSTFUNCTION(lambda) Creates a collaborative filering problem # to check your cost function and gradients, it will output the # analytical gradients produced by your code and the numerical gradients # (computed using computeNumericalGradient). These two gradient # computations should result in very similar values. ## Create small problem X_t = np.random.rand(4, 3) Theta_t = np.random.rand(5, 3) # Zap out most entries Y = np.dot(X_t, Theta_t.T) Y[np.random.rand(*Y.shape) > 0.5] = 0 R = np.zeros(Y.shape) R[Y != 0] = 1 ## Run Gradient Checking X = np.random.randn(*X_t.shape) Theta = np.random.randn(*Theta_t.shape) num_movies, num_users = Y.shape num_features = Theta_t.shape[1] numgrad = computeNumericalGradient( lambda x: cofiCostFunc(x, Y, R, num_users, num_movies, num_features, lambda_value), np.concatenate([X.ravel(), Theta.ravel()])) cost, grad = cofiCostFunc(np.concatenate([X.ravel(), Theta.ravel()]), Y, R, num_users, num_movies, num_features, lambda_value) print(np.stack([numgrad, grad], axis=1)) print('The above two columns you get should be very similar.\n(Left-Your Numerical Gradient, Right-Analytical Gradient)\n') diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your cost function implementation is correct, then \nthe relative difference will be small (less than 1e-9).\nRelative Difference: %g' % diff) #end
def checkNNGradients(lambd): input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 layers = [3, 5, 3] # In this point we generate a number of random data Theta = [] Theta.append(debugInitializeWeights(hidden_layer_size, input_layer_size)) Theta.append(debugInitializeWeights(num_labels, hidden_layer_size)) X = debugInitializeWeights(m, input_layer_size - 1) y = np.remainder(np.arange(m) + 1, num_labels) # Unroll parameters nn_params = unroll_params(Theta) # Compute Numerical Gradient numgrad = computeNumericalGradient(nn_params, layers, X, y, num_labels, lambd) # Compute Analytical Gradient (BackPropagation) truegrad = backwards(nn_params, layers, X, y, num_labels, lambd) print(np.concatenate(([numgrad], [truegrad]), axis=0).transpose()) print( "The above two columns must be very similar.\n(Left-Numerical Gradient, Right-Analytical Gradient (BackPropagation)\n" ) diff = np.linalg.norm(numgrad - truegrad) / np.linalg.norm(numgrad + truegrad) print( "\nNote: If the implementation of the backpropagation is correct, the relative different must be quite small (less that 1e-09)." ) print("Relative difference: " + str(diff) + "\n")
def checkNNGradients(lmbda=0): input_layer_size = 3 hidden_layer_size = 5 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) # Reusing debugInitializeWeights to generate X X = debugInitializeWeights(m, input_layer_size - 1) y = np.mod(range(m), num_labels) # Unroll parameters nn_params = np.hstack((Theta1.flatten(), Theta2.flatten())) # Short hand for cost function costFunc = lambda nn_params: nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.vstack((numgrad, grad)).T) print("The above two columns you get should be very similar.") print("(Left-Your Numerical Gradient, Right-Analytical Gradient)") # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print("If your backpropagation implementation is correct, then") print("the relative difference will be small (less than 1e-9).") print("Relative Difference: %g" % (diff))
def checkNNGradients(lamda=0): input_layer_size = 3 hidden_layer_size = 3 num_labels = 3 m = 5 # We generate some 'random' test data Theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size) Theta2 = debugInitializeWeights(num_labels, hidden_layer_size) X = debugInitializeWeights(m, input_layer_size - 1) y = 1 + np.mod(range(1, m + 1), num_labels).reshape((m, 1)) # Unroll parameters nn_params = np.hstack((Theta1.T.ravel(), Theta2.T.ravel())) # Short hand for cost function costFunc = lambda p: nnCostFunction( p, input_layer_size, hidden_layer_size, num_labels, X, y, lamda) cost, grad = costFunc(nn_params) numgrad = computeNumericalGradient(costFunc, nn_params) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(np.vstack((numgrad, grad)).T) print('The above two columns you get should be very similar.\n' + '(Left-Your Numerical Gradient, Right-Analytical Gradient)') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.py, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then \n' + 'the relative difference will be small (less than 1e-9). \n' + 'Relative Difference: %g\n' % diff)
cost, grad = softmaxCost(theta, numClasses, inputSize, decay, images, labels) ##====================================================================== ## STEP 3: Gradient checking # # As with any learning algorithm, you should always check that your # gradients are correct before learning the parameters. # if FLAGS.debug: # First, lets make sure your numerical gradient computation is correct for a # simple function. After you have implemented computeNumericalGradient.py, # run the following: checkNumericalGradient() numGrad = computeNumericalGradient(lambda x: softmaxCost(x, numClasses, inputSize, decay, images, labels), theta) # Use this to visually compare the gradients side by side. print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with those computed analytically. diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) sys.exit(1) # The difference should be small. # In our implementation, these values are usually less than 1e-7. # When your gradients are correct, congratulations! ##====================================================================== ## STEP 4: Learning parameters
# Hint: If you are debugging your code, performing gradient checking on smaller # models and smaller training sets (e.g., using only 10 training examples and # 1-2 hidden units) may speed things up. # First, lets make sure your numerical gradient computation is correct for a # simple function. After you have implemented computeNumericalGradient.m, # run the following: from checkNumericalGradient import checkNumericalGradient checkNumericalGradient() # Now we can use it to check your cost function and derivative calculations # for the sparse autoencoder. from computeNumericalGradient import computeNumericalGradient numgrad = computeNumericalGradient(lambda x:sparseAutoencoderCost(x, visibleSize=visibleSize, hiddenSize=hiddenSize, lam=lambdaval, sparsityParam=sparsityParam, beta=beta, data=patches)[0], theta) # Use this to visually compare the gradients side by side print [numgrad, grad] # Compare numerically computed gradients with the ones obtained from backpropagation diff = np.linalg.norm(numgrad-grad)*1.0/np.linalg.norm(numgrad+grad) print 'the difference in the gradients is: ', diff # print diff # Should be small. In our implementation, these values are # usually less than 1e-9. # When you got this working, Congratulations!!! ##======================================================================
W = 0.01 * randn(numClasses, inputSize) b = np.zeros((numClasses, 1)) ##====================================================================== ## STEP 2: Gradient checking # # As with any learning algorithm, you should always check that your # gradients are correct before learning the parameters. # if FLAGS.debug: decay = 0.001 cost, dW, db = softmaxCost(W, b, numClasses, inputSize, decay, instances, labels) W_numGrad = computeNumericalGradient( lambda x: softmaxCost(x, b, numClasses, inputSize, decay, instances, labels), W) # Use this to visually compare the gradients side by side. print(np.stack((W_numGrad.ravel(), dW.ravel())).T) # Compare numerically computed gradients with those computed analytically. diff = norm(W_numGrad - dW) / norm(W_numGrad + dW) print(diff) sys.exit(0) # The difference should be small. # In our implementation, these values are usually less than 1e-7. ##====================================================================== ## STEP 3: Learning parameters #
def softmax_scipy(): FLAGS = parse_args() # Initiliaze values inputSize = 28 * 28 # Size of input vector (MNIST images are 28x28) numClasses = 10 # Number of classes (MNIST images fall into 10 classes) decay = 1e-4 # Weight decay parameter # Load training data images = np.load(FLAGS.input_data_dir + 'train-images.npy') labels = np.load(FLAGS.input_data_dir + 'train-labels.npy') print("\n\n For MNIST train data") print("images.shape = {}".format(images.shape)) # (784, 55000) print("labels.shape = {}".format(labels.shape)) # (55000,) print("\n\n") # ------------------------------------------------------- # Create data for debugging if FLAGS.debug: inputSize = 8 np.random.seed(100) images = randn(8, 100) labels = randint(0, 10, 100, dtype=np.uint8) # Randomly initialise theta (theta is 1d array) np.random.seed(100) theta_init = 0.005 * randn(numClasses * inputSize) # Get cost and grad cost, grad = softmaxCost(theta, numClasses, inputSize, decay, images, labels) # ---------------- debug: Gradient Checking Start ------------------------ if FLAGS.debug: checkNumericalGradient() numGrad = computeNumericalGradient( lambda x: softmaxCost(x, numClasses, inputSize, decay, images, labels), theta) # Use this to visually compare the gradients side by side. print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with those computed analytically. diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) sys.exit(1) # ---------------- debug: Gradient Checking End ------------------------ max_iters = 2000 learning_rate = 0.1 batchsize = 100 batches = shuffle_and_split(images, labels, batchsize) # print("batches[0].shape = {}".format(batches[0].shape)) # (100, 785) # Fit the parameter theta theta, cost_lst = minibatch_grad_desc(theta_init, max_iters, batches, numClasses, inputSize, decay, learning_rate, batchsize) # Test the data images = np.load(FLAGS.input_data_dir + 'test-images.npy') labels = np.load(FLAGS.input_data_dir + 'test-labels.npy') print("\n\n For MNIST test data") print("images.shape = {}".format(images.shape)) # (784, 10000) print("labels.shape = {}".format(labels.shape)) # (10000,) print("\n\n") # Get prediction for test data theta = np.reshape(theta, (numClasses, inputSize)) pred = softmaxPredict(theta, images) acc = np.mean(labels == pred) print('Accuracy: %0.3f%%.' % (acc * 100)) # 92.630%. (for eta = 10)
def run_training(FLAGS, patches): ##====================================================================== ## STEP 1: Here we provide the relevant parameters values that will # allow your sparse autoencoder to get good filters; you do not need to # change the parameters below. visibleSize = FLAGS.visibleSize # number of input units hiddenSize = FLAGS.hiddenSize # number of hidden units sparsityParam = FLAGS.rho # desired average activation \rho of the hidden units. decay = FLAGS.decay # weight decay parameter beta = FLAGS.beta # weight of sparsity penalty term # Obtain random parameters theta theta = initializeParameters(hiddenSize, visibleSize) ##====================================================================== ## STEP 2: Implement sparseAutoencoderCost # # You can implement all of the components (squared error cost, weight decay term, # sparsity penalty) in the cost function at once, but it may be easier to do # it step-by-step and run gradient checking (see STEP 3) after each step. We # suggest implementing the sparseAutoencoderCost function using the following steps: # # (a) Implement forward propagation in your neural network, and implement the # squared error term of the cost function. Implement backpropagation to # compute the derivatives. Then (using lambda=beta=0), run Gradient Checking # to verify that the calculations corresponding to the squared error cost # term are correct. # # (b) Add in the weight decay term (in both the cost function and the derivative # calculations), then re-run Gradient Checking to verify correctness. # # (c) Add in the sparsity penalty term, then re-run Gradient Checking to # verify correctness. # # Feel free to change the training settings when debugging your # code. (For example, reducing the training set size or # number of hidden units may make your code run faster; and setting beta # and/or lambda to zero may be helpful for debugging.) However, in your # final submission of the visualized weights, please use parameters we # gave in Step 0 above. cost, grad = sparseAutoencoderCost(theta, visibleSize, hiddenSize, decay, sparsityParam, beta, patches) ##====================================================================== ## STEP 3: Gradient Checking # # Hint: If you are debugging your code, performing gradient checking on smaller models # and smaller training sets (e.g., using only 10 training examples and 1-2 hidden # units) may speed things up. if FLAGS.debug: # Now we can use it to check your cost function and derivative calculations # for the sparse autoencoder. cost, grad = sparseAutoencoderCost(theta, visibleSize, hiddenSize, decay, \ sparsityParam, beta, patches) numGrad = computeNumericalGradient(lambda x: sparseAutoencoderCost(x, visibleSize, hiddenSize, decay, sparsityParam, beta, patches), theta) # Use this to visually compare the gradients side by side print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with the ones obtained from backpropagation diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) # Should be small. In our implementation, these values are # usually less than 1e-9. sys.exit(1) # When you got this working, Congratulations!!! ##====================================================================== ## STEP 4: After verifying that your implementation of # sparseAutoencoderCost is correct, You can start training your sparse # autoencoder with minFunc (L-BFGS). # Randomly initialize the parameters. theta = initializeParameters(hiddenSize, visibleSize) # Use L-BFGS to minimize the function. theta, _, _ = fmin_l_bfgs_b(sparseAutoencoderCost, theta, args = (visibleSize, hiddenSize, decay, sparsityParam, beta, patches), maxiter = 400, disp = 1) # save the learned parameters to external file pickle.dump(theta, open(FLAGS.log_dir + '/' + FLAGS.params_file, 'wb')) ##====================================================================== ## STEP 5: Visualization # Fold W1 parameters into a matrix format. W1 = np.reshape(theta[:hiddenSize * visibleSize], (hiddenSize, visibleSize)) # Save the visualization to a file. displayNetwork(W1.T, file_name = 'weights_digits.jpg') return theta
# Obtain random parameters theta theta = initializeParameters(hiddenSize, visibleSize) ##====================================================================== ## Gradient Checking # # Hint: If you are debugging your code, performing gradient checking on smaller models # and smaller training sets (e.g., using only 10 training examples and 1-2 hidden # units) may speed things up. if FLAGS.debug: # Check your cost function and derivative calculations for the sparse autoencoder. cost, grad = sparseAutoencoderCost(theta, visibleSize, hiddenSize, decay, \ sparsityParam, beta, patches) numGrad = computeNumericalGradient(lambda x: sparseAutoencoderCost(x, visibleSize, hiddenSize, decay, sparsityParam, beta, patches), theta) # Use this to visually compare the gradients side by side print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with the ones obtained from backpropagation diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) # Should be small. In our implementation, these values are # usually less than 1e-9. sys.exit(1) # When you got this working, Congratulations!!! ##====================================================================== ## After verifying that your implementation of sparseAutoencoderCost is # correct, You can start training your sparse autoencoder with minFunc (L-BFGS).
theta = initializeParameters(hiddenSize, visibleSize) ##====================================================================== ## Gradient Checking # # Hint: If you are debugging your code, performing gradient checking on smaller models # and smaller training sets (e.g., using only 10 training examples and 1-2 hidden # units) may speed things up. if FLAGS.debug: # Now we can use it to check your cost function and derivative calculations # for the sparse autoencoder. cost, grad = sparseAutoencoderCost(theta, visibleSize, hiddenSize, decay, sparsityParam, beta, patches) numGrad = computeNumericalGradient( lambda x: sparseAutoencoderCost(x, visibleSize, hiddenSize, decay, sparsityParam, beta, patches), theta) # Use this to visually compare the gradients side by side print(np.stack((numGrad, grad)).T) # Compare numerically computed gradients with the ones obtained from backpropagation diff = norm(numGrad - grad) / norm(numGrad + grad) print(diff) # Should be small. In our implementation, these values are # usually less than 1e-9. sys.exit(1) # When you got this working, Congratulations!!! ##====================================================================== ## After verifying that your implementation of sparseAutoencoderCost is # correct, You can start training your sparse autoencoder with L-BFGS.
# Randomly initialize parameters params = nn1Layer.initParams(n_x, n_h, n_y) ##====================================================================== ## STEP 2: Gradient checking # # As with any learning algorithm, you should always check that your # gradients are correct before learning the parameters. # if FLAGS.debug: a3, cache = nn1Layer.forward(X, params) dParams = nn1Layer.backward(X, y, params, cache, decay) dNumParams = computeNumericalGradient( lambda p: nn1Layer.cost(X, y, p, decay), params) rdParams = nn1Layer.ravelGrads(dParams) rdnParams = nn1Layer.ravelGrads(dNumParams) # Use this to visually compare the gradients side by side. print(rdnParams.shape) print(rdParams.shape) print(np.stack((rdnParams, rdParams)).T) # Compare numerically computed gradients with those computed analytically. diff = norm(rdnParams - rdParams) / norm(rdnParams + rdParams) print(diff) sys.exit(0) # The difference should be small. # In our implementation, these values are usually less than 1e-7.
cost, grad = sparseAutoencoderCost(theta, visibleSize, hiddenSize, _lambda, sparsityParam, beta, patches.T) ##====================================================================== ## STEP 3: Gradient Checking # Hint: If you are debugging your code, performing gradient checking on smaller models # and smaller training sets (e.g., using only 10 training examples and 1-2 hidden # units) may speed things up. # Short hand for cost function costFunc = lambda p: sparseAutoencoderCost( p, visibleSize, hiddenSize, _lambda, sparsityParam, beta, patches.T[0:10]) cost, grad = costFunc(theta) numgrad = computeNumericalGradient(costFunc, theta) # Visually examine the two gradient computations. The two columns # you get should be very similar. print(grad) print(numgrad) print('The above two columns you get should be very similar.\n \ (Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n') # Evaluate the norm of the difference between two solutions. # If you have a correct implementation, and assuming you used EPSILON = 0.0001 # in computeNumericalGradient.m, then diff below should be less than 1e-9 diff = np.linalg.norm(numgrad - grad) / np.linalg.norm(numgrad + grad) print('If your backpropagation implementation is correct, then \n \ the relative difference will be small (less than 1e-9). \n \