def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one ### START CODE HERE ### (approx. 3 lines) thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i] + epsilon # Step 2 J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Step 3 ### END CODE HERE ### # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". ### START CODE HERE ### (approx. 3 lines) thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i] - epsilon # Step 2 J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Step 3 ### END CODE HERE ### # Compute gradapprox[i] ### START CODE HERE ### (approx. 1 line) gradapprox[i] = (J_plus[i] - J_minus[i]) /(2 * epsilon) ### END CODE HERE ### # Compare gradapprox to backward propagation gradients by computing difference. ### START CODE HERE ### (approx. 1 line) numerator = np.linalg.norm(grad - gradapprox) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator # Step 3' ### END CODE HERE ### if difference > 2e-7: print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) e = epsilon # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one ### START CODE HERE ### (approx. 3 lines) thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i][0] + e # Step 2 J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Step 3 ### END CODE HERE ### # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". ### START CODE HERE ### (approx. 3 lines) thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i][0] - e # Step 2 J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Step 3 ### END CODE HERE ### # Compute gradapprox[i] ### START CODE HERE ### (approx. 1 line) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2*e) ### END CODE HERE ### # Compare gradapprox to backward propagation gradients by computing difference. ### START CODE HERE ### (approx. 1 line) numerator = np.linalg.norm(gradapprox - grad) denominator = np.linalg.norm(gradapprox) + np.linalg.norm(grad) difference = numerator / denominator # Step 3' ### END CODE HERE ### if difference > 2e-7: print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters,gradients,X,Y,epsilon=1e-7): parameters_values, keys = gc_utils.dictionary_to_vector(parameters) grad = gc_utils.gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): # 计算J_plus[i] theta_plus = np.copy(parameters_values) theta_plus[i][0] = theta_plus[i][0] + epsilon J_plus[i], cache = forward_propagation_n(X, Y, gc_utils.vector_to_dictionary(theta_plus)) # 计算J_minus[i] theta_minus = np.copy(parameters_values) theta_minus[i][0] = theta_minus[i][0] - epsilon J_minus[i], cache = forward_propagation_n(X, Y, gc_utils.vector_to_dictionary(theta_minus)) # 计算gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference < 1e-7: print("梯度检查:梯度正常") else: print("梯度检测:梯度超出阈值") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaplus[i][0] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) difference = np.linalg.norm(grad - gradapprox) / ( np.linalg.norm(grad) + np.linalg.norm(gradapprox)) if difference > 1e-7: print("There is a mistake in the backward propagation! difference = " + str(difference)) else: print("Your backward propagation works perfectly fine! difference = " + str(difference)) return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2 J_plus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaplus)) # Step 3 thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2 J_minus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaminus)) # Step 3 gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) numerator = np.linalg.norm(gradapprox - grad) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator # Step 3' if difference > 1e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference > 2e-7: print("\033[93m" + "反向传播有问题! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "反向传播很完美! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient. Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient. """ # Sets up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # Computes gradapprox for i in range(num_parameters): # Computes J_plus[i]. thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Computes J_minus[i]. thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Computes gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # Compares gradapprox to backward propagation gradients by computing difference. numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference > 2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters,gradients,X,Y,epsilon=1e-7): """ 检查backward_propagation_n是否正确计算forward_propagation_n输出的成本梯度 参数: parameters - 包含参数“W1”,“b1”,"W2","b2","W3","b3"的python字典 grentients - grad_output_propagation_n的输出 包含与参数相关的成本梯度 x - 输入数据点,维度为(输入节点数量,1)\ y - 标签 epsilon - 计算输入的微小偏移以计算近似梯度 返回: difference - 近似梯度和后向传播梯度之间的差异 """ #初始化参数 parameters_values,keys = gc_utils.dictionary_to_vector(parameters) #keys用不到 print("parameters"+str(parameters)) print("parameters_values"+str(parameters_values)) grad = gc_utils.gradients_to_vector(gradients) print("gradients"+str(gradients)) print("grad"+str(grad)) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters,1)) J_minus = np.zeros((num_parameters,1)) gradapprox = np.zeros((num_parameters,1)) #计算gradapprox for i in range(num_parameters): #计算J——plus[i],输入:"parameters_values,epsilon",输出"J_plus[i]" thetaplus = np.copy(parameters_values) #strp1 thetaplus[i][0] = thetaplus[i][0] + epsilon #step2 J_plus[i],cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaplus)) #step3 cache用不到 #计算J_minus[i].输入"parameters_values,epsilon",输出"J_minus[i]" thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i],cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaminus)) #计算gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) #通过计算差异比较gradapprox和后向传播梯度 numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference < 1e-7: print("梯度检查:梯度正常!") else: print("梯度检查:梯度超出阈值!") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameter s. X -- input datapoint, of shape (input size, 1) Y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient Returns: difference -- difference between approximated gradient and the backward propagation gradient """ # Set up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output: "J_plus[i]" # "_" is used because the function you have to outputs two parameters but we only care about the first one thetaplus = np.copy(parameters_values) thetaplus[i][0] += epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output: "J_minus[i]". thetaminus = np.copy(parameters_values) thetaminus[i][0] -= epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Compute gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # Compare gradapprox to backward propagation gradients by computing difference. numerator = np.linalg.norm(gradapprox - grad) denominator = np.linalg.norm(gradapprox) + np.linalg.norm(grad) difference = numerator / denominator if difference > 1.2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ 检查backward_propagation_n是否正确计算forward_propagation_n输出的成本梯度 参数: parameters - 包含参数“W1”,“b1”,“W2”,“b2”,“W3”,“b3”的python字典: grad_output_propagation_n的输出包含与参数相关的成本梯度。 x - 输入数据点,维度为(输入节点数量,1) y - 标签 epsilon - 计算输入的微小偏移以计算近似梯度 返回: difference - 近似梯度和后向传播梯度之间的差异 """ # 初始化参数 parameters_values, keys = gc_utils.dictionary_to_vector( parameters) # keys用不到 grad = gc_utils.gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # 计算gradapprox for i in range(num_parameters): # 计算J_plus [i]。输入:“parameters_values,epsilon”。输出=“J_plus [i]” thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2 J_plus[i], cache = forward_propagation_n( X, Y, gc_utils.vector_to_dictionary(thetaplus)) # Step 3 ,cache用不到 # 计算J_minus [i]。输入:“parameters_values,epsilon”。输出=“J_minus [i]”。 thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2 J_minus[i], cache = forward_propagation_n( X, Y, gc_utils.vector_to_dictionary(thetaminus)) # Step 3 ,cache用不到 # 计算gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # 通过计算差异比较gradapprox和后向传播梯度。 numerator = np.linalg.norm(grad - gradapprox) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator # Step 3' if difference < 1e-7: print("梯度检查:梯度正常!") else: print("梯度检查:梯度超出阈值!") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ check if backward_propagation_n computes correctly the gradient of the cost output arguments: parameters -- dictionary containing your parameters "W1","b1","W2","b2","W3","b3" grad -- output of backward_propagation_n x -- input datapoint,shape(input size,1) y -- true label epsilon -- tiny shift to the input to compute approximated gradient returns: difference -- difference between the approximated gradient """ #set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) #compute gradapprox for i in range(num_parameters): thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) print('grad.shape = ', grad.shape) print('gradapprox.shape = ', gradapprox.shape) numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference > 1e-7: print('there is a mistake in the backward propagation, difference = ' + str(difference)) else: print('your backward propagation works perfectly fine! difference = ' + str(difference)) return difference
def gradients_check(X, Y, lambd=0, keep_prob=1, init_method='he'): layers_dims = [X.shape[0], 5, 3, 1] # inintial params if init_method == 'zeros': params = init_zeros(layers_dims) elif init_method == 'random': params = init_random(layers_dims) elif init_method == 'he': params = init_he(layers_dims) else: print('Error: unexcepted init_method!') # compute grads a3, cache = forward_propagate_with_reg(X, params, keep_prob=keep_prob) grads = backward_propagate_with_reg(X, Y, cache, lambd=lambd, keep_prob=keep_prob) grads_vector = gc_utils.gradients_to_vector(grads) theta, keys = gc_utils.dictionary_to_vector(params) #转化成向量方便索引(n, 1) n = theta.shape[0] #参数个数 grads_approx_vector = np.zeros((n, 1)) # compute grads_approx for i in range(n): theta_p = np.copy(theta) theta_p[i, 0] += 1e-7 params_p = gc_utils.vector_to_dictionary(theta_p) theta_m = np.copy(theta) theta_m[i, 0] -= 1e-7 params_m = gc_utils.vector_to_dictionary(theta_m) a3_, cache_ = forward_propagate_with_reg(X, params_p, keep_prob=keep_prob) J_p = compute_loss_with_reg(a3_, Y, params_p, lambd=lambd) a3_, cache_ = forward_propagate_with_reg(X, params_m, keep_prob=keep_prob) J_m = compute_loss_with_reg(a3_, Y, params_m, lambd=lambd) d_approx = (J_p - J_m) / (2 * 1e-7) grads_approx_vector[i, 0] = d_approx # compute difference numerator = np.linalg.norm(grads_vector - grads_approx_vector) denominator = np.linalg.norm(grads_vector) + np.linalg.norm( grads_approx_vector) diff = numerator / denominator return diff
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one ### START CODE HERE ### (approx. 3 lines) plus_copy = parameters_values.copy() plus_copy[i] = plus_copy[i] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(plus_copy)) # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". minus_copy = parameters_values.copy() minus_copy[i] = minus_copy[i] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(minus_copy)) # Compute gradapprox[i] ### START CODE HERE ### (approx. 1 line) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # Compare gradapprox to backward propagation gradients by computing difference. num = np.linalg.norm(grad - gradapprox) nom = np.linalg.norm(grad) + np.linalg.norm(gradapprox) diff = num / nom return diff
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] Jplus = np.zeros((num_parameters, 1)) Jminus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): thetaplus = np.copy(parameters_values) thetaplus[i][0] += epsilon Jplus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) thetaminus = np.copy(parameters_values) thetaminus[i][0] -= epsilon Jminus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) gradapprox[i] = (Jplus[i] - Jminus[i]) / (2. * epsilon) diff = np.linalg.norm(grad - gradapprox) / (np.linalg.norm(grad) + np.linalg.norm(gradapprox)) if diff > 1e-7: print('There is a mistake in backword propagation diff = {}'.format( diff)) else: print( 'Your backward propagation works well with diff = {}'.format(diff)) return diff
def gradient_check_n(parameters, grads, X, Y, epsilon=1e-7): ''' Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient ''' parameters_values, _ = dictionary_to_vector(parameters) grads_value = gradients_to_vector(grads) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradsapprox = np.zeros((num_parameters, 1)) for i in range(num_parameters): thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) gradsapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) numeretor = np.linalg.norm(grads_value - gradsapprox) denominator = np.linalg.norm(grads_value) + np.linalg.norm(gradsapprox) difference = numeretor / denominator if difference > 1e-7: print( 'There is a mistake in the backward propagation! difference = {}'. format(difference)) else: print( 'Your backward propagation worlks perfectly fine! difference = {}'. format(difference))
def gradient_check_n(parameters,gradients,X,Y,epsilon = 1e-7): ''' 检查backward_propagation_n是否正确计算forward_propagation_n输出的成本梯度 :param parameters: 包含参数'W1','b1','W2','b2','W3','b3'的python字典 grad_output_propagation_n的输出包含与参数相关的成本梯度 :param gradients: :param X: 输入数据点,维度为(输入节点数量,1) :param Y: 标签 :param epsilon计算输入的微小偏移以计算近似梯度: :return: 近似梯度和后向传播梯度之间的差异 ''' #初始化参数 parameters_values , keys = gc_utils.dictionary_to_vector(parameters)#keys用不到,parameters_values是一个n行1列的矩阵 grad = gc_utils.gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters,1)) J_minus = np.zeros((num_parameters,1)) gradapprox = np.zeros(num_parameters,1) #计算gradapprox for i in range(num_parameters): #计算J_plus[i].输入'parameters_values,epsilon'.输出='J_plus[i]' thetaplus = np.copy(parameters_values) thetaplus[i][0]=thetaplus[i][0]+epsilon J_plus[i],cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaplus)) #计算J_minus[i].输入:'parameters_values,epsilon',输出='J_minus[i]' thetaminus = np.copy(parameters_values) thetaminus[i][0]=thetaminus[i][0] - epsilon J_minus[i],cache = forward_propagation_n(X,Y,gc_utils.vector_to_dictionary(thetaminus)) #计算gradapprox[i] gradapprox[i] = (J_plus[i]-J_minus[i])/(2*epsilon) #通过计算差异比较gradapprox和后向传播梯度 numerator = np.linalg.norm(grad-gradapprox) denominator = np.linalg.norm(grad)+np.linalg.norm(gradapprox) difference = numerator/denominator if difference<1e-7: print('梯度检查:梯度正常') else: print('梯度检查:梯度超出阈值') return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7): # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] += epsilon # Step 2 J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Step 3 # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] -= epsilon # Step 2 J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Step 3 # Compute gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2. * epsilon) # Compare gradapprox to backward propagation gradients by computing difference. numerator = np.linalg.norm(grad - gradapprox) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator if difference > 1e-6: print ("There is a mistake in the backward propagation! difference = " + str(difference)) else: print ("Your backward propagation works perfectly fine! difference = " + str(difference)) return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): # 初始化参数 parameters_values, keys = gc_utils.dictionary_to_vector( parameters) # 将parameters字典转换为array grad = gc_utils.gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # 计算grad approx for i in range(num_parameters): # 遍历所有的参数 # 计算 J_plus[i] thetaplus = np.copy(parameters_values) thetaplus[i][0] = thetaplus[i][0] + epsilon J_plus[i], cache = forward_propagation_n( X, Y, gc_utils.vector_to_dictionary(thetaplus)) # cache用不到 # 计算 J_minus[i] thetaminus = np.copy(parameters_values) thetaminus[i][0] = thetaminus[i][0] - epsilon J_minus[i], cache = forward_propagation_n( X, Y, gc_utils.vector_to_dictionary(thetaminus)) # cache用不到 # 计算 grad apporx[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # 通过计算差异比较 gradapprox 和后向传播梯度 numerator = np.linalg.norm(grad - gradapprox) denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) difference = numerator / denominator if difference < 1e-7: print("Gradient Checking: 梯度正常!") else: print("Gradient Checking:梯度超出阈值!") return difference
How does gradient checking work?. As in 1) and 2), you want to compare "gradapprox" to the gradient computed by backpropagation. The formula is still: ∂J∂θ=limε→0J(θ+ε)−J(θ−ε)2ε(1) (1)∂J∂θ=limε→0J(θ+ε)−J(θ−ε)2ε However, θθ is not a scalar anymore. It is a dictionary called "parameters". We implemented a function "dictionary_to_vector()" for you. It converts the "parameters" dictionary into a vector called "values", obtained by reshaping all parameters (W1, b1, W2, b2, W3, b3) into vectors and concatenating them. The inverse function is "vector_to_dictionary" which outputs back the "parameters" dictionary. Figure 2 : dictionary_to_vector() and vector_to_dictionary() You will need these functions in gradient_check_n() We have also converted the "gradients" dictionary into a vector "grad" using gradients_to_vector(). You don't need to worry about that. Exercise: Implement gradient_check_n(). Instructions: Here is pseudo-code that will help you implement the gradient check. For each i in num_parameters: To compute J_plus[i]: Set θ+θ+ to np.copy(parameters_values) Set θ+iθi+ to θ+i+εθi++ε Calculate J+iJi+ using to forward_propagation_n(x, y, vector_to_dictionary(θ+θ+ )). To compute J_minus[i]: do the same thing with θ−θ− Compute gradapprox[i]=J+i−J−i2εgradapprox[i]=Ji+−Ji−2ε Thus, you get a vector gradapprox, where gradapprox[i] is an approximation of the gradient with respect to parameter_values[i]. You can now compare this gradapprox vector to the gradients vector from backpropagation. Just like for the 1D case (Steps 1', 2', 3'), compute: difference=∥grad−gradapprox∥2∥grad∥2+∥gradapprox∥2(3)
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2 J_plus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaplus)) # Step 3 # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2 J_minus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaminus)) # Step 3 # Compute gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # Compare gradapprox to backward propagation gradients by computing difference. numerator = np.linalg.norm(grad - gradapprox) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator # Step 3' if difference > 2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference # What you should remember from this notebook: # Gradient checking verifies closeness between the gradients from backpropagation # and the numerical approximation of the gradient (computed using forward propagation). # Gradient checking is slow, so we don't run it in every iteration of training. # You would usually run it only to make sure your code is correct, # then turn it off and use backprop for the actual learning process.
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n Arguments: parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3": grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters. x -- input datapoint, of shape (input size, 1) y -- true "label" epsilon -- tiny shift to the input to compute approximated gradient with formula(1) Returns: difference -- difference (2) between the approximated gradient and the backward propagation gradient """ # How to help implement gradient check. # Set-up variables parameters_values, _ = dictionary_to_vector( parameters ) # converts the "parameters" dictionary into a vector called "values" grad = gradients_to_vector( gradients) # convert gradients dictionary into a vector, "grads" num_parameters = parameters_values.shape[ 0] # get current shape of an array by assigning a tuple of array dimensions J_plus = np.zeros( (num_parameters, 1)) # initialize J_plus with zeros and number of parameter objects J_minus = np.zeros( (num_parameters, 1)) # initialize J_minus with zeros and number of parameter objects gradapprox = np.zeros(( num_parameters, 1)) # initialize gradapprox with zeros and number of parameter objects # Compute gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function outputs two parameters but only care about the first one thetaplus = np.copy(parameters_values) # Set theta to np.copy thetaplus[i][0] = thetaplus[i][0] + epsilon # Set theta_plus J_plus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary( thetaplus)) # Calculate J_plus using forward propagation # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". thetaminus = np.copy(parameters_values) # Set theta to np.copy thetaminus[i][0] = thetaminus[i][0] - epsilon # Set theta_minus J_minus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary( thetaminus)) # Calculate J_minus using forward propagation # Compute gradapprox[i] gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) # Compare gradapprox to backward propagation gradients by computing difference. numerator = np.linalg.norm( grad - gradapprox) # compute the numerator using np.linag.norm(...) denominator = np.linalg.norm(grad) + np.linalg.norm( gradapprox ) # compute the denominator(need to call np.linag.norm(...) twice) difference = numerator / denominator # divide both if difference > 2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference
def gradient_check_n(parameters, gradients, X, Y, epsilon=1e-7): """ 梯度校验,检查后向传播是否正确计算前向传播输出的 cost 的梯度 :param parameters: 参数字典,包含 "W1", "b1", "W2", "b2", "W3", "b3": :param grad: 后向传播的输出, 包含与参数相关的 cost 梯度 :param x: 输入数据点, of shape (input size, 1) :param y: 正确的标签 :param epsilon: 输入的微小偏移,用来计算近似梯度 :return difference: 近似梯度和后向传播计算的梯度之间的差异 """ # Set-up variables parameters_values, _ = dictionary_to_vector(parameters) grad = gradients_to_vector(gradients) num_parameters = parameters_values.shape[0] J_plus = np.zeros((num_parameters, 1)) J_minus = np.zeros((num_parameters, 1)) gradapprox = np.zeros((num_parameters, 1)) # 计算 gradapprox for i in range(num_parameters): # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]". # "_" is used because the function you have to outputs two parameters but we only care about the first one ### START CODE HERE ### (approx. 3 lines) thetaplus = np.copy(parameters_values) # Step 1 thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2 J_plus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaplus)) # Step 3 ### END CODE HERE ### # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]". ### START CODE HERE ### (approx. 3 lines) thetaminus = np.copy(parameters_values) # Step 1 thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2 J_minus[i], _ = forward_propagation_n( X, Y, vector_to_dictionary(thetaminus)) # Step 3 ### END CODE HERE ### # Compute gradapprox[i] ### START CODE HERE ### (approx. 1 line) gradapprox[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon) ### END CODE HERE ### # 通过计算差异来比较 gradapprox 梯度和后向传播计算的梯度 ### START CODE HERE ### (approx. 1 line) numerator = np.linalg.norm(grad - gradapprox) # Step 1' denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2' difference = numerator / denominator # Step 3' ### END CODE HERE ### if difference > 2e-7: print("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m") else: print("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m") return difference