def compute_gradients(dL_da2, da2_dz2, dz2_dW2, dz2_db2, dz2_da1, da1_dz1, dz1_dW1, dz1_db1): ''' Given the local gradients, compute the gradient of the loss function L w.r.t. model parameters: the weights W1, W2 and biases b1 and b2. Input: see details in the above functions. Output: dL_dW2: the gradient of the loss function L w.r.t. the weight matrix W2 dL_db2: the gradient of the loss function L w.r.t. the biases b2 dL_dW1: the gradient of the loss function L w.r.t. the weight matrix W1 dL_db1: the gradient of the loss function L w.r.t. the biases b1 Hint: you could re-use the functions in problem2, such as sr.compute_dL_dz(...) ''' ######################################### ## INSERT YOUR CODE HERE # the 2nd layer dL_dW2 = sr.compute_dL_dW(sr.compute_dL_dz(dL_da2, da2_dz2), dz2_dW2) dL_db2 = sr.compute_dL_db(sr.compute_dL_dz(dL_da2, da2_dz2), dz2_db2) # the 1st layer dL_dW1 = np.multiply(np.multiply(dz1_dW1, da1_dz1), dz2_da1.T * sr.compute_dL_dz(dL_da2, da2_dz2)) dL_db1 = np.multiply(np.multiply(dz1_db1, da1_dz1), dz2_da1.T * sr.compute_dL_dz(dL_da2, da2_dz2)) ######################################### return dL_dW2, dL_db2, dL_dW1, dL_db1
def compute_gradients(dL_da2, da2_dz2, dz2_dW2, dz2_db2, dz2_da1, da1_dz1, dz1_dW1, dz1_db1): ''' Given the local gradients, compute the gradient of the loss function L w.r.t. model parameters: the weights W1, W2 and biases b1 and b2. Input: see details in the above functions. Output: dL_dW2: the gradient of the loss function L w.r.t. the weight matrix W2 dL_db2: the gradient of the loss function L w.r.t. the biases b2 dL_dW1: the gradient of the loss function L w.r.t. the weight matrix W1 dL_db1: the gradient of the loss function L w.r.t. the biases b1 Hint: you could re-use the functions in problem2, such as sr.compute_dL_dz(...) ''' ######################################### ## INSERT YOUR CODE HERE # the 2nd layer dL_dW2 = sr.compute_dL_dW(sr.compute_dL_dz(dL_da2, da2_dz2), dz2_dW2) dL_db2 = sr.compute_dL_db(sr.compute_dL_dz(dL_da2, da2_dz2), dz2_db2) # the 1st layer import problem2 as lr dL_dW1 = np.asmatrix(np.zeros(dz1_dW1.shape)) dL_db1 = np.asmatrix(np.zeros((dz1_dW1.shape[0], 1))) dL_da1 = dL_da2.T * da2_dz2 * dz2_da1 for i in range(dz1_dW1.shape[0]): dL_dW1[:, i] = lr.compute_dL_dw(dL_da1.T[i, 0], da1_dz1[i, 0], dz1_dW1[i, 0]) dL_db1[i, 0] = lr.compute_dL_db(dL_da1.T[i, 0], da1_dz1[i, 0], dz1_db1[i, 0]) # another solution # dL_dW1 = np.multiply(dL_da2.T * da2_dz2 * dz2_da1 * da1_dz1, dz1_dW1) # dL_db1 = np.multiply(dL_da2.T * da2_dz2 * dz2_da1 * da1_dz1, dz1_db1) ######################################### return dL_dW2, dL_db2, dL_dW1, dL_db1
def compute_gradients(dL_da2, da2_dz2, dz2_dW2, dz2_db2, dz2_da1, da1_dz1, dz1_dW1, dz1_db1): ''' Given the local gradients, computing the gradient of the loss function L w.r.t. model parameters: the weights W1, W2 and biases b1 and b2. Input: see details in the above functions. Output: dL_dW2: the gradient of the loss function L w.r.t. the weight matrix W2 dL_db2: the gradient of the loss function L w.r.t. the biases b2 dL_dW1: the gradient of the loss function L w.r.t. the weight matrix W1 dL_db1: the gradient of the loss function L w.r.t. the biases b1 ''' # the 2nd layer dL_dz2 = sr.compute_dL_dz(dL_da2,da2_dz2) dL_dW2 = sr.compute_dL_dW(dL_dz2,dz2_dW2) dL_db2 = sr.compute_dL_db(dL_dz2,dz2_db2) # the 1st layer dL_da1 = compute_dL_da1(dL_dz2,dz2_da1) dL_dz1 = compute_dL_dz1(dL_da1,da1_dz1) dL_dW1 = sr.compute_dL_dW(dL_dz1,dz1_dW1) dL_db1 = sr.compute_dL_db(dL_dz1,dz1_db1) return dL_dW2, dL_db2, dL_dW1, dL_db1