def backward_with_dropout(self, A, Y, keep_prob):
        assert A.shape == Y.shape

        m = A.shape[1]
        L = len(self.params) // 2
        # shortcut for cross-entropy
        dZ = A - Y
        for l in range(L, 0, -1):  # [L, L-1, ...2, 1]
            # from dZ[l] we compute dW[l] and db[l]
            A = self.caches['A' + str(l - 1)]
            self.grads['dW' + str(l)] = np.dot(dZ, A.T) / float(m)
            self.grads['db' + str(l)] = np.sum(dZ, axis=1, keepdims=True) / float(m)

            # stop at [1] since we don't need to compute dW[0] and db[0] and same for dZ[0] (and dA[0])
            if l == 1:
                break

            # compute dZ[l-1] (implicitly dA[l-1] also) for the next iteration
            W = self.params['W' + str(l)]
            Z = self.caches['Z' + str(l - 1)]
            D = self.caches['D' + str(l - 1)]
            dA = np.dot(W.T, dZ)
            dA = dA * D
            dA = dA / keep_prob
            self.grads['dA' + str(l - 1)] = dA
            dZ = np.multiply(dA, d_relu(Z))
    def loss(self, x, y=None):
        """Compute loss and gradient for the fully-connected net."""
        if len(y.shape) == 2:
            N, M = y.shape  # N is n_samples, M is dims of each sample
        elif len(y.shape) == 1:
            M = y.shape[0]
        else:
            raise ValueError("y has incorrect shape")

        output, caches = self.prediction_save_cache(x)  # Forward pass
        grads = {}

        # Calculate the loss for the current batch =============================
        # Get the mean squared error loss (1/2 to simplify derivative)
        loss = 0.5 * np.mean((output - y)**2)
        # Add a regularization term
        for l in range(self.n_hidden):
            loss += 0.5 * self.reg * np.sum(self.params[f"w{l}"]**2)
            loss += 0.5 * self.reg * np.sum(self.params[f"b{l}"]**2)
        loss += 0.5 * self.reg * np.sum(self.params["w_out"]**2)
        loss += 0.5 * self.reg * np.sum(self.params["b_out"]**2)

        # Get the gradients through backprop ===================================
        # Gradient from the MSE loss
        dout = (output - y) / N
        # Backprop through output layer
        dout, dw, db = d_affine(dout, caches["affine_out"])
        grads["w_out"] = dw + self.reg * self.params["w_out"]
        grads["b_out"] = db + self.reg * self.params["b_out"]
        # Backprop through each hidden layer
        for l in reversed(range(self.n_hidden)):
            l = str(l)
            dout = d_dropout(dout, caches["dropout" + l])
            dout = d_relu(dout, caches["relu" + l])
            dout, dw, db = d_affine(dout, caches["affine" + l])

            # Save gradients into a dictionary where the key matches the param key
            grads["w" + l] = dw + self.reg * self.params["w" + l]
            grads["b" + l] = db + self.reg * self.params["b" + l]

        # Clip gradients if enabled - really helps stability! ==================
        if self.grad_clip:
            for key, grad in grads.items():
                grads[key] = np.clip(grads[key], -self.grad_clip,
                                     self.grad_clip)

        return loss, grads
    def backward(self, A, Y):
        assert A.shape == Y.shape

        L = len(self.params) // 2
        m = Y.shape[1]

        # shotcut for cross-entropy
        dZ = A - Y
        for l in range(L, 0, -1):  # [L, L-2, 2, 1]
            # from dZ[l] we compute dW[l] and db[l]
            A = self.caches['A' + str(l - 1)]
            self.grads['dW' + str(l)] = np.dot(dZ, A.T) / float(m)
            self.grads['db' + str(l)] = np.sum(dZ, axis=1, keepdims=True) / float(m)

            # stop at [1] since we don't need to compute dW[0] and db[0], so same for dZ[0] (and dA[0])
            if l == 1:
                break

            # compute dZ[l-1] (implicitly dA[l-1] also) for the next iteration
            W = self.params['W' + str(l)]
            Z = self.caches['Z' + str(l - 1)]
            dZ = np.multiply(np.dot(W.T, dZ), d_relu(Z))  # dA = np.dot(W.T, dZ)