def conv_layer(x):
            """
            the derivative check in the gradient checker relates to the input of the function
            hence, the input should be z - since the backward step computes @loss / @z
            """

            conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=2)
            relu1 = nn.Relu()
            conv2 = nn.Conv2d(in_channels=2, out_channels=4, kernel_size=2)
            relu2 = nn.Relu()
            flatten = nn.Flatten()
            linear = nn.Linear(4, 2)
            softmax = nn.Softmax()

            # forward pass
            a = relu1(conv1(x))
            a = relu2(conv2(a))
            a_flatten = flatten(a)
            dist = softmax(linear(a_flatten))

            # backward
            labels = np.zeros(dist.shape)
            labels[:, 1] = 1
            loss = -np.log(np.sum(dist * labels, axis=1))

            softmax_grad = softmax.backward(labels)
            linear_grad = linear.backward(softmax_grad)
            flatten_grad = flatten.backward(linear_grad)
            relu2_grad = relu2.backward(flatten_grad)
            conv2_grad = conv2.backward(relu2_grad)
            relu1_grad = relu1.backward(conv2_grad)
            conv1_grad = conv1.backward(relu1_grad)

            return loss, conv1_grad
        def conv(b):
            """
            the derivative check in the gradient checker relates to the input of the function
            hence, the input should be z - since the backward step computes @loss / @z
            """

            # simulate end of classification
            conv = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=2)
            relu = nn.Relu()
            flatten = nn.Flatten()
            linear = nn.Linear(in_dimension=12, out_dimension=4)
            softmax = nn.Softmax()

            conv.set_biases(b.reshape(3, 1))

            # forward
            a = flatten(relu(conv(x)))
            dist = softmax(linear(a))

            # backward
            labels = np.zeros(dist.shape)
            labels[:, 1] = 1
            loss = -np.log(np.sum(dist * labels, axis=1))

            softmax_grad = softmax.backward(labels)
            linear_grad = linear.backward(softmax_grad)
            flatten_grad = flatten.backward(linear_grad)
            relu_grad = relu.backward(flatten_grad)
            conv_grad = conv.backward(relu_grad)

            b_grad = conv.b_grad

            return loss, b_grad
        def relu_layer(x):
            relu = nn.Relu()
            softmax = nn.Softmax()

            a = softmax(relu(x))

            num_classes = x.shape
            labels = np.zeros(num_classes)
            labels[:, 0] = 1

            loss = -np.log(np.sum(a * labels, axis=1))
            softmax_grad = softmax.backward(labels)
            relu_grad = relu.backward(softmax_grad)
            return loss, relu_grad
    def test_linear_module_relu_2(self):
        x = np.array([[1, 2, 0, -1], [1, 2, -1, -2]])
        w = np.array([[0., 1., 0., 0.], [0., 2., 2., 0.]])
        b = np.array([-5., -1.])
        expected_res = np.array([[0., 3.], [0., 1.]])

        linear_layer = nn.Linear(4, 2)
        linear_layer.set_weights(w)
        linear_layer.set_biases(b)

        relu_layer = nn.Relu()
        z = linear_layer(x)
        a = relu_layer(z)

        np.testing.assert_allclose(a, expected_res, atol=0.0001)
        def linear_layer(z):
            """
            the derivative check in the gradient checker relates to the input of the function
            hence, the input should be z - since the backward step computes @loss / @z
            """

            # simulate end of classification
            relu_layer = nn.Relu()
            linear = nn.Linear(in_dimension=2, out_dimension=5)
            softmax = nn.Softmax()

            a_L_mins_1 = relu_layer(z)
            z_L = linear(a_L_mins_1)
            a_L = softmax(z_L)

            labels = np.zeros(a_L.shape)
            labels[:, 1] = 1
            loss = -np.log(np.sum(a_L * labels, axis=1))

            softmax_grad = softmax.backward(labels)
            layer_L_grad = linear.backward(softmax_grad)
            relu_grad = relu_layer.backward(layer_L_grad)

            return loss, relu_grad