def test_linear_fuse_relu_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     in_features = torch.randint(3, 10, (1, )).item()
     out_features = torch.randint(3, 100, (1, )).item()
     for dtype in [torch.bfloat16, torch.float]:
         x = torch.randn(3, in_features) * 10
         x = x.to(dtype).to('dpcpp')
         for bias in [True, False]:
             linear = torch.nn.Linear(in_features, out_features,
                                      bias=bias).to('dpcpp').to(dtype)
             relu = torch.nn.ReLU()
             linear_fuse_relu = intel_pytorch_extension.LinearFuseRelu(
                 in_features, out_features, bias=bias)
             linear_fuse_relu.weight.data = linear.weight.clone()
             if bias:
                 linear_fuse_relu.bias.data = linear.bias.clone()
             x1 = x.clone().requires_grad_()
             x2 = x.clone().requires_grad_()
             y1 = relu(linear(x1).float()).sum()
             y2 = linear_fuse_relu(x2).sum()
             y1.backward()
             y2.backward()
             self.assertEqual(x1.grad.float(), x2.grad.float())
             self.assertEqual(linear.weight.grad.float(),
                              linear_fuse_relu.weight.grad.float())
             if bias:
                 self.assertEqual(linear.bias.grad.float(),
                                  linear_fuse_relu.bias.grad.float())
    def test_mul(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        N = torch.randint(3, 10, (1, )).item()
        C = torch.randint(3, 100, (1, )).item()
        alpha = torch.randn(1, dtype=torch.float32).item()

        x_cpu = torch.randn(N, C, 35, 45, dtype=torch.float32) * 10
        y_cpu = torch.randn(N, C, 35, 45, dtype=torch.float32) * 10
        x_dpcpp = x_cpu.to(device=device)
        y_dpcpp = y_cpu.to(device=device)

        # mul
        self.assertEqual(x_cpu * y_cpu, x_dpcpp * y_dpcpp)

        self.assertEqual(torch.mul(x_cpu, y_cpu), torch.mul(x_dpcpp, y_dpcpp))

        # mul_out
        out_cpu = x_cpu.clone()
        out_dpcpp = out_cpu.to(device=device)
        torch.mul(x_cpu, y_cpu, out=out_cpu)
        torch.mul(x_dpcpp, y_dpcpp, out=out_dpcpp)
        self.assertEqual(out_cpu, out_dpcpp)
    def test_view(self):
        ipex.enable_auto_dnnl()
        old_shape = (4, 16)
        new_shape = (1, 4, 4, 4)

        x_cpu = torch.randn(old_shape)
        x_dpcpp = x_cpu.to(device=device).clone()
        self.assertTrue(ipex.is_dil_tensor(x_dpcpp))
        self.assertEqual(ipex.get_dil_tensor_sizes(x_dpcpp), [4, 16])
        self.assertEqual(ipex.get_dil_tensor_strides(x_dpcpp), [16, 1])

        x_cpu_view = x_cpu.view(new_shape)
        self.assertEqual(x_cpu_view.size(), [1, 4, 4, 4])
        self.assertEqual(x_cpu_view.stride(), [64, 16, 4, 1])

        x_dpcpp_view = x_dpcpp.view(new_shape)
        self.assertTrue(ipex.is_dil_tensor(x_dpcpp_view))

        y = torch.randn(new_shape)
        out_cpu = x_cpu_view * y
        # test if the shape of x_dpcpp_view is compatible with y
        out_dpcpp = x_dpcpp_view * y
        self.assertTrue(ipex.is_dil_tensor(out_dpcpp))
        self.assertEqual(ipex.get_dil_tensor_sizes(out_dpcpp), [1, 4, 4, 4])
        self.assertEqual(ipex.get_dil_tensor_strides(out_dpcpp),
                         [64, 16, 4, 1])
        self.assertEqual(out_cpu, out_dpcpp)

        # test if metadata of x_dpcpp has not been altered
        y = torch.randn(old_shape)
        out_cpu = x_cpu * y
        out_dpcpp = x_dpcpp * y
        self.assertEqual(out_cpu, out_dpcpp)
    def test_conv_add_relu_(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        res_dcpp_dnnl, input_dpcpp_dnnl, _ = self._test_conv_add_relu_(
            "dpcpp:0", rand_seed)

        ipex.disable_auto_dnnl()
        res_dcpp_cpu, input_dpcpp_cpu, _ = self._test_conv_add_relu_(
            "dpcpp:0", rand_seed)

        res_cpu, input_cpu, _ = self._test_conv_add_relu_("cpu", rand_seed)
        self.assertEqual(res_cpu, res_dcpp_cpu.to('cpu'))
        self.assertEqual(res_cpu, res_dcpp_dnnl.to('cpu'))

        ipex.enable_auto_dnnl()
        res_dcpp_dnnl.sum().backward()
        res_dcpp_cpu.sum().backward()
        res_cpu.sum().backward()

        self.assertEqual(input_dpcpp_dnnl.grad.to('cpu'),
                         input_cpu.grad,
                         prec=0.0)
        self.assertEqual(input_dpcpp_cpu.grad.to('cpu'),
                         input_cpu.grad,
                         prec=0.0)
    def test_max_pool3d_backward(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        N = torch.randint(3, 10, (1, )).item()
        C = torch.randint(3, 10, (1, )).item()

        for stride in [1, 2, 3]:
            for D, H, W in [(64, 64, 64), (35, 39, 35), (16, 19, 20),
                            [7, 8, 9]]:
                x = torch.randn(N, C, D, H, W, dtype=torch.float32) * 10
                x1 = x.clone().requires_grad_()
                x2 = x.clone().to(device=device).requires_grad_()

                for ceil_mode in [False, True]:
                    max_pool3d = torch.nn.MaxPool3d(
                        kernel_size=3 if not ceil_mode else 7,
                        stride=stride,
                        padding=1,
                        ceil_mode=ceil_mode)

                    y1 = max_pool3d(x1).sum()
                    y2 = max_pool3d(x2).sum()
                    y1.backward()
                    y2.backward()
                    self.assertEqual(x1.grad, x2.grad)
 def test_split_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x = torch.randn(5, 5, dtype=torch.float32) * 10
     x1 = x.clone().requires_grad_()
     x2 = x.clone().to(device=device).requires_grad_()
     for dim in [0, 1]:
         y1 = torch.split(x1, (2,3), dim=dim)[0].sum() \
                 + torch.split(x1, (2,3), dim=dim)[1].sum()
         y2 = torch.split(x2, (2,3), dim=dim)[0].sum() \
                 + torch.split(x2, (2,3), dim=dim)[1].sum()
         y1.backward()
         y2.backward()
         self.assertEqual(x1.grad, x2.grad)
         y1 = torch.split(x1, 3, dim=dim)[0].sum() \
                 + torch.split(x1, 3, dim=dim)[1].sum()
         y2 = torch.split(x2, 3, dim=dim)[0].sum() \
                 + torch.split(x2, 3, dim=dim)[1].sum()
         y1.backward()
         y2.backward()
         self.assertEqual(x1.grad, x2.grad)
         y1 = torch.split(x1, 2, dim=dim)[0].sum() \
                 + torch.split(x1, 2, dim=dim)[1].sum() \
                 + torch.split(x1, 2, dim=dim)[2].sum()
         y2 = torch.split(x2, 2, dim=dim)[0].sum() \
                 + torch.split(x2, 2, dim=dim)[1].sum() \
                 + torch.split(x2, 2, dim=dim)[2].sum()
         y1.backward()
         y2.backward()
         self.assertEqual(x1.grad, x2.grad)
 def test_relu_(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     a1 = self._test_relu_(device, rand_seed)
     a2 = self._test_relu_('cpu', rand_seed)
     self.assertEqual(a2, a1.to('cpu'))
 def _test_conv_relu_(self, device, rand_seed):
     ipex.enable_auto_dnnl()
     torch.manual_seed(rand_seed)
     conv_op = torch.nn.Conv2d(1, 1, (7, 7)).to(device=device)
     conv_op_input = torch.rand((1, 1, 10, 10)).to(device=device)
     conv_op_output = conv_op(conv_op_input)
     conv_op_output.relu_()
     return conv_op_output
 def test_relu(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x_cpu = torch.randn((4, 5), dtype=torch.float32) * 10
     x_dpcpp = x_cpu.to(device=device)
     self.assertEqual(torch.relu(x_cpu), torch.relu(x_dpcpp))
 def test_layer_norm(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     input = torch.randn(2, 5, 10, 10, dtype=torch.float32)
     input_dpcpp = input.to(device=device)
     m = torch.nn.LayerNorm([10, 10])
     self.assertEqual(m(input), m(input_dpcpp))
    def test_seq_conv(self):
        ipex.disable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        res_cpu = self._seq_conf('cpu', rand_seed)

        ipex.enable_auto_dnnl()
        res_dpcpp = self._seq_conf(device, rand_seed)
        self.assertEqual(res_cpu, res_dpcpp.to('cpu'))
 def test_softmax(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x_cpu = torch.randn(3, 4, 5, dtype=torch.float32) * 10
     x_dpcpp = x_cpu.to(device=device)
     for dim in range(x_cpu.ndim):
         softmax = torch.nn.Softmax(dim=dim)
         self.assertEqual(softmax(x_cpu), softmax(x_dpcpp))
    def test_batch_norm3d(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        x_cpu = torch.randn(4, 3, 30, 30, 30, dtype=torch.float32) * 10
        x_dpcpp = x_cpu.to(device=device)

        bn = torch.nn.BatchNorm3d(3)
        bn_dpcpp = copy.deepcopy(bn).to(device=device)
        self.assertEqual(bn(x_cpu), bn_dpcpp(x_dpcpp))
    def test_add_(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        res_dcpp_dnnl = self._test_add_("dpcpp:0", rand_seed)

        ipex.disable_auto_dnnl()
        res_dcpp_cpu = self._test_add_("dpcpp:0", rand_seed)

        res_cpu = self._test_add_("cpu", rand_seed)
        self.assertEqual(res_cpu, res_dcpp_cpu.to('cpu'))
        self.assertEqual(res_cpu, res_dcpp_dnnl.to('cpu'))
 def test_cat_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x = torch.randn((4, 5), dtype=torch.float32) * 10
     x_cpu = x.clone().requires_grad_()
     x_dpcpp = x.clone().to(device=device).requires_grad_()
     y_cpu = torch.cat((x_cpu, x_cpu, x_cpu)).sum()
     y_dpcpp = torch.cat((x_dpcpp, x_dpcpp, x_dpcpp)).sum()
     y_cpu.backward()
     y_dpcpp.backward()
     self.assertEqual(x_cpu.grad, x_dpcpp.grad)
 def test_transpose(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x = torch.randn(3, 4, 5, dtype=torch.float32) * 10
     x_dpcpp = x.clone().to(device=device)
     for dim1 in range(x.ndim):
         for dim2 in range(x.ndim):
             self.assertEqual(
                 x.transpose(dim1, dim2),
                 x_dpcpp.transpose(dim1, dim2),
             )
    def test_linear(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        in_features = torch.randint(3, 10, (1, )).item()
        out_features = torch.randint(3, 100, (1, )).item()
        x = torch.randn(3, in_features, dtype=torch.float32) * 10
        x_dpcpp = x.to(device=device)

        for bias in [True, False]:
            linear = torch.nn.Linear(in_features, out_features, bias=bias)
            linear_dpcpp = copy.deepcopy(linear).to(device=device)
            self.assertEqual(linear(x), linear_dpcpp(x_dpcpp))
    def test_adaptive_avg_pool2d(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        N = torch.randint(3, 10, (1, )).item()
        C = torch.randint(3, 10, (1, )).item()
        x_cpu = torch.randn(N, C, 224, 224, dtype=torch.float32) * 100
        x_dpcpp = x_cpu.to(device=device)

        adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(7)

        self.assertEqual(adaptive_avg_pool2d(x_cpu),
                         adaptive_avg_pool2d(x_dpcpp))
Beispiel #19
0
def enable_auto_optimization(mixed_dtype=None, train=False):
    r""" Enable auto-mixed-precision to improve performance for global scope.

    The auto-mixed-precision auto reorders the tensor to the specified low precision data type.
    You don't need to convert the input tensors and the model to the specified data type manually,
    the extension will do it automatically and then dispatch the extension backend to accelerate
    computation

    Args:
        mixed_dtype(torch.dtype): Auto reorder the input tensors to the specified low precision data type
            and dispatch to oneDNN backend for computation, can be torch.bfloat16 or None.
    """
    if mixed_dtype != None:
        core.enable_auto_dnnl()
    running_mode = 'training' if train else 'inference'
    enable_auto_mix_precision(AmpConf(mixed_dtype), running_mode).__enter__()
 def test_softmax_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     x = torch.randn(3, 4, 5, dtype=torch.float32) * 10
     for dim in range(x.ndim):
         x_cpu = x.clone().requires_grad_()
         x_dpcpp = x.clone().to(device=device).requires_grad_()
         softmax = torch.nn.Softmax(dim=dim)
         y_cpu = softmax(x_cpu).sum()
         y_dpcpp = softmax(x_dpcpp).sum()
         y_cpu.backward()
         y_dpcpp.backward()
         self.assertEqual(x_cpu.grad, x_dpcpp.grad)
 def test_layer_norm_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     input = torch.randn(2, 5, 10, 10, dtype=torch.float32)
     input_cpu = input.clone().requires_grad_()
     input_dpcpp = input.clone().to(device=device).requires_grad_()
     m = torch.nn.LayerNorm([10, 10])
     m_dpcpp = copy.deepcopy(m).to(device=device)
     y_cpu = m(input_cpu).sum()
     y_cpu.backward()
     y_dpcpp = m_dpcpp(input_dpcpp).sum()
     y_dpcpp.backward()
     self.assertEqual(input_cpu.grad, input_dpcpp.grad)
def enable_auto_optimization(mixed_dtype=None):
    r""" Enable auto-mixed-precision to improve performance.

    The auto-mixed-precision auto reorders the tensor to the specified low precision data type.
    You don't need to convert the input tensors and the model to the specified data type manually,
    the extension will do it automatically and then dispatch the extension backend to accelerate
    computation

    Args:
        mixed_dtype(torch.dtype): Auto reorder the input tensors to the specified low precision data type
            and dispatch to oneDNN backend for computation

    """
    if mixed_dtype != None:
        core.enable_auto_dnnl(True)
    enable_auto_mix_precision(mixed_dtype)
    def test_batch_norm2d_backward(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        x = torch.randn(64, 3, 35, 45, dtype=torch.float32) * 10
        x_cpu = x.clone().requires_grad_()
        x_dpcpp = x.clone().to(device=device).requires_grad_()

        bn = torch.nn.BatchNorm2d(3)
        bn_dpcpp = copy.deepcopy(bn).to(device=device)
        y_cpu = bn(x_cpu).sum()
        y_dpcpp = bn_dpcpp(x_dpcpp).sum()
        y_cpu.backward()
        y_dpcpp.backward()
        self.assertEqual(x_cpu.grad, x_dpcpp.grad)
    def test_adaptive_avg_pool2d_backward(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        x = torch.randn(10, 3, 224, 224, dtype=torch.float32) * 100

        x_cpu = x.clone().requires_grad_()
        x_dpcpp = x.clone().to(device=device).requires_grad_()
        adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(7)

        y_cpu = adaptive_avg_pool2d(x_cpu).sum()
        y_dpcpp = adaptive_avg_pool2d(x_dpcpp).sum()
        y_cpu.backward()
        y_dpcpp.backward()
        self.assertEqual(x_cpu.grad, x_dpcpp.grad)
    def test_addmm(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        for i in range(8, 12, 2):
            for j in range(8, 12, 2):
                alpha = i / 10
                beta = j / 10
                M, N, O = 23, 8, 12
                b1_cpu = torch.randn(M, N, dtype=torch.float32)
                b2_cpu = torch.randn(N, O, dtype=torch.float32)
                res_cpu = torch.randn(M, O, dtype=torch.float32)
                b1_dpcpp = b1_cpu.to(device=device)
                b2_dpcpp = b2_cpu.to(device=device)
                res_dpcpp = res_cpu.to(device=device)

                addmm_cpu = torch.addmm(input=res_cpu,
                                        mat1=b1_cpu,
                                        mat2=b2_cpu,
                                        alpha=alpha,
                                        beta=beta)
                addmm_dpcpp = torch.addmm(input=res_dpcpp,
                                          mat1=b1_dpcpp,
                                          mat2=b2_dpcpp,
                                          alpha=alpha,
                                          beta=beta)
                self.assertEqual(addmm_cpu, addmm_dpcpp)

                y_cpu = torch.randn(M, O, dtype=torch.float32)
                y_dpcpp = y_cpu.to(device=device)
                torch.addmm(input=res_cpu,
                            mat1=b1_cpu,
                            mat2=b2_cpu,
                            alpha=alpha,
                            beta=beta,
                            out=y_cpu)
                torch.addmm(input=res_dpcpp,
                            mat1=b1_dpcpp,
                            mat2=b2_dpcpp,
                            alpha=alpha,
                            beta=beta,
                            out=y_dpcpp)
                self.assertEqual(y_cpu, y_dpcpp)
    def test_baddbmm(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        for i in range(8, 12, 2):
            for j in range(8, 12, 2):
                alpha = i / 10
                beta = j / 10
                num_batches = 10
                M, N, O = 23, 8, 12
                b1_cpu = torch.randn(num_batches, M, N, dtype=torch.float32)
                b2_cpu = torch.randn(num_batches, N, O, dtype=torch.float32)
                res_cpu = torch.randn(num_batches, M, O, dtype=torch.float32)
                b1_dpcpp = b1_cpu.to(device=device)
                b2_dpcpp = b2_cpu.to(device=device)
                res_dpcpp = res_cpu.to(device=device)

                baddbmm_cpu = torch.baddbmm(res_cpu,
                                            b1_cpu,
                                            b2_cpu,
                                            alpha=alpha,
                                            beta=beta)
                baddbmm_dpcpp = torch.baddbmm(res_dpcpp,
                                              b1_dpcpp,
                                              b2_dpcpp,
                                              alpha=alpha,
                                              beta=beta)
                self.assertEqual(baddbmm_cpu, baddbmm_dpcpp)
                y_cpu = torch.randn(num_batches, M, O, dtype=torch.float32)
                y_dpcpp = y_cpu.to(device=device)
                torch.baddbmm(res_cpu,
                              b1_cpu,
                              b2_cpu,
                              alpha=alpha,
                              beta=beta,
                              out=y_cpu),
                torch.baddbmm(res_dpcpp,
                              b1_dpcpp,
                              b2_dpcpp,
                              alpha=alpha,
                              beta=beta,
                              out=y_dpcpp),
                self.assertEqual(y_cpu, y_dpcpp)
 def test_linear_backward(self):
     ipex.enable_auto_dnnl()
     rand_seed = int(get_rand_seed())
     print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                    rand_seed))
     torch.manual_seed(rand_seed)
     in_features = torch.randint(3, 10, (1, )).item()
     out_features = torch.randint(3, 100, (1, )).item()
     x = torch.randn(3, in_features, dtype=torch.float32) * 10
     for bias in [True, False]:
         x1 = x.clone().requires_grad_()
         x2 = x.clone().to(device=device).requires_grad_()
         linear = torch.nn.Linear(in_features, out_features, bias=bias)
         linear_dpcpp = copy.deepcopy(linear).to(device=device)
         y1 = linear(x1).sum()
         y2 = linear_dpcpp(x2).sum()
         y1.backward()
         y2.backward()
         self.assertEqual(x1.grad, x2.grad)
    def test_Conv2d_backward(self):
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        ipex.enable_auto_dnnl()
        with torch.backends.mkldnn.flags(enabled=False):
            input = torch.rand((1, 1, 7, 7))
            for bias in [True, False]:
                input_cpu = input.clone().requires_grad_()
                input_dpcpp = input.clone().to(device=device).requires_grad_()
                conv_cpu = torch.nn.Conv2d(1, 1, (3, 3), bias=bias)
                conv_dpcpp = copy.deepcopy(conv_cpu).to(device=device)
                out_cpu = conv_cpu(input_cpu).sum()
                out_dpcpp = conv_dpcpp(input_dpcpp).sum()
                out_cpu.backward()
                out_dpcpp.backward()

                self.assertEqual(input_cpu.grad, input_dpcpp.grad)
    def test_avg_pool3d(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        N = torch.randint(3, 10, (1, )).item()
        C = torch.randint(3, 10, (1, )).item()
        x_cpu = torch.randn(N, C, 64, 64, 64, dtype=torch.float32) * 10
        x_dpcpp = x_cpu.to(device=device)

        for count_include_pad in [True, False]:
            avg_pool3d = torch.nn.AvgPool3d(
                kernel_size=3,
                stride=2,
                padding=1,
                count_include_pad=count_include_pad)

            self.assertEqual(avg_pool3d(x_cpu), avg_pool3d(x_dpcpp))
    def test_max_pool2d_backward(self):
        ipex.enable_auto_dnnl()
        rand_seed = int(get_rand_seed())
        print("{} rand sed: {}".format(sys._getframe().f_code.co_name,
                                       rand_seed))
        torch.manual_seed(rand_seed)
        x = torch.randn(10, 3, 64, 64, dtype=torch.float32) * 10
        for ceil_mode in [True]:
            max_pool2d = torch.nn.MaxPool2d(kernel_size=3,
                                            stride=2,
                                            padding=1,
                                            ceil_mode=ceil_mode)

            x1 = x.clone().requires_grad_()
            x2 = x.clone().to(device=device).requires_grad_()

            y1 = max_pool2d(x1).sum()
            y2 = max_pool2d(x2).sum()
            y1.backward()
            y2.backward()
            self.assertEqual(x1.grad, x2.grad)