예제 #1
0
    def test_forward_compression(self):
        dtype = torch.float
        if torch.cuda.is_available():
            print("cuda is available")
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        print("device used: ", str(device))
        N, C, H, W = 128, 16, 32, 32
        K, HH, WW = 16, 3, 3
        x = torch.randn(N, C, H, W, dtype=dtype, device=device)
        y = torch.randn(K, C, HH, WW, dtype=dtype, device=device)

        start = time.time()
        torch.nn.functional.conv2d(input=x, weight=y, stride=1)
        convStandardTime = time.time() - start
        print("convStandard time: ", convStandardTime)

        conv = Conv2dfftFunction()
        start = time.time()
        conv.forward(ctx=None,
                     input=x,
                     filter=y,
                     stride=1,
                     args=Arguments(stride_type=StrideType.STANDARD,
                                    preserved_energy=100))
        convFFTtime = time.time() - start
        print("convFFT time: ", convFFTtime)
        speedup = convFFTtime / convStandardTime
        print(f"Pytorch speedup is: {speedup} X")
예제 #2
0
 def __init__(self,
              in_channels=None,
              out_channels=None,
              kernel_size=None,
              stride=1,
              padding=0,
              dilation=None,
              groups=None,
              bias=False,
              weight_value=None,
              bias_value=None,
              is_manual=tensor([0]),
              args=Arguments(),
              out_size=None):
     super(Conv2dfft, self).__init__(in_channels=in_channels,
                                     out_channels=out_channels,
                                     kernel_size=kernel_size,
                                     stride=stride,
                                     padding=padding,
                                     dilation=dilation,
                                     groups=groups,
                                     bias=bias,
                                     weight_value=weight_value,
                                     bias_value=bias_value,
                                     is_manual=is_manual,
                                     args=args,
                                     out_size=out_size)
예제 #3
0
    def test_forward_correctness(self):
        """
        exec: CUDA
        convStandard time:  0.004092693328857422
        convFFT time:  0.19140386581420898
        Pytorch speedup is: 46.76721426074799 X

        convStandard time:  0.0039272308349609375
        convFFT time:  0.03870105743408203
        Pytorch speedup is: 9.854541039339486 X

        exec: SGEMM
        convStandard time:  0.004120588302612305
        convFFT time:  0.029807090759277344
        Pytorch speedup is: 7.233697853381936 X

        """
        dtype = torch.float
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        print("device used: ", str(device))
        N, C, H, W = 128, 16, 32, 32
        K, HH, WW = 16, 3, 3
        x = torch.randn(N, C, H, W, dtype=dtype, device=device)
        y = torch.randn(K, C, HH, WW, dtype=dtype, device=device)

        repetitions = 1

        start = time.time()
        for repeat in range(repetitions):
            convStandard = torch.nn.functional.conv2d(input=x,
                                                      weight=y,
                                                      stride=1)
        convStandardTime = time.time() - start
        print("convStandard time: ", convStandardTime)

        conv = Conv2dfftFunction()
        start = time.time()
        for repeat in range(repetitions):
            convFFT = conv.forward(ctx=None,
                                   input=x,
                                   filter=y,
                                   stride=1,
                                   args=Arguments(
                                       stride_type=StrideType.STANDARD,
                                       conv_exec_type=self.conv_exec_type,
                                   ))
        convFFTtime = time.time() - start
        print("convFFT time: ", convFFTtime)
        speedup = convFFTtime / convStandardTime
        print(f"Pytorch speedup is: {speedup} X")

        np.testing.assert_array_almost_equal(
            x=convStandard.cpu().detach().numpy(),
            y=convFFT.cpu().detach().numpy(),
            decimal=3,
            err_msg="The expected array x and computed y are not almost equal."
        )
예제 #4
0
 def test_AutogradForwardWithCompression(self):
     # A single input 2D map.
     x = tensor([[[[1.0, 2.0, 3.0], [3.0, 4.0, 1.0], [1., 2., 1.]]]])
     # A single filter.
     y = tensor([[[[1.0, 2.0], [3.0, 2.0]]]])
     b = tensor([0.0])
     conv = Conv2dfftAutograd(weight_value=y, bias=b,
                              args=Arguments(index_back=1,
                                             next_power2=False,
                                             preserve_energy=100))
     result = conv.forward(input=x)
     # expect = np.array([[[[21.5, 22.0], [17.5, 13.]]]])
     expect = np.array([[[[21.75, 21.75], [18.75, 13.75]]]])
     np.testing.assert_array_almost_equal(
         x=expect, y=result.detach().numpy(),
         err_msg="The expected array x and computed y are not almost equal")
예제 #5
0
 def test_FunctionForwardCompression(self):
     # A single 2D input map.
     x = tensor([[[[1.0, 2.0, 3.0], [3.0, 4.0, 1.0], [1., 2., 1.]]]],
                device=self.device, dtype=self.dtype)
     # A single filter.
     y = tensor([[[[1.0, 2.0], [3.0, 2.0]]]], device=self.device,
                dtype=self.dtype)
     b = tensor([0.0], device=self.device, dtype=self.dtype)
     conv = Conv2dfftFunction()
     result = conv.forward(ctx=None, input=x, filter=y, bias=b,
                           args=Arguments(index_back=1, preserve_energy=100))
     # expect = np.array([[[[21.5, 22.0], [17.5, 13.]]]])
     expect = np.array([[[[21.75, 21.75], [18.75, 13.75]]]])
     np.testing.assert_array_almost_equal(
         x=expect, y=get_numpy(result),
         err_msg="The expected array x and computed y are not almost equal.")
예제 #6
0
    def test_FunctionForwardCompressionConvFFTIndexBackCifar10LeNet1stLayer(
            self):
        start = time.time()
        x = cifar10_image
        print("shape of the input image: ", x.size())
        y = cifar10_lenet_filter
        print("shape of the filter: ", y.size())
        b = torch.tensor([0.0])
        # get the expected results from numpy correlate

        expected_result_tensor = F.conv2d(input=x, weight=y, bias=b)
        N, C, H, W = x.size()
        K, C, HH, WW = y.size()
        out_size = H - HH + 1
        fft_size = H + out_size - 1
        half_fft_size = fft_size // 2 + 1
        fft_numel = half_fft_size * fft_size * C

        # for compress_rate in range(1, fft_numel, 10):
        for index_back in range(1, 2):
            print("index back: ", index_back)
            conv = Conv2dfft(weight_value=y,
                             bias_value=b,
                             args=Arguments(
                                 index_back=index_back,
                                 preserve_energy=100,
                                 is_debug=True,
                                 next_power2=False,
                                 compress_type=CompressType.STANDARD))
            result = conv.forward(input=x)
            # print("actual result: ", result)

            result = result.float()
            abs_error = torch.sum(
                torch.abs(result - expected_result_tensor)).item()
            print("abs error: ", abs_error)
            expected_total = torch.sum(
                torch.abs(expected_result_tensor) + torch.abs(result))
            relative_error = 100.0 * abs_error / expected_total
            print("relative error: ", relative_error)
            # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100)
            print(f"absolute divergence for index back,{index_back},"
                  f"absolute error,{abs_error},"
                  f"relative error (%),{relative_error}")
        print("elapsed: ", time.time() - start)
예제 #7
0
    def test_FunctionForwardCompressionConvFFTPreserveEnergyCifar10LeNet1stLayer(
            self):
        print("\n")
        x = cifar10_image
        print("shape of the input image: ", x.size())
        y = cifar10_lenet_filter
        print("shape of the filter: ", y.size())
        b = torch.tensor([0.0])
        # get the expected results from numpy correlate

        # print("expected_result_numpy: ", expected_result_numpy)

        preserved_energies = [100., 99., 98.5, 98., 97., 96., 95., 94., 93.,
                              92., 91., 90., 89., 87., 85., 80., 70., 60.,
                              50.,
                              40., 10., 5., 1.]
        # preserved_energies = [1.0]
        # compress_rates = [1, 2, 4, 8, 16, 32, 64, 128, 256]

        expected_result_tensor = F.conv2d(input=x, weight=y, bias=b)

        for preserve_energy in preserved_energies:
            conv = Conv2dfft(weight_value=y,
                             bias_value=b,
                             args=Arguments(
                                 preserve_energy=preserve_energy,
                                 index_back=0,
                                 is_debug=True,
                                 next_power2=True,
                                 compress_type=CompressType.STANDARD))
            result = conv.forward(input=x)
            # print("actual result: ", result)

            result = result.float()
            abs_error = torch.sum(
                torch.abs(result - expected_result_tensor)).item()
            expected_total = torch.sum(torch.abs(expected_result_tensor))
            relative_error = abs_error / expected_total * 100.0
            # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100)
            print(
                f"absolute divergence for preserved energy,{preserve_energy}"
                f",absolute error,{abs_error},"
                f"relative error (%),{relative_error}")
예제 #8
0
    return model


class Args(object):
    pass


if __name__ == "__main__":
    dtype = torch.float
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print("device used: ", str(device))

    args = Arguments()
    args.in_channels = 3
    # args.conv_type = "FFT2D"
    args.conv_type = ConvType.STANDARD2D
    args.compress_rate = None
    args.preserve_energy = None
    args.is_debug = False
    args.next_power2 = True
    args.compress_type = CompressType.STANDARD
    args.tensor_type = TensorType.FLOAT32
    args.num_classes = 10
    args.min_batch_size = 16
    args.test_batch_size = 16

    batch_size = 16
    inputs = torch.randn(batch_size,
예제 #9
0
    def test_FunctionBackwardWithPooling(self):
        x = np.array([[[[1., 2., 3., 4., 5.],
                        [6., 7., 8., 1., 2.],
                        [2., 3., 1., 0., 1.],
                        [1., 2., 3., -1., -2.],
                        [0., 1., 3., 1., 2.]
                        ]]])
        y = np.array([[[[2., 1.], [-1.0, 2.0]]]])
        b = np.array([0.0])

        # Full result.
        conv_param = {'pad': 0, 'stride': 1}
        full_expected_result, cache = conv_forward_naive(x=x, w=y, b=b,
                                                         conv_param=conv_param)
        print()
        print("full expected result: ", full_expected_result)

        # get the expected results from numpy correlate
        # expected_result = np.array([[[[10.103396, 12.630585, 11.697527],
        #                               [12.558281, 13.923859, 11.561422],
        #                               [11.473415, 11.409614, 8.187342]]]])
        # expected_result = np.array([[[[11.2787, 14.2694, 12.6907],
        #                               [14.0552, 15.6585, 12.3298],
        #                               [12.0275, 11.8809, 7.7573]]]])
        expected_result = np.array([[[[12.2992, 13.6678, 10.92],
                                      [15.9293, 16.679, 11.7282],
                                      [13.3441, 13.755, 8.7778]]]])
        conv = Conv2dfftFunction()

        x_torch = torch.tensor(data=x, requires_grad=True)
        y_torch = torch.tensor(data=y, requires_grad=True)
        b_torch = torch.tensor(data=b, requires_grad=True)

        out_size = 3

        result_torch = conv.forward(ctx=None, input=x_torch, filter=y_torch,
                                    bias=b_torch, out_size=out_size,
                                    args=Arguments())
        result = result_torch.detach().numpy()
        np.testing.assert_array_almost_equal(
            x=np.array(expected_result), y=result, decimal=4,
            err_msg="Manual: Expected x is different from computed y.")

        # Prevent any interference/overlap of variables in manual and auto
        # differentiation.
        x_torch_auto = torch.tensor(data=x, requires_grad=True)
        y_torch_auto = torch.tensor(data=y, requires_grad=True)
        b_torch_auto = torch.tensor(data=b, requires_grad=True)

        convAuto = Conv2dfftAutograd(weight_value=y_torch_auto,
                                     bias=b_torch_auto,
                                     out_size=out_size)
        resultAuto = convAuto.forward(input=x_torch_auto)
        np.testing.assert_array_almost_equal(
            x=np.array(expected_result), y=resultAuto.cpu().detach().numpy(),
            decimal=4,
            err_msg="Auto: Expected x is different from computed y.")

        dout_np = np.array([[[[0.1, -0.2, 0.3],
                              [-0.1, 0.1, 0.2],
                              [-0.2, 1.1, -1.2]]]])
        dout = tensor(dout_np)

        resultAuto.backward(dout)
        print("x auto grad: ", x_torch_auto.grad)
        print("y auto grad: ", y_torch_auto.grad)
        print("b auto grad: ", b_torch_auto.grad)

        # get the expected result from the backward pass
        expected_dx, expected_dw, expected_db = \
            conv_backward_naive(dout.numpy(), cache)

        result_torch.backward(dout)

        # approximate_expected_dx = np.array(
        #     [[[[0.0306, 0.1016, 0.1293, 0.0976, 0.0249],
        #        [0.0815, 0.1438, 0.1321, 0.0534, -0.0463],
        #        [0.1171, 0.1399, 0.0813, -0.0245, -0.1154],
        #        [0.1164, 0.0923, 0.0066, -0.0904, -0.1420],
        #        [0.0799, 0.0287, -0.0482, -0.1058, -0.1104]]]])

        # approximate_expected_dx = np.array(
        #     [[[[0.0004, 0.1056, 0.1608, 0.1246, 0.0241],
        #        [0.0604, 0.1825, 0.1858, 0.0676, -0.0829],
        #        [0.1250, 0.1951, 0.1164, -0.0518, -0.1829],
        #        [0.1456, 0.1338, 0.0051, -0.1437, -0.2005],
        #        [0.1066, 0.0448, -0.0645, -0.1389, -0.1225]]]])

        approximate_expected_dx = np.array([[[
            [-0.0148, 0.0503, 0.1306, 0.1655, 0.1288],
            [0.1054, 0.1526, 0.1158, 0.0227, -0.0567],
            [0.1963, 0.2130, 0.0595, -0.1488, -0.2549],
            [0.1895, 0.1861, 0.0040, -0.2197, -0.3165],
            [0.0901, 0.0920, -0.0089, -0.1367, -0.1952]]]])

        print("manual torch grad: ", x_torch.grad)

        # Are the gradients correct?
        np.testing.assert_array_almost_equal(
            x=approximate_expected_dx, y=x_torch.grad, decimal=4,
            err_msg="Expected x is different from computed y.")

        self._check_delta2D(actual_result=x_torch.grad,
                            accurate_expected_result=expected_dx, delta=5.4)

        print("Expected fully correct dw: ", expected_dw)
        print("actual result for dw from y_torch.grad: ", y_torch.grad)

        # approximate_expected_dw = np.array([[[[0.844089, 1.41447],
        #                                       [1.221608, 1.32085]]]])
        # approximate_expected_dw = np.array([[[[1.1816, 1.8317],
        #                                       [1.5589, 1.4568]]]])
        approximate_expected_dw = np.array([[[[1.2042, 2.0410],
                                              [1.6021, 1.6371]]]])

        np.testing.assert_array_almost_equal(
            x=approximate_expected_dw, y=y_torch.grad, decimal=4,
            err_msg="Expected x is different from computed y.")

        self._check_delta2D(actual_result=y_torch.grad,
                            accurate_expected_result=expected_dw, delta=4.0)

        np.testing.assert_array_almost_equal(b_torch.grad,
                                             expected_db)
예제 #10
0
'''VGG11/13/16/19 in Pytorch.'''
import torch.nn as nn
from cnns.nnlib.pytorch_layers.conv_picker import Conv
from cnns.nnlib.utils.arguments import Arguments
from cnns.nnlib.utils.general_utils import ConvType
from cnns.nnlib.utils.general_utils import CompressType
import torch

args = Arguments()
args.conv_type = ConvType.FFT2D
args.compress_rate = 80.0
args.dtype = torch.float
args.preserve_energy = None
args.next_power2 = True
args.is_debug = False
args.compress_type = CompressType.STANDARD


def conv3x3(in_planes, out_planes, compress_rate = args.compress_rate,
            stride=1, padding=1, args=args):
    """3x3 convolution with padding"""
    # return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
    #                      padding=1, bias=False)
    args.compress_rate = compress_rate
    return Conv(kernel_sizes=[3], in_channels=in_planes,
                out_channels=[out_planes], strides=[stride],
                padding=[padding], args=args, is_bias=False).get_conv()


cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
예제 #11
0
    def __init__(self,
                 in_channels=None,
                 out_channels=None,
                 kernel_size=None,
                 stride=1,
                 padding=0,
                 dilation=None,
                 groups=None,
                 bias=False,
                 weight_value=None,
                 bias_value=None,
                 is_manual=tensor([0]),
                 args=Arguments(),
                 out_size=None):
        """

        2D convolution using DCT.

        :param in_channels: (int) – Number of channels in the input series.
        :param out_channels: (int) – Number of channels produced by the
        convolution (equal to the number of filters in the given conv layer).
        :param kernel_size: (int) - Size of the convolving kernel (the width and
        height of the filter).
        :param stride: what is the stride for the convolution (the pattern for
        omitted values).
        :param padding: the padding added to the (top and bottom) and to the
        (left and right) of the input signal.
        :param dilation: (int) – Spacing between kernel elements. Default: 1
        :param groups: (int) – Number of blocked connections from input channels
        to output channels. Default: 1
        :param bias: (bool) - add bias or not
        :param compress_rate: how many frequency coefficients should be
        discarded
        :param preserve_energy: how much energy should be preserved in the input
        image.
        :param out_size: what is the expected output size of the
        operation (when compression is used and the out_size is
        smaller than the size of the input to the convolution, then
        the max pooling can be omitted and the compression
        in this layer can serve as the frequency-based (spectral)
        pooling.
        :param weight_value: you can provide the initial filter, i.e.
        filter weights of shape (F, C, HH, WW), where
        F - number of filters, C - number of channels, HH - height of the
        filter, WW - width of the filter
        :param bias_value: you can provide the initial value of the bias,
        of shape (F,)
        :param use_next_power2: should we extend the size of the input for the
        FFT convolution to the next power of 2.
        :param is_manual: to check if the backward computation of convolution
        was computed manually.

        Regarding the stride parameter: the number of pixels between
        adjacent receptive fields in the horizontal and vertical
        directions, we can generate the full output, and then remove the
        redundant elements according to the stride parameter. The more relevant
        method is to apply spectral pooling as a means to achieving the strided
        convolution.
        """
        super(ConvDCT, self).__init__()
        self.args = args

        if dilation is not None and dilation > 1:
            raise NotImplementedError("dilation > 1 is not supported.")
        if groups is not None and groups > 1:
            raise NotImplementedError("groups > 1 is not supported.")

        self.is_weight_value = None  # Was the filter value provided?
        if weight_value is None:
            self.is_weight_value = False
            if out_channels is None or in_channels is None or \
                    kernel_size is None:
                raise ValueError(
                    "Either specify filter_value or provide all"
                    "the required parameters (out_channels, "
                    "in_channels and kernel_size) to generate the "
                    "filter.")
            self.kernel_height, self.kernel_width = get_pair(kernel_size)
            if args.dtype is torch.float:
                weight = torch.randn(out_channels,
                                     in_channels,
                                     self.kernel_height,
                                     self.kernel_width,
                                     dtype=args.dtype)
            elif args.dtype is torch.half:
                weight = torch.randn(out_channels, in_channels,
                                     self.kernel_height,
                                     self.kernel_width).to(torch.half)
            else:
                raise Exception(f"Unknown dtype in args: {args.dtype}")
            self.weight = Parameter(weight)
        else:
            self.is_weight_value = True
            self.weight = weight_value
            out_channels = weight_value.shape[0]
            in_channels = weight_value.shape[1]
            self.kernel_height = weight_value.shape[2]
            self.kernel_width = weight_value.shape[3]

        self.is_bias_value = None  # Was the bias value provided.
        if bias_value is None:
            self.is_bias_value = False
            if bias is True:
                self.bias = Parameter(
                    torch.randn(out_channels, dtype=args.dtype))
            else:
                self.register_parameter('bias', None)
                self.bias = None
        else:
            self.is_bias_value = True
            self.bias = bias_value

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        self.pad_H, self.pad_W = get_pair(value=padding,
                                          val_1_default=0,
                                          val2_default=0,
                                          name="padding")

        if self.pad_H != self.pad_W:
            raise Exception(
                "We only support a symmetric padding in the frequency domain.")

        self.stride = stride
        self.stride_type = args.stride_type

        self.stride_H, self.stride_W = get_pair(value=stride,
                                                val_1_default=None,
                                                val2_default=None,
                                                name="stride")

        if self.stride_H != self.stride_W:
            raise Exception(
                "We only support a symmetric striding in the frequency domain."
            )

        self.is_manual = is_manual
        self.conv_index = ConvDCT.conv_index_counter
        ConvDCT.conv_index_counter += 1
        self.out_size = out_size

        self.out_size_H, self.out_size_W = get_pair(value=out_size,
                                                    val_1_default=None,
                                                    val2_default=None,
                                                    name="out_size")

        if self.out_size_H != self.out_size_W:
            raise Exception(
                "We only support a symmetric outputs in the frequency domain.")

        if args is None:
            self.compress_rate = None
            self.preserve_energy = None
            self.is_debug = False
            self.next_power2 = False
            self.is_debug = False
            self.compress_type = CompressType.STANDARD
        else:
            self.compress_rate = args.compress_rate
            self.preserve_energy = args.preserve_energy
            self.next_power2 = args.next_power2
            self.is_debug = args.is_debug
            self.compress_type = args.compress_type

        self.reset_parameters()
예제 #12
0
    def test_forward_pass_resnet18(self):
        """
        total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.813918352127075
        total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.35197567939758
        total time for (ConvType.FFT2D-ConvExecType.SGEMM): 55.51149845123291

        total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.736859083175659
        total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.84979581832886
        total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.26755166053772

        global init time:  0.24471688270568848
        global pad time:  4.250756025314331
        (r)fft time:  8.754997730255127
        conjugate time:  3.734828233718872
        correlation time:  25.324009656906128
        restore time (de-compress/concat output):  0.021800994873046875
        i(r)fft time:  8.525353193283081
        total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.27733850479126
        GPU mem: 2903

        global init time:  0.2371835708618164
        global pad time:  4.492943286895752
        (r)fft time:  9.08437442779541
        conjugate time:  3.8394811153411865
        correlation time:  25.043412446975708
        restore time (de-compress/concat output):  0.021334409713745117
        i(r)fft time:  5.491833925247192
        total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.804604053497314
        GPU mem: 2679
        """
        if not torch.cuda.is_available():
            print("CUDA device is not available.")
            return

        device = torch.device("cuda")
        print("\ndevice used: ", str(device))

        C = 3
        # dtype = torch.float
        # random mini batch imitating cifar-10
        # N, H, W = 128, 32, 32
        # inputs = torch.randn(N, C, H, W, dtype=dtype, device=device,
        #                      requires_grad=True)
        args = Arguments()
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

        args.sample_count_limit = 10000
        args.min_batch_size = 32
        args.dataset_name = "cifar10"
        args.test_batch_size = args.min_batch_size
        args.network_type = NetworkType.ResNet18
        from cnns.nnlib.datasets.cifar import get_cifar
        train_loader, test_loader, _, _ = get_cifar(
            args=args, dataset_name=args.dataset_name)

        repetition = 1

        args.in_channels = C
        args.compress_rate = None
        args.preserve_energy = 100
        args.is_debug = True
        args.next_power2 = True
        args.compress_type = CompressType.STANDARD
        args.tensor_type = TensorType.FLOAT32
        args.num_classes = 10
        args.test_batch_size = args.min_batch_size
        args.in_channels = C
        args.dtype = torch.float32
        conv_exec_types = [  # (ConvType.STANDARD2D, ConvExecType.SERIAL),
            # (ConvType.FFT2D, ConvExecType.CUDA),
            (ConvType.FFT2D, ConvExecType.SGEMM),
            # (ConvType.FFT2D, ConvExecType.CUDA_SHARED_LOG),
            # (ConvType.FFT2D, ConvExecType.CUDA_DEEP),
            # (ConvType.FFT2D, ConvExecType.SERIAL),
            # (ConvType.FFT2D, ConvExecType.BATCH),
        ]

        for conv_type, conv_exec_type in conv_exec_types:
            args.conv_type = conv_type
            args.conv_exec_type = conv_exec_type
            model = resnet18(args=args)
            model.to(device)
            model.eval()
            start_eval = time.time()
            for _ in range(repetition):
                for inputs, _ in train_loader:
                    inputs = inputs.to(device)
                    outputs_standard = model(inputs)
            standard_time = time.time() - start_eval
            print(f"total time for ({conv_type}-{conv_exec_type}):"
                  f" {standard_time}")
예제 #13
0
    def test_forward_backward_performance(self):
        dtype = torch.float
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        print("device used: ", str(device))

        N, C, H, W, K, HH, WW, padding = 32, 3, 32, 32, 64, 3, 3, 0

        natural_image = True
        if natural_image:
            x = cifar10_image[:, :1, :H, :W]
            x_new = x.expand(N, C, -1, -1).clone()  # specifies new size
            del x
            print("x size: ", x_new.size())
            x = x_new.to(device)
            x.requires_grad_(True)
        else:
            x = torch.randn(N,
                            C,
                            H,
                            W,
                            dtype=dtype,
                            device=device,
                            requires_grad=True)
        x_expect = x.clone().detach().requires_grad_(True)
        y = torch.randn(K,
                        C,
                        HH,
                        WW,
                        dtype=dtype,
                        device=device,
                        requires_grad=True)
        y_expect = y.clone().detach().requires_grad_(True)

        print("input size: ", x.size())
        print("filter size: ", y.size())
        print("padding: ", padding)
        from .conv2D_fft import global_threshold
        repetitions = global_threshold
        print("repetitions: ", repetitions)
        preserve_energy = 80
        print("preserve energy: ", preserve_energy)
        stride = 1
        print("stride: ", stride)
        next_power2 = True
        print("next_power2: ", str(next_power2))
        print("cuda exec type: ", self.conv_exec_type.name)
        compress_rate = 0.0
        print("compress rate: ", compress_rate)

        # warm-up
        torch.nn.functional.conv2d(input=x_expect,
                                   weight=y_expect,
                                   stride=stride,
                                   padding=padding)

        start = time.time()
        for _ in range(repetitions):
            convStandard = torch.nn.functional.conv2d(input=x_expect,
                                                      weight=y_expect,
                                                      stride=stride,
                                                      padding=padding)
        convStandardTime = time.time() - start
        print("convStandard time: ", convStandardTime)

        conv = Conv2dfft(weight_value=y,
                         stride=stride,
                         bias=False,
                         padding=padding,
                         args=Arguments(stride_type=StrideType.STANDARD,
                                        min_batch_size=N,
                                        is_debug=True,
                                        preserved_energy=preserve_energy,
                                        next_power2=next_power2,
                                        conv_exec_type=self.conv_exec_type,
                                        compress_rate=compress_rate,
                                        compress_rates=[compress_rate]))

        # warm-up
        conv.forward(input=x)

        start = time.time()
        for _ in range(repetitions):
            convFFT = conv.forward(input=x)
        convFFTtime = time.time() - start
        print("convFFT time: ", convFFTtime)
        speedup = convFFTtime / convStandardTime
        print(f"Pytorch forward pass speedup is: {speedup}")

        if compress_rate == 0.0 and preserve_energy == 100:
            np.testing.assert_array_almost_equal(
                x=convStandard.cpu().detach().numpy(),
                y=convFFT.cpu().detach().numpy(),
                decimal=1,
                err_msg=
                "The expected array x and computed y are not almost equal.")

        dout = torch.randn(list(convStandard.size()),
                           device=device,
                           dtype=dtype)
        dout_clone = dout.clone()

        # warm-up
        convStandard.backward(dout, retain_graph=True)

        standard_back_time_start = time.time()
        for _ in range(repetitions):
            convStandard.backward(dout, retain_graph=True)
        standard_back_time = time.time() - standard_back_time_start
        print("standard back time: ", standard_back_time)

        # warm-up
        convFFT.backward(dout_clone, retain_graph=True)

        fft_back_time_start = time.time()
        for _ in range(repetitions):
            convFFT.backward(dout_clone, retain_graph=True)
        conv_fft_back_time = time.time() - fft_back_time_start
        assert conv.is_manual[0] == 1
        print("conv fft back time: ", conv_fft_back_time)
        speedup = conv_fft_back_time / standard_back_time
        print(f"Pytorch speedup for backprop: {speedup}")

        full_pass_fft = convFFTtime + conv_fft_back_time
        print("full pass fft:", full_pass_fft)
        full_pass_pytorch = convStandardTime + standard_back_time
        print("full pass pytorch: ", full_pass_pytorch)
        speedup_full_pass = full_pass_fft / full_pass_pytorch
        print(f"Pytorch speedup for full pass: {speedup_full_pass}")

        if compress_rate == 0.0 and preserve_energy == 100:
            np.testing.assert_array_almost_equal(
                x.grad.cpu().detach().numpy(),
                x_expect.grad.cpu().detach().numpy(),
                decimal=1)

            np.testing.assert_array_almost_equal(
                y.grad.cpu().detach().numpy(),
                y_expect.grad.cpu().detach().numpy(),
                decimal=1)
예제 #14
0
    def test_forward_backward(self):
        dtype = torch.float
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        print("device used: ", str(device))
        N, C, H, W = 128, 16, 32, 32
        K, HH, WW = 16, 3, 3
        x = torch.randn(N,
                        C,
                        H,
                        W,
                        dtype=dtype,
                        device=device,
                        requires_grad=True)
        x_expect = x.clone().detach().requires_grad_(True)
        y = torch.randn(K,
                        C,
                        HH,
                        WW,
                        dtype=dtype,
                        device=device,
                        requires_grad=True)
        y_expect = y.clone().detach().requires_grad_(True)
        start = time.time()
        convStandard = torch.nn.functional.conv2d(input=x_expect,
                                                  weight=y_expect,
                                                  stride=1)
        convStandardTime = time.time() - start
        print("convStandard time: ", convStandardTime)

        conv = Conv2dfft(weight_value=y,
                         stride=1,
                         bias=False,
                         args=Arguments(stride_type=StrideType.STANDARD))
        start = time.time()
        convFFT = conv.forward(input=x)
        convFFTtime = time.time() - start
        print("convFFT time: ", convFFTtime)
        speedup = convFFTtime / convStandardTime
        print(f"Pytorch forward pass speedup is: {speedup} X")

        np.testing.assert_array_almost_equal(
            x=convStandard.cpu().detach().numpy(),
            y=convFFT.cpu().detach().numpy(),
            decimal=3,
            err_msg="The expected array x and computed y are not almost equal."
        )

        dout = torch.randn(list(convStandard.size()),
                           device=device,
                           dtype=dtype)
        dout_clone = dout.clone()

        standard_back_time_start = time.time()
        convStandard.backward(dout)
        standard_back_time = time.time() - standard_back_time_start
        print("standard back time: ", standard_back_time)

        fft_back_time_start = time.time()
        convFFT.backward(dout_clone)
        conv_fft_back_time = time.time() - fft_back_time_start
        assert conv.is_manual[0] == 1
        print("conv fft back time: ", conv_fft_back_time)
        speedup = conv_fft_back_time / standard_back_time
        print(f"Pytorch speedup for backprop: {speedup} X")

        np.testing.assert_array_almost_equal(
            x.grad.cpu().detach().numpy(),
            x_expect.grad.cpu().detach().numpy(),
            decimal=3)

        np.testing.assert_array_almost_equal(
            y.grad.cpu().detach().numpy(),
            y_expect.grad.cpu().detach().numpy(),
            decimal=3)
예제 #15
0
    def test_forward_timing(self):
        """
        device used:  cuda
        x size:  torch.Size([32, 3, 32, 32])
        input size:  torch.Size([32, 3, 32, 32])
        filter size:  torch.Size([64, 3, 3, 3])
        padding:  0
        repetitions:  1000
        preserve energy:  100
        next_power2:  False
        cuda exec type:  CUDA
        output size:  torch.Size([32, 64, 30, 30])
        PyTorch conv2D:  0.6601831912994385
        compress rate:  80.0
        conv FFT time:  16.30516004562378
        Pytorch speedup: 24.697932726112484


        """
        dtype = torch.float
        if torch.cuda.is_available():
            device = torch.device("cuda")
            print("\nTorch CUDA is available")
        else:
            device = torch.device("cpu")
        print("device used: ", str(device))
        # # 1st layer
        # N, C, H, W, K, HH, WW = 32, 3, 32, 32, 64, 3, 3
        # 7th layer
        # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 256, 3, 3
        # last layer
        # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 512, 3, 3
        for N, C, H, W, K, HH, WW, padding in [
            (32, 3, 32, 32, 64, 3, 3, 0),
                # (32, 3, 32, 32, 64, 3, 3, 1),
                # (32, 3, 32, 32, 64, 7, 7, 3),
                # (32, 64, 16, 16, 64, 3, 3, 1),
                # (32, 256, 4, 4, 256, 3, 3, 1),
                # (32, 512, 2, 2, 512, 3, 3, 1),
        ]:
            natural_image = True
            if natural_image:
                x = cifar10_image[:, :1, :H, :W]
                x_new = x.expand(N, C, -1, -1).clone()  # specifies new size
                del x
                print("x size: ", x_new.size())
                x = x_new.to(device)
            else:
                x = torch.randn(N, C, H, W, dtype=dtype, device=device)

            y = torch.randn(K, C, HH, WW, dtype=dtype, device=device)

            print("input size: ", x.size())
            print("filter size: ", y.size())
            print("padding: ", padding)
            repetitions = 1000
            print("repetitions: ", repetitions)
            preserve_energy = 100
            print("preserve energy: ", preserve_energy)
            stride = 1
            next_power2 = False
            print("next_power2: ", str(next_power2))
            print("cuda exec type: ", self.conv_exec_type.name)

            # print("preserve energy: ", preserve_energy)
            # print("min_batch_size (equivalent to the batch slice for fft): ", N)
            # print("next power 2: ", next_power2)

            convStandard = torch.nn.Conv2d(in_channels=C,
                                           out_channels=K,
                                           kernel_size=(HH, WW),
                                           stride=stride,
                                           padding=padding)
            convStandard.to(device)
            out_standard = convStandard.forward(x)
            print("output size: ", out_standard.size())

            start = time.time()
            for repeat in range(repetitions):
                convStandard.forward(x)
            convStandardTime = time.time() - start
            print("PyTorch conv2D: ", convStandardTime)

            # print("compress_rate, FFT conv2D:")
            # for compress_rate in range(0, 86, 5):
            for compress_rate in [80.0]:
                compress_rate = float(compress_rate)
                print("compress rate: ", compress_rate)

                conv = Conv2dfft(weight_value=y,
                                 stride=stride,
                                 padding=padding,
                                 args=Arguments(
                                     stride_type=StrideType.STANDARD,
                                     min_batch_size=N,
                                     is_debug=True,
                                     preserved_energy=preserve_energy,
                                     next_power2=next_power2,
                                     conv_exec_type=self.conv_exec_type,
                                     compress_rate=compress_rate,
                                     compress_rates=[compress_rate]))
                conv.to(device)
                start = time.time()
                for repeat in range(repetitions):
                    conv.forward(input=x)
                convFFTtime = time.time() - start
                # print(compress_rate, ",", convFFTtime)
                print("conv FFT time: ", convFFTtime)
                # del conv
                speedup = convFFTtime / convStandardTime
                print(f"Pytorch speedup: {speedup}")