def test_FunctionForwardCompressionConvFFTIndexBackCifar10LeNet1stLayer( self): start = time.time() x = cifar10_image print("shape of the input image: ", x.size()) y = cifar10_lenet_filter print("shape of the filter: ", y.size()) b = torch.tensor([0.0]) # get the expected results from numpy correlate expected_result_tensor = F.conv2d(input=x, weight=y, bias=b) N, C, H, W = x.size() K, C, HH, WW = y.size() out_size = H - HH + 1 fft_size = H + out_size - 1 half_fft_size = fft_size // 2 + 1 fft_numel = half_fft_size * fft_size * C # for compress_rate in range(1, fft_numel, 10): for index_back in range(1, 2): print("index back: ", index_back) conv = Conv2dfft(weight_value=y, bias_value=b, args=Arguments( index_back=index_back, preserve_energy=100, is_debug=True, next_power2=False, compress_type=CompressType.STANDARD)) result = conv.forward(input=x) # print("actual result: ", result) result = result.float() abs_error = torch.sum( torch.abs(result - expected_result_tensor)).item() print("abs error: ", abs_error) expected_total = torch.sum( torch.abs(expected_result_tensor) + torch.abs(result)) relative_error = 100.0 * abs_error / expected_total print("relative error: ", relative_error) # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100) print(f"absolute divergence for index back,{index_back}," f"absolute error,{abs_error}," f"relative error (%),{relative_error}") print("elapsed: ", time.time() - start)
def test_FunctionForwardCompressionConvFFTPreserveEnergyCifar10LeNet1stLayer( self): print("\n") x = cifar10_image print("shape of the input image: ", x.size()) y = cifar10_lenet_filter print("shape of the filter: ", y.size()) b = torch.tensor([0.0]) # get the expected results from numpy correlate # print("expected_result_numpy: ", expected_result_numpy) preserved_energies = [100., 99., 98.5, 98., 97., 96., 95., 94., 93., 92., 91., 90., 89., 87., 85., 80., 70., 60., 50., 40., 10., 5., 1.] # preserved_energies = [1.0] # compress_rates = [1, 2, 4, 8, 16, 32, 64, 128, 256] expected_result_tensor = F.conv2d(input=x, weight=y, bias=b) for preserve_energy in preserved_energies: conv = Conv2dfft(weight_value=y, bias_value=b, args=Arguments( preserve_energy=preserve_energy, index_back=0, is_debug=True, next_power2=True, compress_type=CompressType.STANDARD)) result = conv.forward(input=x) # print("actual result: ", result) result = result.float() abs_error = torch.sum( torch.abs(result - expected_result_tensor)).item() expected_total = torch.sum(torch.abs(expected_result_tensor)) relative_error = abs_error / expected_total * 100.0 # relative_error = torch.mean(torch.abs(result) / torch.abs(expected_result_tensor) * 100) print( f"absolute divergence for preserved energy,{preserve_energy}" f",absolute error,{abs_error}," f"relative error (%),{relative_error}")
def run(): N, C, H, W = 16, 3, 32, 32 F = 32 HH, WW = 3, 3 if torch.cuda.is_available(): print("Cuda is available.") device = torch.device('cuda') else: device = torch.device('cpu') x = torch.randn(N, C, H, W, device=device) y = torch.randn(F, C, HH, WW, device=device) b = torch.randn(N, F, H, W, device=device) layer_cpp = Conv2dfftCpp(weight_value=y, padding=2) layer_python = Conv2dfft(weight_value=y, padding=2) time_it(layer=layer_cpp, name="cpp") time_it(layer=layer_python, name="python")
def get_conv(self, param_index=0, compress_rate=None): if param_index == 0: in_channels = self.in_channels else: in_channels = self.out_channels[param_index - 1] if compress_rate is None: compress_rate = self.compress_rate if self.conv_type is ConvType.STANDARD: return nn.Conv1d(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias) elif self.conv_type is ConvType.STANDARD2D: return nn.Conv2d(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias) elif self.conv_type is ConvType.FFT1D: return Conv1dfft(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias, args=self.args) elif self.conv_type is ConvType.FFT2D: return Conv2dfft(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias, args=self.args) elif self.conv_type is ConvType.DCT: return ConvDCT(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias, args=self.args) elif self.conv_type is ConvType.AUTOGRAD: return Conv1dfftAutograd(in_channels=in_channels, out_channels=self.out_channels[ param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], index_back=compress_rate, bias=self.is_bias) elif self.conv_type is ConvType.AUTOGRAD2D: return Conv2dfftAutograd(in_channels=in_channels, out_channels=self.out_channels[ param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], bias=self.is_bias, args=self.args) elif self.conv_type is ConvType.SIMPLE_FFT: return Conv1dfftSimple(in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], index_back=compress_rate, bias=self.is_bias) elif self.conv_type is ConvType.SIMPLE_FFT_FOR_LOOP: return Conv1dfftSimpleForLoop(in_channels=in_channels, out_channels=self.out_channels[ param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[ param_index], padding=self.padding[param_index], index_back=compress_rate, bias=self.is_bias) elif self.conv_type is ConvType.COMPRESS_INPUT_ONLY: return Conv1dfftCompressSignalOnly( in_channels=in_channels, out_channels=self.out_channels[param_index], stride=self.strides[param_index], kernel_size=self.kernel_sizes[param_index], padding=self.padding[param_index], index_back=compress_rate, preserve_energy=self.preserve_energy, bias=self.is_bias) else: raise Exception(CONV_TYPE_ERROR)
def test_forward_backward_performance(self): dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W, K, HH, WW, padding = 32, 3, 32, 32, 64, 3, 3, 0 natural_image = True if natural_image: x = cifar10_image[:, :1, :H, :W] x_new = x.expand(N, C, -1, -1).clone() # specifies new size del x print("x size: ", x_new.size()) x = x_new.to(device) x.requires_grad_(True) else: x = torch.randn(N, C, H, W, dtype=dtype, device=device, requires_grad=True) x_expect = x.clone().detach().requires_grad_(True) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device, requires_grad=True) y_expect = y.clone().detach().requires_grad_(True) print("input size: ", x.size()) print("filter size: ", y.size()) print("padding: ", padding) from .conv2D_fft import global_threshold repetitions = global_threshold print("repetitions: ", repetitions) preserve_energy = 80 print("preserve energy: ", preserve_energy) stride = 1 print("stride: ", stride) next_power2 = True print("next_power2: ", str(next_power2)) print("cuda exec type: ", self.conv_exec_type.name) compress_rate = 0.0 print("compress rate: ", compress_rate) # warm-up torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=stride, padding=padding) start = time.time() for _ in range(repetitions): convStandard = torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=stride, padding=padding) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfft(weight_value=y, stride=stride, bias=False, padding=padding, args=Arguments(stride_type=StrideType.STANDARD, min_batch_size=N, is_debug=True, preserved_energy=preserve_energy, next_power2=next_power2, conv_exec_type=self.conv_exec_type, compress_rate=compress_rate, compress_rates=[compress_rate])) # warm-up conv.forward(input=x) start = time.time() for _ in range(repetitions): convFFT = conv.forward(input=x) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch forward pass speedup is: {speedup}") if compress_rate == 0.0 and preserve_energy == 100: np.testing.assert_array_almost_equal( x=convStandard.cpu().detach().numpy(), y=convFFT.cpu().detach().numpy(), decimal=1, err_msg= "The expected array x and computed y are not almost equal.") dout = torch.randn(list(convStandard.size()), device=device, dtype=dtype) dout_clone = dout.clone() # warm-up convStandard.backward(dout, retain_graph=True) standard_back_time_start = time.time() for _ in range(repetitions): convStandard.backward(dout, retain_graph=True) standard_back_time = time.time() - standard_back_time_start print("standard back time: ", standard_back_time) # warm-up convFFT.backward(dout_clone, retain_graph=True) fft_back_time_start = time.time() for _ in range(repetitions): convFFT.backward(dout_clone, retain_graph=True) conv_fft_back_time = time.time() - fft_back_time_start assert conv.is_manual[0] == 1 print("conv fft back time: ", conv_fft_back_time) speedup = conv_fft_back_time / standard_back_time print(f"Pytorch speedup for backprop: {speedup}") full_pass_fft = convFFTtime + conv_fft_back_time print("full pass fft:", full_pass_fft) full_pass_pytorch = convStandardTime + standard_back_time print("full pass pytorch: ", full_pass_pytorch) speedup_full_pass = full_pass_fft / full_pass_pytorch print(f"Pytorch speedup for full pass: {speedup_full_pass}") if compress_rate == 0.0 and preserve_energy == 100: np.testing.assert_array_almost_equal( x.grad.cpu().detach().numpy(), x_expect.grad.cpu().detach().numpy(), decimal=1) np.testing.assert_array_almost_equal( y.grad.cpu().detach().numpy(), y_expect.grad.cpu().detach().numpy(), decimal=1)
def test_forward_backward(self): dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("device used: ", str(device)) N, C, H, W = 128, 16, 32, 32 K, HH, WW = 16, 3, 3 x = torch.randn(N, C, H, W, dtype=dtype, device=device, requires_grad=True) x_expect = x.clone().detach().requires_grad_(True) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device, requires_grad=True) y_expect = y.clone().detach().requires_grad_(True) start = time.time() convStandard = torch.nn.functional.conv2d(input=x_expect, weight=y_expect, stride=1) convStandardTime = time.time() - start print("convStandard time: ", convStandardTime) conv = Conv2dfft(weight_value=y, stride=1, bias=False, args=Arguments(stride_type=StrideType.STANDARD)) start = time.time() convFFT = conv.forward(input=x) convFFTtime = time.time() - start print("convFFT time: ", convFFTtime) speedup = convFFTtime / convStandardTime print(f"Pytorch forward pass speedup is: {speedup} X") np.testing.assert_array_almost_equal( x=convStandard.cpu().detach().numpy(), y=convFFT.cpu().detach().numpy(), decimal=3, err_msg="The expected array x and computed y are not almost equal." ) dout = torch.randn(list(convStandard.size()), device=device, dtype=dtype) dout_clone = dout.clone() standard_back_time_start = time.time() convStandard.backward(dout) standard_back_time = time.time() - standard_back_time_start print("standard back time: ", standard_back_time) fft_back_time_start = time.time() convFFT.backward(dout_clone) conv_fft_back_time = time.time() - fft_back_time_start assert conv.is_manual[0] == 1 print("conv fft back time: ", conv_fft_back_time) speedup = conv_fft_back_time / standard_back_time print(f"Pytorch speedup for backprop: {speedup} X") np.testing.assert_array_almost_equal( x.grad.cpu().detach().numpy(), x_expect.grad.cpu().detach().numpy(), decimal=3) np.testing.assert_array_almost_equal( y.grad.cpu().detach().numpy(), y_expect.grad.cpu().detach().numpy(), decimal=3)
def test_forward_timing(self): """ device used: cuda x size: torch.Size([32, 3, 32, 32]) input size: torch.Size([32, 3, 32, 32]) filter size: torch.Size([64, 3, 3, 3]) padding: 0 repetitions: 1000 preserve energy: 100 next_power2: False cuda exec type: CUDA output size: torch.Size([32, 64, 30, 30]) PyTorch conv2D: 0.6601831912994385 compress rate: 80.0 conv FFT time: 16.30516004562378 Pytorch speedup: 24.697932726112484 """ dtype = torch.float if torch.cuda.is_available(): device = torch.device("cuda") print("\nTorch CUDA is available") else: device = torch.device("cpu") print("device used: ", str(device)) # # 1st layer # N, C, H, W, K, HH, WW = 32, 3, 32, 32, 64, 3, 3 # 7th layer # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 256, 3, 3 # last layer # N, C, H, W, K, HH, WW = 32, 256, 4, 4, 512, 3, 3 for N, C, H, W, K, HH, WW, padding in [ (32, 3, 32, 32, 64, 3, 3, 0), # (32, 3, 32, 32, 64, 3, 3, 1), # (32, 3, 32, 32, 64, 7, 7, 3), # (32, 64, 16, 16, 64, 3, 3, 1), # (32, 256, 4, 4, 256, 3, 3, 1), # (32, 512, 2, 2, 512, 3, 3, 1), ]: natural_image = True if natural_image: x = cifar10_image[:, :1, :H, :W] x_new = x.expand(N, C, -1, -1).clone() # specifies new size del x print("x size: ", x_new.size()) x = x_new.to(device) else: x = torch.randn(N, C, H, W, dtype=dtype, device=device) y = torch.randn(K, C, HH, WW, dtype=dtype, device=device) print("input size: ", x.size()) print("filter size: ", y.size()) print("padding: ", padding) repetitions = 1000 print("repetitions: ", repetitions) preserve_energy = 100 print("preserve energy: ", preserve_energy) stride = 1 next_power2 = False print("next_power2: ", str(next_power2)) print("cuda exec type: ", self.conv_exec_type.name) # print("preserve energy: ", preserve_energy) # print("min_batch_size (equivalent to the batch slice for fft): ", N) # print("next power 2: ", next_power2) convStandard = torch.nn.Conv2d(in_channels=C, out_channels=K, kernel_size=(HH, WW), stride=stride, padding=padding) convStandard.to(device) out_standard = convStandard.forward(x) print("output size: ", out_standard.size()) start = time.time() for repeat in range(repetitions): convStandard.forward(x) convStandardTime = time.time() - start print("PyTorch conv2D: ", convStandardTime) # print("compress_rate, FFT conv2D:") # for compress_rate in range(0, 86, 5): for compress_rate in [80.0]: compress_rate = float(compress_rate) print("compress rate: ", compress_rate) conv = Conv2dfft(weight_value=y, stride=stride, padding=padding, args=Arguments( stride_type=StrideType.STANDARD, min_batch_size=N, is_debug=True, preserved_energy=preserve_energy, next_power2=next_power2, conv_exec_type=self.conv_exec_type, compress_rate=compress_rate, compress_rates=[compress_rate])) conv.to(device) start = time.time() for repeat in range(repetitions): conv.forward(input=x) convFFTtime = time.time() - start # print(compress_rate, ",", convFFTtime) print("conv FFT time: ", convFFTtime) # del conv speedup = convFFTtime / convStandardTime print(f"Pytorch speedup: {speedup}")