device = torch.device("cpu") print("device used: ", str(device)) args = Arguments() args.in_channels = 3 # args.conv_type = "FFT2D" args.conv_type = ConvType.STANDARD2D args.compress_rate = None args.preserve_energy = None args.is_debug = False args.next_power2 = True args.compress_type = CompressType.STANDARD args.tensor_type = TensorType.FLOAT32 args.num_classes = 10 args.min_batch_size = 16 args.test_batch_size = 16 batch_size = 16 inputs = torch.randn(batch_size, args.in_channels, 32, 32, dtype=dtype, device=device) model = resnet18(args=args) model.to(device) model.eval() start_eval = time.time() outputs_standard = model(inputs) standard_time = time.time() - start_eval
def test_forward_pass_resnet18(self): """ total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.813918352127075 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.35197567939758 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 55.51149845123291 total time for (ConvType.STANDARD2D-ConvExecType.SERIAL): 6.736859083175659 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.84979581832886 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.26755166053772 global init time: 0.24471688270568848 global pad time: 4.250756025314331 (r)fft time: 8.754997730255127 conjugate time: 3.734828233718872 correlation time: 25.324009656906128 restore time (de-compress/concat output): 0.021800994873046875 i(r)fft time: 8.525353193283081 total time for (ConvType.FFT2D-ConvExecType.SGEMM): 56.27733850479126 GPU mem: 2903 global init time: 0.2371835708618164 global pad time: 4.492943286895752 (r)fft time: 9.08437442779541 conjugate time: 3.8394811153411865 correlation time: 25.043412446975708 restore time (de-compress/concat output): 0.021334409713745117 i(r)fft time: 5.491833925247192 total time for (ConvType.FFT2D-ConvExecType.CUDA): 53.804604053497314 GPU mem: 2679 """ if not torch.cuda.is_available(): print("CUDA device is not available.") return device = torch.device("cuda") print("\ndevice used: ", str(device)) C = 3 # dtype = torch.float # random mini batch imitating cifar-10 # N, H, W = 128, 32, 32 # inputs = torch.randn(N, C, H, W, dtype=dtype, device=device, # requires_grad=True) args = Arguments() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.sample_count_limit = 10000 args.min_batch_size = 32 args.dataset_name = "cifar10" args.test_batch_size = args.min_batch_size args.network_type = NetworkType.ResNet18 from cnns.nnlib.datasets.cifar import get_cifar train_loader, test_loader, _, _ = get_cifar( args=args, dataset_name=args.dataset_name) repetition = 1 args.in_channels = C args.compress_rate = None args.preserve_energy = 100 args.is_debug = True args.next_power2 = True args.compress_type = CompressType.STANDARD args.tensor_type = TensorType.FLOAT32 args.num_classes = 10 args.test_batch_size = args.min_batch_size args.in_channels = C args.dtype = torch.float32 conv_exec_types = [ # (ConvType.STANDARD2D, ConvExecType.SERIAL), # (ConvType.FFT2D, ConvExecType.CUDA), (ConvType.FFT2D, ConvExecType.SGEMM), # (ConvType.FFT2D, ConvExecType.CUDA_SHARED_LOG), # (ConvType.FFT2D, ConvExecType.CUDA_DEEP), # (ConvType.FFT2D, ConvExecType.SERIAL), # (ConvType.FFT2D, ConvExecType.BATCH), ] for conv_type, conv_exec_type in conv_exec_types: args.conv_type = conv_type args.conv_exec_type = conv_exec_type model = resnet18(args=args) model.to(device) model.eval() start_eval = time.time() for _ in range(repetition): for inputs, _ in train_loader: inputs = inputs.to(device) outputs_standard = model(inputs) standard_time = time.time() - start_eval print(f"total time for ({conv_type}-{conv_exec_type}):" f" {standard_time}")