[512, 512, 2, 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'], ) cat_configs_long = op_bench.cross_product_configs(M=[128], N=[128, 1024], K=[1, 2], dim=[0, 1, 2], device=['cpu', 'cuda'], tags=['long']) class CatBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, dim, device): self.input_one = torch.rand(M, N, K, device=device) self.dim = dim self.set_module_name('cat') def forward(self): return torch.cat((self.input_one, self.input_one), dim=self.dim) op_bench.generate_pt_test(cat_configs_short + cat_configs_long, CatBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
import torch.nn.functional as F """Microbenchmarks for batchnorm operator.""" configs = op_bench.config_list(attrs=[ [1, 256, 3136], [1, 2**16, 1], [128, 2048, 1], ], attr_names=["M", "N", "K"], tags=["short"]) class BatchNormBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K): self.input_one = torch.rand(M, N, K) self.mean = torch.rand(N) self.var = torch.rand(N) self.weight = torch.rand(N) self.bias = torch.rand(N) self.set_module_name("batchnorm") def forward(self): return F.batch_norm(self.input_one, self.mean, self.var, self.weight, self.bias) op_bench.generate_pt_test(configs, BatchNormBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
requires_grad=False, dtype=torch.float) def forward(self): return self.input.to(torch.half) class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, device): self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.half) def forward(self): return self.input.to(torch.float) op_bench.generate_pt_test(tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark) op_bench.generate_pt_test(tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark) op_bench.generate_pt_test(tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark) op_bench.generate_pt_test(tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
def forward(self, input): # Assume that the `self.input` is set in the child return self.qlinear(input) class QLinearBenchmark(_QLinearBenchmarkBase): def init(self, N, IN, OUT, device): super(QLinearBenchmark, self).init(N, IN, OUT, nnq.Linear(IN, OUT)) self.inputs = {"input": self.qX} self.set_module_name("QLinear") class QDynamicLinearBenchmark(_QLinearBenchmarkBase): def init(self, N, IN, OUT, device): super(QDynamicLinearBenchmark, self).init(N, IN, OUT, nnqd.Linear(IN, OUT)) self.inputs = {"input": self.X} self.set_module_name("QDynamicLinear") op_bench.generate_pt_test( configs.remove_cuda(configs.linear_configs_short + configs.linear_configs_long), QLinearBenchmark) op_bench.generate_pt_test( configs.remove_cuda(configs.linear_configs_short + configs.linear_configs_long), QDynamicLinearBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
) as_strided_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], size=[(16, 16), (128, 128)], stride=[(1, 1), (2, 2)], storage_offset=[0, 1], device=['cpu', 'cuda'], tags=['long']) class As_stridedBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, size, stride, storage_offset, device): self.input_one = torch.rand(M, N, device=device) self.size = size self.stride = stride self.storage_offset = storage_offset self.set_module_name('as_strided') def forward(self): return torch.as_strided(self.input_one, self.size, self.stride, self.storage_offset) op_bench.generate_pt_test(as_strided_configs_short + as_strided_configs_long, As_stridedBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
'device': ['cpu'], }, tags=["short"], ) mm_long_configs = op_bench.cross_product_configs(M=[64, 128, 256], N=range(2, 10, 3), K=[128, 512, 1024], trans_a=[True, False], trans_b=[True, False], device=['cpu'], tags=["long"]) class MatMulBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, trans_a, trans_b, device): self.input_one = torch.rand(M, N, device=device) if trans_a \ else torch.rand(N, M, device=device).t() self.input_two = torch.rand(N, K, device=device) if trans_b \ else torch.rand(K, N, device=device).t() self.set_module_name("matmul") def forward(self): return torch.matmul(self.input_one, self.input_two) op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
[1, 1, 1], [64, 64, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu'], }, tags=["short"], ) class LinearUnpackFP16Benchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): # input to unpack operator must be what the output is for prepack operator self.inputs = { "input_one": torch.ops.quantized.linear_prepack_fp16(torch.rand(M, N, K, device=device, requires_grad=False, dtype=torch.float32)) } self.set_module_name("linear_unpack_fp16") def forward(self, input_one): return torch.ops.quantized.linear_unpack_fp16(input_one) # The generated test names based on linear_unpack_fp16_short_configs will be in the following pattern: # linear_unpack_fp16_M8_N16_K32_devicecpu op_bench.generate_pt_test(linear_unpack_fp16_long_configs + linear_unpack_fp16_short_configs, LinearUnpackFP16Benchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
[512, 512, 2], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) chunks_long_configs = op_bench.cross_product_configs(M=[128, 1024], N=[128, 1024], chunks=[2, 4], device=['cpu', 'cuda'], tags=['long']) class ChunkBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, chunks, device): self.input_one = torch.rand(M, N, device=device) self.chunks = chunks self.set_module_name('chunk') def forward(self): return torch.chunk(self.input_one, self.chunks) op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs, ChunkBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
self.input = qX self.qconv2d = nnq.Conv2d(IC, OC, kernel, stride=stride, padding=pad, groups=G) self.qconv2d.weight = qW self.qconv2d.scale = torch.tensor([scale], dtype=torch.double) self.qconv2d.zero_point = torch.tensor([zero_point], dtype=torch.int) W2 = torch.randn(OC, OC // G, kernel, kernel, dtype=torch.float32) qW2 = torch.quantize_per_tensor(W2, scale=scale, zero_point=0, dtype=torch.qint8) self.qconv2d2 = nnq.Conv2d(OC, OC, kernel, stride=stride, padding=pad, groups=G) self.qconv2d2.weight = qW2 self.qconv2d2.scale = torch.tensor([scale], dtype=torch.double) self.qconv2d2.zero_point = torch.tensor([zero_point], dtype=torch.int) self.set_module_name("QConv2dChained") def forward(self): # test that layout propagation works fine x = self.qconv2d(self.input) x = x.relu() return self.qconv2d2(x) op_bench.generate_pt_test(qconv_1d_configs, QConv1dBenchmark) op_bench.generate_pt_test(qconv_2d_configs, QConv2dBenchmark) op_bench.generate_pt_test(resnext_32_4d_shape_configs, QConv2dBenchmark) op_bench.generate_pt_test(qconv_2d_configs, QConv2dChainedBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
import operator_benchmark as op_bench import torch add_configs = op_bench.cross_product_configs( M=[8], N=[8], K=[8], device=["cuda", "cpu"], tags=["short"] ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): self.input_one = torch.rand(M, N, K, device=device, requires_grad=True) self.input_two = torch.rand(M, N, K, device=device, requires_grad=True) self.set_module_name("add") def forward(self): return torch.add(self.input_one, self.input_two) op_bench.generate_pt_test(add_configs, AddBenchmark) op_bench.generate_pt_gradient_test(add_configs, AddBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
def init(self, sizes, N, dim, device): random.seed(42) inputs = [] gen_sizes = [] if type(sizes) == list and N == -1: gen_sizes = sizes else: for i in range(N): gen_sizes.append([ old_size() if callable(old_size) else old_size for old_size in sizes ]) for s in gen_sizes: inputs.append(torch.rand(s, device=device)) result = torch.empty(0, device=device) self.inputs = {"result": result, "inputs": inputs, "dim": dim} self.set_module_name('cat') def forward(self, result: torch.Tensor, inputs: List[torch.Tensor], dim: int): return torch.cat(inputs, dim=dim, out=result) op_bench.generate_pt_test( cat_configs_short + cat_configs_long + cat_configs_multidim + cat_configs_manyinputs + cat_configs_static_runtime, CatBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
bidirectional=D, ) cell_temp = nn.Sequential(cell_nn) self.cell = torch.quantization.quantize_dynamic(cell_temp, {nn.LSTM, nn.Linear}, dtype=dtype)[0] self.x = torch.randn( sequence_len, # sequence length batch_size, # batch size I) # Number of featues in X self.h = torch.randn( NL * (D + 1), # layer_num * dir_num batch_size, # batch size H) # hidden size self.c = torch.randn( NL * (D + 1), # layer_num * dir_num batch_size, # batch size H) # hidden size self.set_module_name("QLSTM") def forward(self): return self.cell(self.x, (self.h, self.c)) op_bench.generate_pt_test(qrnn_configs, LSTMBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
device=['cpu', 'cuda'], tags=['long'] ) class SumBenchmark(op_bench.TorchBenchmarkBase): def init(self, R, V, dim, contiguous, device): shape = (R, V) if dim == 0 else (V, R) tensor = torch.rand(shape, device=device) if not contiguous: storage = torch.empty([s * 2 for s in shape], device=device) storage[::2, ::2] = tensor self.input_tensor = storage[::2, ::2] else: self.input_tensor = tensor self.inputs = { "input_tensor": self.input_tensor, "dim": dim } self.set_module_name("sum") def forward(self, input_tensor, dim: int): return input_tensor.sum(dim=dim) op_bench.generate_pt_test(sum_configs, SumBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
tags=["short"], ) @torch.jit.script def torch_sumall(a, iterations): # type: (Tensor, int) result = 0.0 for _ in range(iterations): result += float(torch.sum(a)) a[0][0] += 0.01 return result class TorchSumBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N): self.input_one = torch.rand(M, N) self.set_module_name("sum") # This is a very temporary method and will be removed soon, so # don't use this method in your benchmark # TODO(mingzhe): use one forward method for both JIT and Eager def jit_forward(self, iters): return torch_sumall(self.input_one, iters) op_bench.generate_pt_test(intraop_bench_configs, TorchSumBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
N, K, 1, device=device, requires_grad=self.auto_set()) x_scale = 0.1 x_zero_point = 0 self.q_input_one = torch.quantize_per_tensor(self.input_one, scale=x_scale, zero_point=x_zero_point, dtype=dtype) self.mean = torch.rand(N) self.var = torch.rand(N) self.weight = torch.rand(N) self.bias = torch.rand(N) self.eps = 1e-5 self.Y_scale = 0.1 self.Y_zero_point = 0 def forward(self): return torch.ops.quantized.batch_norm2d(self.q_input_one, self.weight, self.bias, self.mean, self.var, self.eps, self.Y_scale, self.Y_zero_point) op_bench.generate_pt_test(batchnorm_configs_short, QBatchNormBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
[8, 1, 2, 3, torch.int32], [9, 1, 2, 4, torch.int32], [10, 1, 2, 5, torch.int32], ], attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) class ClipRangesBenchmark(op_bench.TorchBenchmarkBase): def init(self, LENGTH, M, N, MAX_LENGTH, device, dtype): self.input = torch.rand(LENGTH, M, N, device=device).type(dtype) self.max_length = MAX_LENGTH self.set_module_name("clip_ranges") def forward(self): output = torch.ops.fb.clip_ranges(self.input, self.max_length) return output op_bench.generate_pt_test( clip_ranges_long_configs + clip_ranges_short_configs, ClipRangesBenchmark ) if __name__ == "__main__": op_bench.benchmark_runner.main()
SBS=((1, 4), (1, 8), (4, 1), (8, 1)), # Sparse block shape ZPB=(0, 1, 2, 3, 4, None), # Zeros per block tags=("long",) ) class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, SL, SBS, ZPB): weight = torch.ones(M) model = nn.Module() model.register_buffer("weight", weight) sparse_config = [{"tensor_fqn": "weight"}] self.sparsifier = sparsity.WeightNormSparsifier( sparsity_level=SL, sparse_block_shape=SBS, zeros_per_block=ZPB, ) self.sparsifier.prepare(model, config=sparse_config) self.inputs = {} # All benchmarks need inputs :) self.set_module_name("weight_norm_sparsifier_step") def forward(self): self.sparsifier.step() all_tests = sparse_configs_short + sparse_configs_long op_bench.generate_pt_test(all_tests, WeightNormSparsifierBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
# An example input from this configuration is M=4, N=4, dim=0. configs = op_bench.config_list( attrs=[ [4, 4, 0], [256, 256, 1], ], attr_names=["M", "N", "dim"], tags=["short"] ) class GatherBenchmark(op_bench.TorchBenchmarkBase): # TODO (mingzhe0908): should we have a global seed for all ops? def init(self, M, N, dim): self.input_one = torch.rand(M, N) self.dim = dim min_val = M if dim == 0 else N numpy.random.seed((1 << 32) - 1) self.index = torch.tensor(numpy.random.randint(0, min_val, (M, N))) self.set_module_name("gather") def forward(self): return torch.gather(self.input_one, self.dim, self.index) op_bench.generate_pt_test(configs, GatherBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
""" class QEmbeddingBagBenchmark(op_bench.TorchBenchmarkBase): def init(self, embeddingbags, dim, mode, input_size, offset, sparse, include_last_offset, device): self.embedding = nnq.EmbeddingBag( num_embeddings=embeddingbags, embedding_dim=dim, mode=mode, include_last_offset=include_last_offset).to(device=device) numpy.random.seed((1 << 32) - 1) self.input = torch.tensor(numpy.random.randint(0, embeddingbags, input_size), device=device).long() offset = torch.LongTensor([offset], device=device) self.offset = torch.cat( (offset, torch.tensor([self.input.size(0)], dtype=torch.long)), 0) self.inputs = {"input": self.input, "offset": self.offset} self.set_module_name('qEmbeddingBag') def forward(self, input, offset): return self.embedding(input, offset) op_bench.generate_pt_test(configs.embeddingbag_short_configs, QEmbeddingBagBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
diag_configs_short = op_bench.config_list( attr_names=['dim', 'M', 'N', 'diagonal', 'out'], attrs=[ [1, 64, 64, 0, True], [2, 128, 128, -10, False], [1, 256, 256, 20, True], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'], ) class DiagBenchmark(op_bench.TorchBenchmarkBase): def init(self, dim, M, N, diagonal, out, device): self.input = torch.rand( M, N, device=device) if dim == 2 else torch.rand(M, device=device) self.diagonal = diagonal self.out = torch.tensor((), ) if out else None self.set_module_name('diag') def forward(self): return torch.diag(self.input, diagonal=self.diagonal, out=self.out) op_bench.generate_pt_test(diag_configs_short, DiagBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
self.input = torch.rand(C, M, N) self.dtype = dtype self.op = nnq.Quantize(scale=1.0, zero_point=0, dtype=dtype) self.set_module_name('QuantizePerTensor') if mode == 'D': self.input = self.op(self.input) self.op = nnq.DeQuantize() self.set_module_name('DequantizePerTensor') def forward(self): return self.op(self.input) op_bench.generate_pt_test( quantize_per_tensor_configs_short + quantize_per_tensor_configs_long, QuantizePerTensorBenchmark) # === Per Channel quantization === quantize_per_channel_configs_short = op_bench.config_list( cross_product_configs={ 'axis': (0,) }, **quantize_configs_short_dict ) quantize_per_channel_configs_long = op_bench.cross_product_configs( axis=(0, 1, 2), **quantize_configs_long_dict )
def forward(self): return self.conv1d(self.input) class ConvTranspose1dBenchmark(op_bench.TorchBenchmarkBase): def init(self, IC, OC, kernel, stride, N, L, device): self.input = torch.rand(N, IC, L, device=device) self.convtranspose1d = nn.ConvTranspose1d( IC, OC, kernel, stride=stride).to(device=device) self.set_module_name('ConvTranspose1d') def forward(self): return self.convtranspose1d(self.input) op_bench.generate_pt_test(conv_1d_configs_short + conv_1d_configs_long, Conv1dBenchmark) op_bench.generate_pt_test(conv_1d_configs_short + conv_1d_configs_long, ConvTranspose1dBenchmark) """ Microbenchmarks for Conv2d and ConvTranspose2d operators. """ # Configs for Conv2d and ConvTranspose1d conv_2d_configs_short = op_bench.config_list( attr_names=[ 'IC', 'OC', 'kernel', 'stride', 'N', 'H',
inputs = [] gen_sizes = [] if type(sizes) == list and N == -1: gen_sizes = sizes else: for i in range(N): gen_sizes.append([old_size() if callable(old_size) else old_size for old_size in sizes]) for s in gen_sizes: inputs.append(torch.rand(s, device=device)) result = torch.rand(gen_sizes[0], device=device) self.inputs = { "result": result, "inputs": inputs, "dim": dim } self.set_module_name('stack') def forward(self, result: torch.Tensor, inputs: List[torch.Tensor], dim: int): return torch.stack(inputs, dim=dim, out=result) op_bench.generate_pt_test(stack_configs_static_runtime + stack_configs_short + stack_configs_long + stack_configs_multidim, StackBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
from __future__ import unicode_literals import operator_benchmark as op_bench import torch import torch.nn as nn """Microbenchmarks for Linear operator.""" configs = op_bench.config_list(attrs=[ [1, 32, 10], [4, 256, 100], [16, 1024, 256], ], attr_names=["N", "IN", "OUT"], tags=["short"]) class LinearBenchmark(op_bench.TorchBenchmarkBase): def init(self, N, IN, OUT): self.input_one = torch.rand(N, IN) self.linear = nn.Linear(IN, OUT) self.set_module_name("linear") def forward(self): return self.linear(self.input_one) op_bench.generate_pt_test(configs, LinearBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
torch.rand((B, M, K), device=device, requires_grad=self.auto_set()), "batch2": torch.rand(( B, K, N, ), device=device, requires_grad=self.auto_set()) } self.set_module_name("bmm") def forward(self, batch1, batch2): return torch.bmm(batch1, batch2) bmm_configs = op_bench.cross_product_configs( B=[2, 100], M=[8, 256], N=[256, 16], K=[16, 32], device=['cpu'], tags=["short"], ) op_bench.generate_pt_test(bmm_configs, BmmBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
class QAvgPool2dBenchmark(_QPool2dBenchmarkBase): def init(self, N, C, H, W, k, s, p, contig, dtype): self.pool_op = torch.nn.AvgPool2d(kernel_size=k, stride=s, padding=p, ceil_mode=False) super(QAvgPool2dBenchmark, self).setup(N, C, H, W, dtype, contig) class QAdaptiveAvgPool2dBenchmark(_QPool2dBenchmarkBase): def init(self, N, C, input_size, output_size, contig, dtype): self.pool_op = torch.nn.AdaptiveAvgPool2d(output_size=output_size) super(QAdaptiveAvgPool2dBenchmark, self).setup(N, C, *input_size, dtype=dtype, contig=contig) op_bench.generate_pt_test( qadaptive_avgpool2d_short_configs + qadaptive_avgpool2d_long_configs, QAdaptiveAvgPool2dBenchmark) op_bench.generate_pt_test(qpool2d_short_configs + qpool2d_long_configs, QAvgPool2dBenchmark) op_bench.generate_pt_test(qpool2d_short_configs + qpool2d_long_configs, QMaxPool2dBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
class QInterpolateBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, dtype, mode, scale, contig): f_input = (torch.rand(1, M, N, K) - 0.5) * 256 scale = 0.1 zero_point = 42 self.q_input = torch.quantize_per_tensor(f_input, scale=scale, zero_point=zero_point, dtype=dtype) if not contig: permute_dims = list(range(q_input.ndim))[::-1] self.q_input_a = self.q_input_a.permute(permute_dims) self.mode = mode self.scale_factor = scale self.set_module_name('q_interpolate') def forward(self): return torch.nn.quantized.functional.interpolate( self.q_input, scale_factor=self.scale_factor, mode=self.mode) op_bench.generate_pt_test( qinterpolate_short_configs + qinterpolate_long_configs, QInterpolateBenchmark) if __name__ == '__main__': op_bench.benchmark_runner.main()
# no channels_last for 3D tensors attr_names=["input_size", "output_size"], attrs=[ [(4, 512, 320), (256, )], [(4, 512, 320), (512, )], ], cross_product_configs={ 'mode': ["nearest", "linear"], }, tags=["long"], ) config_5d = op_bench.config_list( attr_names=["input_size", "output_size"], attrs=[ [(1, 3, 16, 320, 320), (8, 256, 256)], [(1, 3, 16, 320, 320), (32, 512, 512)], ], cross_product_configs={ 'channels_last': [True, False], 'mode': ["nearest", "linear"], }, tags=["long"], ) for config in (config_short, config_long, config_3d, config_5d): op_bench.generate_pt_test(config, InterpolateBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
K=[2 ** x for x in range(0, 3)], tags=["long"] ) add_short_configs = op_bench.config_list( attrs=[ [8, 16, 32], [16, 32, 64], ], attr_names=["M", "N", "K"], tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K): self.input_one = torch.rand(M, N, K) self.input_two = torch.rand(M, N, K) self.set_module_name("add") def forward(self): return torch.add(self.input_one, self.input_two) op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
) class QInstanceNormBenchmark(op_bench.TorchBenchmarkBase): def init(self, dims, dtype): X = (torch.rand(*dims) - 0.5) * 256 num_channels = dims[1] scale = 1.0 zero_point = 0 self.qX = torch.quantize_per_tensor( X, scale=scale, zero_point=zero_point, dtype=dtype) self.weight = torch.rand(num_channels, dtype=torch.float) self.bias = torch.rand(num_channels, dtype=torch.float) self.eps = 1e-5 self.Y_scale = 0.1 self.Y_zero_point = 0 def forward(self): return torch.ops.quantized.instance_norm( self.qX, weight=self.weight, bias=self.bias, eps=self.eps, output_scale=self.Y_scale, output_zero_point=self.Y_zero_point) op_bench.generate_pt_test(instancenorm_configs_short, QInstanceNormBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()