attr_names=["N"], attrs=[ [1024], [2048], ], cross_product_configs={ 'device': ['cpu', 'cuda'], 'dtype': [torch.int32], }, tags=["short"], ) fill_long_configs = op_bench.cross_product_configs( N=[10, 1000], device=torch.testing.get_all_device_types(), dtype=[ torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64, torch.half, torch.float, torch.double ], tags=["long"]) class Fill_Benchmark(op_bench.TorchBenchmarkBase): def init(self, N, device, dtype): self.input_one = torch.zeros(N, device=device).type(dtype) self.set_module_name("fill_") def forward(self): return self.input_one.fill_(10) op_bench.generate_pt_test(fill_short_configs + fill_long_configs,
torch.rand((B, M, K), device=device, requires_grad=self.auto_set()), "batch2": torch.rand(( B, K, N, ), device=device, requires_grad=self.auto_set()) } self.set_module_name("bmm") def forward(self, batch1, batch2): return torch.bmm(batch1, batch2) bmm_configs = op_bench.cross_product_configs( B=[2, 100], M=[8, 256], N=[256, 16], K=[16, 32], device=['cpu'], tags=["short"], ) op_bench.generate_pt_test(bmm_configs, BmmBenchmark) if __name__ == "__main__": op_bench.benchmark_runner.main()
import benchmark_caffe2 as op_bench_c2 import operator_benchmark as op_bench from benchmark_caffe2 import Caffe2BenchmarkBase # noqa from caffe2.python import core """Microbenchmarks for BatchBoxCox operator.""" # Configs for C2 BatchBoxCox operator batch_box_cox_long_configs = op_bench.cross_product_configs( M=[32, 64, 128], N=range(32, 128, 32), dtype=["float", "double"], tags=["long"]) batch_box_cox_short_configs = op_bench.config_list( attrs=[ [16, 16, "float"], [16, 16, "double"], [64, 64, "float"], [64, 64, "double"], ], attr_names=["M", "N", "dtype"], tags=["short"], ) class BatchBoxCoxBenchmark(op_bench_c2.Caffe2BenchmarkBase): def init(self, M, N, dtype): self.data = self.tensor([M, N], dtype) self.lambda1 = self.tensor([N], dtype) self.lambda2 = self.tensor([N], dtype) self.output = self.tensor([1, 1], dtype)
[8, 8, 1], [256, 512, 1], [512, 512, 1], [8, 8, 2], [256, 512, 2], [512, 512, 2], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"]) batch_gather_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], K=[1, 2], device=['cpu', 'cuda'], tags=["long"]) class BatchGatherBenchmark(op_bench_c2.Caffe2BenchmarkBase): def init(self, M, N, K, device): self.input_one = self.tensor([M, N, K], device=device) max_val = N numpy.random.seed((1 << 32) - 1) index_dim = numpy.random.randint(0, N) self.index = self.feed_tensor(numpy.random.randint( 0, max_val, index_dim), device=device) self.output = self.tensor([M, index_dim, K], device=device) self.set_module_name("batch_gather")
quantize_configs_long_dict = { 'C': [3, 5, 8], # this is reused for per-channel: avoid single channel test 'M': [256, 1024], 'N': [256, 1024], 'dtype': [torch.quint8, torch.qint8, torch.qint32], 'mode': ['D', 'Q'], 'tags': ['long'], } quantize_per_tensor_configs_short = op_bench.config_list( **quantize_configs_short_dict ) quantize_per_tensor_configs_long = op_bench.cross_product_configs( **quantize_configs_long_dict ) class QuantizePerTensorBenchmark(op_bench.TorchBenchmarkBase): r"""Benchmarks both quantization and dequantization.""" def init(self, C, M, N, dtype, mode): assert(mode in ('Q', 'D')) self.input = torch.rand(C, M, N) self.dtype = dtype self.op = nnq.Quantize(scale=1.0, zero_point=0, dtype=dtype) self.set_module_name('QuantizePerTensor') if mode == 'D': self.input = self.op(self.input) self.op = nnq.DeQuantize()
"""Microbenchmarks for sparsifier.""" sparse_configs_short = op_bench.config_list( attr_names=["M", "SL", "SBS", "ZPB"], attrs=[ [(32, 16), 0.3, (4, 1), 2], [(32, 16), 0.6, (1, 4), 4], [(17, 23), 0.9, (1, 1), 1] ], tags=("short",) ) sparse_configs_long = op_bench.cross_product_configs( M=((128, 128), (255, 324)), # Mask shape SL=(0.0, 1.0, 0.3, 0.6, 0.9, 0.99), # Sparsity level SBS=((1, 4), (1, 8), (4, 1), (8, 1)), # Sparse block shape ZPB=(0, 1, 2, 3, 4, None), # Zeros per block tags=("long",) ) class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, SL, SBS, ZPB): weight = torch.ones(M) model = nn.Module() model.register_buffer("weight", weight) sparse_config = [{"tensor_fqn": "weight"}] self.sparsifier = sparsity.WeightNormSparsifier( sparsity_level=SL, sparse_block_shape=SBS, zeros_per_block=ZPB,
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch.""" # Configs for PT add operator add_long_configs = op_bench.cross_product_configs( M=[8, 64, 128], N=range(2, 128, 64), K=[8**x for x in range(0, 3)], device=['cpu', 'cuda'], tags=["long"]) add_short_configs = op_bench.config_list( attr_names=["M", "N", "K"], attrs=[ [64, 64, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase):
# Configs for pool-1d ops pool_1d_configs_short = op_bench.config_list( attr_names=['kernel', 'stride', 'N', 'C', 'L'], attrs=[ [3, 1, 8, 256, 256], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short']) pool_1d_configs_long = op_bench.cross_product_configs(kernel=[3], stride=[1, 2], N=[8, 16], C=[3], L=[128, 256], device=['cpu', 'cuda'], tags=['long']) pool_1d_ops_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['MaxPool1d', nn.MaxPool1d], ['AvgPool1d', nn.AvgPool1d], ], ) class Pool1dBenchmark(op_bench.TorchBenchmarkBase): def init(self, kernel, stride, N, C, L, device, op_func):
chunk_short_configs = op_bench.config_list( attr_names=["M", "N", "chunks"], attrs=[ [8, 8, 2], [256, 512, 2], [512, 512, 2], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) chunks_long_configs = op_bench.cross_product_configs(M=[128, 1024], N=[128, 1024], chunks=[2, 4], device=['cpu', 'cuda'], tags=['long']) class ChunkBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, chunks, device): self.input_one = torch.rand(M, N, device=device) self.chunks = chunks self.set_module_name('chunk') def forward(self): return torch.chunk(self.input_one, self.chunks) op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
import operator_benchmark as op_bench import torch """Microbenchmarks for sum reduction operator.""" # Configs for PT add operator sum_configs = op_bench.cross_product_configs( R=[64, 256], # Length of reduced dimension V=[32, 512], # Length of other dimension dim=[0, 1], contiguous=[True, False], device=['cpu', 'cuda'], tags=['short'] ) + op_bench.cross_product_configs( R=[1024, 8192], V=[512, 1024], dim=[0, 1], contiguous=[True, False], device=['cpu', 'cuda'], tags=['long'] ) class SumBenchmark(op_bench.TorchBenchmarkBase): def init(self, R, V, dim, contiguous, device): shape = (R, V) if dim == 0 else (V, R) tensor = torch.rand(shape, device=device) if not contiguous: storage = torch.empty([s * 2 for s in shape], device=device) storage[::2, ::2] = tensor
import operator_benchmark as op_bench import torch import numpy """EmbeddingBag Operator Benchmark""" embeddingbag_short_configs = op_bench.cross_product_configs( embeddingbags=[80, 120, 1000, 2300], dim=[64], mode=['sum'], input_size=[8, 16, 64], offset=[0], sparse=[True], tags=['short'] ) class EmbeddingBagBenchmark(op_bench.TorchBenchmarkBase): def init(self, embeddingbags, dim, mode, input_size, offset, sparse): self.embegging = torch.nn.EmbeddingBag( num_embeddings=embeddingbags, embedding_dim=dim, mode=mode, sparse=sparse) numpy.random.seed((1 << 32) - 1) self.input = torch.tensor(numpy.random.randint(0, embeddingbags, input_size)).long() self.offset = torch.LongTensor([offset]) self.set_module_name('embeddingbag') def forward(self):
cat_configs_short = op_bench.config_list( attr_names=['M', 'N', 'K', 'dim'], attrs=[ [1, 1, 1, 0], [256, 512, 1, 0], [512, 512, 2, 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'], ) cat_configs_long = op_bench.cross_product_configs(M=[128], N=[128, 1024], K=[1, 2], dim=[0, 1, 2], device=['cpu', 'cuda'], tags=['long']) class CatBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, dim, device): self.input_one = torch.rand(M, N, K, device=device) self.dim = dim self.set_module_name('cat') def forward(self): return torch.cat((self.input_one, self.input_one), dim=self.dim) op_bench.generate_pt_test(cat_configs_short + cat_configs_long, CatBenchmark)
import operator_benchmark as op_bench import torch """Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch.""" # Configs for PT add operator add_long_configs = op_bench.cross_product_configs(M=[8, 128], N=[32, 64], K=[256, 512], device=['cpu', 'cuda'], tags=["long"]) add_short_configs = op_bench.config_list( attr_names=["M", "N", "K"], attrs=[ [1, 1, 1], [64, 64, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): self.input_one = torch.rand(M, N, K, device=device,
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for quantized instancenorm operator.""" instancenorm_configs_short = op_bench.cross_product_configs( dims=( (32, 8, 16), (32, 8, 56, 56), ), dtype=(torch.qint8,), tags=["short"], ) class QInstanceNormBenchmark(op_bench.TorchBenchmarkBase): def init(self, dims, dtype): X = (torch.rand(*dims) - 0.5) * 256 num_channels = dims[1] scale = 1.0 zero_point = 0 self.qX = torch.quantize_per_tensor( X, scale=scale, zero_point=zero_point, dtype=dtype) self.weight = torch.rand(num_channels, dtype=torch.float)
], attrs=[ [128, 256, 3, 1, 1, 64], [256, 256, 3, 2, 4, 64], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=['short'] ) conv_1d_configs_long = op_bench.cross_product_configs( IC=[128, 512], OC=[128, 512], kernel=[3], stride=[1, 2], N=[8], L=[128], device=['cpu', 'cuda'], tags=["long"] ) # Configs for Conv2d and ConvTranspose1d conv_2d_configs_short = op_bench.config_list( attr_names=[ 'IC', 'OC', 'kernel', 'stride', 'N', 'H', 'W', 'G', 'pad', ], attrs=[ [256, 256, 3, 1, 1, 16, 16, 1, 0], ], cross_product_configs={ 'device': ['cpu', 'cuda'],
import operator_benchmark as op_bench import torch """Microbenchmarks for ClipRanges operator.""" torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators") # Configs for C2 ClipRanges operator clip_ranges_long_configs = op_bench.cross_product_configs( LENGTH=range(1, 100), M=[1], N=[2], MAX_LENGTH=range(1, 100), device=['cpu', 'cuda'], dtype=[torch.int32], tags=["long"], ) clip_ranges_short_configs = op_bench.config_list( attrs=[ [6, 1, 2, 1, torch.int32], [7, 1, 2, 2, torch.int32], [8, 1, 2, 3, torch.int32], [9, 1, 2, 4, torch.int32], [10, 1, 2, 5, torch.int32], ], attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"], cross_product_configs={ 'device': ['cpu', 'cuda'], },
cross_product_configs={ 'N': (2, ), 'contig': (True, ), 'dtype': (torch.qint32, torch.qint8, torch.quint8), }, tags=('short', )) qadaptive_avgpool2d_long_configs = op_bench.cross_product_configs( input_size=( # VGG16 pools with original input shape: (-1, 3, 224, 224) (112, 112), # MaxPool2d-9 # noqa ), output_size=( (448, 448), # VGG16 pools with original input shape: (-1, 3, 224, 224) (224, 224), # MaxPool2d-4 # noqa (112, 112), # MaxPool2d-9 # noqa (56, 56), # MaxPool2d-16 # noqa (14, 14), # MaxPool2d-30 # noqa ), N=(1, 4), C=(1, 3, 64, 128), contig=(False, True), dtype=(torch.quint8, ), tags=('long', )) qadaptive_avgpool2d_short_configs = op_bench.config_list( attrs=((4, 3, (224, 224), (112, 112), True), ), attr_names=('N', 'C', 'input_size', 'output_size', 'contig'), cross_product_configs={ 'dtype': (torch.qint32, torch.qint8, torch.quint8), },
import operator_benchmark as op_bench from caffe2.python import core add_configs = op_bench.cross_product_configs(M=[8], N=[8], K=[8], tags=["short"], device=["cuda", "cpu"]) class AddBenchmark(op_bench.Caffe2BenchmarkBase): def init(self, M, N, K, device): self.set_module_name("add") self.input_one = self.tensor([M, N, K], device=device) self.input_two = self.tensor([M, N, K], device=device) self.input_one_grad = self.tensor([M, N, K], device=device) self.input_two_grad = self.tensor([M, N, K], device=device) self.output = self.tensor([M, N, K], device=device) def forward(self): op = core.CreateOperator("Add", [self.input_one, self.input_two], self.output, **self.args) return op def backward(self): grad_op = core.CreateOperator( "AddGradient", [self.output, self.input_one, self.input_two], [self.input_one_grad, self.input_two_grad], **self.args) return grad_op
mm_short_configs = op_bench.config_list( attr_names=["M", "N", "K", "trans_a", "trans_b"], attrs=[ [128, 128, 128, True, False], [256, 256, 256, False, True], ], cross_product_configs={ 'device': ['cpu'], }, tags=["short"], ) mm_long_configs = op_bench.cross_product_configs(M=[64, 128, 256], N=range(2, 10, 3), K=[128, 512, 1024], trans_a=[True, False], trans_b=[True, False], device=['cpu'], tags=["long"]) class MatMulBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, trans_a, trans_b, device): self.input_one = torch.rand(M, N, device=device) if trans_a \ else torch.rand(N, M, device=device).t() self.input_two = torch.rand(N, K, device=device) if trans_b \ else torch.rand(K, N, device=device).t() self.set_module_name("matmul") def forward(self): return torch.matmul(self.input_one, self.input_two)
import torch import torch.nn.quantized as nnq import operator_benchmark as op_bench r"""Microbenchmarks for the quantized activations.""" qactivation_long_configs = op_bench.cross_product_configs( dims=( # VGG-16 relu's with original shape: (-1, 3, 224, 224) (64, 224, 224), # ReLU-1 # noqa: E201 (128, 112, 112), # ReLU-6 (256, 56, 56), # ReLU-11 # noqa: E241 (512, 28, 28), # ReLU-18 # noqa: E241 (512, 14, 14), # ReLU-25 # noqa: E241 # Batch = 16 (16, 64, 224, 224), # ReLU-1 # noqa: E241 (16, 128, 112, 112), # ReLU-6 (16, 256, 56, 56), # ReLU-11 # noqa: E241 (16, 512, 28, 28), # ReLU-18 # noqa: E241 (16, 512, 14, 14), # ReLU-25 # noqa: E241 ), contig=(False, True), inplace=(False, True), dtype=(torch.quint8, ), tags=('long', )) qactivation_short_configs = op_bench.cross_product_configs( dims=( (3, 4, 5), # Rank=3 (2, 3, 4, 5), # Rank=4, # Dimensions from the floating point benchmarks
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench from caffe2.python import core """Microbenchmarks for MatMul operator""" # Configs for C2 Matmul operator mm_long_configs = op_bench.cross_product_configs( M=[8, 64, 128], N=range(2, 10, 3), K=[2 ** x for x in range(0, 3)], trans_a=[True, False], trans_b=[True, False], tags=["long"] ) mm_short_configs = op_bench.config_list( attrs=[ [128, 128, 128, False, True], [1024, 1024, 256, True, False], [8192, 8192, 1024, True, False], ], attr_names=["M", "N", "K", "trans_a", "trans_b"], tags=["short"], )
# Configs for conv-1d ops conv_1d_configs_short = op_bench.config_list( attr_names=['in_c', 'out_c', 'kernel', 'stride', 'N', 'L'], attrs=[ [256, 256, 3, 1, 1, 64], [256, 256, 3, 2, 16, 128], ], cross_product_configs={ 'device': ['cpu'], }, tags=['short']) conv_1d_configs_long = op_bench.cross_product_configs(in_c=[128, 512], out_c=[128, 512], kernel=[3], stride=[1, 2], N=[4, 8], L=[64, 128], device=['cpu'], tags=["long"]) class Conv1dBenchmark(op_bench.TorchBenchmarkBase): def init(self, in_c, out_c, kernel, stride, N, L, device): self.input = torch.rand(N, in_c, L, device=device) self.conv1d = nn.Conv1d(in_c, out_c, kernel, stride=stride) self.set_module_name('Conv1d') def forward(self): return self.conv1d(self.input)
import operator_benchmark as op_bench import torch """Microbenchmarks for linear_unpack_fp16_ operator. Supports both Caffe2/PyTorch.""" # Configs for PT linear_unpack_fp16 operator linear_unpack_fp16_long_configs = op_bench.cross_product_configs( M=[8, 128], N=[32, 64], K=[256, 512], device=['cpu'], tags=["long"] ) linear_unpack_fp16_short_configs = op_bench.config_list( attr_names=["M", "N", "K"], attrs=[ [1, 1, 1], [64, 64, 64], [64, 64, 128], ], cross_product_configs={ 'device': ['cpu'], }, tags=["short"], ) class LinearUnpackFP16Benchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device): # input to unpack operator must be what the output is for prepack operator self.inputs = {
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import torch import operator_benchmark as op_bench qcomparators_configs = op_bench.cross_product_configs( N=(8, 64), dtype=(torch.quint8, torch.qint8, torch.qint32), contig=(False, True), other_scalar=(False, True), out_variant=(False, True), tags=('short',) ) qcomparators_ops = op_bench.op_list( attrs=( ('eq', torch.eq), ('ne', torch.ne), ('lt', torch.lt), ('gt', torch.gt), ('le', torch.le), ('ge', torch.ge), ), attr_names=('op_name', 'op_func'), )
as_strided_configs_short = op_bench.config_list( attr_names=["M", "N", "size", "stride", "storage_offset"], attrs=[ [256, 256, (32, 32), (1, 1), 0], [512, 512, (64, 64), (2, 2), 1], ], cross_product_configs={ 'device': ['cpu', 'cuda'], }, tags=["short"], ) as_strided_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], size=[(16, 16), (128, 128)], stride=[(1, 1), (2, 2)], storage_offset=[0, 1], device=['cpu', 'cuda'], tags=['long']) class As_stridedBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, size, stride, storage_offset, device): self.input_one = torch.rand(M, N, device=device) self.size = size self.stride = stride self.storage_offset = storage_offset self.set_module_name('as_strided') def forward(self): return torch.as_strided(self.input_one, self.size, self.stride,
import torch # Configs for pointwise and reduction unary ops qmethods_configs_short = op_bench.config_list(attr_names=['M', 'N'], attrs=[ [32, 32], ], cross_product_configs={ 'dtype': [torch.quint8], 'contig': [False, True], }, tags=['short']) qmethods_configs_long = op_bench.cross_product_configs( M=[256, 1024], N=[256, 1024], dtype=[torch.qint8, torch.qint32], contig=[False, True], tags=['long']) qmethods_tensor_input_list = op_bench.op_list( attr_names=['op_name', 'op_func'], attrs=[ ['q_copy', 'copy_'], ], ) class _QMethodBenchmarkBase(op_bench.TorchBenchmarkBase): def init(self, M, N, dtype, contig, op_func): f_input = torch.rand(M, N) scale = 1.0
import operator_benchmark as op_bench import torch tensor_conversion_short_configs = op_bench.cross_product_configs( M=( 8, 16, 32, ), N=( 16, 64, 128, ), device=['cpu', 'cuda'], tags=['short'], ) tensor_conversion_long_configs = op_bench.cross_product_configs( M=( 64, 128, 256, 512, ), N=( 256, 512, 1024, 2048, ),
import operator_benchmark as op_bench import torch import numpy as np embeddingbag_conversion_short_configs = op_bench.cross_product_configs( num_embeddings=(80, ), embedding_dim=(128, 256, 512), tags=('short', )) embeddingbag_conversion_long_configs = op_bench.cross_product_configs( num_embeddings=(100, 120, 1000), embedding_dim=(16, 64, 128, 256, 512, 1024, 2048), tags=('long', )) conversion_ops = op_bench.op_list( attrs=( ('qembeddingbag_byte_prepack', torch.ops.quantized.embedding_bag_byte_prepack), ('qembeddingbag_4bit_prepack', torch.ops.quantized.embedding_bag_4bit_prepack), ('qembeddingbag_2bit_prepack', torch.ops.quantized.embedding_bag_2bit_prepack), ), attr_names=('op_name', 'op_func'), ) unpack_ops = op_bench.op_list( attrs=( ('qembeddingbag_byte_unpack', torch.ops.quantized.embedding_bag_byte_unpack), ('qembeddingbag_4bit_unpack', torch.ops.quantized.embedding_bag_4bit_unpack), ('qembeddingbag_2bit_unpack',
from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals import operator_benchmark as op_bench import torch """Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch.""" # Configs for PT add operator add_long_configs = op_bench.cross_product_configs( M=[8, 64, 128], N=range(2, 10, 3), K=[2 ** x for x in range(0, 3)], tags=["long"] ) add_short_configs = op_bench.config_list( attrs=[ [8, 16, 32], [16, 32, 64], ], attr_names=["M", "N", "K"], tags=["short"], ) class AddBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K):
attrs=[ [256, 512, 1, 2, 0], [512, 512, 2, 1, 1], ], cross_product_configs={ 'contig': ('all', 'one', 'none'), 'dtype': (torch.quint8, torch.qint8, torch.qint32), }, tags=['short'], ) qcat_configs_long = op_bench.cross_product_configs( M=[128, 1024], N=[128, 1024], K=[1, 2], L=[5, 7], dim=[0, 1, 2], contig=['all', 'one', 'none'], dtype=[torch.quint8], tags=['long'] ) class QCatBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, L, dim, contig, dtype): f_input = (torch.rand(M, N, K) - 0.5) * 256 self.qf = nnq.QFunctional() scale = 1.0 zero_point = 0 self.qf.scale = scale self.qf.zero_point = zero_point