attr_names=["N"],
    attrs=[
        [1024],
        [2048],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
        'dtype': [torch.int32],
    },
    tags=["short"],
)

fill_long_configs = op_bench.cross_product_configs(
    N=[10, 1000],
    device=torch.testing.get_all_device_types(),
    dtype=[
        torch.bool, torch.int8, torch.uint8, torch.int16, torch.int32,
        torch.int64, torch.half, torch.float, torch.double
    ],
    tags=["long"])


class Fill_Benchmark(op_bench.TorchBenchmarkBase):
    def init(self, N, device, dtype):
        self.input_one = torch.zeros(N, device=device).type(dtype)
        self.set_module_name("fill_")

    def forward(self):
        return self.input_one.fill_(10)


op_bench.generate_pt_test(fill_short_configs + fill_long_configs,
            torch.rand((B, M, K), device=device,
                       requires_grad=self.auto_set()),
            "batch2":
            torch.rand((
                B,
                K,
                N,
            ),
                       device=device,
                       requires_grad=self.auto_set())
        }
        self.set_module_name("bmm")

    def forward(self, batch1, batch2):
        return torch.bmm(batch1, batch2)


bmm_configs = op_bench.cross_product_configs(
    B=[2, 100],
    M=[8, 256],
    N=[256, 16],
    K=[16, 32],
    device=['cpu'],
    tags=["short"],
)

op_bench.generate_pt_test(bmm_configs, BmmBenchmark)

if __name__ == "__main__":
    op_bench.benchmark_runner.main()
Exemple #3
0
import benchmark_caffe2 as op_bench_c2
import operator_benchmark as op_bench
from benchmark_caffe2 import Caffe2BenchmarkBase  # noqa
from caffe2.python import core
"""Microbenchmarks for BatchBoxCox operator."""

# Configs for C2 BatchBoxCox operator
batch_box_cox_long_configs = op_bench.cross_product_configs(
    M=[32, 64, 128],
    N=range(32, 128, 32),
    dtype=["float", "double"],
    tags=["long"])

batch_box_cox_short_configs = op_bench.config_list(
    attrs=[
        [16, 16, "float"],
        [16, 16, "double"],
        [64, 64, "float"],
        [64, 64, "double"],
    ],
    attr_names=["M", "N", "dtype"],
    tags=["short"],
)


class BatchBoxCoxBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, M, N, dtype):
        self.data = self.tensor([M, N], dtype)
        self.lambda1 = self.tensor([N], dtype)
        self.lambda2 = self.tensor([N], dtype)
        self.output = self.tensor([1, 1], dtype)
Exemple #4
0
                                                      [8, 8, 1],
                                                      [256, 512, 1],
                                                      [512, 512, 1],
                                                      [8, 8, 2],
                                                      [256, 512, 2],
                                                      [512, 512, 2],
                                                  ],
                                                  cross_product_configs={
                                                      'device':
                                                      ['cpu', 'cuda'],
                                                  },
                                                  tags=["short"])

batch_gather_configs_long = op_bench.cross_product_configs(
    M=[128, 1024],
    N=[128, 1024],
    K=[1, 2],
    device=['cpu', 'cuda'],
    tags=["long"])


class BatchGatherBenchmark(op_bench_c2.Caffe2BenchmarkBase):
    def init(self, M, N, K, device):
        self.input_one = self.tensor([M, N, K], device=device)
        max_val = N
        numpy.random.seed((1 << 32) - 1)
        index_dim = numpy.random.randint(0, N)
        self.index = self.feed_tensor(numpy.random.randint(
            0, max_val, index_dim),
                                      device=device)
        self.output = self.tensor([M, index_dim, K], device=device)
        self.set_module_name("batch_gather")
Exemple #5
0
quantize_configs_long_dict = {
    'C': [3, 5, 8],  # this is reused for per-channel: avoid single channel test
    'M': [256, 1024],
    'N': [256, 1024],
    'dtype': [torch.quint8, torch.qint8, torch.qint32],
    'mode': ['D', 'Q'],
    'tags': ['long'],
}


quantize_per_tensor_configs_short = op_bench.config_list(
    **quantize_configs_short_dict
)

quantize_per_tensor_configs_long = op_bench.cross_product_configs(
    **quantize_configs_long_dict
)


class QuantizePerTensorBenchmark(op_bench.TorchBenchmarkBase):
    r"""Benchmarks both quantization and dequantization."""
    def init(self, C, M, N, dtype, mode):
        assert(mode in ('Q', 'D'))
        self.input = torch.rand(C, M, N)
        self.dtype = dtype
        self.op = nnq.Quantize(scale=1.0, zero_point=0, dtype=dtype)
        self.set_module_name('QuantizePerTensor')

        if mode == 'D':
            self.input = self.op(self.input)
            self.op = nnq.DeQuantize()
Exemple #6
0
"""Microbenchmarks for sparsifier."""

sparse_configs_short = op_bench.config_list(
    attr_names=["M", "SL", "SBS", "ZPB"],
    attrs=[
        [(32, 16), 0.3, (4, 1), 2],
        [(32, 16), 0.6, (1, 4), 4],
        [(17, 23), 0.9, (1, 1), 1]
    ],
    tags=("short",)
)

sparse_configs_long = op_bench.cross_product_configs(
    M=((128, 128), (255, 324)),  # Mask shape
    SL=(0.0, 1.0, 0.3, 0.6, 0.9, 0.99),  # Sparsity level
    SBS=((1, 4), (1, 8), (4, 1), (8, 1)),  # Sparse block shape
    ZPB=(0, 1, 2, 3, 4, None),  # Zeros per block
    tags=("long",)
)

class WeightNormSparsifierBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, SL, SBS, ZPB):
        weight = torch.ones(M)
        model = nn.Module()
        model.register_buffer("weight", weight)

        sparse_config = [{"tensor_fqn": "weight"}]
        self.sparsifier = sparsity.WeightNormSparsifier(
            sparsity_level=SL,
            sparse_block_shape=SBS,
            zeros_per_block=ZPB,
Exemple #7
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import operator_benchmark as op_bench
import torch
"""Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""

# Configs for PT add operator
add_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
    N=range(2, 128, 64),
    K=[8**x for x in range(0, 3)],
    device=['cpu', 'cuda'],
    tags=["long"])

add_short_configs = op_bench.config_list(
    attr_names=["M", "N", "K"],
    attrs=[
        [64, 64, 64],
        [64, 64, 128],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=["short"],
)


class AddBenchmark(op_bench.TorchBenchmarkBase):
Exemple #8
0
# Configs for pool-1d ops
pool_1d_configs_short = op_bench.config_list(
    attr_names=['kernel', 'stride', 'N', 'C', 'L'],
    attrs=[
        [3, 1, 8, 256, 256],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=['short'])

pool_1d_configs_long = op_bench.cross_product_configs(kernel=[3],
                                                      stride=[1, 2],
                                                      N=[8, 16],
                                                      C=[3],
                                                      L=[128, 256],
                                                      device=['cpu', 'cuda'],
                                                      tags=['long'])

pool_1d_ops_list = op_bench.op_list(
    attr_names=['op_name', 'op_func'],
    attrs=[
        ['MaxPool1d', nn.MaxPool1d],
        ['AvgPool1d', nn.AvgPool1d],
    ],
)


class Pool1dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, kernel, stride, N, C, L, device, op_func):
chunk_short_configs = op_bench.config_list(
    attr_names=["M", "N", "chunks"],
    attrs=[
        [8, 8, 2],
        [256, 512, 2],
        [512, 512, 2],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=["short"],
)

chunks_long_configs = op_bench.cross_product_configs(M=[128, 1024],
                                                     N=[128, 1024],
                                                     chunks=[2, 4],
                                                     device=['cpu', 'cuda'],
                                                     tags=['long'])


class ChunkBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, chunks, device):
        self.input_one = torch.rand(M, N, device=device)
        self.chunks = chunks
        self.set_module_name('chunk')

    def forward(self):
        return torch.chunk(self.input_one, self.chunks)


op_bench.generate_pt_test(chunk_short_configs + chunks_long_configs,
Exemple #10
0
import operator_benchmark as op_bench
import torch

"""Microbenchmarks for sum reduction operator."""

# Configs for PT add operator
sum_configs = op_bench.cross_product_configs(
    R=[64, 256],  # Length of reduced dimension
    V=[32, 512],  # Length of other dimension
    dim=[0, 1],
    contiguous=[True, False],
    device=['cpu', 'cuda'],
    tags=['short']
) + op_bench.cross_product_configs(
    R=[1024, 8192],
    V=[512, 1024],
    dim=[0, 1],
    contiguous=[True, False],
    device=['cpu', 'cuda'],
    tags=['long']
)


class SumBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, R, V, dim, contiguous, device):
        shape = (R, V) if dim == 0 else (V, R)
        tensor = torch.rand(shape, device=device)

        if not contiguous:
            storage = torch.empty([s * 2 for s in shape], device=device)
            storage[::2, ::2] = tensor
Exemple #11
0
import operator_benchmark as op_bench
import torch
import numpy

"""EmbeddingBag Operator Benchmark"""

embeddingbag_short_configs = op_bench.cross_product_configs(
    embeddingbags=[80, 120, 1000, 2300],
    dim=[64],
    mode=['sum'],
    input_size=[8, 16, 64],
    offset=[0],
    sparse=[True],
    tags=['short']
)


class EmbeddingBagBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, embeddingbags, dim, mode, input_size, offset, sparse):
        self.embegging = torch.nn.EmbeddingBag(
            num_embeddings=embeddingbags,
            embedding_dim=dim,
            mode=mode,
            sparse=sparse)
        numpy.random.seed((1 << 32) - 1)
        self.input = torch.tensor(numpy.random.randint(0, embeddingbags, input_size)).long()
        self.offset = torch.LongTensor([offset])

        self.set_module_name('embeddingbag')

    def forward(self):
Exemple #12
0
cat_configs_short = op_bench.config_list(
    attr_names=['M', 'N', 'K', 'dim'],
    attrs=[
        [1, 1, 1, 0],
        [256, 512, 1, 0],
        [512, 512, 2, 1],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=['short'],
)

cat_configs_long = op_bench.cross_product_configs(M=[128],
                                                  N=[128, 1024],
                                                  K=[1, 2],
                                                  dim=[0, 1, 2],
                                                  device=['cpu', 'cuda'],
                                                  tags=['long'])


class CatBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, dim, device):
        self.input_one = torch.rand(M, N, K, device=device)
        self.dim = dim
        self.set_module_name('cat')

    def forward(self):
        return torch.cat((self.input_one, self.input_one), dim=self.dim)


op_bench.generate_pt_test(cat_configs_short + cat_configs_long, CatBenchmark)
Exemple #13
0
import operator_benchmark as op_bench
import torch
"""Microbenchmarks for add_ operator. Supports both Caffe2/PyTorch."""

# Configs for PT add operator
add_long_configs = op_bench.cross_product_configs(M=[8, 128],
                                                  N=[32, 64],
                                                  K=[256, 512],
                                                  device=['cpu', 'cuda'],
                                                  tags=["long"])

add_short_configs = op_bench.config_list(
    attr_names=["M", "N", "K"],
    attrs=[
        [1, 1, 1],
        [64, 64, 64],
        [64, 64, 128],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=["short"],
)


class AddBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device):
        self.input_one = torch.rand(M,
                                    N,
                                    K,
                                    device=device,
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


import operator_benchmark as op_bench
import torch


"""Microbenchmarks for quantized instancenorm operator."""

instancenorm_configs_short = op_bench.cross_product_configs(
    dims=(
        (32, 8, 16),
        (32, 8, 56, 56),
    ),
    dtype=(torch.qint8,),
    tags=["short"],
)


class QInstanceNormBenchmark(op_bench.TorchBenchmarkBase):

    def init(self, dims, dtype):
        X = (torch.rand(*dims) - 0.5) * 256
        num_channels = dims[1]
        scale = 1.0
        zero_point = 0
        self.qX = torch.quantize_per_tensor(
            X, scale=scale, zero_point=zero_point, dtype=dtype)
        self.weight = torch.rand(num_channels, dtype=torch.float)
Exemple #15
0
    ],
    attrs=[
        [128, 256, 3, 1, 1, 64],
        [256, 256, 3, 2, 4, 64],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=['short']
)

conv_1d_configs_long = op_bench.cross_product_configs(
    IC=[128, 512],
    OC=[128, 512],
    kernel=[3],
    stride=[1, 2],
    N=[8],
    L=[128],
    device=['cpu', 'cuda'],
    tags=["long"]
)

# Configs for Conv2d and ConvTranspose1d
conv_2d_configs_short = op_bench.config_list(
    attr_names=[
        'IC', 'OC', 'kernel', 'stride', 'N', 'H', 'W', 'G', 'pad',
    ],
    attrs=[
        [256, 256, 3, 1, 1, 16, 16, 1, 0],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
import operator_benchmark as op_bench
import torch


"""Microbenchmarks for ClipRanges operator."""
torch.ops.load_library("//caffe2/torch/fb/sparsenn:sparsenn_operators")

# Configs for C2 ClipRanges operator
clip_ranges_long_configs = op_bench.cross_product_configs(
    LENGTH=range(1, 100),
    M=[1],
    N=[2],
    MAX_LENGTH=range(1, 100),
    device=['cpu', 'cuda'],
    dtype=[torch.int32],
    tags=["long"],
)


clip_ranges_short_configs = op_bench.config_list(
    attrs=[
        [6, 1, 2, 1, torch.int32],
        [7, 1, 2, 2, torch.int32],
        [8, 1, 2, 3, torch.int32],
        [9, 1, 2, 4, torch.int32],
        [10, 1, 2, 5, torch.int32],
    ],
    attr_names=["LENGTH", "M", "N", "MAX_LENGTH", "dtype"],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
Exemple #17
0
    cross_product_configs={
        'N': (2, ),
        'contig': (True, ),
        'dtype': (torch.qint32, torch.qint8, torch.quint8),
    },
    tags=('short', ))

qadaptive_avgpool2d_long_configs = op_bench.cross_product_configs(
    input_size=(
        # VGG16 pools with original input shape: (-1, 3, 224, 224)
        (112, 112),  # MaxPool2d-9  # noqa
    ),
    output_size=(
        (448, 448),
        # VGG16 pools with original input shape: (-1, 3, 224, 224)
        (224, 224),  # MaxPool2d-4  # noqa
        (112, 112),  # MaxPool2d-9  # noqa
        (56, 56),  # MaxPool2d-16 # noqa
        (14, 14),  # MaxPool2d-30 # noqa
    ),
    N=(1, 4),
    C=(1, 3, 64, 128),
    contig=(False, True),
    dtype=(torch.quint8, ),
    tags=('long', ))

qadaptive_avgpool2d_short_configs = op_bench.config_list(
    attrs=((4, 3, (224, 224), (112, 112), True), ),
    attr_names=('N', 'C', 'input_size', 'output_size', 'contig'),
    cross_product_configs={
        'dtype': (torch.qint32, torch.qint8, torch.quint8),
    },
import operator_benchmark as op_bench
from caffe2.python import core

add_configs = op_bench.cross_product_configs(M=[8],
                                             N=[8],
                                             K=[8],
                                             tags=["short"],
                                             device=["cuda", "cpu"])


class AddBenchmark(op_bench.Caffe2BenchmarkBase):
    def init(self, M, N, K, device):
        self.set_module_name("add")
        self.input_one = self.tensor([M, N, K], device=device)
        self.input_two = self.tensor([M, N, K], device=device)
        self.input_one_grad = self.tensor([M, N, K], device=device)
        self.input_two_grad = self.tensor([M, N, K], device=device)
        self.output = self.tensor([M, N, K], device=device)

    def forward(self):
        op = core.CreateOperator("Add", [self.input_one, self.input_two],
                                 self.output, **self.args)
        return op

    def backward(self):
        grad_op = core.CreateOperator(
            "AddGradient", [self.output, self.input_one, self.input_two],
            [self.input_one_grad, self.input_two_grad], **self.args)
        return grad_op

Exemple #19
0
mm_short_configs = op_bench.config_list(
    attr_names=["M", "N", "K", "trans_a", "trans_b"],
    attrs=[
        [128, 128, 128, True, False],
        [256, 256, 256, False, True],
    ],
    cross_product_configs={
        'device': ['cpu'],
    },
    tags=["short"],
)

mm_long_configs = op_bench.cross_product_configs(M=[64, 128, 256],
                                                 N=range(2, 10, 3),
                                                 K=[128, 512, 1024],
                                                 trans_a=[True, False],
                                                 trans_b=[True, False],
                                                 device=['cpu'],
                                                 tags=["long"])


class MatMulBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, trans_a, trans_b, device):
        self.input_one = torch.rand(M, N, device=device) if trans_a \
            else torch.rand(N, M, device=device).t()
        self.input_two = torch.rand(N, K, device=device) if trans_b \
            else torch.rand(K, N, device=device).t()
        self.set_module_name("matmul")

    def forward(self):
        return torch.matmul(self.input_one, self.input_two)
Exemple #20
0
import torch
import torch.nn.quantized as nnq

import operator_benchmark as op_bench
r"""Microbenchmarks for the quantized activations."""

qactivation_long_configs = op_bench.cross_product_configs(
    dims=(
        # VGG-16 relu's with original shape: (-1, 3, 224, 224)
        (64, 224, 224),  # ReLU-1   # noqa: E201
        (128, 112, 112),  # ReLU-6
        (256, 56, 56),  # ReLU-11  # noqa: E241
        (512, 28, 28),  # ReLU-18  # noqa: E241
        (512, 14, 14),  # ReLU-25  # noqa: E241
        # Batch = 16
        (16, 64, 224, 224),  # ReLU-1   # noqa: E241
        (16, 128, 112, 112),  # ReLU-6
        (16, 256, 56, 56),  # ReLU-11  # noqa: E241
        (16, 512, 28, 28),  # ReLU-18  # noqa: E241
        (16, 512, 14, 14),  # ReLU-25  # noqa: E241
    ),
    contig=(False, True),
    inplace=(False, True),
    dtype=(torch.quint8, ),
    tags=('long', ))

qactivation_short_configs = op_bench.cross_product_configs(
    dims=(
        (3, 4, 5),  # Rank=3
        (2, 3, 4, 5),  # Rank=4,
        # Dimensions from the floating point benchmarks
Exemple #21
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals


import operator_benchmark as op_bench
from caffe2.python import core 

"""Microbenchmarks for MatMul operator"""

# Configs for C2 Matmul operator
mm_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
    N=range(2, 10, 3),
    K=[2 ** x for x in range(0, 3)], 
    trans_a=[True, False],
    trans_b=[True, False],
    tags=["long"]
)


mm_short_configs = op_bench.config_list(
    attrs=[
        [128, 128, 128, False, True],
        [1024, 1024, 256, True, False],
        [8192, 8192, 1024, True, False],
    ],
    attr_names=["M", "N", "K", "trans_a", "trans_b"], 
    tags=["short"], 
)
Exemple #22
0
# Configs for conv-1d ops
conv_1d_configs_short = op_bench.config_list(
    attr_names=['in_c', 'out_c', 'kernel', 'stride', 'N', 'L'],
    attrs=[
        [256, 256, 3, 1, 1, 64],
        [256, 256, 3, 2, 16, 128],
    ],
    cross_product_configs={
        'device': ['cpu'],
    },
    tags=['short'])

conv_1d_configs_long = op_bench.cross_product_configs(in_c=[128, 512],
                                                      out_c=[128, 512],
                                                      kernel=[3],
                                                      stride=[1, 2],
                                                      N=[4, 8],
                                                      L=[64, 128],
                                                      device=['cpu'],
                                                      tags=["long"])


class Conv1dBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, in_c, out_c, kernel, stride, N, L, device):
        self.input = torch.rand(N, in_c, L, device=device)
        self.conv1d = nn.Conv1d(in_c, out_c, kernel, stride=stride)
        self.set_module_name('Conv1d')

    def forward(self):
        return self.conv1d(self.input)

import operator_benchmark as op_bench
import torch

"""Microbenchmarks for linear_unpack_fp16_ operator. Supports both Caffe2/PyTorch."""

# Configs for PT linear_unpack_fp16 operator
linear_unpack_fp16_long_configs = op_bench.cross_product_configs(
    M=[8, 128],
    N=[32, 64],
    K=[256, 512],
    device=['cpu'],
    tags=["long"]
)

linear_unpack_fp16_short_configs = op_bench.config_list(
    attr_names=["M", "N", "K"],
    attrs=[
        [1, 1, 1],
        [64, 64, 64],
        [64, 64, 128],
    ],
    cross_product_configs={
        'device': ['cpu'],
    },
    tags=["short"],
)

class LinearUnpackFP16Benchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device):
        # input to unpack operator must be what the output is for prepack operator
        self.inputs = {
Exemple #24
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch

import operator_benchmark as op_bench

qcomparators_configs = op_bench.cross_product_configs(
    N=(8, 64),
    dtype=(torch.quint8, torch.qint8, torch.qint32),
    contig=(False, True),
    other_scalar=(False, True),
    out_variant=(False, True),
    tags=('short',)
)

qcomparators_ops = op_bench.op_list(
    attrs=(
        ('eq', torch.eq),
        ('ne', torch.ne),
        ('lt', torch.lt),
        ('gt', torch.gt),
        ('le', torch.le),
        ('ge', torch.ge),
    ),
    attr_names=('op_name', 'op_func'),
)

Exemple #25
0
as_strided_configs_short = op_bench.config_list(
    attr_names=["M", "N", "size", "stride", "storage_offset"],
    attrs=[
        [256, 256, (32, 32), (1, 1), 0],
        [512, 512, (64, 64), (2, 2), 1],
    ],
    cross_product_configs={
        'device': ['cpu', 'cuda'],
    },
    tags=["short"],
)

as_strided_configs_long = op_bench.cross_product_configs(
    M=[128, 1024],
    N=[128, 1024],
    size=[(16, 16), (128, 128)],
    stride=[(1, 1), (2, 2)],
    storage_offset=[0, 1],
    device=['cpu', 'cuda'],
    tags=['long'])


class As_stridedBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, size, stride, storage_offset, device):
        self.input_one = torch.rand(M, N, device=device)
        self.size = size
        self.stride = stride
        self.storage_offset = storage_offset
        self.set_module_name('as_strided')

    def forward(self):
        return torch.as_strided(self.input_one, self.size, self.stride,
Exemple #26
0
import torch

# Configs for pointwise and reduction unary ops
qmethods_configs_short = op_bench.config_list(attr_names=['M', 'N'],
                                              attrs=[
                                                  [32, 32],
                                              ],
                                              cross_product_configs={
                                                  'dtype': [torch.quint8],
                                                  'contig': [False, True],
                                              },
                                              tags=['short'])

qmethods_configs_long = op_bench.cross_product_configs(
    M=[256, 1024],
    N=[256, 1024],
    dtype=[torch.qint8, torch.qint32],
    contig=[False, True],
    tags=['long'])

qmethods_tensor_input_list = op_bench.op_list(
    attr_names=['op_name', 'op_func'],
    attrs=[
        ['q_copy', 'copy_'],
    ],
)


class _QMethodBenchmarkBase(op_bench.TorchBenchmarkBase):
    def init(self, M, N, dtype, contig, op_func):
        f_input = torch.rand(M, N)
        scale = 1.0
Exemple #27
0
import operator_benchmark as op_bench
import torch

tensor_conversion_short_configs = op_bench.cross_product_configs(
    M=(
        8,
        16,
        32,
    ),
    N=(
        16,
        64,
        128,
    ),
    device=['cpu', 'cuda'],
    tags=['short'],
)

tensor_conversion_long_configs = op_bench.cross_product_configs(
    M=(
        64,
        128,
        256,
        512,
    ),
    N=(
        256,
        512,
        1024,
        2048,
    ),
Exemple #28
0
import operator_benchmark as op_bench
import torch
import numpy as np

embeddingbag_conversion_short_configs = op_bench.cross_product_configs(
    num_embeddings=(80, ), embedding_dim=(128, 256, 512), tags=('short', ))

embeddingbag_conversion_long_configs = op_bench.cross_product_configs(
    num_embeddings=(100, 120, 1000),
    embedding_dim=(16, 64, 128, 256, 512, 1024, 2048),
    tags=('long', ))

conversion_ops = op_bench.op_list(
    attrs=(
        ('qembeddingbag_byte_prepack',
         torch.ops.quantized.embedding_bag_byte_prepack),
        ('qembeddingbag_4bit_prepack',
         torch.ops.quantized.embedding_bag_4bit_prepack),
        ('qembeddingbag_2bit_prepack',
         torch.ops.quantized.embedding_bag_2bit_prepack),
    ),
    attr_names=('op_name', 'op_func'),
)

unpack_ops = op_bench.op_list(
    attrs=(
        ('qembeddingbag_byte_unpack',
         torch.ops.quantized.embedding_bag_byte_unpack),
        ('qembeddingbag_4bit_unpack',
         torch.ops.quantized.embedding_bag_4bit_unpack),
        ('qembeddingbag_2bit_unpack',
Exemple #29
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import operator_benchmark as op_bench
import torch

"""Microbenchmarks for element-wise Add operator. Supports both Caffe2/PyTorch."""

# Configs for PT add operator 
add_long_configs = op_bench.cross_product_configs(
    M=[8, 64, 128],
    N=range(2, 10, 3),
    K=[2 ** x for x in range(0, 3)], 
    tags=["long"]
)


add_short_configs = op_bench.config_list(
    attrs=[
        [8, 16, 32],
        [16, 32, 64],
    ],
    attr_names=["M", "N", "K"], 
    tags=["short"], 
)


class AddBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K): 
Exemple #30
0
    attrs=[
        [256, 512, 1, 2, 0],
        [512, 512, 2, 1, 1],
    ],
    cross_product_configs={
        'contig': ('all', 'one', 'none'),
        'dtype': (torch.quint8, torch.qint8, torch.qint32),
    },
    tags=['short'],
)

qcat_configs_long = op_bench.cross_product_configs(
    M=[128, 1024],
    N=[128, 1024],
    K=[1, 2],
    L=[5, 7],
    dim=[0, 1, 2],
    contig=['all', 'one', 'none'],
    dtype=[torch.quint8],
    tags=['long']
)


class QCatBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, L, dim, contig, dtype):
        f_input = (torch.rand(M, N, K) - 0.5) * 256
        self.qf = nnq.QFunctional()
        scale = 1.0
        zero_point = 0
        self.qf.scale = scale
        self.qf.zero_point = zero_point