Exemple #1
0
    def test_tensordot_autotune_pybind(self):
        tensordot_str = """
        def tensordot(float(N, C1, C2, H, W) I0, float(N, C2, C3, H, W) I1)
            -> (O)
        {
            O(n, c1, c3, h, w) +=! I0(n, c1, c2, h, w) * I1(n, c2, c3, h, w)
        }
        """
        entry_point = "tensordot"

        N, C1, C2, C3, H, W = 40, 16, 8, 20, 13, 15
        with tempfile.NamedTemporaryFile() as cache_file:
            I0 = torch.randn(N, C1, C2, H, W, device='cuda')
            I1 = torch.randn(N, C2, C3, H, W, device='cuda')

            tuner = tc.Tuner(tensordot_str, cache_file.name)
            top1 = tuner.tune(entry_point, (I0, I1),
                              tc.MappingOptions('naive'), tuner_config)

            import tensor_comprehensions.tclib as tclib
            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     top1)
            O = executor.run((I0, I1), ())

            cache = tc.MappingOptionsCache(cache_file.name)
            best_options, = cache.load(tensordot_str, entry_point, (I0, I1),
                                       10)
            assert str(top1) == str(best_options), (
                "Expected the same but found {}\nand\n{}".format(
                    top1, best_options))

            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     best_options)
            O = executor.run((I0, I1), ())

            # No simple torch baseline, compare against naive
            executor = tclib.compile(tensordot_str, entry_point, (I0, I1),
                                     tc.MappingOptions('naive'))
            ref = executor.run((I0, I1), ())

            tc.assert_almost_equal(ref, O, I0, I1, operations=C2)
Exemple #2
0
def compile(tc: str, entry_point: str, mapping_options: Union[str,
                                                              MappingOptions],
            *inputs: torch.Tensor) -> Executor:
    r"""Returns a compiled, callable, low-overhead :class:`Executor`.

        An example of usage is provided in :class:`Executor`.

        :param tc: a string containing one of more TC defs.
        :param entry_point: the name of the TC def to compile and execute.
        :param mapping_options: the options to use for compilation.
        :param inputs: PyTorch Tensors for which the compiled kernel is specialized.

        :rtype: :class:`Executor`, a low-overhead callable class to launch the
            kernel compiled from the :code:`entry_point`.
    """
    mapping_options = (MappingOptions(mapping_options) if isinstance(
        mapping_options, str) else mapping_options)
    return Executor(tclib.compile(tc, entry_point, inputs, mapping_options))
Exemple #3
0
    def test_matmul_pybind(self):
        mm_str = """
        def matmul(float(M,N) A, float(N,K) B) -> (C) {
            C(m, k) +=! A(m, r_n) * B(r_n, k)
        }
        """

        A, B = (torch.randn(3, 4,
                            device='cuda'), torch.randn(4, 5, device='cuda'))

        import tensor_comprehensions.tclib as tclib
        executor = tclib.compile(mm_str, "matmul", (A, B),
                                 tc.MappingOptions('naive'))
        C = executor.run((A, B), ())
        torch.cuda.synchronize()
        expected = torch.mm(A, B)
        torch.cuda.synchronize()
        tc.assert_almost_equal(C, expected, A, B, operations=4)

        C = executor.run((A, B), (C, ))
        tc.assert_almost_equal(C, torch.mm(A, B), A, B, operations=4)
compilation_cache = CompilationCache(mm)

tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices(
    "0")

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below.
################################################################################
executor = compile(mm, "matmul", (A, B), MappingOptions('naive'))
C = executor.run((A, B))

time_tc(100, "simple API (in place)\t",
        lambda name, ins: executor.unchecked_run(ins, (C, )), "matmul", (A, B))

time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins), "matmul", (A, B))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
# Compilation returns an allocated tuple of outputs with the proper shapes.
# Allocation overhead is negligible compared to compilation overhead.
compilation_cache.compile("matmul", (A, B), MappingOptions('naive'))
Exemple #5
0
"""
mat1, mat2 = torch.randn(300, 400).cuda(), torch.randn(400, 500).cuda()

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below
################################################################################
from tensor_comprehensions.tclib import compile

executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
outputs = executor.run((mat1, mat2), ())
outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
time_tc(100, "simple API\t",
        lambda name, ins: executor.unchecked_run(ins, tuple(outputs)),
        "matmul", (mat1, mat2))
time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins,
                                                 ()), "matmul", (mat1, mat2))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
from tensor_comprehensions.tclib import CompilationCache

compilation_cache = CompilationCache(mm)