time_tc(100, "simple API (in place)\t",
        lambda name, ins: executor.unchecked_run(ins, (C, )), "matmul", (A, B))

time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins), "matmul", (A, B))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
# Compilation returns an allocated tuple of outputs with the proper shapes.
# Allocation overhead is negligible compared to compilation overhead.
compilation_cache.compile("matmul", (A, B), MappingOptions('naive'))

# Run once without timing
compilation_cache.unchecked_run("matmul", (A, B))

# unchecked_run on tensors
time_tc(100, "raw unchecked_run naive options\t",
        lambda name, ins: compilation_cache.unchecked_run(name, ins), "matmul",
        (A, B))

################################################################################
# 3. Short tuning run saving to file then load the best option to create a
#    compilation cache
################################################################################
with tempfile.NamedTemporaryFile() as cache_file:
    tuner = Tuner(mm, cache_file.name)
    top1 = tuner.tune("matmul", (A, B), MappingOptions('naive'), tuner_config)
    cache = MappingOptionsCache(cache_file.name)
    top10 = cache.load(mm, "matmul", (A, B), 10)
Ejemplo n.º 2
0
class TC(object):
    def __init__(
        self, tc: str,
        mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]],
                                           MappingOptions])):
        self.tc = tc
        self.mapping_options_factory = mapping_options_factory
        self.compilation_cache = CompilationCache(self.tc)

        # Make each TC def in the tc str a method of the TC object so we can:
        #     T = tc.define("def add() ...")
        #     T.add()
        #
        def make_closure(obj: TC, tc_def_name: str):
            def fun(*inputs: torch.Tensor,
                    outputs: Optional[Tuple[torch.Tensor]] = None,
                    unchecked: Optional[bool] = False) -> List[torch.Tensor]:
                return obj(tc_def_name,
                           *inputs,
                           outputs=outputs,
                           unchecked=unchecked)

            return fun

        for tc_def in tclib.parse_defs(self.tc):
            self.__setattr__(tc_def, make_closure(self, tc_def))

    def __call__(self,
                 entry_point: str,
                 *inputs: torch.Tensor,
                 outputs: Optional[Tuple[torch.Tensor]] = None,
                 unchecked: Optional[bool] = False) -> List[torch.Tensor]:

        # Locally scoped implicit compilation
        def implicit_compile(tc_obj: TC, entry_point: str,
                             *inputs: torch.Tensor):
            already_compiled = tc_obj.compilation_cache.is_compiled(
                entry_point, inputs)

            if already_compiled:
                return

            global SILENT
            if not SILENT:
                sizes = "".join(str(i.size()) + " " for i in inputs)
                print("TC \"{}\" was not explicitly compiled for ".format(
                    entry_point) + "inputs of sizes:\n  {}\n".format(sizes) +
                      "....Generate implicit MappingOptions")

            mapping_options = tc_obj.mapping_options_factory(
                tc_obj.tc, entry_point, *inputs)

            assert mapping_options is not None, (
                "No options found for TC {} ".format(entry_point) +
                "with inputs of sizes:\n  {}\n".format("".join(
                    str(i.size()) + " " for i in inputs)))

            # Compile best options to set the executor for the current
            #     (entry point, inputs)
            start = time.clock()
            tc_obj.compilation_cache.compile(entry_point, inputs,
                                             mapping_options)
            if not SILENT:
                print("Done compiling TC \"{}\" (compile time: {}ms)".format(
                    entry_point, int((time.clock() - start) * 10**3)))

        implicit_compile(self, entry_point, *inputs)

        if unchecked:
            return self.compilation_cache.unchecked_run(entry_point, inputs)

        return self.compilation_cache.run(entry_point, inputs)
Ejemplo n.º 3
0
        "matmul", (mat1, mat2))
time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins,
                                                 ()), "matmul", (mat1, mat2))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
from tensor_comprehensions.tclib import CompilationCache

compilation_cache = CompilationCache(mm)
# Compilation returns an allocated tuple of outputs with the proper shapes.
# Allocation overhead is negligible compared to compilation overhead.
compilation_cache.compile("matmul", (mat1, mat2), MappingOptions())
# Run once without timing
compilation_cache.unchecked_run("matmul", (mat1, mat2), ())
# unchecked_run on  tensors
time_tc(100, "raw unchecked_run naive options\t",
        lambda name, ins: compilation_cache.unchecked_run(name, ins, ()),
        "matmul", (mat1, mat2))

################################################################################
# 3. Short tuning run saving to file then load the best option to create a
#    compilation cache
################################################################################
from tensor_comprehensions.tclib import Tuner
from tensor_comprehensions.tclib import MappingOptionsCache
from tensor_comprehensions.tclib import TunerConfig

import uuid
unique_filename = "/tmp/" + str(uuid.uuid4())