Ejemplo n.º 1
0
    def __init__(
        self, tc: str,
        mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]],
                                           MappingOptions])):
        self.tc = tc
        self.mapping_options_factory = mapping_options_factory
        self.compilation_cache = CompilationCache(self.tc)

        # Make each TC def in the tc str a method of the TC object so we can:
        #     T = tc.define("def add() ...")
        #     T.add()
        #
        def make_closure(obj: TC, tc_def_name: str):
            def fun(*inputs: torch.Tensor,
                    outputs: Optional[Tuple[torch.Tensor]] = None,
                    unchecked: Optional[bool] = False) -> List[torch.Tensor]:
                return obj(tc_def_name,
                           *inputs,
                           outputs=outputs,
                           unchecked=unchecked)

            return fun

        for tc_def in tclib.parse_defs(self.tc):
            self.__setattr__(tc_def, make_closure(self, tc_def))
Ejemplo n.º 2
0
    def __init__(self,
                 tc="",
                 forward_name="",
                 forward_force_reinforcement_tuning=False,
                 backward_name="",
                 backward_force_reinforcement_tuning=False,
                 check_output_shapes=True,
                 tuner_cache_file="",
                 tuner_config=TunerConfig(),
                 debug=False):
        if debug:
            assert isinstance(tc, str), type(tc)
            assert isinstance(forward_name, str), type(forward_name)
            assert isinstance(forward_force_reinforcement_tuning,
                              bool), type(forward_force_reinforcement_tuning)
            assert isinstance(backward_name, str), type(backward_name)
            assert isinstance(backward_force_reinforcement_tuning,
                              bool), type(backward_force_reinforcement_tuning)
            assert isinstance(check_output_shapes,
                              bool), type(tuner_cache_file)
            assert isinstance(tuner_cache_file, str), type(tuner_cache_file)
            assert isinstance(tuner_config, TunerConfig), type(tuner_config)

        self.tc = tc
        self.forward_name = forward_name
        self.forward_force_reinforcement_tuning = forward_force_reinforcement_tuning
        self.backward_name = backward_name
        self.backward_force_reinforcement_tuning = backward_force_reinforcement_tuning
        self.check_output_shapes = check_output_shapes
        self.tuner_cache_file = tuner_cache_file
        self.tuner_config = tuner_config
        self.debug = debug
        self.compilation_cache = CompilationCache(self.tc)
Ejemplo n.º 3
0
class TC(object):
    def __init__(
        self, tc: str,
        mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]],
                                           MappingOptions])):
        self.tc = tc
        self.mapping_options_factory = mapping_options_factory
        self.compilation_cache = CompilationCache(self.tc)

        # Make each TC def in the tc str a method of the TC object so we can:
        #     T = tc.define("def add() ...")
        #     T.add()
        #
        def make_closure(obj: TC, tc_def_name: str):
            def fun(*inputs: torch.Tensor,
                    outputs: Optional[Tuple[torch.Tensor]] = None,
                    unchecked: Optional[bool] = False) -> List[torch.Tensor]:
                return obj(tc_def_name,
                           *inputs,
                           outputs=outputs,
                           unchecked=unchecked)

            return fun

        for tc_def in tclib.parse_defs(self.tc):
            self.__setattr__(tc_def, make_closure(self, tc_def))

    def __call__(self,
                 entry_point: str,
                 *inputs: torch.Tensor,
                 outputs: Optional[Tuple[torch.Tensor]] = None,
                 unchecked: Optional[bool] = False) -> List[torch.Tensor]:

        # Locally scoped implicit compilation
        def implicit_compile(tc_obj: TC, entry_point: str,
                             *inputs: torch.Tensor):
            already_compiled = tc_obj.compilation_cache.is_compiled(
                entry_point, inputs)

            if already_compiled:
                return

            global SILENT
            if not SILENT:
                sizes = "".join(str(i.size()) + " " for i in inputs)
                print("TC \"{}\" was not explicitly compiled for ".format(
                    entry_point) + "inputs of sizes:\n  {}\n".format(sizes) +
                      "....Generate implicit MappingOptions")

            mapping_options = tc_obj.mapping_options_factory(
                tc_obj.tc, entry_point, *inputs)

            assert mapping_options is not None, (
                "No options found for TC {} ".format(entry_point) +
                "with inputs of sizes:\n  {}\n".format("".join(
                    str(i.size()) + " " for i in inputs)))

            # Compile best options to set the executor for the current
            #     (entry point, inputs)
            start = time.clock()
            tc_obj.compilation_cache.compile(entry_point, inputs,
                                             mapping_options)
            if not SILENT:
                print("Done compiling TC \"{}\" (compile time: {}ms)".format(
                    entry_point, int((time.clock() - start) * 10**3)))

        implicit_compile(self, entry_point, *inputs)

        if unchecked:
            return self.compilation_cache.unchecked_run(entry_point, inputs)

        return self.compilation_cache.run(entry_point, inputs)
# Define a TC string for matmul and some input torch cuda tensors
mm = """
def matmul(float(M,N) A, float(N,K) B) -> (C) {
    C(m, k) +=! A(m, r_n) * B(r_n, k)
}
def matmul_agrad(float(N,K) B, float(M,K) d_C) -> (d_A) {
    d_A(m, n) +=! d_C(  m, r_k) * B(  n, r_k)
}
def matmul_bgrad(float(M,N) A, float(M,K) d_C) -> (d_B) {
    d_B(n, k) +=! d_C(r_m,   k) * A(r_m,   n)
}
"""
A, B = (torch.randn(300, 400, device='cuda', requires_grad=True),
        torch.randn(400, 500, device='cuda', requires_grad=True))

compilation_cache = CompilationCache(mm)

tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices(
    "0")

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below.
################################################################################
executor = compile(mm, "matmul", (A, B), MappingOptions('naive'))
Ejemplo n.º 5
0
class MultiTcBuilder():
    def __init__(self,
                 tc="",
                 forward_names=(),
                 forward_input_indices=(()),
                 forward_force_reinforcement_tunings=(),
                 backward_names=(),
                 backward_input_indices=(()),
                 backward_force_reinforcement_tunings=(),
                 check_output_shapes=True,
                 tuner_cache_file="",
                 tuner_config=TunerConfig(),
                 debug=False):
        if debug:
            assert isinstance(tc, str), type(tc)
            assert isinstance(forward_names, tuple), type(forward_names)
            assert isinstance(forward_input_indices,
                              tuple), type(forward_input_indices)
            assert isinstance(forward_force_reinforcement_tunings,
                              tuple), type(forward_force_reinforcement_tunings)
            assert isinstance(backward_names, tuple), type(backward_names)
            assert isinstance(backward_input_indices,
                              tuple), type(backward_input_indices)
            assert isinstance(
                backward_force_reinforcement_tunings,
                tuple), type(backward_force_reinforcement_tunings)
            assert isinstance(check_output_shapes,
                              bool), type(tuner_cache_file)
            assert isinstance(tuner_cache_file, str), type(tuner_cache_file)
            assert isinstance(tuner_config, TunerConfig), type(tuner_config)

        self.tc = tc
        self.forward_names = forward_names
        self.forward_input_indices = forward_input_indices
        self.forward_force_reinforcement_tunings = forward_force_reinforcement_tunings
        self.backward_names = backward_names
        self.backward_input_indices = backward_input_indices
        self.backward_force_reinforcement_tunings = backward_force_reinforcement_tunings
        self.check_output_shapes = check_output_shapes
        self.tuner_cache_file = tuner_cache_file
        self.tuner_config = tuner_config
        self.debug = debug
        self.compilation_cache = CompilationCache(self.tc)

    def compileOrTune(self,
                      name="",
                      force_reinforcement_tuning=False,
                      inputs=()):
        if self.debug:
            print(
                "On Tc: {}\ncompile def {}, force_reinforcement_tuning {}, inputs: {}"
                .format(
                    self.tc, name, force_reinforcement_tuning,
                    "".join("{}/{}, ".format(t.size().__str__(),
                                             t.stride().__str__())
                            for t in inputs)))

        if not self.compilation_cache.is_compiled(name, inputs):
            cache = MappingOptionsCache(self.tuner_cache_file)
            mapping_options = None
            base_options_list = cache.load(self.tc, name, inputs, 1)
            if len(base_options_list) > 0 and not force_reinforcement_tuning:
                mapping_options = base_options_list[0]
                if self.debug:
                    print("Found best options in {}:\n{}".format(
                        self.tuner_cache_file, mapping_options))
            else:
                if self.debug:
                    print(
                        "########################################################"
                        "########################################################"
                    )
                    print(
                        "force_reinforcement_tuning = {} was specified, {} options loaded from "
                        "{}".format(force_reinforcement_tuning,
                                    len(base_options_list),
                                    self.tuner_cache_file))
                    print(
                        "Starting a tuning run (abort it with Ctrl+C when "
                        "performance is satisfactory.\nYou can always reinforce "
                        "the results later by passing a proper tuner cache file "
                        "and specifying force_reinforcement_tuning=True)")
                    print(
                        "########################################################"
                        "########################################################"
                    )

                if len(base_options_list) == 0:
                    mapping_options = MappingOptions()
                else:
                    mapping_options = base_options_list[0]

                tuner = Tuner(self.tc, self.tuner_cache_file)
                mapping_options = tuner.tune(name, inputs, mapping_options,
                                             self.tuner_config)

            self.compilation_cache.compile(name, inputs, mapping_options)
Ejemplo n.º 6
0
executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
outputs = executor.run((mat1, mat2), ())
outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
time_tc(100, "simple API\t",
        lambda name, ins: executor.unchecked_run(ins, tuple(outputs)),
        "matmul", (mat1, mat2))
time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins,
                                                 ()), "matmul", (mat1, mat2))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
from tensor_comprehensions.tclib import CompilationCache

compilation_cache = CompilationCache(mm)
# Compilation returns an allocated tuple of outputs with the proper shapes.
# Allocation overhead is negligible compared to compilation overhead.
compilation_cache.compile("matmul", (mat1, mat2), MappingOptions())
# Run once without timing
compilation_cache.unchecked_run("matmul", (mat1, mat2), ())
# unchecked_run on  tensors
time_tc(100, "raw unchecked_run naive options\t",
        lambda name, ins: compilation_cache.unchecked_run(name, ins, ()),
        "matmul", (mat1, mat2))

################################################################################
# 3. Short tuning run saving to file then load the best option to create a
#    compilation cache
################################################################################
from tensor_comprehensions.tclib import Tuner