Ejemplo n.º 1
0
 def generate(tc: str, entry_point: str,
              *inputs: torch.Tensor) -> MappingOptions:
     cache = MappingOptionsCache(cache_filename)
     loaded = cache.load(tc, entry_point, inputs, 1)
     if len(loaded) > 0:
         return loaded[0]
     return MappingOptions('naive')
Ejemplo n.º 2
0
    def compileOrTune(self,
                      name="",
                      force_reinforcement_tuning=False,
                      inputs=()):
        if self.debug:
            print(
                "On Tc: {}\ncompile def {}, force_reinforcement_tuning {}, inputs: {}"
                .format(
                    self.tc, name, force_reinforcement_tuning,
                    "".join("{}/{}, ".format(t.size().__str__(),
                                             t.stride().__str__())
                            for t in inputs)))

        if not self.compilation_cache.is_compiled(name, inputs):
            cache = MappingOptionsCache(self.tuner_cache_file)
            mapping_options = None
            base_options_list = cache.load(self.tc, name, inputs, 1)
            if len(base_options_list) > 0 and not force_reinforcement_tuning:
                mapping_options = base_options_list[0]
                if self.debug:
                    print("Found best options in {}:\n{}".format(
                        self.tuner_cache_file, mapping_options))
            else:
                if self.debug:
                    print(
                        "########################################################"
                        "########################################################"
                    )
                    print(
                        "force_reinforcement_tuning = {} was specified, {} options loaded from "
                        "{}".format(force_reinforcement_tuning,
                                    len(base_options_list),
                                    self.tuner_cache_file))
                    print(
                        "Starting a tuning run (abort it with Ctrl+C when "
                        "performance is satisfactory.\nYou can always reinforce "
                        "the results later by passing a proper tuner cache file "
                        "and specifying force_reinforcement_tuning=True)")
                    print(
                        "########################################################"
                        "########################################################"
                    )

                if len(base_options_list) == 0:
                    mapping_options = MappingOptions()
                else:
                    mapping_options = base_options_list[0]

                tuner = Tuner(self.tc, self.tuner_cache_file)
                mapping_options = tuner.tune(name, inputs, mapping_options,
                                             self.tuner_config)

            self.compilation_cache.compile(name, inputs, mapping_options)
Ejemplo n.º 3
0
def compile(tc: str, entry_point: str, mapping_options: Union[str,
                                                              MappingOptions],
            *inputs: torch.Tensor) -> Executor:
    r"""Returns a compiled, callable, low-overhead :class:`Executor`.

        An example of usage is provided in :class:`Executor`.

        :param tc: a string containing one of more TC defs.
        :param entry_point: the name of the TC def to compile and execute.
        :param mapping_options: the options to use for compilation.
        :param inputs: PyTorch Tensors for which the compiled kernel is specialized.

        :rtype: :class:`Executor`, a low-overhead callable class to launch the
            kernel compiled from the :code:`entry_point`.
    """
    mapping_options = (MappingOptions(mapping_options) if isinstance(
        mapping_options, str) else mapping_options)
    return Executor(tclib.compile(tc, entry_point, inputs, mapping_options))
Ejemplo n.º 4
0
 def generate(tc: str, entry_point: str,
              *inputs: torch.Tensor) -> MappingOptions:
     return MappingOptions('naive')
Ejemplo n.º 5
0
def autotune(tc: str,
             entry_point: str,
             *inputs: torch.Tensor,
             starting_options: Optional[Union[str, MappingOptions]] = None,
             tuner_config: Optional[TunerConfig] = TunerConfig(),
             cache_filename: Optional[str] = None,
             load_from_cache: Optional[bool] = False,
             store_to_cache: Optional[bool] = False) -> MappingOptions:
    r"""Tunes the defined TC function for given inputs.

        The MappingOptions from which tuning starts is either passed explicitly via
        :code:`starting_options` or loaded from a cache file (when both
        :code:`cache_filename` and :code:`load_from_cache` are properly
        specified). Exactly one of :code:`starting_options` and
        :code:`load_from_cache` must be specified.

        It is possible to obtain a reinforcement tuning behavior by tuning over
        multiple executions and specifying both :code:`load_from_cache` and
        :code:`store_to_cache`. It is recommended to only use a single cache
        file for all TC defs and reinforce it over time.

        An example of usage is provided with :func:`autotune_and_compile`.

        :param tc: a string containing one of more TC defs.
        :param entry_point: the name of the TC def to compile and execute.
        :param inputs: PyTorch Tensors that TC should tune for. The inputs must be
            passed in the order they are also passed in the definition of
            the TC function.
        :param starting_options: :class:`~tclib.MappingOptions` from which tuning should start.
        :param tuner_config: :class:`~tclib.TunerConfig` to control the behavior of the autotuner.
        :param load_from_cache: Get the starting :class:`~tclib.MappingOptions` by loading from
            :code:`cache_filename`. If loading fails to recover an entry
            from the cache file for the given input sizes an assertion error
            will trigger.
        :param store_to_cache: Optionally store the best result by appending it to
            the backing cache file.

        Returns:
            The best options found during this tuning run.
    """

    if cache_filename is not None:
        assert load_from_cache or store_to_cache, (
            "cache_filename specified" +
            "must also specify load_from_cache or store_to_cache")
    if load_from_cache or store_to_cache:
        assert cache_filename is not None, (
            "load_from_cache or store_to_cache" +
            " specified, must also specify cache_filename")
    assert starting_options is not None or load_from_cache, (
        "Must specify either starting_options or load_from_cache, choose one!")
    assert starting_options is None or not load_from_cache, (
        "Cannot specify both starting_options and load_from_cache, choose one!"
    )

    base_options = None
    if load_from_cache:
        cache = MappingOptionsCache(cache_filename)
        loaded = cache.load(tc, entry_point, inputs, 1)
        assert len(loaded) > 0, (
            "Could not load from cache for TC {} and sizes {}".format(
                entry_point, "".join(str(i.size()) + " " for i in inputs)))
        base_options = loaded[0]
    else:
        base_options = (MappingOptions(starting_options) if isinstance(
            starting_options, str) else starting_options)

    # TODO: This is still an implicit store behavior in the C++ API,
    #     make it explicit...
    tuner = Tuner(tc, cache_filename if store_to_cache else "")
    return tuner.tune(entry_point, inputs, base_options, tuner_config)
compilation_cache = CompilationCache(mm)

tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices(
    "0")

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below.
################################################################################
executor = compile(mm, "matmul", (A, B), MappingOptions('naive'))
C = executor.run((A, B))

time_tc(100, "simple API (in place)\t",
        lambda name, ins: executor.unchecked_run(ins, (C, )), "matmul", (A, B))

time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins), "matmul", (A, B))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
# Compilation returns an allocated tuple of outputs with the proper shapes.
# Allocation overhead is negligible compared to compilation overhead.
compilation_cache.compile("matmul", (A, B), MappingOptions('naive'))
Ejemplo n.º 7
0
"""
mat1, mat2 = torch.randn(300, 400).cuda(), torch.randn(400, 500).cuda()

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below
################################################################################
from tensor_comprehensions.tclib import compile

executor = compile(mm, "matmul", (mat1, mat2), MappingOptions())
outputs = executor.run((mat1, mat2), ())
outputs = executor.unchecked_run((mat1, mat2), tuple(outputs))
time_tc(100, "simple API\t",
        lambda name, ins: executor.unchecked_run(ins, tuple(outputs)),
        "matmul", (mat1, mat2))
time_tc(100, "simple API (with allocation overhead)\t",
        lambda name, ins: executor.unchecked_run(ins,
                                                 ()), "matmul", (mat1, mat2))

################################################################################
# 2. Use the C++ API to build a low-overhead compilation cache and time it
################################################################################
from tensor_comprehensions.tclib import CompilationCache

compilation_cache = CompilationCache(mm)