Beispiel #1
0
def make_autotuned_options_factory(
    starting_options: Optional[Union[str, MappingOptions]] = None,
    tuner_config: TunerConfig = TunerConfig(),
    cache_filename: Optional[str] = None,
    load_from_cache: Optional[bool] = False,
    store_to_cache: Optional[bool] = False
) -> (Callable[[str, str, Iterable[torch.Tensor]], MappingOptions]):
    r"""Return a factory that runs autotuning to determine the best :class:`~tclib.MappingOptions`.

        The returned factory just calls the :func:`autotune` function, see
        its documentation for more information.

        :rtype: a function that takes a string with multiple
            TC defs, an entry_point and input PyTorch Tensors and produces a
            :class:`~tclib.MappingOptions`.
    """
    def generate(tc: str, entry_point: str,
                 *inputs: torch.Tensor) -> MappingOptions:
        return autotune(tc,
                        entry_point,
                        *inputs,
                        starting_options=starting_options,
                        tuner_config=tuner_config,
                        cache_filename=cache_filename,
                        load_from_cache=load_from_cache,
                        store_to_cache=store_to_cache)

    return generate
Beispiel #2
0
    def __init__(self,
                 tc="",
                 forward_name="",
                 forward_force_reinforcement_tuning=False,
                 backward_name="",
                 backward_force_reinforcement_tuning=False,
                 check_output_shapes=True,
                 tuner_cache_file="",
                 tuner_config=TunerConfig(),
                 debug=False):
        if debug:
            assert isinstance(tc, str), type(tc)
            assert isinstance(forward_name, str), type(forward_name)
            assert isinstance(forward_force_reinforcement_tuning,
                              bool), type(forward_force_reinforcement_tuning)
            assert isinstance(backward_name, str), type(backward_name)
            assert isinstance(backward_force_reinforcement_tuning,
                              bool), type(backward_force_reinforcement_tuning)
            assert isinstance(check_output_shapes,
                              bool), type(tuner_cache_file)
            assert isinstance(tuner_cache_file, str), type(tuner_cache_file)
            assert isinstance(tuner_config, TunerConfig), type(tuner_config)

        self.tc = tc
        self.forward_name = forward_name
        self.forward_force_reinforcement_tuning = forward_force_reinforcement_tuning
        self.backward_name = backward_name
        self.backward_force_reinforcement_tuning = backward_force_reinforcement_tuning
        self.check_output_shapes = check_output_shapes
        self.tuner_cache_file = tuner_cache_file
        self.tuner_config = tuner_config
        self.debug = debug
        self.compilation_cache = CompilationCache(self.tc)
Beispiel #3
0
def autotune_and_compile(
        tc: str,
        entry_point: str,
        *inputs: torch.Tensor,
        starting_options: Optional[Union[str, MappingOptions]] = None,
        tuner_config: Optional[TunerConfig] = TunerConfig(),
        cache_filename: Optional[str] = None,
        load_from_cache: Optional[bool] = False,
        store_to_cache: Optional[bool] = False) -> Executor:
    r"""Calls autotune, compiles with best options then returns an Executor.

    Takes the same arguments as the :func:`autotune` function.

    Example:
        >>> A, B = (
        ... torch.randn(10 ** 5, device='cuda').fill_(1.0),
        ... torch.randn(10 ** 5, device='cuda').fill_(1.0))
        ... add = tc.autotune_and_compile(
        ...    "def add(float(N) A, float(N) B) -> (C) { C(i) = A(i) + B(i) }",
        ...    "add",
        ...    A, B,
        ...    starting_options='naive',
        ...    tuner_config=tc.TunerConfig().threads(5).generations(3).pop_size(5)
        ... )
        ... C = add(A, B)
        >>> print(C.min(), C.max())
        tensor(2., device='cuda:0') tensor(2., device='cuda:0')
    """
    best = autotune(
        tc,
        entry_point,
        *inputs,
        starting_options=starting_options,
        tuner_config=tuner_config,
        cache_filename=cache_filename,
        load_from_cache=load_from_cache,
        store_to_cache=store_to_cache)
    if best is None:
        return None
    return compile(tc, entry_point, best, *inputs)
Beispiel #4
0
def autotune(tc: str,
             entry_point: str,
             *inputs: torch.Tensor,
             starting_options: Optional[Union[str, MappingOptions]] = None,
             tuner_config: Optional[TunerConfig] = TunerConfig(),
             cache_filename: Optional[str] = None,
             load_from_cache: Optional[bool] = False,
             store_to_cache: Optional[bool] = False) -> MappingOptions:
    r"""Tunes the defined TC function for given inputs.

        The MappingOptions from which tuning starts is either passed explicitly via
        :code:`starting_options` or loaded from a cache file (when both
        :code:`cache_filename` and :code:`load_from_cache` are properly
        specified). Exactly one of :code:`starting_options` and
        :code:`load_from_cache` must be specified.

        It is possible to obtain a reinforcement tuning behavior by tuning over
        multiple executions and specifying both :code:`load_from_cache` and
        :code:`store_to_cache`. It is recommended to only use a single cache
        file for all TC defs and reinforce it over time.

        An example of usage is provided with :func:`autotune_and_compile`.

        :param tc: a string containing one of more TC defs.
        :param entry_point: the name of the TC def to compile and execute.
        :param inputs: PyTorch Tensors that TC should tune for. The inputs must be
            passed in the order they are also passed in the definition of
            the TC function.
        :param starting_options: :class:`~tclib.MappingOptions` from which tuning should start.
        :param tuner_config: :class:`~tclib.TunerConfig` to control the behavior of the autotuner.
        :param load_from_cache: Get the starting :class:`~tclib.MappingOptions` by loading from
            :code:`cache_filename`. If loading fails to recover an entry
            from the cache file for the given input sizes an assertion error
            will trigger.
        :param store_to_cache: Optionally store the best result by appending it to
            the backing cache file.

        Returns:
            The best options found during this tuning run.
    """

    if cache_filename is not None:
        assert load_from_cache or store_to_cache, (
            "cache_filename specified" +
            "must also specify load_from_cache or store_to_cache")
    if load_from_cache or store_to_cache:
        assert cache_filename is not None, (
            "load_from_cache or store_to_cache" +
            " specified, must also specify cache_filename")
    assert starting_options is not None or load_from_cache, (
        "Must specify either starting_options or load_from_cache, choose one!")
    assert starting_options is None or not load_from_cache, (
        "Cannot specify both starting_options and load_from_cache, choose one!"
    )

    base_options = None
    if load_from_cache:
        cache = MappingOptionsCache(cache_filename)
        loaded = cache.load(tc, entry_point, inputs, 1)
        assert len(loaded) > 0, (
            "Could not load from cache for TC {} and sizes {}".format(
                entry_point, "".join(str(i.size()) + " " for i in inputs)))
        base_options = loaded[0]
    else:
        base_options = (MappingOptions(starting_options) if isinstance(
            starting_options, str) else starting_options)

    # TODO: This is still an implicit store behavior in the C++ API,
    #     make it explicit...
    tuner = Tuner(tc, cache_filename if store_to_cache else "")
    return tuner.tune(entry_point, inputs, base_options, tuner_config)
def matmul(float(M,N) A, float(N,K) B) -> (C) {
    C(m, k) +=! A(m, r_n) * B(r_n, k)
}
def matmul_agrad(float(N,K) B, float(M,K) d_C) -> (d_A) {
    d_A(m, n) +=! d_C(  m, r_k) * B(  n, r_k)
}
def matmul_bgrad(float(M,N) A, float(M,K) d_C) -> (d_B) {
    d_B(n, k) +=! d_C(r_m,   k) * A(r_m,   n)
}
"""
A, B = (torch.randn(300, 400, device='cuda', requires_grad=True),
        torch.randn(400, 500, device='cuda', requires_grad=True))

compilation_cache = CompilationCache(mm)

tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices(
    "0")

################################################################################
# 1. Use the simple high-overhead compile/run C++ API
#    If one can keep state in their layer or wishes to experiment with TC,
#    this is a simple entry point.
#    If state cannot be kept, be aware that this API has a non-trivial overhead
#    when outputs sizes need to be inferred and outputs allocated.
#    Compilation itself has a prohibitive cost and needs to be memoized either
#    by holding on to the executor or by using the low-overhead abstraction, see
#    below.
################################################################################
executor = compile(mm, "matmul", (A, B), MappingOptions('naive'))
C = executor.run((A, B))

time_tc(100, "simple API (in place)\t",
Beispiel #6
0
#    compilation cache
################################################################################
from tensor_comprehensions.tclib import Tuner
from tensor_comprehensions.tclib import MappingOptionsCache
from tensor_comprehensions.tclib import TunerConfig

import uuid
unique_filename = "/tmp/" + str(uuid.uuid4())
print("Tune with cache @", unique_filename)
print("Note that if you pass a fixed filename, you can reinforce an " +
      "existing tuning state")

tuner = Tuner(mm, unique_filename)
top1 = tuner.tune(
    "matmul", (mat1, mat2), MappingOptions(),
    TunerConfig(threads=8, pop_size=25, generations=3, devices="0"))
cache = MappingOptionsCache(unique_filename)
top10 = cache.load(mm, "matmul", (mat1, mat2), 10)
assert top1.__str__() == top10[0].__str__()

# Compile and run with the new options
compilation_cache.compile("matmul", (mat1, mat2), top1)
time_tc(100, "raw unchecked_run tuned options\t",
        lambda name, ins: compilation_cache.unchecked_run(name, ins, ()),
        "matmul", (mat1, mat2))


################################################################################
# 4. Simple TC builder
################################################################################
class TcBuilder():