def __init__( self, tc: str, mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]], MappingOptions])): self.tc = tc self.mapping_options_factory = mapping_options_factory self.compilation_cache = CompilationCache(self.tc) # Make each TC def in the tc str a method of the TC object so we can: # T = tc.define("def add() ...") # T.add() # def make_closure(obj: TC, tc_def_name: str): def fun(*inputs: torch.Tensor, outputs: Optional[Tuple[torch.Tensor]] = None, unchecked: Optional[bool] = False) -> List[torch.Tensor]: return obj(tc_def_name, *inputs, outputs=outputs, unchecked=unchecked) return fun for tc_def in tclib.parse_defs(self.tc): self.__setattr__(tc_def, make_closure(self, tc_def))
def __init__(self, tc="", forward_name="", forward_force_reinforcement_tuning=False, backward_name="", backward_force_reinforcement_tuning=False, check_output_shapes=True, tuner_cache_file="", tuner_config=TunerConfig(), debug=False): if debug: assert isinstance(tc, str), type(tc) assert isinstance(forward_name, str), type(forward_name) assert isinstance(forward_force_reinforcement_tuning, bool), type(forward_force_reinforcement_tuning) assert isinstance(backward_name, str), type(backward_name) assert isinstance(backward_force_reinforcement_tuning, bool), type(backward_force_reinforcement_tuning) assert isinstance(check_output_shapes, bool), type(tuner_cache_file) assert isinstance(tuner_cache_file, str), type(tuner_cache_file) assert isinstance(tuner_config, TunerConfig), type(tuner_config) self.tc = tc self.forward_name = forward_name self.forward_force_reinforcement_tuning = forward_force_reinforcement_tuning self.backward_name = backward_name self.backward_force_reinforcement_tuning = backward_force_reinforcement_tuning self.check_output_shapes = check_output_shapes self.tuner_cache_file = tuner_cache_file self.tuner_config = tuner_config self.debug = debug self.compilation_cache = CompilationCache(self.tc)
class TC(object): def __init__( self, tc: str, mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]], MappingOptions])): self.tc = tc self.mapping_options_factory = mapping_options_factory self.compilation_cache = CompilationCache(self.tc) # Make each TC def in the tc str a method of the TC object so we can: # T = tc.define("def add() ...") # T.add() # def make_closure(obj: TC, tc_def_name: str): def fun(*inputs: torch.Tensor, outputs: Optional[Tuple[torch.Tensor]] = None, unchecked: Optional[bool] = False) -> List[torch.Tensor]: return obj(tc_def_name, *inputs, outputs=outputs, unchecked=unchecked) return fun for tc_def in tclib.parse_defs(self.tc): self.__setattr__(tc_def, make_closure(self, tc_def)) def __call__(self, entry_point: str, *inputs: torch.Tensor, outputs: Optional[Tuple[torch.Tensor]] = None, unchecked: Optional[bool] = False) -> List[torch.Tensor]: # Locally scoped implicit compilation def implicit_compile(tc_obj: TC, entry_point: str, *inputs: torch.Tensor): already_compiled = tc_obj.compilation_cache.is_compiled( entry_point, inputs) if already_compiled: return global SILENT if not SILENT: sizes = "".join(str(i.size()) + " " for i in inputs) print("TC \"{}\" was not explicitly compiled for ".format( entry_point) + "inputs of sizes:\n {}\n".format(sizes) + "....Generate implicit MappingOptions") mapping_options = tc_obj.mapping_options_factory( tc_obj.tc, entry_point, *inputs) assert mapping_options is not None, ( "No options found for TC {} ".format(entry_point) + "with inputs of sizes:\n {}\n".format("".join( str(i.size()) + " " for i in inputs))) # Compile best options to set the executor for the current # (entry point, inputs) start = time.clock() tc_obj.compilation_cache.compile(entry_point, inputs, mapping_options) if not SILENT: print("Done compiling TC \"{}\" (compile time: {}ms)".format( entry_point, int((time.clock() - start) * 10**3))) implicit_compile(self, entry_point, *inputs) if unchecked: return self.compilation_cache.unchecked_run(entry_point, inputs) return self.compilation_cache.run(entry_point, inputs)
# Define a TC string for matmul and some input torch cuda tensors mm = """ def matmul(float(M,N) A, float(N,K) B) -> (C) { C(m, k) +=! A(m, r_n) * B(r_n, k) } def matmul_agrad(float(N,K) B, float(M,K) d_C) -> (d_A) { d_A(m, n) +=! d_C( m, r_k) * B( n, r_k) } def matmul_bgrad(float(M,N) A, float(M,K) d_C) -> (d_B) { d_B(n, k) +=! d_C(r_m, k) * A(r_m, n) } """ A, B = (torch.randn(300, 400, device='cuda', requires_grad=True), torch.randn(400, 500, device='cuda', requires_grad=True)) compilation_cache = CompilationCache(mm) tuner_config = TunerConfig().threads(8).pop_size(25).generations(3).devices( "0") ################################################################################ # 1. Use the simple high-overhead compile/run C++ API # If one can keep state in their layer or wishes to experiment with TC, # this is a simple entry point. # If state cannot be kept, be aware that this API has a non-trivial overhead # when outputs sizes need to be inferred and outputs allocated. # Compilation itself has a prohibitive cost and needs to be memoized either # by holding on to the executor or by using the low-overhead abstraction, see # below. ################################################################################ executor = compile(mm, "matmul", (A, B), MappingOptions('naive'))
class MultiTcBuilder(): def __init__(self, tc="", forward_names=(), forward_input_indices=(()), forward_force_reinforcement_tunings=(), backward_names=(), backward_input_indices=(()), backward_force_reinforcement_tunings=(), check_output_shapes=True, tuner_cache_file="", tuner_config=TunerConfig(), debug=False): if debug: assert isinstance(tc, str), type(tc) assert isinstance(forward_names, tuple), type(forward_names) assert isinstance(forward_input_indices, tuple), type(forward_input_indices) assert isinstance(forward_force_reinforcement_tunings, tuple), type(forward_force_reinforcement_tunings) assert isinstance(backward_names, tuple), type(backward_names) assert isinstance(backward_input_indices, tuple), type(backward_input_indices) assert isinstance( backward_force_reinforcement_tunings, tuple), type(backward_force_reinforcement_tunings) assert isinstance(check_output_shapes, bool), type(tuner_cache_file) assert isinstance(tuner_cache_file, str), type(tuner_cache_file) assert isinstance(tuner_config, TunerConfig), type(tuner_config) self.tc = tc self.forward_names = forward_names self.forward_input_indices = forward_input_indices self.forward_force_reinforcement_tunings = forward_force_reinforcement_tunings self.backward_names = backward_names self.backward_input_indices = backward_input_indices self.backward_force_reinforcement_tunings = backward_force_reinforcement_tunings self.check_output_shapes = check_output_shapes self.tuner_cache_file = tuner_cache_file self.tuner_config = tuner_config self.debug = debug self.compilation_cache = CompilationCache(self.tc) def compileOrTune(self, name="", force_reinforcement_tuning=False, inputs=()): if self.debug: print( "On Tc: {}\ncompile def {}, force_reinforcement_tuning {}, inputs: {}" .format( self.tc, name, force_reinforcement_tuning, "".join("{}/{}, ".format(t.size().__str__(), t.stride().__str__()) for t in inputs))) if not self.compilation_cache.is_compiled(name, inputs): cache = MappingOptionsCache(self.tuner_cache_file) mapping_options = None base_options_list = cache.load(self.tc, name, inputs, 1) if len(base_options_list) > 0 and not force_reinforcement_tuning: mapping_options = base_options_list[0] if self.debug: print("Found best options in {}:\n{}".format( self.tuner_cache_file, mapping_options)) else: if self.debug: print( "########################################################" "########################################################" ) print( "force_reinforcement_tuning = {} was specified, {} options loaded from " "{}".format(force_reinforcement_tuning, len(base_options_list), self.tuner_cache_file)) print( "Starting a tuning run (abort it with Ctrl+C when " "performance is satisfactory.\nYou can always reinforce " "the results later by passing a proper tuner cache file " "and specifying force_reinforcement_tuning=True)") print( "########################################################" "########################################################" ) if len(base_options_list) == 0: mapping_options = MappingOptions() else: mapping_options = base_options_list[0] tuner = Tuner(self.tc, self.tuner_cache_file) mapping_options = tuner.tune(name, inputs, mapping_options, self.tuner_config) self.compilation_cache.compile(name, inputs, mapping_options)
executor = compile(mm, "matmul", (mat1, mat2), MappingOptions()) outputs = executor.run((mat1, mat2), ()) outputs = executor.unchecked_run((mat1, mat2), tuple(outputs)) time_tc(100, "simple API\t", lambda name, ins: executor.unchecked_run(ins, tuple(outputs)), "matmul", (mat1, mat2)) time_tc(100, "simple API (with allocation overhead)\t", lambda name, ins: executor.unchecked_run(ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 2. Use the C++ API to build a low-overhead compilation cache and time it ################################################################################ from tensor_comprehensions.tclib import CompilationCache compilation_cache = CompilationCache(mm) # Compilation returns an allocated tuple of outputs with the proper shapes. # Allocation overhead is negligible compared to compilation overhead. compilation_cache.compile("matmul", (mat1, mat2), MappingOptions()) # Run once without timing compilation_cache.unchecked_run("matmul", (mat1, mat2), ()) # unchecked_run on tensors time_tc(100, "raw unchecked_run naive options\t", lambda name, ins: compilation_cache.unchecked_run(name, ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 3. Short tuning run saving to file then load the best option to create a # compilation cache ################################################################################ from tensor_comprehensions.tclib import Tuner