time_tc(100, "simple API (in place)\t", lambda name, ins: executor.unchecked_run(ins, (C, )), "matmul", (A, B)) time_tc(100, "simple API (with allocation overhead)\t", lambda name, ins: executor.unchecked_run(ins), "matmul", (A, B)) ################################################################################ # 2. Use the C++ API to build a low-overhead compilation cache and time it ################################################################################ # Compilation returns an allocated tuple of outputs with the proper shapes. # Allocation overhead is negligible compared to compilation overhead. compilation_cache.compile("matmul", (A, B), MappingOptions('naive')) # Run once without timing compilation_cache.unchecked_run("matmul", (A, B)) # unchecked_run on tensors time_tc(100, "raw unchecked_run naive options\t", lambda name, ins: compilation_cache.unchecked_run(name, ins), "matmul", (A, B)) ################################################################################ # 3. Short tuning run saving to file then load the best option to create a # compilation cache ################################################################################ with tempfile.NamedTemporaryFile() as cache_file: tuner = Tuner(mm, cache_file.name) top1 = tuner.tune("matmul", (A, B), MappingOptions('naive'), tuner_config) cache = MappingOptionsCache(cache_file.name) top10 = cache.load(mm, "matmul", (A, B), 10)
class TC(object): def __init__( self, tc: str, mapping_options_factory: (Callable[[str, str, Iterable[torch.Tensor]], MappingOptions])): self.tc = tc self.mapping_options_factory = mapping_options_factory self.compilation_cache = CompilationCache(self.tc) # Make each TC def in the tc str a method of the TC object so we can: # T = tc.define("def add() ...") # T.add() # def make_closure(obj: TC, tc_def_name: str): def fun(*inputs: torch.Tensor, outputs: Optional[Tuple[torch.Tensor]] = None, unchecked: Optional[bool] = False) -> List[torch.Tensor]: return obj(tc_def_name, *inputs, outputs=outputs, unchecked=unchecked) return fun for tc_def in tclib.parse_defs(self.tc): self.__setattr__(tc_def, make_closure(self, tc_def)) def __call__(self, entry_point: str, *inputs: torch.Tensor, outputs: Optional[Tuple[torch.Tensor]] = None, unchecked: Optional[bool] = False) -> List[torch.Tensor]: # Locally scoped implicit compilation def implicit_compile(tc_obj: TC, entry_point: str, *inputs: torch.Tensor): already_compiled = tc_obj.compilation_cache.is_compiled( entry_point, inputs) if already_compiled: return global SILENT if not SILENT: sizes = "".join(str(i.size()) + " " for i in inputs) print("TC \"{}\" was not explicitly compiled for ".format( entry_point) + "inputs of sizes:\n {}\n".format(sizes) + "....Generate implicit MappingOptions") mapping_options = tc_obj.mapping_options_factory( tc_obj.tc, entry_point, *inputs) assert mapping_options is not None, ( "No options found for TC {} ".format(entry_point) + "with inputs of sizes:\n {}\n".format("".join( str(i.size()) + " " for i in inputs))) # Compile best options to set the executor for the current # (entry point, inputs) start = time.clock() tc_obj.compilation_cache.compile(entry_point, inputs, mapping_options) if not SILENT: print("Done compiling TC \"{}\" (compile time: {}ms)".format( entry_point, int((time.clock() - start) * 10**3))) implicit_compile(self, entry_point, *inputs) if unchecked: return self.compilation_cache.unchecked_run(entry_point, inputs) return self.compilation_cache.run(entry_point, inputs)
"matmul", (mat1, mat2)) time_tc(100, "simple API (with allocation overhead)\t", lambda name, ins: executor.unchecked_run(ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 2. Use the C++ API to build a low-overhead compilation cache and time it ################################################################################ from tensor_comprehensions.tclib import CompilationCache compilation_cache = CompilationCache(mm) # Compilation returns an allocated tuple of outputs with the proper shapes. # Allocation overhead is negligible compared to compilation overhead. compilation_cache.compile("matmul", (mat1, mat2), MappingOptions()) # Run once without timing compilation_cache.unchecked_run("matmul", (mat1, mat2), ()) # unchecked_run on tensors time_tc(100, "raw unchecked_run naive options\t", lambda name, ins: compilation_cache.unchecked_run(name, ins, ()), "matmul", (mat1, mat2)) ################################################################################ # 3. Short tuning run saving to file then load the best option to create a # compilation cache ################################################################################ from tensor_comprehensions.tclib import Tuner from tensor_comprehensions.tclib import MappingOptionsCache from tensor_comprehensions.tclib import TunerConfig import uuid unique_filename = "/tmp/" + str(uuid.uuid4())