def __init__(self, tc_lang, **kwargs): # tuner_cache will look like: # hash_key -> {"forward": options1, "backward": options2} self.tuner_cache = {} self.kwargs = kwargs self.tc_lang = tc_lang self.autotuner = ATenAutotuner(tc_lang) self.set_autotuner_parameters(**kwargs)
class TcAutotuner(object): def __init__(self, tc_lang, **kwargs): # tuner_cache will look like: # hash_key -> {"forward": options1, "backward": options2} self.tuner_cache = {} self.kwargs = kwargs self.tc_lang = tc_lang self.autotuner = ATenAutotuner(tc_lang) self.set_autotuner_parameters(**kwargs) def set_autotuner_parameters(self, pop_size=20, crossover_rate=80, mutation_rate=7, generations=10, number_elites=1, threads=8, gpus="0", proto="/tmp/tuner.txt", restore_from_proto=False, restore_number=10, log_generations=False, tuner_min_launch_total_threads=64, **kwargs): self.autotuner.pop_size(pop_size) self.autotuner.crossover_rate(crossover_rate) self.autotuner.mutation_rate(mutation_rate) self.autotuner.generations(generations) self.autotuner.number_elites(number_elites) self.autotuner.threads(threads) self.autotuner.gpus(gpus) self.autotuner.proto(proto) self.autotuner.restore_from_proto(restore_from_proto) self.autotuner.restore_number(restore_number) self.autotuner.log_generations(log_generations) self.autotuner.tuner_min_launch_total_threads( tuner_min_launch_total_threads) # We need to pass the inputs so that we can load the correct options from # the cache that correspond to the inputs sizes. This is useful when the # cache may contain multiple kernels and multiple sizes for each kernel def load(self, filename, tc_name, inputs, num_candidates=1): best_options = self.autotuner.load(filename, tc_name, inputs, num_candidates) if num_candidates == 1: return best_options[0] return best_options # if the cache_file is not "" then the tuning results would be saved to file def tune_and_store(self, tc_name, inputs, mapping_options, cache_file=""): options = mapping_options if not isinstance(options, Options): options = Options(options) try: best_options = self.autotuner.tune(cache_file, tc_name, inputs, options, [options]) return best_options except Exception as e: logger.error('Raised exception: {}'.format(e)) return options def autotune(self, *inputs, **kwargs): input_tensors = get_tensors(list(inputs)) kwargs.update(self.kwargs) name, backward_name = get_tc_names_from_kwargs(**kwargs) kwargs.pop("name", None) backward = True if backward_name is not None else False hash_key = get_tc_hash_key(name, *input_tensors) # lookup for the options in the cache. Whenever we make the call to # autotune, tuning must happen. But if the kernel has been tuned earlier # then we can use previous options to seed the tuning. if hash_key in self.tuner_cache: options_cache = self.tuner_cache[hash_key] else: options_cache = {} # we give priority to the options user might have passed via file, or # Options object. cache_file = "" if "cache" in kwargs and kwargs["cache"]: if isinstance(kwargs["cache"], bool): hash_key = get_tc_hash_key(name, *input_tensors) cache_file = "/tmp/{}_{}".format(hash_key, str(uuid.uuid4())) elif isinstance(kwargs["cache"], str): cache_file = kwargs["cache"] logger.info( 'Autotuning cache will be saved to: {}.cuda/options'.format( cache_file)) else: logger.warning( "Autotuning results won't be cached. 'cache' option is not set" ) # we will first run the autotuning on the forward layer, the inputs are given # for that, we will tune those kwargs["type"] = "forward" # we pass this tuner object so we can load from file without having to # create special object kwargs["tuner"] = self.autotuner options = get_options_from_kwargs_and_tuner_cache( name, cache_file, options_cache, *input_tensors, **kwargs) forward_best_options = self.tune_and_store(name, input_tensors, mapping_options=options, cache_file=cache_file) # update the cache with the options options_cache["forward"] = forward_best_options if not backward: self.tuner_cache[hash_key] = options_cache return forward_best_options # now, we have to tune the backward layer, for that, we need to run # the forward layer first, get its output, logger.info('Autotuning the backward layer now') cu = TcCompilationUnit() cu.define(self.tc_lang) if "options" in kwargs: orig_options = kwargs["options"] kwargs["options"] = forward_best_options outputs = cu.compile_and_run(name, input_tensors, **kwargs) kwargs["options"] = orig_options else: outputs = cu.compile_and_run(name, input_tensors, options=forward_best_options, **kwargs) # now that we have the outputs of the forward pass, we have the inputs # for the backward layer and we can now tune the backward layer reorder_function = kwargs[ "reorder_function"] if "reorder_function" in kwargs else None rearranged_outputs = list(outputs) if reorder_function is not None: rearranged_outputs = reorder_function(list(outputs)) inputs = make_contiguous( unpack_variables(input_tensors + list(rearranged_outputs))) if cache_file: cache_file = cache_file + "_backward" logger.info( 'Backwards autotuning cache will be saved to: {}.cuda/options'. format(cache_file)) kwargs["type"] = "backward" options = get_options_from_kwargs_and_tuner_cache( backward_name, cache_file, options_cache, *inputs, **kwargs) backward_best_options = self.tune_and_store(backward_name, inputs, mapping_options=options, cache_file=cache_file) # update the cache with the options options_cache["backward"] = backward_best_options self.tuner_cache[hash_key] = options_cache return [forward_best_options, backward_best_options]