def check_argument_lists(self, kernel_name, arguments): """ Check if the kernel arguments have the correct types This is done by calling util.check_argument_list on each kernel string. """ for i, f in enumerate(self.kernel_sources): if not callable(f): util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments) else: logging.debug("Checking of arguments list not supported yet for code generators.")
def run_kernel(kernel_name, kernel_string, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, cmem_args=None, compiler_options=None, block_size_names=None, quiet=False): #sort options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) #detect language and create the right device function interface dev = core.DeviceInterface(kernel_string, iterations=1, **device_options) #move data to the GPU util.check_argument_list(arguments) gpu_args = dev.ready_argument_list(arguments) instance = None try: #create kernel instance instance = dev.create_kernel_instance(kernel_options, params, False) if instance is None: raise Exception("cannot create kernel instance, too many threads per block") #compile the kernel func = dev.compile_kernel(instance, False) if func is None: raise Exception("cannot compile kernel, too much shared memory used") #add constant memory arguments to compiled module if cmem_args is not None: dev.copy_constant_memory_args(cmem_args) finally: #delete temp files if instance is not None: for v in instance.temp_files.values(): util.delete_temp_file(v) #run the kernel if not dev.run_kernel(func, gpu_args, instance): raise Exception("runtime error occured, too many resources requested") #copy data in GPU memory back to the host results = [] for i, arg in enumerate(arguments): if numpy.isscalar(arg): results.append(arg) else: results.append(numpy.zeros_like(arg)) dev.memcpy_dtoh(results[-1], gpu_args[i]) return results
def _check_user_input(kernel_name, kernel_string, arguments, block_size_names): # see if the kernel arguments have correct type if not callable(kernel_string): if isinstance(kernel_string, list): for file in kernel_string: util.check_argument_list(kernel_name, util.get_kernel_string(file), arguments) else: util.check_argument_list(kernel_name, util.get_kernel_string(kernel_string), arguments) else: logging.debug( "Checking of arguments list not supported yet for code generators." ) # check for types and length of block_size_names util.check_block_size_names(block_size_names)
def run_kernel(kernel_name, kernel_string, problem_size, arguments, params, grid_div_x=None, grid_div_y=None, grid_div_z=None, lang=None, device=0, platform=0, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None, block_size_names=None, quiet=False, log=None): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log) kernel_source = core.KernelSource(kernel_string, lang) _check_user_input(kernel_name, kernel_source, arguments, block_size_names) #sort options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) #detect language and create the right device function interface dev = core.DeviceInterface(kernel_source, iterations=1, **device_options) #move data to the GPU gpu_args = dev.ready_argument_list(arguments) instance = None try: #create kernel instance instance = dev.create_kernel_instance(kernel_source, kernel_options, params, False) if instance is None: raise Exception("cannot create kernel instance, too many threads per block") # see if the kernel arguments have correct type util.check_argument_list(instance.name, instance.kernel_string, arguments) #compile the kernel func = dev.compile_kernel(instance, False) if func is None: raise Exception("cannot compile kernel, too much shared memory used") #add constant memory arguments to compiled module if cmem_args is not None: dev.copy_constant_memory_args(cmem_args) #add texture memory arguments to compiled module if texmem_args is not None: dev.copy_texture_memory_args(texmem_args) finally: #delete temp files if instance is not None: instance.delete_temp_files() #run the kernel if not dev.run_kernel(func, gpu_args, instance): raise Exception("runtime error occured, too many resources requested") #copy data in GPU memory back to the host results = [] for i, arg in enumerate(arguments): if numpy.isscalar(arg): results.append(arg) else: results.append(numpy.zeros_like(arg)) dev.memcpy_dtoh(results[-1], gpu_args[i]) #trying to make run_kernel work nicely with the Nvidia Visual Profiler del dev return results
def tune_kernel(kernel_name, kernel_string, problem_size, arguments, tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None, restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False, lang=None, device=0, platform=0, cmem_args=None, num_threads=1, use_noodles=False, sample_fraction=False, compiler_options=None, log=None, iterations=7, block_size_names=None, quiet=False, strategy=None, method=None): if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log) #see if the kernel arguments have correct type util.check_argument_list(arguments) #sort all the options into separate dicts opts = locals() kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()]) tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()]) device_options = Options([(k, opts[k]) for k in _device_options.keys()]) logging.debug('tune_kernel called') logging.debug('kernel_options: %s', util.get_config_string(kernel_options)) logging.debug('tuning_options: %s', util.get_config_string(tuning_options)) logging.debug('device_options: %s', util.get_config_string(device_options)) #select strategy based on user options if sample_fraction and not strategy in [None, 'sample_fraction']: raise ValueError("It's not possible to use both sample_fraction in combination with other strategies. " \ 'Please set strategy=None or strategy="random_sample", when using sample_fraction') if strategy in [None, 'sample_fraction', 'brute_force']: if sample_fraction: use_strategy = random_sample else: use_strategy = brute_force elif strategy in ["minimize", "basinhopping"]: if method: if not (method in ["Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "SLSQP"] or callable(method)): raise ValueError("method option not recognized") else: method = "L-BFGS-B" if strategy == "minimize": use_strategy = minimize else: use_strategy = basinhopping elif strategy == "diff_evo": use_strategy = diff_evo if method: if not method in ["best1bin", "best1exp", "rand1exp", "randtobest1exp", "best2exp", "rand2exp", "randtobest1bin", "best2bin", "rand2bin", "rand1bin"]: raise ValueError("method option not recognized") else: raise ValueError("strategy option not recognized") strategy = use_strategy #select runner based on user options if num_threads == 1 and not use_noodles: from kernel_tuner.runners.sequential import SequentialRunner runner = SequentialRunner(kernel_options, device_options, iterations) elif num_threads > 1 and not use_noodles: raise ValueError("Using multiple threads requires the Noodles runner, use use_noodles=True") elif use_noodles: #check if Python version matches required by Noodles if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 5): raise ValueError("Using multiple threads requires Noodles, Noodles requires Python 3.5 or higher") #check if noodles is installed in a way that works with Python 3.4 or newer noodles_installed = importlib.util.find_spec("noodles") is not None if not noodles_installed: raise ValueError("Using multiple threads requires Noodles, please use 'pip install noodles'") #import the NoodlesRunner from kernel_tuner.runners.noodles import NoodlesRunner runner = NoodlesRunner(device_options, num_threads) else: raise ValueError("Somehow no runner was selected, this should not happen, please file a bug report") #call the strategy to execute the tuning process results, env = strategy.tune(runner, kernel_options, device_options, tuning_options) #finished iterating over search space if results: #checks if results is not empty best_config = min(results, key=lambda x: x['time']) print("best performing configuration:", util.get_config_string(best_config)) else: print("no results to report") del runner.dev return results, env