def create_kernel_instance(self, kernel_source, kernel_options, params, verbose): """create kernel instance from kernel source, parameters, problem size, grid divisors, and so on""" instance_string = util.get_instance_string(params) grid_div = (kernel_options.grid_div_x, kernel_options.grid_div_y, kernel_options.grid_div_z) #insert default block_size_names if needed if not kernel_options.block_size_names: kernel_options.block_size_names = util.default_block_size_names #setup thread block and grid dimensions threads, grid = util.setup_block_and_grid( kernel_options.problem_size, grid_div, params, kernel_options.block_size_names) if numpy.prod(threads) > self.dev.max_threads: if verbose: print("skipping config", instance_string, "reason: too many threads per block") return None #obtain the kernel_string and prepare additional files, if any name, kernel_string, temp_files = kernel_source.prepare_list_of_files( kernel_options.kernel_name, params, grid, threads, kernel_options.block_size_names) #collect everything we know about this instance and return it return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, kernel_options.arguments)
def create_kernel_instance(self, kernel_options, params, verbose): """create kernel instance from kernel source, parameters, problem size, grid divisors, and so on""" instance_string = util.get_instance_string(params) grid_div = (kernel_options.grid_div_x, kernel_options.grid_div_y, kernel_options.grid_div_z) #insert default block_size_names if needed if not kernel_options.block_size_names: kernel_options.block_size_names = util.default_block_size_names #setup thread block and grid dimensions threads, grid = util.setup_block_and_grid(kernel_options.problem_size, grid_div, params, kernel_options.block_size_names) if numpy.prod(threads) > self.dev.max_threads: if verbose: print("skipping config", instance_string, "reason: too many threads per block") return None #obtain the kernel_string and prepare additional files, if any temp_files = dict() kernel_source = kernel_options.kernel_string if not isinstance(kernel_source, list): kernel_source = [kernel_source] name, kernel_string, temp_files = util.prepare_list_of_files(kernel_options.kernel_name, kernel_source, params, grid, threads, kernel_options.block_size_names) #collect everything we know about this instance and return it return KernelInstance(name, kernel_string, temp_files, threads, grid, params, kernel_options.arguments)
def _select_best_common_config(results, objective, objective_higher_is_better): """ return the most common config among results obtained on different problem sizes """ results_table = {} total_performance = {} inverse_table = {} #for each configuration in the list for config in results: params = config["tunable_parameters"] config_str = util.get_instance_string(params) #count occurances results_table[config_str] = results_table.get(config_str,0) + 1 #add to performance total_performance[config_str] = total_performance.get(config_str,0) + config[objective] #store mapping from config_str to the parameters inverse_table[config_str] = params #look for best config top_freq = max(results_table.values()) best_configs = [k for k in results_table if results_table[k] == top_freq] #intersect total_performance with the best_configs total_performance = {k:total_performance[k] for k in total_performance if k in best_configs} #get the best config from this intersection if objective_higher_is_better: best_config_str = max(total_performance.keys(), key=lambda x: total_performance[x]) else: best_config_str = min(total_performance.keys(), key=lambda x: total_performance[x]) #lookup the tunable parameters of this configuration in the inverse table and return result return inverse_table[best_config_str]
def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = util.get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) mem_usage = round( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1) logging.debug('Memory usage : %2.2f MB', mem_usage) verbose = tuning_options.verbose instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if instance is None: return None try: #compile the kernel func = self.compile_kernel(instance, verbose) if func is None: return None #add shared memory arguments to compiled module if kernel_options.smem_args is not None: self.dev.copy_shared_memory_args( util.get_smem_args(kernel_options.smem_args, params)) #add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) #add texture memory arguments to compiled module if kernel_options.texmem_args is not None: self.dev.copy_texture_memory_args(kernel_options.texmem_args) #test kernel for correctness and benchmark if tuning_options.answer is not None or tuning_options.verify is not None: self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose) #benchmark result = self.benchmark(func, gpu_args, instance, verbose) except Exception as e: #dump kernel_string to temp file temp_filenames = instance.prepare_temp_files_for_error_msg() print("Error while compiling or benchmarking, see source files: " + " ".join(temp_filenames)) raise e #clean up any temporary files, if no error occured instance.delete_temp_files() return result
def compile_and_benchmark(self, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = util.get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) mem_usage = round( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1) logging.debug('Memory usage : %2.2f MB', mem_usage) verbose = tuning_options.verbose instance = self.create_kernel_instance(kernel_options, params, verbose) if instance is None: return None try: #compile the kernel func = self.compile_kernel(instance, verbose) if func is None: return None #add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) #test kernel for correctness and benchmark if tuning_options.answer is not None: self.check_kernel_correctness(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose) #benchmark time = self.benchmark(func, gpu_args, instance, verbose) except Exception as e: #dump kernel_string to temp file temp_filename = util.get_temp_filename(suffix=".c") util.write_file(temp_filename, instance.kernel_string) print("Error while compiling or benchmarking, see source files: " + temp_filename + " ".join(instance.temp_files.values())) raise e #clean up any temporary files, if no error occured for v in instance.temp_files.values(): util.delete_temp_file(v) return time
def compile_and_benchmark(self, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = util.get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) mem_usage = round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0, 1) logging.debug('Memory usage : %2.2f MB', mem_usage) verbose = tuning_options.verbose instance = self.create_kernel_instance(kernel_options, params, verbose) if instance is None: return None try: #compile the kernel func = self.compile_kernel(instance, verbose) if func is None: return None #add constant memory arguments to compiled module if kernel_options.cmem_args is not None: self.dev.copy_constant_memory_args(kernel_options.cmem_args) #add texture memory arguments to compiled module if kernel_options.texmem_args is not None: self.dev.copy_texture_memory_args(kernel_options.texmem_args) #test kernel for correctness and benchmark if tuning_options.answer is not None: self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose) #benchmark time = self.benchmark(func, gpu_args, instance, tuning_options.times, verbose) except Exception as e: #dump kernel_string to temp file temp_filename = util.get_temp_filename(suffix=".c") util.write_file(temp_filename, instance.kernel_string) print("Error while compiling or benchmarking, see source files: " + temp_filename + " ".join(instance.temp_files.values())) raise e #clean up any temporary files, if no error occured for v in instance.temp_files.values(): util.delete_temp_file(v) return time
def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, tuning_options): """ Compile and benchmark a kernel instance based on kernel strings and parameters """ instance_string = get_instance_string(params) logging.debug('compile_and_benchmark ' + instance_string) raise self.device_access_error