help= "Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.", ) parser.add_argument( "--largeDB1", default=None, help= "Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.", ) parser.add_argument( "--largeDB2", default=None, help= "Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.", ) parser.add_argument( "-c", "--chunk_size", type=int, default=2000, help= "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", ) args = parser.parse_args() paths_to_models = dict() for algo in kernel_algorithm.keys(): paths_to_models[algo] = args.__dict__[algo] main(args.params, args.njobs, args.baseline, paths_to_models, args.chunk_size)
def get_optimal_kernels( mnks_to_predict, njobs, chunk_size, paths_to_models, gpu_properties, autotuning_properties, top_k, ): # optimal_kernels_list is a list of dictionaries # - keys: (m, n, k), # - values: Kernel object describing best parameters # - number of elements in each dictionary = top_k # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm print("Getting optimal kernels") # =============================================================================== # Load predictive trees and feature list tree = dict() kernel_to_investigate = dict() for algo in kernel_algorithm.keys(): path_to_model = paths_to_models[algo] if path_to_model is not None: print("Algorithm: {:<8}, loading model from: {}".format( algo, path_to_model)) tree[algo] = dict() tree[algo]["file"] = path_to_model features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"]) tree[algo]["features"] = features.tolist() kernel_to_investigate[algo] = kernel_algorithm[algo] else: print("Algorithm: {:<8}, no model found.".format(algo)) if len(kernel_to_investigate) == 0: print("No model found. Specify path to predictive models using ") sys.exit(1) # =============================================================================== # Get mnks_by_algo to compute: mnks_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys())) num_mnks_by_algo = len(mnks_by_algo) optimal_kernels_list = list() ckpt_folder_name = "predict_genpars_ckpt" if not os.path.exists(ckpt_folder_name): os.mkdir(ckpt_folder_name) print("Caching intermediate results to:", ckpt_folder_name) for i in range(0, num_mnks_by_algo + 1, chunk_size): # Chunk up tasks start_chunk = i end_chunk = int(min(start_chunk + chunk_size, num_mnks_by_algo + 1)) print("Completed {:,} tasks out of {:,}".format(i, num_mnks_by_algo)) # Create checkpoint file or load checkpointed data from it checkpoint_file_name = os.path.join( ckpt_folder_name, "chunk_{}-{}.json".format(start_chunk, end_chunk)) if os.path.exists(checkpoint_file_name): with open(checkpoint_file_name, "r") as f: optimal_kernels_list__ = json.load(f) optimal_kernels_list_ = list() for i, optker in enumerate(optimal_kernels_list__): optimal_kernels_list_.append({}) for k, v in optker.items(): algo = v.pop("algorithm") optimal_kernels_list_[i][to_tuple( k)] = kernel_algorithm[algo](**v) print("Read chunk {}-{}\n".format(start_chunk, end_chunk)) else: if njobs == 1: # Ignore joblib and run serially: for mnk, algo in mnks_by_algo: gc.collect() print("Find optimal kernels for mnk=", mnk, ", algo=", algo) optimal_kernels_list_ = find_optimal_kernel( mnk, algo, tree[algo]["tree"], tree[algo]["features"], gpu_properties, autotuning_properties, ) else: # Run prediction tasks in parallel with joblib optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)( delayed(find_optimal_kernel, check_pickle=True)( mnk, algo, tree[algo]["tree"], tree[algo]["features"], gpu_properties, autotuning_properties, ) for mnk, algo in mnks_by_algo[start_chunk:end_chunk]) optimal_kernels_list_ = remove_empty_entries(optimal_kernels_list_) with open(checkpoint_file_name, "w") as f: optimal_kernels_list__ = list() for i, optker in enumerate(optimal_kernels_list_): optimal_kernels_list__.append({}) for k, v in optker.items(): optimal_kernels_list__[i][to_string(k)] = v.as_dict json.dump(optimal_kernels_list__, f) optimal_kernels_list += optimal_kernels_list_ print("Finished gathering candidates for optimal parameter space") # Group optimal kernel candidates by (m,n,k) in a dictionary optimal_kernels_mnk_algo = dict() for optimal_kernel_mnk in optimal_kernels_list: for mnk, kernels_mnk in optimal_kernel_mnk.items(): m, n, k = mnk if (m, n, k) in optimal_kernels_mnk_algo.keys(): optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk) else: optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk] # Find optimal kernel per mnk among the different algorithm possibilities optimal_kernels = dict() for mnk, candidate_kernels in optimal_kernels_mnk_algo.items(): m, n, k = mnk optimal_kernel_mnk = sorted(candidate_kernels, key=lambda x: x.perf, reverse=True)[:top_k] optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0] return optimal_kernels
def gen_benchmark(outdir, gpu_properties, autotuning_properties, compiler, m, n, k): includes = [] launcher_codes = [] launchers = [] kernel_descr = [] indent = " " file_extension = get_file_extension_from_compiler(compiler) # Get the kernel algorithms compatible with the given size: compatible_kernels = [ kernel_algorithm[kernclass] for kernclass in kernel_algorithm.keys() if compatible_mnk(kernclass, m, n, k) ] # Get the parameter sets to measure for this (m,n,k) for kernclass in compatible_kernels: params = kernclass.promising_parameters(m, n, k, gpu_properties, autotuning_properties) if params == 0: continue for p in params: kern = kernclass(**p, source="autotuning_candidate", perf=0) includes.append("../../kernels/" + kern.include) launcher_codes.append(kern.launcher_code(compiler)) launchers.append("launch_" + kern.name) kernel_descr.append(kernclass.__name__ + format_params(p)) print("Found %d parameter sets for %dx%dx%d" % (len(launchers), m, n, k)) if len(launchers) == 0: return # Compose the "include" line of the benchmark code incl_output = '#include "../../kernels/smm_acc_common.h"\n' for i in set(includes): incl_output += '#include "%s"\n' % i incl_output += "\n\n" # Compose the benchmark code # The benchmark is broken down in # - n_exe_files executables # - each executable is made of n_obj_files object files # - each object file is made up of launchers_per_obj launchers # - each launcher launches 1 GPU kernel with a certain set of kernel parameters # the hipcc compiler is very slow -> make a larger number of smaller executables max_launchers_per_exe = 10000 if compiler == "nvcc" else 100 launchers_per_obj = 100 if compiler == "nvcc" else 10 n_exe_files = int(len(launcher_codes) / max_launchers_per_exe) + 1 launchers_per_exe = int(len(launcher_codes) / n_exe_files) + 1 # Compose source code for each executable file for i in range(n_exe_files): chunk_a = i * launchers_per_exe chunk_b = min((i + 1) * launchers_per_exe, len(launcher_codes)) n_obj_files = math.ceil((chunk_b - chunk_a) / launchers_per_obj) # Compose source code for each object file for j in range(n_obj_files): a = chunk_a + j * launchers_per_obj b = min(chunk_a + (j + 1) * launchers_per_obj, chunk_b) output = incl_output output += "\n\n".join(launcher_codes[a:b]) fn = outdir + "/tune_%dx%dx%d_exe%d_part%d%s" % ( m, n, k, i, j, file_extension, ) writefile(fn, output) # Compose source code for "main" of executable file output = '#include "../../libsmm_acc_benchmark.h"\n\n' for j in range(chunk_b - chunk_a): output += ("int " + launchers[chunk_a + j] + "(int *param_stack, int stack_size, ") if compiler == "nvcc": output += "cudaStream_t stream, " else: output += "hipStream_t stream, " output += ("int m_max, int n_max, int k_max," + " double *a_data, double *b_data, double *c_data);\n") output += "\n" output += "int main(int argc, char** argv){\n" if compiler == "nvcc": output += ( indent + "cudaError_t err = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);\n" ) output += indent + "if(err != cudaSuccess) return(-1);\n" else: # i.e. compiler = hipcc output += ( indent + "hipError_t err = hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte);\n" ) output += indent + "if(err != hipSuccess) return(-1);\n" output += indent + "libsmm_acc_benchmark_t* handle;\n" output += indent + "KernelLauncher launchers[%d];\n" % (chunk_b - chunk_a) output += indent + "char *kernel_descr[%d];\n" % (chunk_b - chunk_a) for j in range(chunk_b - chunk_a): output += indent + "launchers[%d] = %s;\n" % ( j, launchers[chunk_a + j]) output += indent + 'kernel_descr[%d] = (char *) "%s";\n' % ( j, kernel_descr[chunk_a + j], ) output += indent + "libsmm_acc_benchmark_init(&handle, tune, %d, %d, %d);\n" % ( m, n, k, ) output += ( indent + "int result = libsmm_acc_benchmark(handle, %d, %d, %d, %d, launchers, kernel_descr);\n" % (m, n, k, chunk_b - chunk_a)) output += indent + "libsmm_acc_benchmark_finalize(handle);\n" output += indent + "return result;" output += "}\n" fn = outdir + "/tune_%dx%dx%d_exe%d_main%s" % (m, n, k, i, file_extension) writefile(fn, output)