def merge_data_files(tunedir): """ Merge CSV files """ for algorithm in kernel_algorithm.keys(): training_data_file = os.path.join( tunedir, "raw_training_data_{algorithm}.csv".format(algorithm=algorithm) ) if os.path.exists(training_data_file): print("\nFound {}, skipping ... ".format(training_data_file)) else: print("\nMerging partial CSV files into {} ... ".format(training_data_file)) filenames_pattern = os.path.join( tunedir, "tune_*/raw_training_data_*_{algorithm}.csv".format( algorithm=algorithm ), ) print("Merging all files with pattern:", filenames_pattern) filenames = glob.glob(filenames_pattern) if len(filenames) == 0: print("Found no files matching this pattern, skipping ...") else: print("Found {} files matching this pattern".format(len(filenames))) with open(training_data_file, "w") as out: # Write the first file, including its header fn_1 = filenames.pop(0) with open(fn_1) as f: header_line_ref = next(f) # read header line out.write(header_line_ref) # write header line out.write(f.read()) # write the rest of the file # Write the rest of the files, skipping the header line each time for i, fn in enumerate(filenames): print( "writing from {} ({}/{})".format(fn, i + 1, len(filenames)) ) with open(fn) as f: header_line = next(f) # skip header line assert header_line == header_line_ref, ( 'Cannot merge file "' + fn + '", because its header line:\n' + header_line + 'is different from the header line of file "' + fn_1 + '":\n' + header_line_ref ) out.write(f.read()) print("Wrote to {}".format(training_data_file))
def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size, skip_derived_data): # =============================================================================== # Write baseline and maximum performance records for algorithm in algorithms_to_prep: write_baseline_and_max_records_per_algorithm(data_path, algorithm, arch, n_jobs, chunk_size) if set(algorithms_to_prep) == set(kernel_algorithm.keys()): write_baseline_record(data_path, algorithms_to_prep) write_max_by_algo_record(data_path, algorithms_to_prep) write_max_record(data_path, algorithms_to_prep) # =============================================================================== if not skip_derived_data: for algorithm in algorithms_to_prep: write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size) write_to_parquet(data_path, algorithm)
def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size, skip_derived_data): """ This script is part of the workflow for predictive modelling of optimal libcusmm parameters. For more details, see predict.md """ # =============================================================================== # Write baseline and maximum performance records for algorithm in algorithms_to_prep: write_baseline_and_max_records_per_algorithm(data_path, algorithm, arch, n_jobs, chunk_size) if set(algorithms_to_prep) == set(kernel_algorithm.keys()): write_baseline_record(data_path, algorithms_to_prep) write_max_by_algo_record(data_path, algorithms_to_prep) write_max_record(data_path, algorithms_to_prep) # =============================================================================== if not skip_derived_data: for algorithm in algorithms_to_prep: write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size) write_to_parquet(data_path, algorithm)
"Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.", ) parser.add_argument( "-c", "--chunk_size", type=int, default=20000, help= "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", ) parser.add_argument( "-s", "--skip_derived_data", type=bool, default=False, help= "Skip the computation of derived data. Set to true if computing baseline & max records for each algoseparately", ) args = parser.parse_args() algorithms_to_prep = (kernel_algorithm.keys() if args.algorithm == "" else [args.algorithm]) main( args.folder, algorithms_to_prep, args.arch, args.njobs, args.chunk_size, args.skip_derived_data, )
def get_optimal_kernels( mnks_to_predict, njobs, chunk_size, paths_to_models, gpu_properties, autotuning_properties, top_k, ): # optimal_kernels_list is a list of dictionaries # - keys: (m, n, k), # - values: Kernel object describing best parameters # - number of elements in each dictionary = top_k # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm print("Getting optimal kernels") # =============================================================================== # Load predictive trees and feature list tree = dict() kernel_to_investigate = dict() for algo in kernel_algorithm.keys(): path_to_model = paths_to_models[algo] if path_to_model is not None: print("Algorithm: {:<8}, loading model from: {}".format( algo, path_to_model)) tree[algo] = dict() tree[algo]["file"] = path_to_model features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"]) tree[algo]["features"] = features.tolist() kernel_to_investigate[algo] = kernel_algorithm[algo] else: print("Algorithm: {:<8}, no model found.".format(algo)) if len(kernel_to_investigate) == 0: print("No model found. Specify path to predictive models using ") sys.exit(1) # =============================================================================== optimal_kernels_list = list() mnk_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys())) num_mnks_by_algo = len(mnk_by_algo) if njobs == 1: # Ignore joblib and run serially: for mnk, algo in mnk_by_algo: gc.collect() print("Find optimal kernels for mnk=", mnk, ", algo=", algo) optimal_kernels_list.append( find_optimal_kernel( mnk, algo, tree[algo]["tree"], tree[algo]["features"], gpu_properties, autotuning_properties, )) else: # Chunk up tasks for i in range(0, num_mnks_by_algo + 1, chunk_size): start_chunk = i end_chunk = int(min(start_chunk + chunk_size, num_mnks_by_algo + 1)) print("Completed {:,} tasks out of {:,}".format( i, num_mnks_by_algo)) # Run prediction tasks in parallel with joblib optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)( delayed(find_optimal_kernel, check_pickle=True)( mnk, algo, tree[algo]["tree"], tree[algo]["features"], gpu_properties, autotuning_properties, ) for mnk, algo in mnk_by_algo[start_chunk:end_chunk]) optimal_kernels_list += optimal_kernels_list_ print("Finished gathering candidates for optimal parameter space") # Group optimal kernel candidates by (m,n,k) in a dictionary optimal_kernels_mnk_algo = dict() for optimal_kernel_mnk in optimal_kernels_list: for mnk, kernels_mnk in optimal_kernel_mnk.items(): m, n, k = mnk if (m, n, k) in optimal_kernels_mnk_algo.keys(): optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk) else: optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk] # Find optimal kernel per mnk among the different algorithm possibilities optimal_kernels = dict() for mnk, candidate_kernels in optimal_kernels_mnk_algo.items(): m, n, k = mnk optimal_kernel_mnk = sorted(candidate_kernels, key=lambda x: x.perf, reverse=True)[:top_k] optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0] return optimal_kernels
help= "Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.", ) parser.add_argument( "--largeDB1", default=None, help= "Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.", ) parser.add_argument( "--largeDB2", default=None, help= "Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.", ) parser.add_argument( "-c", "--chunk_size", type=int, default=20000, help= "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", ) args = parser.parse_args() paths_to_models = dict() for algo in kernel_algorithm.keys(): paths_to_models[algo] = args.__dict__[algo] main(args.params, args.njobs, args.baseline, paths_to_models, args.chunk_size)
type=int, help="Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.", ) parser.add_argument( "-c", "--chunk_size", type=int, default=20000, help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", ) parser.add_argument( "-s", "--skip_derived_data", type=bool, default=False, help="Skip the computation of derived data. Set to true if computing baseline & max records for each algoseparately", ) args = parser.parse_args() algorithms_to_prep = ( kernel_algorithm.keys() if args.algorithm == "" else [args.algorithm] ) main( args.folder, algorithms_to_prep, args.arch, args.njobs, args.chunk_size, args.skip_derived_data, )
def print_merging_commands(kernel_folders, kernel_folder_pattern, tunedir): """ Print commands to execute in order to merge CSV files """ for algorithm in kernel_algorithm.keys(): for data_type in ("raw_", ""): data_type_name = ("raw" if data_type == "raw_" else "for predictive modelling") print( "\n$ # Merge instructions for algorithm", algorithm, "(", data_type_name, ")", ) training_data_file = "{data_type}training_data_{algorithm}.csv".format( data_type=data_type, algorithm=algorithm) if os.path.exists(training_data_file): print("$ # Found {}, append new training data to this file:". format(training_data_file)) else: # Find an (m, n, k) for this algorithm to get its header line for i, kernel_folder in enumerate(kernel_folders): # Find (m, n, k) match = kernel_folder_pattern.search( kernel_folder).groups() m = int(match[0]) n = int(match[1]) k = int(match[2]) file_name = os.path.join( kernel_folder, "{data_type}training_data_{mnk}_{algorithm}.csv". format( data_type=data_type, mnk=to_string(m, n, k), algorithm=algorithm, ), ) if os.path.exists(file_name): print("$ head -1 {base_file} > {training_data_file}". format( base_file=file_name, training_data_file=training_data_file, )) break else: print( "None: did not find any existing files for algorithm", algorithm, "and data", data_type_name, ) continue print( "$ tail -n +2 -q {tunedir}tune_*/{data_type}training_data_*_{algorithm}.csv >> {training_data_file}" .format( tunedir=tunedir, data_type=data_type, algorithm=algorithm, training_data_file=training_data_file, ))
def gen_benchmark(outdir, gpu_properties, autotuning_properties, m, n, k): includes = [] launcher_codes = [] launchers = [] kernel_descr = [] # Get the kernel algorithms compatible with the given size: compatible_kernels = [ kernel_algorithm[kernclass] for kernclass in kernel_algorithm.keys() if compatible_mnk(kernclass, m, n, k) ] # Get the parameter sets to measure for this (m,n,k) for kernclass in compatible_kernels: params = kernclass.promising_parameters(m, n, k, gpu_properties, autotuning_properties) if params == 0: continue for p in params: kern = kernclass(**p, source="autotuning_candidate", perf=0) includes.append("../kernels/" + kern.include) launcher_codes.append(kern.launcher_code) launchers.append("launch_" + kern.name) kernel_descr.append(kernclass.__name__ + format_params(p)) print("Found %d parameter sets for %dx%dx%d" % (len(launchers), m, n, k)) if len(launchers) == 0: return # Compose the "include" line of the benchmark code incl_output = '#include "../kernels/cusmm_common.h"\n' for i in set(includes): incl_output += '#include "%s"\n' % i incl_output += "\n\n" max_launchers_per_exe = 10000 # Compose the benchmark code launchers_per_obj = 100 n_exe_files = int(len(launcher_codes) / max_launchers_per_exe) + 1 launchers_per_exe = int(len(launcher_codes) / n_exe_files) + 1 # Compose source code for each executable file for i in range(n_exe_files): chunk_a = i * launchers_per_exe chunk_b = min((i + 1) * launchers_per_exe, len(launcher_codes)) n_obj_files = int((chunk_b - chunk_a) / launchers_per_obj) + 1 # Compose source code for each object file for j in range(n_obj_files): a = chunk_a + j * launchers_per_obj b = min(chunk_a + (j + 1) * launchers_per_obj, chunk_b) output = incl_output output += "\n\n".join(launcher_codes[a:b]) fn = outdir + "/tune_%dx%dx%d_exe%d_part%d.cu" % (m, n, k, i, j) writefile(fn, output) # Compose source code for "main" of executable file output = '#include "../libcusmm_benchmark.h"\n\n' for l in launchers: output += ( "int " + l + "(int *param_stack, int stack_size, cudaStream_t stream, int m_max, int n_max, int k_max," + " double *a_data, double *b_data, double *c_data);\n") output += "\n" output += "int main(int argc, char** argv){\n" output += "libcusmm_benchmark_t* handle;\n" output += "KernelLauncher launchers[%d];\n" % (chunk_b - chunk_a) output += "char *kernel_descr[%d];\n" % (chunk_b - chunk_a) for j in range(chunk_b - chunk_a): output += "launchers[%d] = %s;\n" % (j, launchers[chunk_a + j]) output += 'kernel_descr[%d] = (char *) "%s";\n' % ( j, kernel_descr[chunk_a + j], ) output += "libcusmm_benchmark_init(&handle, tune, %d, %d, %d);\n" % ( m, n, k) output += ( "int result = libcusmm_benchmark(handle, %d, %d, %d, %d, launchers, kernel_descr);\n" % (m, n, k, chunk_b - chunk_a)) output += "libcusmm_benchmark_finalize(handle);\n" output += "return result;" output += "}\n" fn = outdir + "/tune_%dx%dx%d_exe%d_main.cu" % (m, n, k, i) writefile(fn, output)