Esempio n. 1
0
def merge_data_files(tunedir):
    """
    Merge CSV files
    """
    for algorithm in kernel_algorithm.keys():

        training_data_file = os.path.join(
            tunedir, "raw_training_data_{algorithm}.csv".format(algorithm=algorithm)
        )

        if os.path.exists(training_data_file):
            print("\nFound {}, skipping ... ".format(training_data_file))

        else:

            print("\nMerging partial CSV files into {} ... ".format(training_data_file))

            filenames_pattern = os.path.join(
                tunedir,
                "tune_*/raw_training_data_*_{algorithm}.csv".format(
                    algorithm=algorithm
                ),
            )
            print("Merging all files with pattern:", filenames_pattern)
            filenames = glob.glob(filenames_pattern)
            if len(filenames) == 0:
                print("Found no files matching this pattern, skipping ...")

            else:
                print("Found {} files matching this pattern".format(len(filenames)))

                with open(training_data_file, "w") as out:
                    # Write the first file, including its header
                    fn_1 = filenames.pop(0)
                    with open(fn_1) as f:
                        header_line_ref = next(f)  # read header line
                        out.write(header_line_ref)  # write header line
                        out.write(f.read())  # write the rest of the file
                    # Write the rest of the files, skipping the header line each time
                    for i, fn in enumerate(filenames):
                        print(
                            "writing from {} ({}/{})".format(fn, i + 1, len(filenames))
                        )
                        with open(fn) as f:
                            header_line = next(f)  # skip header line
                            assert header_line == header_line_ref, (
                                'Cannot merge file "'
                                + fn
                                + '", because its header line:\n'
                                + header_line
                                + 'is different from the header line of file "'
                                + fn_1
                                + '":\n'
                                + header_line_ref
                            )
                            out.write(f.read())

                print("Wrote to {}".format(training_data_file))
Esempio n. 2
0
def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size,
         skip_derived_data):
    # ===============================================================================
    # Write baseline and maximum performance records
    for algorithm in algorithms_to_prep:
        write_baseline_and_max_records_per_algorithm(data_path, algorithm,
                                                     arch, n_jobs, chunk_size)

    if set(algorithms_to_prep) == set(kernel_algorithm.keys()):
        write_baseline_record(data_path, algorithms_to_prep)
        write_max_by_algo_record(data_path, algorithms_to_prep)
        write_max_record(data_path, algorithms_to_prep)

    # ===============================================================================
    if not skip_derived_data:
        for algorithm in algorithms_to_prep:
            write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size)
            write_to_parquet(data_path, algorithm)
Esempio n. 3
0
def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size,
         skip_derived_data):
    """
    This script is part of the workflow for predictive modelling of optimal libcusmm parameters.
    For more details, see predict.md

    """
    # ===============================================================================
    # Write baseline and maximum performance records
    for algorithm in algorithms_to_prep:
        write_baseline_and_max_records_per_algorithm(data_path, algorithm,
                                                     arch, n_jobs, chunk_size)

    if set(algorithms_to_prep) == set(kernel_algorithm.keys()):
        write_baseline_record(data_path, algorithms_to_prep)
        write_max_by_algo_record(data_path, algorithms_to_prep)
        write_max_record(data_path, algorithms_to_prep)

    # ===============================================================================
    if not skip_derived_data:
        for algorithm in algorithms_to_prep:
            write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size)
            write_to_parquet(data_path, algorithm)
Esempio n. 4
0
        "Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.",
    )
    parser.add_argument(
        "-c",
        "--chunk_size",
        type=int,
        default=20000,
        help=
        "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
    )
    parser.add_argument(
        "-s",
        "--skip_derived_data",
        type=bool,
        default=False,
        help=
        "Skip the computation of derived data. Set to true if computing baseline & max records for each algoseparately",
    )

    args = parser.parse_args()
    algorithms_to_prep = (kernel_algorithm.keys()
                          if args.algorithm == "" else [args.algorithm])
    main(
        args.folder,
        algorithms_to_prep,
        args.arch,
        args.njobs,
        args.chunk_size,
        args.skip_derived_data,
    )
Esempio n. 5
0
def get_optimal_kernels(
    mnks_to_predict,
    njobs,
    chunk_size,
    paths_to_models,
    gpu_properties,
    autotuning_properties,
    top_k,
):
    # optimal_kernels_list is a list of dictionaries
    # - keys: (m, n, k),
    # - values: Kernel object describing best parameters
    # - number of elements in each dictionary = top_k
    # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm

    print("Getting optimal kernels")

    # ===============================================================================
    # Load predictive trees and feature list
    tree = dict()
    kernel_to_investigate = dict()
    for algo in kernel_algorithm.keys():
        path_to_model = paths_to_models[algo]
        if path_to_model is not None:
            print("Algorithm: {:<8}, loading model from: {}".format(
                algo, path_to_model))
            tree[algo] = dict()
            tree[algo]["file"] = path_to_model
            features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"])
            tree[algo]["features"] = features.tolist()
            kernel_to_investigate[algo] = kernel_algorithm[algo]
        else:
            print("Algorithm: {:<8}, no model found.".format(algo))

    if len(kernel_to_investigate) == 0:
        print("No model found. Specify path to predictive models using ")
        sys.exit(1)

    # ===============================================================================
    optimal_kernels_list = list()
    mnk_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys()))
    num_mnks_by_algo = len(mnk_by_algo)
    if njobs == 1:

        # Ignore joblib and run serially:
        for mnk, algo in mnk_by_algo:
            gc.collect()
            print("Find optimal kernels for mnk=", mnk, ", algo=", algo)
            optimal_kernels_list.append(
                find_optimal_kernel(
                    mnk,
                    algo,
                    tree[algo]["tree"],
                    tree[algo]["features"],
                    gpu_properties,
                    autotuning_properties,
                ))
    else:

        # Chunk up tasks
        for i in range(0, num_mnks_by_algo + 1, chunk_size):
            start_chunk = i
            end_chunk = int(min(start_chunk + chunk_size,
                                num_mnks_by_algo + 1))
            print("Completed {:,} tasks out of {:,}".format(
                i, num_mnks_by_algo))

            # Run prediction tasks in parallel with joblib
            optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)(
                delayed(find_optimal_kernel, check_pickle=True)(
                    mnk,
                    algo,
                    tree[algo]["tree"],
                    tree[algo]["features"],
                    gpu_properties,
                    autotuning_properties,
                ) for mnk, algo in mnk_by_algo[start_chunk:end_chunk])

            optimal_kernels_list += optimal_kernels_list_

    print("Finished gathering candidates for optimal parameter space")

    # Group optimal kernel candidates by (m,n,k) in a dictionary
    optimal_kernels_mnk_algo = dict()
    for optimal_kernel_mnk in optimal_kernels_list:
        for mnk, kernels_mnk in optimal_kernel_mnk.items():
            m, n, k = mnk
            if (m, n, k) in optimal_kernels_mnk_algo.keys():
                optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk)
            else:
                optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk]

    # Find optimal kernel per mnk among the different algorithm possibilities
    optimal_kernels = dict()
    for mnk, candidate_kernels in optimal_kernels_mnk_algo.items():
        m, n, k = mnk
        optimal_kernel_mnk = sorted(candidate_kernels,
                                    key=lambda x: x.perf,
                                    reverse=True)[:top_k]
        optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0]

    return optimal_kernels
Esempio n. 6
0
        help=
        "Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "--largeDB1",
        default=None,
        help=
        "Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "--largeDB2",
        default=None,
        help=
        "Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "-c",
        "--chunk_size",
        type=int,
        default=20000,
        help=
        "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
    )

    args = parser.parse_args()
    paths_to_models = dict()
    for algo in kernel_algorithm.keys():
        paths_to_models[algo] = args.__dict__[algo]
    main(args.params, args.njobs, args.baseline, paths_to_models,
         args.chunk_size)
Esempio n. 7
0
        type=int,
        help="Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.",
    )
    parser.add_argument(
        "-c",
        "--chunk_size",
        type=int,
        default=20000,
        help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
    )
    parser.add_argument(
        "-s",
        "--skip_derived_data",
        type=bool,
        default=False,
        help="Skip the computation of derived data. Set to true if computing baseline & max records for each algoseparately",
    )

    args = parser.parse_args()
    algorithms_to_prep = (
        kernel_algorithm.keys() if args.algorithm == "" else [args.algorithm]
    )
    main(
        args.folder,
        algorithms_to_prep,
        args.arch,
        args.njobs,
        args.chunk_size,
        args.skip_derived_data,
    )
Esempio n. 8
0
def print_merging_commands(kernel_folders, kernel_folder_pattern, tunedir):
    """
    Print commands to execute in order to merge CSV files
    """
    for algorithm in kernel_algorithm.keys():
        for data_type in ("raw_", ""):

            data_type_name = ("raw" if data_type == "raw_" else
                              "for predictive modelling")
            print(
                "\n$ # Merge instructions for algorithm",
                algorithm,
                "(",
                data_type_name,
                ")",
            )
            training_data_file = "{data_type}training_data_{algorithm}.csv".format(
                data_type=data_type, algorithm=algorithm)

            if os.path.exists(training_data_file):
                print("$ # Found {}, append new training data to this file:".
                      format(training_data_file))

            else:

                # Find an (m, n, k) for this algorithm to get its header line
                for i, kernel_folder in enumerate(kernel_folders):

                    # Find (m, n, k)
                    match = kernel_folder_pattern.search(
                        kernel_folder).groups()
                    m = int(match[0])
                    n = int(match[1])
                    k = int(match[2])

                    file_name = os.path.join(
                        kernel_folder,
                        "{data_type}training_data_{mnk}_{algorithm}.csv".
                        format(
                            data_type=data_type,
                            mnk=to_string(m, n, k),
                            algorithm=algorithm,
                        ),
                    )
                    if os.path.exists(file_name):
                        print("$ head -1 {base_file} > {training_data_file}".
                              format(
                                  base_file=file_name,
                                  training_data_file=training_data_file,
                              ))
                        break
                else:
                    print(
                        "None: did not find any existing files for algorithm",
                        algorithm,
                        "and data",
                        data_type_name,
                    )
                    continue

            print(
                "$ tail -n +2 -q {tunedir}tune_*/{data_type}training_data_*_{algorithm}.csv >> {training_data_file}"
                .format(
                    tunedir=tunedir,
                    data_type=data_type,
                    algorithm=algorithm,
                    training_data_file=training_data_file,
                ))
Esempio n. 9
0
def gen_benchmark(outdir, gpu_properties, autotuning_properties, m, n, k):
    includes = []
    launcher_codes = []
    launchers = []
    kernel_descr = []

    # Get the kernel algorithms compatible with the given size:
    compatible_kernels = [
        kernel_algorithm[kernclass] for kernclass in kernel_algorithm.keys()
        if compatible_mnk(kernclass, m, n, k)
    ]

    # Get the parameter sets to measure for this (m,n,k)
    for kernclass in compatible_kernels:
        params = kernclass.promising_parameters(m, n, k, gpu_properties,
                                                autotuning_properties)
        if params == 0:
            continue

        for p in params:
            kern = kernclass(**p, source="autotuning_candidate", perf=0)
            includes.append("../kernels/" + kern.include)
            launcher_codes.append(kern.launcher_code)
            launchers.append("launch_" + kern.name)
            kernel_descr.append(kernclass.__name__ + format_params(p))

    print("Found %d parameter sets for %dx%dx%d" % (len(launchers), m, n, k))
    if len(launchers) == 0:
        return

    # Compose the "include" line of the benchmark code
    incl_output = '#include "../kernels/cusmm_common.h"\n'
    for i in set(includes):
        incl_output += '#include "%s"\n' % i
    incl_output += "\n\n"
    max_launchers_per_exe = 10000
    # Compose the benchmark code
    launchers_per_obj = 100
    n_exe_files = int(len(launcher_codes) / max_launchers_per_exe) + 1
    launchers_per_exe = int(len(launcher_codes) / n_exe_files) + 1

    # Compose source code for each executable file
    for i in range(n_exe_files):
        chunk_a = i * launchers_per_exe
        chunk_b = min((i + 1) * launchers_per_exe, len(launcher_codes))
        n_obj_files = int((chunk_b - chunk_a) / launchers_per_obj) + 1

        # Compose source code for each object file
        for j in range(n_obj_files):
            a = chunk_a + j * launchers_per_obj
            b = min(chunk_a + (j + 1) * launchers_per_obj, chunk_b)
            output = incl_output
            output += "\n\n".join(launcher_codes[a:b])
            fn = outdir + "/tune_%dx%dx%d_exe%d_part%d.cu" % (m, n, k, i, j)
            writefile(fn, output)

        # Compose source code for "main" of executable file
        output = '#include "../libcusmm_benchmark.h"\n\n'
        for l in launchers:
            output += (
                "int " + l +
                "(int *param_stack, int stack_size, cudaStream_t stream, int m_max, int n_max, int k_max,"
                + " double *a_data, double *b_data, double *c_data);\n")

        output += "\n"
        output += "int main(int argc, char** argv){\n"
        output += "libcusmm_benchmark_t* handle;\n"
        output += "KernelLauncher launchers[%d];\n" % (chunk_b - chunk_a)
        output += "char *kernel_descr[%d];\n" % (chunk_b - chunk_a)

        for j in range(chunk_b - chunk_a):
            output += "launchers[%d]    = %s;\n" % (j, launchers[chunk_a + j])
            output += 'kernel_descr[%d] = (char *) "%s";\n' % (
                j,
                kernel_descr[chunk_a + j],
            )
        output += "libcusmm_benchmark_init(&handle, tune, %d, %d, %d);\n" % (
            m, n, k)
        output += (
            "int result = libcusmm_benchmark(handle, %d, %d, %d, %d, launchers, kernel_descr);\n"
            % (m, n, k, chunk_b - chunk_a))
        output += "libcusmm_benchmark_finalize(handle);\n"
        output += "return result;"
        output += "}\n"

        fn = outdir + "/tune_%dx%dx%d_exe%d_main.cu" % (m, n, k, i)
        writefile(fn, output)