Exemple #1
0
    def check_argument_lists(self, kernel_name, arguments):
        """ Check if the kernel arguments have the correct types

        This is done by calling util.check_argument_list on each kernel string.
        """
        for i, f in enumerate(self.kernel_sources):
            if not callable(f):
                util.check_argument_list(kernel_name, self.get_kernel_string(i), arguments)
            else:
                logging.debug("Checking of arguments list not supported yet for code generators.")
Exemple #2
0
def run_kernel(kernel_name, kernel_string, problem_size, arguments,
               params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
               lang=None, device=0, platform=0, cmem_args=None, compiler_options=None,
               block_size_names=None, quiet=False):

    #sort options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    #detect language and create the right device function interface
    dev = core.DeviceInterface(kernel_string, iterations=1, **device_options)

    #move data to the GPU
    util.check_argument_list(arguments)
    gpu_args = dev.ready_argument_list(arguments)

    instance = None
    try:
        #create kernel instance
        instance = dev.create_kernel_instance(kernel_options, params, False)
        if instance is None:
            raise Exception("cannot create kernel instance, too many threads per block")

        #compile the kernel
        func = dev.compile_kernel(instance, False)
        if func is None:
            raise Exception("cannot compile kernel, too much shared memory used")

        #add constant memory arguments to compiled module
        if cmem_args is not None:
            dev.copy_constant_memory_args(cmem_args)
    finally:
        #delete temp files
        if instance is not None:
            for v in instance.temp_files.values():
                util.delete_temp_file(v)

    #run the kernel
    if not dev.run_kernel(func, gpu_args, instance):
        raise Exception("runtime error occured, too many resources requested")

    #copy data in GPU memory back to the host
    results = []
    for i, arg in enumerate(arguments):
        if numpy.isscalar(arg):
            results.append(arg)
        else:
            results.append(numpy.zeros_like(arg))
            dev.memcpy_dtoh(results[-1], gpu_args[i])

    return results
Exemple #3
0
def _check_user_input(kernel_name, kernel_string, arguments, block_size_names):
    # see if the kernel arguments have correct type
    if not callable(kernel_string):
        if isinstance(kernel_string, list):
            for file in kernel_string:
                util.check_argument_list(kernel_name,
                                         util.get_kernel_string(file),
                                         arguments)
        else:
            util.check_argument_list(kernel_name,
                                     util.get_kernel_string(kernel_string),
                                     arguments)
    else:
        logging.debug(
            "Checking of arguments list not supported yet for code generators."
        )

    # check for types and length of block_size_names
    util.check_block_size_names(block_size_names)
def run_kernel(kernel_name, kernel_string, problem_size, arguments,
               params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
               lang=None, device=0, platform=0, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None,
               block_size_names=None, quiet=False, log=None):

    if log:
        logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log)

    kernel_source = core.KernelSource(kernel_string, lang)

    _check_user_input(kernel_name, kernel_source, arguments, block_size_names)

    #sort options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    #detect language and create the right device function interface
    dev = core.DeviceInterface(kernel_source, iterations=1, **device_options)

    #move data to the GPU
    gpu_args = dev.ready_argument_list(arguments)

    instance = None
    try:
        #create kernel instance
        instance = dev.create_kernel_instance(kernel_source, kernel_options, params, False)
        if instance is None:
            raise Exception("cannot create kernel instance, too many threads per block")

        # see if the kernel arguments have correct type
        util.check_argument_list(instance.name, instance.kernel_string, arguments)

        #compile the kernel
        func = dev.compile_kernel(instance, False)
        if func is None:
            raise Exception("cannot compile kernel, too much shared memory used")

        #add constant memory arguments to compiled module
        if cmem_args is not None:
            dev.copy_constant_memory_args(cmem_args)
        #add texture memory arguments to compiled module
        if texmem_args is not None:
            dev.copy_texture_memory_args(texmem_args)
    finally:
        #delete temp files
        if instance is not None:
            instance.delete_temp_files()

    #run the kernel
    if not dev.run_kernel(func, gpu_args, instance):
        raise Exception("runtime error occured, too many resources requested")

    #copy data in GPU memory back to the host
    results = []
    for i, arg in enumerate(arguments):
        if numpy.isscalar(arg):
            results.append(arg)
        else:
            results.append(numpy.zeros_like(arg))
            dev.memcpy_dtoh(results[-1], gpu_args[i])

    #trying to make run_kernel work nicely with the Nvidia Visual Profiler
    del dev

    return results
Exemple #5
0
def tune_kernel(kernel_name, kernel_string, problem_size, arguments,
                tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
                restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False,
                lang=None, device=0, platform=0, cmem_args=None,
                num_threads=1, use_noodles=False, sample_fraction=False, compiler_options=None, log=None,
                iterations=7, block_size_names=None, quiet=False, strategy=None, method=None):

    if log:
        logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log)

    #see if the kernel arguments have correct type
    util.check_argument_list(arguments)

    #sort all the options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    logging.debug('tune_kernel called')
    logging.debug('kernel_options: %s', util.get_config_string(kernel_options))
    logging.debug('tuning_options: %s', util.get_config_string(tuning_options))
    logging.debug('device_options: %s', util.get_config_string(device_options))

    #select strategy based on user options
    if sample_fraction and not strategy in [None, 'sample_fraction']:
        raise ValueError("It's not possible to use both sample_fraction in combination with other strategies. " \
                         'Please set strategy=None or strategy="random_sample", when using sample_fraction')

    if strategy in [None, 'sample_fraction', 'brute_force']:
        if sample_fraction:
            use_strategy = random_sample
        else:
            use_strategy = brute_force
    elif strategy in ["minimize", "basinhopping"]:
        if method:
            if not (method in ["Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B",
                               "TNC", "COBYLA", "SLSQP"] or callable(method)):
                raise ValueError("method option not recognized")
        else:
            method = "L-BFGS-B"
        if strategy == "minimize":
            use_strategy = minimize
        else:
            use_strategy = basinhopping
    elif strategy == "diff_evo":
        use_strategy = diff_evo
        if method:
            if not method in ["best1bin", "best1exp", "rand1exp", "randtobest1exp", "best2exp",
                              "rand2exp", "randtobest1bin", "best2bin", "rand2bin", "rand1bin"]:
                raise ValueError("method option not recognized")
    else:
        raise ValueError("strategy option not recognized")
    strategy = use_strategy

    #select runner based on user options
    if num_threads == 1 and not use_noodles:
        from kernel_tuner.runners.sequential import SequentialRunner
        runner = SequentialRunner(kernel_options, device_options, iterations)
    elif num_threads > 1 and not use_noodles:
        raise ValueError("Using multiple threads requires the Noodles runner, use use_noodles=True")
    elif use_noodles:
        #check if Python version matches required by Noodles
        if sys.version_info[0] < 3 or (sys.version_info[0] == 3 and sys.version_info[1] < 5):
            raise ValueError("Using multiple threads requires Noodles, Noodles requires Python 3.5 or higher")
        #check if noodles is installed in a way that works with Python 3.4 or newer
        noodles_installed = importlib.util.find_spec("noodles") is not None
        if not noodles_installed:
            raise ValueError("Using multiple threads requires Noodles, please use 'pip install noodles'")
        #import the NoodlesRunner
        from kernel_tuner.runners.noodles import NoodlesRunner
        runner = NoodlesRunner(device_options, num_threads)
    else:
        raise ValueError("Somehow no runner was selected, this should not happen, please file a bug report")

    #call the strategy to execute the tuning process
    results, env = strategy.tune(runner, kernel_options, device_options, tuning_options)

    #finished iterating over search space
    if results:     #checks if results is not empty
        best_config = min(results, key=lambda x: x['time'])
        print("best performing configuration:", util.get_config_string(best_config))
    else:
        print("no results to report")

    del runner.dev

    return results, env