コード例 #1
0
def env():
    kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
            int i = blockIdx.x * block_size_x + threadIdx.x;
            if (i<n) {
                c[i] = a[i] + b[i];
            }
        } """
    args = get_vector_add_args()
    params = {"block_size_x": 128}

    lang = "CUDA"
    kernel_source = core.KernelSource(kernel_string, lang)
    verbose = True
    kernel_options = Options(kernel_name="vector_add",
                             kernel_string=kernel_string,
                             problem_size=args[-1],
                             arguments=args,
                             lang=lang,
                             grid_div_x=None,
                             grid_div_y=None,
                             grid_div_z=None,
                             cmem_args=None,
                             texmem_args=None,
                             block_size_names=None)
    device_options = Options(device=0,
                             platform=0,
                             lang=lang,
                             quiet=False,
                             compiler=None,
                             compiler_options=None)
    dev = core.DeviceInterface(kernel_source, iterations=7, **device_options)
    instance = dev.create_kernel_instance(kernel_source, kernel_options,
                                          params, verbose)

    return dev, instance
コード例 #2
0
ファイル: kernelbuilder.py プロジェクト: wjp/kernel_tuner
    def __init__(self, kernel_name, kernel_string, problem_size, arguments, params, inputs=None, outputs=None, device=0, platform=0,
                 block_size_names=None, grid_div_x=None, grid_div_y=None, grid_div_z=None, verbose=True, lang=None):
        """ Construct Python helper object to compile and call the kernel from Python

            This object compiles a GPU kernel parameterized using the parameters in params.
            GPU memory is allocated for each argument using its size and type as listed in arguments.
            The object can be called directly as a function with the kernel arguments as function arguments.
            Kernel arguments marked as inputs will be copied to the GPU on every kernel launch.
            Only the kernel arguments marked as outputs will be returned, note that the result is always
            returned in a list, even when there is only one output.

            Most of the arguments to this function are the same as with tune_kernel or run_kernel in Kernel Tuner,
            and are therefore not duplicated here. The two new arguments are:

            :param inputs: a boolean list of length arguments to signal whether an argument is input to the kernel
            :type inputs: list(bool)

            :param outputs: a boolean list of length arguments to signal whether an argument is output of the kernel
            :type outputs: list(bool)

        """
        #construct device interface
        kernel_source = core.KernelSource(kernel_string, lang)
        self.dev = core.DeviceInterface(kernel_source, device=device)

        #construct kernel_options to hold information about the kernel
        opts = locals()
        kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys() if k in opts.keys()])

        #instantiate the kernel given the parameters in params
        self.kernel_instance = self.dev.create_kernel_instance(kernel_source, kernel_options, params, verbose)

        #compile the kernel
        self.func = self.dev.compile_kernel(self.kernel_instance, verbose)

        #setup GPU memory
        self.gpu_args = self.dev.ready_argument_list(arguments)
        if inputs:
            self.inputs = inputs
        else:
            self.inputs = [True for _ in arguments]
        if outputs:
            self.outputs = outputs
        else:
            self.outputs = [True for _ in arguments]
コード例 #3
0
def test_check_kernel_output(dev_func_interface):
    dev_func_interface.configure_mock(**mock_config)

    dev = core.DeviceInterface(core.KernelSource("", lang="CUDA"))
    dfi = dev.dev

    answer = [np.zeros(4).astype(np.float32)]
    instance = core.KernelInstance("name", None, "kernel_string", "temp_files",
                                   (256, 1, 1), (1, 1, 1), {}, answer)
    wrong = [np.array([1, 2, 3, 4]).astype(np.float32)]
    atol = 1e-6

    test = dev.check_kernel_output('func', answer, instance, answer, atol,
                                   None, True)

    dfi.memcpy_htod.assert_called_once_with(answer[0], answer[0])
    dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1),
                                           (1, 1, 1))

    print(dfi.mock_calls)

    assert dfi.memcpy_dtoh.called == 1

    for name, args, _ in dfi.mock_calls:
        if name == 'memcpy_dtoh':
            assert all(args[0] == answer[0])
            assert all(args[1] == answer[0])
    assert test

    #the following call to check_kernel_output is expected to fail because
    #the answer is non-zero, while the memcpy_dtoh function on the Mocked object
    #obviously does not result in the result_host array containing anything
    #non-zero
    try:
        dev.check_kernel_output('func', wrong, instance, wrong, atol, None,
                                True)
        print("check_kernel_output failed to throw an exception")
        assert False
    except Exception:
        assert True
コード例 #4
0
def run_kernel(kernel_name, kernel_string, problem_size, arguments,
               params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
               lang=None, device=0, platform=0, cmem_args=None, texmem_args=None, compiler=None, compiler_options=None,
               block_size_names=None, quiet=False, log=None):

    if log:
        logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log)

    kernel_source = core.KernelSource(kernel_string, lang)

    _check_user_input(kernel_name, kernel_source, arguments, block_size_names)

    #sort options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    #detect language and create the right device function interface
    dev = core.DeviceInterface(kernel_source, iterations=1, **device_options)

    #move data to the GPU
    gpu_args = dev.ready_argument_list(arguments)

    instance = None
    try:
        #create kernel instance
        instance = dev.create_kernel_instance(kernel_source, kernel_options, params, False)
        if instance is None:
            raise Exception("cannot create kernel instance, too many threads per block")

        # see if the kernel arguments have correct type
        util.check_argument_list(instance.name, instance.kernel_string, arguments)

        #compile the kernel
        func = dev.compile_kernel(instance, False)
        if func is None:
            raise Exception("cannot compile kernel, too much shared memory used")

        #add constant memory arguments to compiled module
        if cmem_args is not None:
            dev.copy_constant_memory_args(cmem_args)
        #add texture memory arguments to compiled module
        if texmem_args is not None:
            dev.copy_texture_memory_args(texmem_args)
    finally:
        #delete temp files
        if instance is not None:
            instance.delete_temp_files()

    #run the kernel
    if not dev.run_kernel(func, gpu_args, instance):
        raise Exception("runtime error occured, too many resources requested")

    #copy data in GPU memory back to the host
    results = []
    for i, arg in enumerate(arguments):
        if numpy.isscalar(arg):
            results.append(arg)
        else:
            results.append(numpy.zeros_like(arg))
            dev.memcpy_dtoh(results[-1], gpu_args[i])

    #trying to make run_kernel work nicely with the Nvidia Visual Profiler
    del dev

    return results
コード例 #5
0
def tune_kernel(kernel_name, kernel_string, problem_size, arguments,
                tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
                restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False,
                lang=None, device=0, platform=0, cmem_args=None, texmem_args=None,
                compiler=None, compiler_options=None, log=None,
                iterations=7, block_size_names=None, quiet=False, strategy=None, strategy_options=None,
                cache=None):

    if log:
        logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log)

    kernel_source = core.KernelSource(kernel_string, lang)

    _check_user_input(kernel_name, kernel_source, arguments, block_size_names)

    # check for forbidden names in tune parameters
    util.check_tune_params_list(tune_params)

    # check whether block_size_names are used as expected
    util.check_block_size_params_names_list(block_size_names, tune_params)

    if iterations < 1:
        raise ValueError("Iterations should be at least one!")

    #sort all the options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    logging.debug('tune_kernel called')
    logging.debug('kernel_options: %s', util.get_config_string(kernel_options))
    logging.debug('tuning_options: %s', util.get_config_string(tuning_options))
    logging.debug('device_options: %s', util.get_config_string(device_options))

    if strategy:
        if strategy in strategy_map:
            strategy = strategy_map[strategy]
        else:
            raise ValueError("Strategy %s not recognized" % strategy)

        #make strategy_options into an Options object
        if tuning_options.strategy_options:
            if not isinstance(strategy_options, Options):
                tuning_options.strategy_options = Options(strategy_options)

            #select strategy based on user options
            if "fraction" in tuning_options.strategy_options and not tuning_options.strategy == 'random_sample':
                raise ValueError('It is not possible to use fraction in combination with strategies other than "random_sample". ' \
                                 'Please set strategy="random_sample", when using "fraction" in strategy_options')

            #check if method is supported by the selected strategy
            if "method" in tuning_options.strategy_options:
                method = tuning_options.strategy_options.method
                if not method in strategy.supported_methods:
                    raise ValueError('Method %s is not supported for strategy %s' % (method, tuning_options.strategy))

        #if no strategy_options dict has been passed, create empty dictionary
        else:
            tuning_options.strategy_options = Options({})

    #if no strategy selected
    else:
        strategy = brute_force


    runner = SequentialRunner(kernel_source, kernel_options, device_options, iterations)

    #the user-specified function may or may not have an optional atol argument;
    #we normalize it so that it always accepts atol.
    tuning_options.verify = util.normalize_verify_function(tuning_options.verify)

    #process cache
    if cache:
        if cache[-5:] != ".json":
            cache += ".json"

        util.process_cache(cache, kernel_options, tuning_options, runner)
    else:
        tuning_options.cache = {}
        tuning_options.cachefile = None

    #call the strategy to execute the tuning process
    results, env = strategy.tune(runner, kernel_options, device_options, tuning_options)

    #finished iterating over search space
    if not device_options.quiet:
        if results:     #checks if results is not empty
            best_config = min(results, key=lambda x: x['time'])
            units = getattr(runner, "units", None)
            print("best performing configuration:", util.get_config_string(best_config, list(tune_params.keys()) + ['time'], units=units))
        else:
            print("no results to report")

    if cache:
        util.close_cache(cache)

    del runner.dev

    return results, env
コード例 #6
0
def test_get_device_interface2():
    lang = "OpenCL"
    dev = core.DeviceInterface(core.KernelSource("", lang=lang))
    assert isinstance(dev, core.DeviceInterface)
    assert isinstance(dev.dev, opencl.OpenCLFunctions)
コード例 #7
0
def test_get_device_interface1():
    lang = "CUDA"
    dev = core.DeviceInterface(core.KernelSource("", lang=lang))
    assert isinstance(dev, core.DeviceInterface)
    assert isinstance(dev.dev, cuda.CudaFunctions)
コード例 #8
0
def test_get_device_interface1():
    lang = "CUDA"
    with core.DeviceInterface(core.KernelSource("", "", lang=lang)) as dev:
        assert isinstance(dev, core.DeviceInterface)
        assert isinstance(dev.dev, pycuda.PyCudaFunctions)