コード例 #1
0
ファイル: core.py プロジェクト: stijnh/kernel_tuner
    def check_kernel_output(self, func, gpu_args, instance, answer, atol,
                            verify, verbose):
        """runs the kernel once and checks the result against answer"""
        logging.debug('check_kernel_output')

        #if not using custom verify function, check if the length is the same
        if not verify and len(instance.arguments) != len(answer):
            raise TypeError(
                "The length of argument list and provided results do not match."
            )

        #re-copy original contents of output arguments to GPU memory, to overwrite any changes
        #by earlier kernel runs
        for i, arg in enumerate(instance.arguments):
            if (verify or answer[i] is not None) and isinstance(
                    arg, (np.ndarray, cp.ndarray, torch.Tensor)):
                self.dev.memcpy_htod(gpu_args[i], arg)

        #run the kernel
        check = self.run_kernel(func, gpu_args, instance)
        if not check:
            return True  #runtime failure occured that should be ignored, skip correctness check

        #retrieve gpu results to host memory
        result_host = []
        for i, arg in enumerate(instance.arguments):
            if (verify or answer[i] is not None) and isinstance(
                    arg, (np.ndarray, cp.ndarray)):
                result_host.append(np.zeros_like(arg))
                self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
            elif isinstance(arg, torch.Tensor) and isinstance(
                    answer[i], torch.Tensor):
                if not answer[i].is_cuda:
                    #if the answer is on the host, copy gpu output to host as well
                    result_host.append(torch.zeros_like(answer[i]))
                    self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
                else:
                    result_host.append(gpu_args[i].tensor)
            else:
                result_host.append(None)

        #if the user has specified a custom verify function, then call it, else use default based on numpy allclose
        if verify:
            correct = verify(answer, result_host, atol=atol)
        else:
            correct = _default_verify_function(instance, answer, result_host,
                                               atol, verbose)

        if not correct:
            raise RuntimeError("Kernel result verification failed for: " +
                               util.get_config_string(instance.params))
        return True
コード例 #2
0
ファイル: sequential.py プロジェクト: tdwebste/kernel_tuner
    def run(self, parameter_space, kernel_options, tuning_options):
        """ Iterate through the entire parameter space using a single Python process

        :param parameter_space: The parameter space as an iterable.
        :type parameter_space: iterable

        :param kernel_options: A dictionary with all options for the kernel.
        :type kernel_options: kernel_tuner.interface.Options

        :param tuning_options: A dictionary with all options regarding the tuning
            process.
        :type tuning_options: kernel_tuner.iterface.Options

        :returns: A list of dictionaries for executed kernel configurations and their
            execution times. And a dictionary that contains a information
            about the hardware/software environment on which the tuning took place.
        :rtype: list(dict()), dict()

        """
        logging.debug('sequential runner started for ' +
                      kernel_options.kernel_name)

        results = []

        #iterate over parameter space
        for element in parameter_space:
            params = OrderedDict(
                zip(tuning_options.tune_params.keys(), element))

            time = self.dev.compile_and_benchmark(self.gpu_args, params,
                                                  kernel_options,
                                                  tuning_options)

            if time is None:
                logging.debug(
                    'received time is None, kernel configuration was skipped silently due to compile or runtime failure'
                )
                continue

            #print and append to results
            params['time'] = time
            output_string = get_config_string(params, self.units)
            logging.debug(output_string)
            if not self.quiet:
                print(output_string)
            results.append(params)

        return results, self.dev.get_environment()
コード例 #3
0
    def run(self, parameter_space, kernel_options, tuning_options):
        """ Iterate through the entire parameter space using a single Python process

        :param parameter_space: The parameter space as an iterable.
        :type parameter_space: iterable

        :param kernel_options: A dictionary with all options for the kernel.
        :type kernel_options: kernel_tuner.interface.Options

        :param tuning_options: A dictionary with all options regarding the tuning
            process.
        :type tuning_options: kernel_tuner.iterface.Options

        :returns: A list of dictionaries for executed kernel configurations and their
            execution times. And a dictionary that contains a information
            about the hardware/software environment on which the tuning took place.
        :rtype: list(dict()), dict()

        """
        logging.debug('sequential runner started for ' + kernel_options.kernel_name)

        results = []

        #iterate over parameter space
        for element in parameter_space:
            params = OrderedDict(zip(tuning_options.tune_params.keys(), element))

            time = self.dev.compile_and_benchmark(self.gpu_args, params, kernel_options, tuning_options)

            if time is None:
                logging.debug('received time is None, kernel configuration was skipped silently due to compile or runtime failure')
                continue

            #print and append to results
            params['time'] = time
            output_string = get_config_string(params, self.units)
            logging.debug(output_string)
            if not self.quiet:
                print(output_string)
            results.append(params)

        return results, self.dev.get_environment()
コード例 #4
0
ファイル: interface.py プロジェクト: tdwebste/kernel_tuner
def tune_kernel(kernel_name,
                kernel_string,
                problem_size,
                arguments,
                tune_params,
                grid_div_x=None,
                grid_div_y=None,
                grid_div_z=None,
                restrictions=None,
                answer=None,
                atol=1e-6,
                verify=None,
                verbose=False,
                lang=None,
                device=0,
                platform=0,
                cmem_args=None,
                num_threads=1,
                use_noodles=False,
                sample_fraction=False,
                compiler=None,
                compiler_options=None,
                log=None,
                iterations=7,
                times=False,
                block_size_names=None,
                quiet=False,
                strategy=None,
                method=None):

    if log:
        logging.basicConfig(filename=kernel_name +
                            datetime.now().strftime('%Y%m%d-%H:%M:%S') +
                            '.log',
                            level=log)

    _check_user_input(kernel_name, kernel_string, arguments, block_size_names)

    # check for forbidden names in tune parameters
    util.check_tune_params_list(tune_params)

    # check whether block_size_names are used as expected
    util.check_block_size_params_names_list(block_size_names, tune_params)

    if iterations < 1:
        raise ValueError("Iterations should be at least one!")

    #sort all the options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    logging.debug('tune_kernel called')
    logging.debug('kernel_options: %s', util.get_config_string(kernel_options))
    logging.debug('tuning_options: %s', util.get_config_string(tuning_options))
    logging.debug('device_options: %s', util.get_config_string(device_options))

    #select strategy based on user options
    if sample_fraction and not strategy in [None, 'sample_fraction']:
        raise ValueError("It's not possible to use both sample_fraction in combination with other strategies. " \
                         'Please set strategy=None or strategy="random_sample", when using sample_fraction')

    if strategy in [None, 'sample_fraction', 'brute_force']:
        if sample_fraction:
            use_strategy = random_sample
        else:
            use_strategy = brute_force
    elif strategy in ["minimize", "basinhopping"]:
        if method:
            if not (method in [
                    "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC",
                    "COBYLA", "SLSQP"
            ] or callable(method)):
                raise ValueError("method option not recognized")
        else:
            method = "L-BFGS-B"
        if strategy == "minimize":
            use_strategy = minimize
        else:
            use_strategy = basinhopping
    elif strategy == "diff_evo":
        use_strategy = diff_evo
        if method:
            if not method in [
                    "best1bin", "best1exp", "rand1exp", "randtobest1exp",
                    "best2exp", "rand2exp", "randtobest1bin", "best2bin",
                    "rand2bin", "rand1bin"
            ]:
                raise ValueError("method option not recognized")
    else:
        raise ValueError("strategy option not recognized")
    strategy = use_strategy

    #select runner based on user options
    if num_threads == 1 and not use_noodles:
        from kernel_tuner.runners.sequential import SequentialRunner
        runner = SequentialRunner(kernel_options, device_options, iterations)
    elif num_threads > 1 and not use_noodles:
        raise ValueError(
            "Using multiple threads requires the Noodles runner, use use_noodles=True"
        )
    elif use_noodles:
        #check if Python version matches required by Noodles
        if sys.version_info[0] < 3 or (sys.version_info[0] == 3
                                       and sys.version_info[1] < 5):
            raise ValueError(
                "Using multiple threads requires Noodles, Noodles requires Python 3.5 or higher"
            )
        #check if noodles is installed in a way that works with Python 3.4 or newer
        noodles_installed = importlib.util.find_spec("noodles") is not None
        if not noodles_installed:
            raise ValueError(
                "Using multiple threads requires Noodles, please use 'pip install noodles'"
            )
        #import the NoodlesRunner
        from kernel_tuner.runners.noodles import NoodlesRunner
        runner = NoodlesRunner(device_options, num_threads)
    else:
        raise ValueError(
            "Somehow no runner was selected, this should not happen, please file a bug report"
        )

    #call the strategy to execute the tuning process
    results, env = strategy.tune(runner, kernel_options, device_options,
                                 tuning_options)

    #finished iterating over search space
    if not device_options.quiet:
        if results:  #checks if results is not empty
            best_config = min(results, key=lambda x: x['time'])
            units = getattr(runner, "units", None)
            print("best performing configuration:",
                  util.get_config_string(best_config, units=units))
        else:
            print("no results to report")

    del runner.dev

    return results, env
コード例 #5
0
def _default_verify_function(instance, answer, result_host, atol, verbose):
    """default verify function based on numpy.allclose"""

    #first check if the length is the same
    if len(instance.arguments) != len(answer):
        raise TypeError("The length of argument list and provided results do not match.")
    #for each element in the argument list, check if the types match
    for i, arg in enumerate(instance.arguments):
        if answer[i] is not None:    #skip None elements in the answer list
            if isinstance(answer[i], numpy.ndarray) and isinstance(arg, numpy.ndarray):
                if answer[i].dtype != arg.dtype:
                    raise TypeError("Element " + str(i) + " of the expected results list is not of the same dtype as the kernel output: " +
                                    str(answer[i].dtype) + " != " + str(arg.dtype) + ".")
                if answer[i].size != arg.size:
                    raise TypeError("Element " + str(i) + " of the expected results list has a size different from " + "the kernel argument: " +
                                    str(answer[i].size) + " != " + str(arg.size) + ".")
            elif isinstance(answer[i], numpy.number) and isinstance(arg, numpy.number):
                if answer[i].dtype != arg.dtype:
                    raise TypeError("Element " + str(i) + " of the expected results list is not the same as the kernel output: " + str(answer[i].dtype) +
                                    " != " + str(arg.dtype) + ".")
            else:
                #either answer[i] and argument have different types or answer[i] is not a numpy type
                if not isinstance(answer[i], numpy.ndarray) or not isinstance(answer[i], numpy.number):
                    raise TypeError("Element " + str(i) + " of expected results list is not a numpy array or numpy scalar.")
                else:
                    raise TypeError("Element " + str(i) + " of expected results list and kernel arguments have different types.")

    def _ravel(a):
        if hasattr(a, 'ravel') and len(a.shape) > 1:
            return a.ravel()
        return a

    def _flatten(a):
        if hasattr(a, 'flatten'):
            return a.flatten()
        return a

    correct = True
    for i, arg in enumerate(instance.arguments):
        expected = answer[i]
        if expected is not None:

            result = _ravel(result_host[i])
            expected = _flatten(expected)
            output_test = numpy.allclose(expected, result, atol=atol)

            if not output_test and verbose:
                print("Error: " + util.get_config_string(instance.params) + " detected during correctness check")
                print("this error occured when checking value of the %oth kernel argument" % (i, ))
                print("Printing kernel output and expected result, set verbose=False to suppress this debug print")
                numpy.set_printoptions(edgeitems=50)
                print("Kernel output:")
                print(result)
                print("Expected:")
                print(expected)
            correct = correct and output_test

    if not correct:
        logging.debug('correctness check has found a correctness issue')

    return correct
コード例 #6
0
def tune_kernel(kernel_name, kernel_string, problem_size, arguments,
                tune_params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
                restrictions=None, answer=None, atol=1e-6, verify=None, verbose=False,
                lang=None, device=0, platform=0, cmem_args=None, texmem_args=None,
                compiler=None, compiler_options=None, log=None,
                iterations=7, block_size_names=None, quiet=False, strategy=None, strategy_options=None,
                cache=None):

    if log:
        logging.basicConfig(filename=kernel_name + datetime.now().strftime('%Y%m%d-%H:%M:%S') + '.log', level=log)

    kernel_source = core.KernelSource(kernel_string, lang)

    _check_user_input(kernel_name, kernel_source, arguments, block_size_names)

    # check for forbidden names in tune parameters
    util.check_tune_params_list(tune_params)

    # check whether block_size_names are used as expected
    util.check_block_size_params_names_list(block_size_names, tune_params)

    if iterations < 1:
        raise ValueError("Iterations should be at least one!")

    #sort all the options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    tuning_options = Options([(k, opts[k]) for k in _tuning_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    logging.debug('tune_kernel called')
    logging.debug('kernel_options: %s', util.get_config_string(kernel_options))
    logging.debug('tuning_options: %s', util.get_config_string(tuning_options))
    logging.debug('device_options: %s', util.get_config_string(device_options))

    if strategy:
        if strategy in strategy_map:
            strategy = strategy_map[strategy]
        else:
            raise ValueError("Strategy %s not recognized" % strategy)

        #make strategy_options into an Options object
        if tuning_options.strategy_options:
            if not isinstance(strategy_options, Options):
                tuning_options.strategy_options = Options(strategy_options)

            #select strategy based on user options
            if "fraction" in tuning_options.strategy_options and not tuning_options.strategy == 'random_sample':
                raise ValueError('It is not possible to use fraction in combination with strategies other than "random_sample". ' \
                                 'Please set strategy="random_sample", when using "fraction" in strategy_options')

            #check if method is supported by the selected strategy
            if "method" in tuning_options.strategy_options:
                method = tuning_options.strategy_options.method
                if not method in strategy.supported_methods:
                    raise ValueError('Method %s is not supported for strategy %s' % (method, tuning_options.strategy))

        #if no strategy_options dict has been passed, create empty dictionary
        else:
            tuning_options.strategy_options = Options({})

    #if no strategy selected
    else:
        strategy = brute_force


    runner = SequentialRunner(kernel_source, kernel_options, device_options, iterations)

    #the user-specified function may or may not have an optional atol argument;
    #we normalize it so that it always accepts atol.
    tuning_options.verify = util.normalize_verify_function(tuning_options.verify)

    #process cache
    if cache:
        if cache[-5:] != ".json":
            cache += ".json"

        util.process_cache(cache, kernel_options, tuning_options, runner)
    else:
        tuning_options.cache = {}
        tuning_options.cachefile = None

    #call the strategy to execute the tuning process
    results, env = strategy.tune(runner, kernel_options, device_options, tuning_options)

    #finished iterating over search space
    if not device_options.quiet:
        if results:     #checks if results is not empty
            best_config = min(results, key=lambda x: x['time'])
            units = getattr(runner, "units", None)
            print("best performing configuration:", util.get_config_string(best_config, list(tune_params.keys()) + ['time'], units=units))
        else:
            print("no results to report")

    if cache:
        util.close_cache(cache)

    del runner.dev

    return results, env
コード例 #7
0
def tune():
    with open('reduction.cl', 'r') as f:
        kernel_string = f.read()

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5, 11)]
    tune_params["vector"] = [2**i for i in range(3)]
    tune_params["num_blocks"] = [2**i for i in range(5, 11)]

    problem_size = "num_blocks"
    size = 80000000
    max_blocks = max(tune_params["num_blocks"])

    x = numpy.random.rand(size).astype(numpy.float32)
    sum_x = numpy.zeros(max_blocks).astype(numpy.float32)
    n = numpy.int32(size)

    args = [sum_x, x, n]

    #prepare output verification with custom function
    reference = [numpy.sum(x), None, None]

    def verify_partial_reduce(cpu_result, gpu_result, atol=None):
        return numpy.isclose(cpu_result, numpy.sum(gpu_result), atol=atol)

    #tune the first kernel
    first_kernel, _ = tune_kernel("sum_floats",
                                  kernel_string,
                                  problem_size,
                                  args,
                                  tune_params,
                                  grid_div_x=[],
                                  verbose=True,
                                  answer=reference,
                                  verify=verify_partial_reduce)

    #tune the second kernel for different input sizes
    #depending on the number of blocks used in the first kernel

    #store the parameter list used in the first kernel
    num_blocks = tune_params["num_blocks"]
    #fix num_blocks parameter to only 1 for the second kernel
    tune_params["num_blocks"] = [1]
    second_kernel = dict()
    for nblocks in num_blocks:
        #change the input size to nblocks
        args = [sum_x, x, numpy.int32(nblocks)]
        #tune the second kernel with n=nblocks
        result, _ = tune_kernel("sum_floats",
                                kernel_string,
                                problem_size,
                                args,
                                tune_params,
                                grid_div_x=[],
                                verbose=True)
        with open("reduce-kernel2-" + str(nblocks) + ".json", 'w') as fp:
            json.dump(result, fp)
        #only keep the best performing config
        second_kernel[nblocks] = min(result, key=lambda x: x['time'])

    #combine the results from the first kernel with best
    #second kernel that uses the same num_blocks
    for i, instance in enumerate(first_kernel):
        first_kernel[i]["total"] = instance["time"] + second_kernel[
            instance["num_blocks"]]["time"]

    best_config = min(first_kernel, key=lambda x: x['total'])

    print("Best performing config: \n" + get_config_string(best_config))
    print("uses the following config for the secondary kernel:")
    print(get_config_string(second_kernel[best_config["num_blocks"]]))

    with open("reduce.json", 'w') as fp:
        json.dump(first_kernel, fp)

    return first_kernel, second_kernel
コード例 #8
0
ファイル: core.py プロジェクト: benvanwerkhoven/kernel_tuner
def _default_verify_function(instance, answer, result_host, atol, verbose):
    """default verify function based on numpy.allclose"""

    #first check if the length is the same
    if len(instance.arguments) != len(answer):
        raise TypeError("The length of argument list and provided results do not match.")
    #for each element in the argument list, check if the types match
    for i, arg in enumerate(instance.arguments):
        if answer[i] is not None: #skip None elements in the answer list
            if isinstance(answer[i], numpy.ndarray) and isinstance(arg, numpy.ndarray):
                if answer[i].dtype != arg.dtype:
                    raise TypeError("Element " + str(i)
                                    + " of the expected results list is not of the same dtype as the kernel output: "
                                    + str(answer[i].dtype) + " != " + str(arg.dtype) + ".")
                if answer[i].size != arg.size:
                    raise TypeError("Element " + str(i)
                                    + " of the expected results list has a size different from "
                                    + "the kernel argument: "
                                    + str(answer[i].size) + " != " + str(arg.size) + ".")
            elif isinstance(answer[i], numpy.number) and isinstance(arg, numpy.number):
                if answer[i].dtype != arg.dtype:
                    raise TypeError("Element " + str(i)
                                    + " of the expected results list is not the same as the kernel output: "
                                    + str(answer[i].dtype) + " != " + str(arg.dtype) + ".")
            else:
                #either answer[i] and argument have different types or answer[i] is not a numpy type
                if not isinstance(answer[i], numpy.ndarray) or not isinstance(answer[i], numpy.number):
                    raise TypeError("Element " + str(i)
                                    + " of expected results list is not a numpy array or numpy scalar.")
                else:
                    raise TypeError("Element " + str(i)
                                    + " of expected results list and kernel arguments have different types.")

    def _ravel(a):
        if hasattr(a, 'ravel') and len(a.shape) > 1:
            return a.ravel()
        return a

    def _flatten(a):
        if hasattr(a, 'flatten'):
            return a.flatten()
        return a

    correct = True
    for i, arg in enumerate(instance.arguments):
        expected = answer[i]
        if expected is not None:

            result = _ravel(result_host[i])
            expected = _flatten(expected)
            output_test = numpy.allclose(expected, result, atol=atol)

            if not output_test and verbose:
                print("Error: " + util.get_config_string(instance.params) + " detected during correctness check")
                print("this error occured when checking value of the %oth kernel argument" % (i,))
                print("Printing kernel output and expected result, set verbose=False to suppress this debug print")
                numpy.set_printoptions(edgeitems=50)
                print("Kernel output:")
                print(result)
                print("Expected:")
                print(expected)
            correct = correct and output_test

    if not correct:
        logging.debug('correctness check has found a correctness issue')
        raise Exception("Error: " + util.get_config_string(instance.params) + " failed correctness check")

    return correct
コード例 #9
0
ファイル: hyper.py プロジェクト: stijnh/kernel_tuner
def tune_hyper_params(target_strategy, hyper_params, *args, **kwargs):
    """ Tune hyperparameters for a given strategy and kernel

    This function is to be called just like tune_kernel, except that you specify a strategy
    and a dictionary with hyperparameters in front of the arguments you pass to tune_kernel.

    The arguments to tune_kernel should contain a cachefile. To compute the optimum the hyperparameter
    tuner first tunes the kernel with a brute force search. If your cachefile is not yet complete
    this may take very long.

    :param target_strategy: Specify the strategy for which to tune hyperparameters
    :type target_strategy: string

    :param hyper_params: A dictionary containing the hyperparameters as keys and
        lists the possible values per key
    :type hyper_params: dict(string: list)

    :param args: all positional arguments used to call tune_kernel
    :type args: various

    :param kwargs: other keyword arguments to pass to tune_kernel
    :type kwargs: dict

    """
    if not "cache" in kwargs:
        raise ValueError(
            "Please specify a cachefile to store benchmarking data when tuning hyperparameters"
        )

    def put_if_not_present(d, key, value):
        d[key] = value if not key in d else d[key]

    put_if_not_present(kwargs, "verbose", False)
    put_if_not_present(kwargs, "quiet", True)
    put_if_not_present(kwargs, "simulation_mode", True)
    kwargs['strategy'] = 'brute_force'

    #last position argument is tune_params
    tune_params = args[-1]

    #find optimum
    kwargs["strategy"] = "brute_force"
    results, env = kernel_tuner.tune_kernel(*args, **kwargs)
    optimum = min(results, key=lambda p: p["time"])["time"]

    #could throw a warning for the kwargs that will be overwritten, strategy(_options)
    kwargs["strategy"] = target_strategy

    parameter_space = itertools.product(*hyper_params.values())
    all_results = []

    for params in parameter_space:
        strategy_options = dict(zip(hyper_params.keys(), params))

        kwargs["strategy_options"] = strategy_options

        fevals = []
        p_of_opt = []
        for _ in range(100):
            #measure
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                results, env = kernel_tuner.tune_kernel(*args, **kwargs)

            #get unique function evaluations
            unique_fevals = {
                ",".join(
                    [str(v) for k, v in record.items() if k in tune_params])
                for record in results
            }

            fevals.append(len(unique_fevals))
            # p_of_opt.append(optimum / min(results, key=lambda p: p["time"])["time"] * 100)
            p_of_opt.append(
                min(results, key=lambda p: p["time"])["time"] / optimum * 100)

        strategy_options["fevals"] = np.average(fevals)
        strategy_options["fevals_std"] = np.std(fevals)

        strategy_options["p_of_opt"] = np.average(p_of_opt)
        strategy_options["p_of_opt_std"] = np.std(p_of_opt)

        print(get_config_string(strategy_options))
        all_results.append(strategy_options)

    return all_results
コード例 #10
0
def tune():
    with open('reduction.cl', 'r') as f:
        kernel_string = f.read()

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["vector"] = [2**i for i in range(3)]
    tune_params["num_blocks"] = [2**i for i in range(5,11)]

    problem_size = "num_blocks"
    size = 80000000
    max_blocks = max(tune_params["num_blocks"])

    x = numpy.random.rand(size).astype(numpy.float32)
    sum_x = numpy.zeros(max_blocks).astype(numpy.float32)
    n = numpy.int32(size)

    args = [sum_x, x, n]

    #prepare output verification with custom function
    reference = [numpy.sum(x), None, None]
    def verify_partial_reduce(cpu_result, gpu_result, atol=None):
        return numpy.isclose(cpu_result, numpy.sum(gpu_result), atol=atol)

    #tune the first kernel
    first_kernel, _ = tune_kernel("sum_floats", kernel_string, problem_size,
        args, tune_params, grid_div_x=[], verbose=True, answer=reference, verify=verify_partial_reduce)

    #tune the second kernel for different input sizes
    #depending on the number of blocks used in the first kernel

    #store the parameter list used in the first kernel
    num_blocks = tune_params["num_blocks"]
    #fix num_blocks parameter to only 1 for the second kernel
    tune_params["num_blocks"] = [1]
    second_kernel = dict()
    for nblocks in num_blocks:
        #change the input size to nblocks
        args = [sum_x, x, numpy.int32(nblocks)]
        #tune the second kernel with n=nblocks
        result, _ = tune_kernel("sum_floats", kernel_string, problem_size,
        args, tune_params, grid_div_x=[], verbose=True)
        with open("reduce-kernel2-" + str(nblocks) + ".json", 'w') as fp:
            json.dump(result, fp)
        #only keep the best performing config
        second_kernel[nblocks] = min(result, key=lambda x:x['time'])

    #combine the results from the first kernel with best
    #second kernel that uses the same num_blocks
    for i, instance in enumerate(first_kernel):
        first_kernel[i]["total"] = instance["time"] + second_kernel[instance["num_blocks"]]["time"]

    best_config = min(first_kernel, key=lambda x:x['total'])

    print("Best performing config: \n" + get_config_string(best_config))
    print("uses the following config for the secondary kernel:")
    print(get_config_string(second_kernel[best_config["num_blocks"]]))

    with open("reduce.json", 'w') as fp:
        json.dump(first_kernel, fp)

    return first_kernel, second_kernel
コード例 #11
0
def tune(algorithm, do_strategy):

    result_summary = {}

    tune_func = algorithms[algorithm]['method']

    test_methods = strategy_options[do_strategy]

    for method in test_methods:

        if method:
            experiment_name = do_strategy + "_" + method
        else:
            experiment_name = do_strategy

        summary = OrderedDict()
        summary["best"] = []
        summary["best_times"] = []
        summary["execution_time"] = []

        try:

            #test all methods multiple times because some methods are stochastic
            for i in range(32 if do_strategy != "brute_force" else 1):

                outfile = algorithm + "/" + algorithm + "_" + experiment_name
                if do_strategy != "brute_force":
                    outfile += "_" + str(i)
                outfile += ".json"

                if os.path.isfile(outfile):
                    print("output file %s already exists, skipping this experiment" % outfile)
                    continue

                start = time.time()

                if 'options' in algorithms[algorithm]:
                    results, env = tune_func(do_strategy, method, algorithms[algorithm]['options'])
                else:
                    results, env = tune_func(do_strategy, method)

                end = time.time()
                env['execution_time'] = end-start

                gc.collect()

                with open(outfile, 'w') as fp:
                    json.dump(results, fp)

                best_config = min(results, key=lambda x:x['time'])
                summary["best"].append(best_config)
                summary["best_times"].append(best_config['time'])
                summary["execution_time"].append(env['execution_time'])

        finally:
            if len(summary["best"]) > 0:
                result_summary[experiment_name] = summary
                update_results_db(algorithm, result_summary)



    #print some output at end of run, not strictly necessary
    with open(algorithm + "/" + algorithm + "_summary.json", 'r') as fp:
        result_summary = json.load(fp)

    total_ops = algorithms[algorithm]['total_ops']
    unit = algorithms[algorithm]['unit']

    for k, d in result_summary.items():
        print(k)
        for i, config in enumerate(d["best"]):
            print(get_config_string(config), str(total_ops / (config['time'] /1e3)) + " " + unit, str(d["execution_time"][i]) + " sec")
        print("average best performance: " + str(numpy.average(d["best_times"])))
        print("average execution_time: " + str(numpy.average(d["execution_time"])))
コード例 #12
0
ファイル: core.py プロジェクト: TaihuLight/kernel_tuner
    def check_kernel_correctness(self, func, gpu_args, instance, answer, atol,
                                 verify, verbose):
        """runs the kernel once and checks the result against answer"""
        logging.debug('check_kernel_correctness')
        params = instance.params

        #zero GPU memory for output arguments
        for i, arg in enumerate(instance.arguments):
            if answer[i] is not None:
                self.dev.memset(gpu_args[i], 0, arg.nbytes)

        #run the kernel
        if not self.run_kernel(func, gpu_args, instance):
            return True  #runtime failure occured that should be ignored, skip correctness check

        def _ravel(a):
            if hasattr(a, 'ravel') and len(a.shape) > 1:
                return a.ravel()
            return a

        def _flatten(a):
            if hasattr(a, 'flatten'):
                return a.flatten()
            return a

        #check correctness of each output argument
        correct = True
        for i, arg in enumerate(instance.arguments):
            expected = answer[i]
            if expected is not None:
                result_host = numpy.zeros_like(arg)
                self.dev.memcpy_dtoh(result_host, gpu_args[i])

                result_host = _ravel(result_host)
                expected = _flatten(expected)
                if verify is None:
                    output_test = numpy.allclose(expected,
                                                 result_host,
                                                 atol=atol)
                else:
                    try:
                        output_test = verify(expected, result_host, atol=atol)
                    except TypeError:
                        output_test = verify(expected, result_host)

                if not output_test and verbose:
                    print("Error: " + util.get_config_string(params) +
                          " detected during correctness check")
                    print(
                        "Printing kernel output and expected result, set verbose=False to suppress this debug print"
                    )
                    numpy.set_printoptions(edgeitems=50)
                    print("Kernel output:")
                    print(result_host)
                    print("Expected:")
                    print(expected)
                correct = correct and output_test
                del result_host
        if not correct:
            logging.debug('correctness check has found a correctness issue')
            raise Exception("Error: " + util.get_config_string(params) +
                            " failed correctness check")
        return correct