def tune():

    size = int(80e6)

    a = np.random.randn(size).astype(np.float32)
    b = np.random.randn(size).astype(np.float32)
    c = np.zeros_like(b)
    n = np.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["N"] = [size]
    tune_params["NTHREADS"] = [16, 8, 4, 2, 1]

    print("compile with ftn using intel on cray")
    result, env = tune_kernel("time_vector_add", "vector_add.F90", size,
                              args, tune_params, lang="C", compiler="ftn")

    print("compile with gfortran")
    result, env = tune_kernel("time_vector_add", "vector_add.F90", size,
                              args, tune_params, lang="C", compiler="gfortran")

    print("compile with pgfortran")
    result, env = tune_kernel("time_vector_add", "vector_add.F90", size,
                              args, tune_params, lang="C", compiler="pgfortran")

    return result
Example #2
0
def tune_expdist():

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,10)]
    tune_params["block_size_x"] = [2**i for i in range(5,10)]
    tune_params["block_size_y"] = [2**i for i in range(6)]
    tune_params["tile_size_x"] = [2**i for i in range(4)]
    tune_params["tile_size_y"] = [2**i for i in range(4)]
    tune_params["use_shared_mem"] = [0, 1]

    #setup test input
    alloc_size = 3000
    size = numpy.int32(2000)
    max_blocks = numpy.int32( numpy.ceil(size / float(numpy.amin(tune_params["block_size_x"]))) *
                              numpy.ceil(size / float(numpy.amin(tune_params["block_size_y"]))) )
    ndim = numpy.int32(2)
    A = numpy.random.randn(alloc_size*ndim).astype(numpy.float64)
    B = A+0.00001*numpy.random.randn(alloc_size*ndim).astype(numpy.float64)
    scale_A = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64))
    scale_B = numpy.absolute(0.01*numpy.random.randn(alloc_size).astype(numpy.float64))
    cost = numpy.zeros((max_blocks)).astype(numpy.float64)

    #setup kernel
    with open('expdist.cu', 'r') as f:
        kernel_string = f.read()
    arguments = [A, B, size, size, scale_A, scale_B, cost]
    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    #tune using the Noodles runner for parallel tuning using 8 threads
    kernel1 = tune_kernel("ExpDist", kernel_string, (size, size), arguments, tune_params,
                          grid_div_x=grid_div_x, grid_div_y=grid_div_y,
                          num_threads=8, use_noodles=True, verbose=True)

    #dump the tuning results to a json file
    with open("expdist.json", 'w') as fp:
        json.dump(kernel1, fp)

    #get the number of blocks used by the best configuration in the first kernel
    best_config1 = min(kernel1[0], key=lambda x:x['time'])
    nblocks = numpy.int32( numpy.ceil(size / float(best_config1["block_size_x"]*best_config1["tile_size_x"])) *
                           numpy.ceil(size / float(best_config1["block_size_y"]*best_config1["tile_size_y"])) )

    #tunable parameters for the second kernel
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]

    #tune the second kernel, again in parallel with 8 threads
    arguments = [numpy.zeros(1).astype(numpy.float64), cost, size, size, nblocks]
    kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params,
                grid_div_x=[], num_threads=8, use_noodles=True, verbose=True)

    best_config2 = min(kernel2[0], key=lambda x:x['time'])
    print("best GPU configuration, total time=", best_config1['time'] + best_config2['time'])
    print(best_config1)
    print(best_config2)
Example #3
0
def tune():
    with open('convolution.cu', 'r') as f:
        kernel_string = f.read()

    #setup tunable parameters
    tune_params = OrderedDict()
    tune_params["filter_height"] = [i for i in range(3,19,2)]
    tune_params["filter_width"] = [i for i in range(3,19,2)]
    tune_params["block_size_x"] = [16*i for i in range(1,65)]
    tune_params["block_size_y"] = [2**i for i in range(6)]
    tune_params["tile_size_x"] = [i for i in range(1,11)]
    tune_params["tile_size_y"] = [i for i in range(1,11)]

    tune_params["use_padding"] = [0,1]  #toggle the insertion of padding in shared memory
    tune_params["read_only"] = [0,1]    #toggle using the read-only cache

    #limit the search to only use padding when its effective, and at least 32 threads in a block
    restrict = ["use_padding==0 or (block_size_x % 32 != 0)", "block_size_x*block_size_y >= 32"]

    #setup input and output dimensions
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)
    largest_fh = max(tune_params["filter_height"])
    largest_fw = max(tune_params["filter_width"])
    input_size = ((problem_size[0]+largest_fw-1) * (problem_size[1]+largest_fh-1))

    #create input data
    output_image = numpy.zeros(size).astype(numpy.float32)
    input_image = numpy.random.randn(input_size).astype(numpy.float32)
    filter_weights = numpy.random.randn(largest_fh * largest_fw).astype(numpy.float32)

    #setup kernel arguments
    cmem_args = {'d_filter': filter_weights}
    args = [output_image, input_image, filter_weights]

    #tell the Kernel Tuner how to compute grid dimensions
    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    #start tuning separable convolution (row)
    tune_params["filter_height"] = [1]
    tune_params["tile_size_y"] = [1]
    results_row = tune_kernel("convolution_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x, cmem_args=cmem_args, verbose=True, restrictions=restrict)

    #start tuning separable convolution (col)
    tune_params["filter_height"] = tune_params["filter_width"][:]
    tune_params["file_size_y"] = tune_params["tile_size_x"][:]
    tune_params["filter_width"] = [1]
    tune_params["tile_size_x"] = [1]
    results_col = tune_kernel("convolution_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x, cmem_args=cmem_args, verbose=True, restrictions=restrict)

    return results_row, results_col
Example #4
0
def tune_correlate_full_kernel(kernel_name):

    with open(get_kernel_path()+'correlate_full.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(1e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N, factor=1750.0)

    #setup kernel arguments
    row_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    col_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    prefix_sums = np.zeros(10).astype(np.int32)     #not used in first kernel
    sums = np.zeros(N).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    #run the sums kernel once
    params = {"block_size_x": 256, "write_sums": 1}
    answer = run_kernel(kernel_name, kernel_string, problem_size, args, params)
    reference = [None for _ in range(len(args))]
    reference[3] = answer[3]
    sums = reference[3].astype(np.int32)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32
    tune_params["write_sums"] = [1]
    tune_params["write_spm"] = [0]

    kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    #tune kernel #2
    total_correlated_hits = sums.sum()
    print("total_correlated_hits", total_correlated_hits)
    print("density", total_correlated_hits/(float(N)*sliding_window_width))

    col_idx = np.zeros(total_correlated_hits).astype(np.int32)
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    prefix_sums = np.cumsum(sums).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    tune_params["write_sums"] = [0]
    tune_params["write_spm"] = [1]

    kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    return kernel_1, kernel_2
Example #5
0
def tune_degrees_dense():

    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)
    problem_size = (N,1)
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]
    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        qd_string = f.read()
    data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1})
    correlations = data[0]
    sums = data[1]  #partial sum of the # of correlated hits to hits later in time

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["window_width"] = [sliding_window_width]

    args = [sums, correlations, N]
    return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
def test_sequential_runner_alt_block_size_names():

    kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_dim_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    c, a, b, n = get_vector_add_args()
    args = [c, a, b, n]
    tune_params = {"block_dim_x": [128 + 64 * i for i in range(5)],
                   "block_size_y": [1], "block_size_z": [1]}

    ref = (a+b).astype(np.float32)
    answer = [ref, None, None, None]

    block_size_names = ["block_dim_x"]

    result, _ = kernel_tuner.tune_kernel(
        "vector_add", kernel_string, int(n), args,
        tune_params, grid_div_x=["block_dim_x"], answer=answer,
        block_size_names=block_size_names)

    assert len(result) == len(tune_params["block_dim_x"])
def tune():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["block_size_x"] = [128+64*i for i in range(15)]

    result = tune_kernel("vector_add", kernel_string, size, args, tune_params)

    with open("vector_add.json", 'w') as fp:
        json.dump(result, fp)

    return result
Example #8
0
def tune_minimum_degree():

    with open(get_kernel_path()+'minimum_degree.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #tune params here
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["threshold"] = [3]

    max_blocks = int(np.ceil(N / float(max(tune_params["block_size_x"]))))

    #generate input data with an expected density of correlated hits
    correlations, sums = generate_large_correlations_table(N, sliding_window_width)
    row_idx, col_idx, prefix_sums = create_sparse_matrix(correlations, sums)

    #setup all kernel inputs
    minimum = np.zeros(max_blocks).astype(np.int32)
    num_nodes = np.zeros(max_blocks).astype(np.int32)

    #call the CUDA kernel
    args = [minimum, num_nodes, sums, row_idx, col_idx, prefix_sums, N]
    return tune_kernel("minimum_degree", kernel_string, problem_size, args, tune_params, verbose=True)
def test_noodles_runner():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 100
    a = np.random.randn(size).astype(np.float32)
    b = np.random.randn(size).astype(np.float32)
    c = np.zeros_like(b)
    n = np.int32(size)

    args = [c, a, b, n]
    tune_params = {"block_size_x": [128+64*i for i in range(15)]}

    result, _ = kernel_tuner.tune_kernel(
        "vector_add", kernel_string, size, args, tune_params,
        use_noodles=True, num_threads=4)

    assert len(result) == len(tune_params["block_size_x"])
Example #10
0
def tune():
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)

    A = numpy.random.randn(*problem_size).astype(numpy.float32)
    B = numpy.random.randn(*problem_size).astype(numpy.float32)
    C = numpy.zeros_like(A)

    args = [C, A, B]
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [16*2**i for i in range(3)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    tune_params["tile_size_x"] = [2**i for i in range(4)]
    tune_params["tile_size_y"] = [2**i for i in range(4)]

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    restrict = ["block_size_x==block_size_y*tile_size_y"]

    answer = [numpy.dot(A,B), None, None]

    return kernel_tuner.tune_kernel("matmul_kernel", "matmul.cl",
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x,
        restrictions=restrict, verbose=True, answer=answer, atol=1e-3)
Example #11
0
def tune_quadratic_difference_kernel():

    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)

    #setup kernel arguments
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32
    tune_params["f_unroll"] = [i for i in range(1,20) if 1500/float(i) == 1500//i] #divisors of 1500
    tune_params["tile_size_x"] = [2**i for i in range(5)] #powers of 2
    tune_params["write_sums"] = [1]

    return tune_kernel("quadratic_difference_linear", kernel_string, problem_size, args, tune_params, verbose=True)
Example #12
0
def tune_dense2sparse():

    with open(get_kernel_path()+'dense2sparse.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input
    correlations, sums = generate_large_correlations_table(N, sliding_window_width)

    #setup all kernel inputs
    prefix_sums = np.cumsum(sums).astype(np.int32)
    total_correlated_hits = np.sum(sums.sum())
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    col_idx = np.zeros(total_correlated_hits).astype(np.int32)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #factors of 32 up to 1024
    tune_params["window_width"] = [sliding_window_width]
    tune_params["use_shared"] = [0, 1]
    tune_params["f_unroll"] = [i for i in range(1,5) if 1500/float(i) == 1500//i] #divisors of 1500

    #call the tuner
    args = [row_idx, col_idx, prefix_sums, correlations, N]
    return tune_kernel("dense2sparse_kernel", kernel_string, problem_size, args, tune_params, verbose=True)
Example #13
0
def test_strategies(env):

    options = dict(popsize=5, max_fevals=15)

    for strategy in strategy_map:
        print(f"testing {strategy}")
        result, _ = kernel_tuner.tune_kernel(*env, strategy=strategy, strategy_options=options,
                                             verbose=False, cache=cache_filename, simulation_mode=True)
        assert len(result) > 0
Example #14
0
def test_genetic_algorithm(env):
    options = dict(method="uniform", popsize=10, maxiter=2, mutation_change=1)
    result, _ = kernel_tuner.tune_kernel(*env,
                                         strategy="genetic_algorithm",
                                         strategy_options=options,
                                         verbose=True,
                                         cache=cache_filename,
                                         simulation_mode=True)
    assert len(result) > 0
Example #15
0
def test_simulation_runner(env):
    cache_filename = os.path.dirname(
        os.path.realpath(__file__)) + "/test_cache_file.json"
    result, _ = kernel_tuner.tune_kernel(*env,
                                         cache=cache_filename,
                                         simulation_mode=True,
                                         verbose=True)
    tune_params = env[-1]
    assert len(result) == len(tune_params["block_size_x"])
Example #16
0
def test_nvml_observer(env):
    nvmlobserver = NVMLObserver(["nvml_energy", "temperature"])
    env[-1]["block_size_x"] = [128]

    result, _ = kernel_tuner.tune_kernel(*env, observers=[nvmlobserver])

    assert "nvml_energy" in result[0]
    assert "temperature" in result[0]
    assert result[0]["temperature"] > 0
def tune(nodes, edges, elements, max_levels, max_tile, real_type, quiet=True):
    numpy_real_type = None
    if real_type == "float":
        numpy_real_type = numpy.float32
    elif real_type == "double":
        numpy_real_type = numpy.float64
    else:
        raise ValueError
    # Tuning and code generation parameters
    tuning_parameters = dict()
    tuning_parameters["int_type"] = ["unsigned_int", "int"]
    tuning_parameters["real_type"] = [real_type]
    tuning_parameters["max_levels"] = [str(max_levels)]
    tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)]
    tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)]
    constraints = list()
    constraints.append("block_size_x * tiling_x <= max_levels")
    # Memory allocation and initialization
    fct_adf_h = numpy.random.randn(edges * max_levels).astype(numpy_real_type)
    fct_adf_h_control = numpy.copy(fct_adf_h)
    fct_plus = numpy.random.randn(nodes * max_levels).astype(numpy_real_type)
    fct_minus = numpy.random.randn(nodes * max_levels).astype(numpy_real_type)
    levels = numpy.zeros(elements).astype(numpy.int32)
    for element in range(0, elements):
        levels[element] = numpy.random.randint(3, max_levels)
    nodes_per_edge = numpy.zeros(edges * 2).astype(numpy.int32)
    elements_per_edge = numpy.zeros(edges * 2).astype(numpy.int32)
    for edge in range(0, edges):
        nodes_per_edge[edge * 2] = numpy.random.randint(1, nodes + 1)
        nodes_per_edge[(edge * 2) + 1] = numpy.random.randint(1, nodes + 1)
        elements_per_edge[edge * 2] = numpy.random.randint(1, elements + 1)
        elements_per_edge[(edge * 2) + 1] = numpy.random.randint(
            0, elements + 1)
    arguments = [
        numpy.int32(max_levels), levels, nodes_per_edge, elements_per_edge,
        fct_adf_h, fct_plus, fct_minus
    ]
    # Reference
    memory_bytes = reference(edges, nodes_per_edge, elements_per_edge, levels,
                             max_levels, fct_adf_h_control, fct_plus,
                             fct_minus, numpy_real_type)
    arguments_control = [None, None, None, None, fct_adf_h_control, None, None]
    # Tuning
    results, _ = tune_kernel("fct_ale_b3_horizontal",
                             generate_code,
                             "{} * block_size_x".format(edges),
                             arguments,
                             tuning_parameters,
                             lang="CUDA",
                             answer=arguments_control,
                             restrictions=constraints,
                             quiet=quiet)
    # Memory bandwidth
    for result in results:
        result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3)
    return results
Example #18
0
def tune(nodes, max_levels, max_tile, real_type, quiet=True):
    numpy_real_type = None
    if real_type == "float":
        numpy_real_type = numpy.float32
    elif real_type == "double":
        numpy_real_type = numpy.float64
    else:
        raise ValueError
    # Tuning and code generation parameters
    tuning_parameters = dict()
    tuning_parameters["int_type"] = ["unsigned_int", "int"]
    tuning_parameters["real_type"] = [real_type]
    tuning_parameters["max_levels"] = [str(max_levels)]
    tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)]
    tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)]
    constraints = list()
    constraints.append("block_size_x * tiling_x <= max_levels")
    # Memory allocation and initialization
    fct_low_order = numpy.random.randn(nodes *
                                       max_levels).astype(numpy_real_type)
    ttf = numpy.random.randn(nodes * max_levels).astype(numpy_real_type)
    fct_ttf_max = numpy.zeros(nodes * max_levels).astype(numpy_real_type)
    fct_ttf_min = numpy.zeros_like(fct_ttf_max).astype(numpy_real_type)
    fct_ttf_max_control = numpy.zeros_like(fct_ttf_max).astype(numpy_real_type)
    fct_ttf_min_control = numpy.zeros_like(fct_ttf_min).astype(numpy_real_type)
    levels = numpy.zeros(nodes).astype(numpy.int32)
    used_levels = 0
    for node in range(0, nodes):
        levels[node] = numpy.random.randint(3, max_levels)
        used_levels = used_levels + (levels[node] - 1)
    arguments = [
        numpy.int32(max_levels), fct_low_order, ttf, levels, fct_ttf_max,
        fct_ttf_min
    ]
    # Reference
    reference(nodes, levels, max_levels, fct_low_order, ttf,
              fct_ttf_max_control, fct_ttf_min_control)
    arguments_control = [
        None, None, None, None, fct_ttf_max_control, fct_ttf_min_control
    ]
    # Tuning
    results, _ = tune_kernel("fct_ale_a1",
                             generate_code,
                             "{} * block_size_x".format(nodes),
                             arguments,
                             tuning_parameters,
                             lang="CUDA",
                             answer=arguments_control,
                             restrictions=constraints,
                             quiet=quiet)
    # Memory bandwidth
    memory_bytes = ((nodes * 4) +
                    (used_levels * 4 * numpy.dtype(numpy_real_type).itemsize))
    for result in results:
        result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3)
    return results
def tune(number_of_frequencies):

    N = 61
    T = 20 
    K = 150
    F = number_of_frequencies
    B = (N)*(N-1)//2 * T

    print('N', N, 'B', B, 'T', T, 'K', K, 'F', F)

    args = generate_input_data(B, N, T, K, F)

    problem_size = B

    tune_params = OrderedDict()
    tune_params['block_size_x'] = [2**i for i in range(5,10)]

    print("First call the reference kernel")
    ref = call_reference_kernel(N, B, T, K, F, args)
    answer = [None for _ in args]
    answer[-2] = ref[-2]

    tolerance = 1e-2
    verbosity = False
    print("Next, we call the modified kernel, with (use_kernel = 1) and without (use_kernel = 0) the slave kernel")
    print("With slave kernel:")
    # tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu",
    #                           problem_size, args, {'block_size_x': [32], 'use_kernel': [1]}, compiler_options=cp, verbose=True, answer=answer, atol=tolerance)

    tune_params['use_kernel'] = [1]
    results, env = tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu",
                               problem_size, args, tune_params, compiler_options=cp, verbose=verbosity, answer=answer, atol=tolerance)

    min_time_with_slave = min([item['time'] for item in results])

    print("Without slave kernel:")
    tune_params['use_kernel'] = [0]
    results, env = tune_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu",
                               problem_size, args, tune_params, compiler_options=cp, verbose=verbosity, answer=answer, atol=tolerance)

    min_time_without_slave = min([item['time'] for item in results])

    return min_time_with_slave/min_time_without_slave
Example #20
0
def tune():

    size = int(80e6)

    a = np.random.randn(size).astype(np.float32)
    b = np.random.randn(size).astype(np.float32)
    c = np.zeros_like(b)
    n = np.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["N"] = [size]
    tune_params["NTHREADS"] = [16, 8, 4, 2, 1]

    print("compile with ftn using intel on cray")
    result, env = tune_kernel("time_vector_add",
                              "vector_add.F90",
                              size,
                              args,
                              tune_params,
                              lang="C",
                              compiler="ftn")

    print("compile with gfortran")
    result, env = tune_kernel("time_vector_add",
                              "vector_add.F90",
                              size,
                              args,
                              tune_params,
                              lang="C",
                              compiler="gfortran")

    print("compile with pgfortran")
    result, env = tune_kernel("time_vector_add",
                              "vector_add.F90",
                              size,
                              args,
                              tune_params,
                              lang="C",
                              compiler="pgfortran")

    return result
Example #21
0
def tune_complex_and_flip(kernel_string, height, width, image, image2):
    """step 1 convert to complex data structure and flip pattern"""
    problem_size = (width, height)
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    image_freq = np.zeros((height,width,2), dtype=np.float32)
    image2_freq = np.zeros((height,width,2), dtype=np.float32)

    args = [height, width, image_freq, image2_freq, image, image2]
    params = {"block_size_x": 32, "block_size_y": 16}
    output = run_kernel("toComplexAndFlip2",
        kernel_string, problem_size, args, params, grid_div_y=["block_size_y"])

    tune_kernel("toComplexAndFlip2", kernel_string, problem_size,
        args, tune_params, grid_div_y=["block_size_y"])

    return output[2], output[3]
def tune_pnpoly():

    #change to dir with source files because of includes in pnpoly_host.cu
    os.chdir(get_kernel_path())

    with open('pnpoly_host.cu', 'r') as f:
        host_string = f.read()
    with open('pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0
    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, vertex_xy, size]

    tune_params = OrderedDict()

    #tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32

    tune_params["tile_size"] = [2**i for i in range(6)]
    tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy})
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, answer=result, lang="C", verbose=True)

    return results, tune_params
def tune(elements, nodes, max_levels, max_tile, real_type, quiet=True):
    numpy_real_type = None
    if real_type == "float":
        numpy_real_type = numpy.float32
    elif real_type == "double":
        numpy_real_type = numpy.float64
    else:
        raise ValueError
    # Tuning and code generation parameters
    tuning_parameters = dict()
    tuning_parameters["int_type"] = ["unsigned_int", "int"]
    tuning_parameters["real_type"] = [real_type]
    tuning_parameters["max_levels"] = [str(max_levels)]
    tuning_parameters["block_size_x"] = [32 * i for i in range(1, 33)]
    tuning_parameters["tiling_x"] = [i for i in range(1, max_tile)]
    tuning_parameters["vector_size"] = [1, 2]
    constraints = list()
    constraints.append("block_size_x * tiling_x <= max_levels")
    # Memory allocation and initialization
    uv_rhs = numpy.zeros(elements * max_levels * 2).astype(numpy_real_type)
    uv_rhs_control = numpy.zeros_like(uv_rhs).astype(numpy_real_type)
    fct_ttf_max = numpy.random.randn(nodes *
                                     max_levels).astype(numpy_real_type)
    fct_ttf_min = numpy.random.randn(nodes *
                                     max_levels).astype(numpy_real_type)
    levels = numpy.zeros(elements).astype(numpy.int32)
    element_nodes = numpy.zeros(elements * 3).astype(numpy.int32)
    for element in range(0, elements):
        levels[element] = numpy.random.randint(3, max_levels)
        element_nodes[(element * 3)] = numpy.random.randint(1, nodes + 1)
        element_nodes[(element * 3) + 1] = numpy.random.randint(1, nodes + 1)
        element_nodes[(element * 3) + 2] = numpy.random.randint(1, nodes + 1)
    arguments = [
        numpy.int32(max_levels), levels, element_nodes, uv_rhs, fct_ttf_max,
        fct_ttf_min
    ]
    # Reference
    memory_bytes = reference(elements, levels, max_levels, element_nodes,
                             uv_rhs_control, fct_ttf_max, fct_ttf_min,
                             real_type)
    arguments_control = [None, None, None, uv_rhs_control, None, None]
    # Tuning
    results, _ = tune_kernel("fct_ale_a2",
                             generate_code,
                             "{} * block_size_x".format(elements),
                             arguments,
                             tuning_parameters,
                             lang="CUDA",
                             answer=arguments_control,
                             restrictions=constraints,
                             quiet=quiet)
    # Memory bandwidth
    for result in results:
        result["memory_bandwidth"] = memory_bytes / (result["time"] / 10**3)
    return results
Example #24
0
def test_random_sample(env):
    result, _ = kernel_tuner.tune_kernel(*env,
                                         strategy="random_sample",
                                         strategy_options={"fraction": 0.1},
                                         cache=cache_filename,
                                         simulation_mode=True)
    # check that number of benchmarked kernels is 10% (rounded up)
    assert len(result) == 2
    # check all returned results make sense
    for v in result:
        assert v['time'] > 0.0 and v['time'] < 1.0
Example #25
0
def test_custom_observer(env):
    env[-1]["block_size_x"] = [128]

    class MyObserver(BenchmarkObserver):
        def get_results(self):
            return {"name": self.dev.name}

    result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()])

    assert "name" in result[0]
    assert len(result[0]["name"]) > 0
Example #26
0
def tune():

    #set the number of points and the number of vertices
    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    #allocate device mapped host memory and generate input data
    points = allocate(2 * size, numpy.float32)
    numpy.copyto(points, numpy.random.randn(2 * size).astype(numpy.float32))

    bitmap = allocate(size, numpy.int32)
    numpy.copyto(bitmap, numpy.zeros(size).astype(numpy.int32))
    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0
    vertex_seeds = numpy.sort(numpy.random.rand(vertices) * 2.0 *
                              numpy.pi)[::-1]
    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = allocate(2 * vertices, numpy.float32)
    numpy.copyto(
        vertex_xy,
        numpy.array(list(zip(vertex_x,
                             vertex_y))).astype(numpy.float32).ravel())

    #kernel arguments
    args = [bitmap, points, vertex_xy, size]

    #setup tunable parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32 * i
                                   for i in range(1, 32)]  #multiple of 32
    tune_params["tile_size"] = [1] + [2 * i for i in range(1, 11)]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    #tell the Kernel Tuner how to compute the grid dimensions from the problem_size
    grid_div_x = ["block_size_x", "tile_size"]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly_host",
                                       ['pnpoly_host.cu', 'pnpoly.cu'],
                                       problem_size,
                                       args,
                                       tune_params,
                                       grid_div_x=grid_div_x,
                                       lang="C",
                                       compiler_options=["-arch=sm_52"],
                                       verbose=True,
                                       log=logging.DEBUG)

    return results
Example #27
0
def tune():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]

    # This example illustrates how to use metrics
    # metrics can be either specified as functions or using strings

    # metrics need to be OrderedDicts because we can compose
    # earlier defined metrics into new metrics
    metrics = OrderedDict()

    # This metrics is the well-known GFLOP/s performance metric
    # we can define the value of the metric using a function that accepts 1 argument
    # this argument is a dictionary with all the tunable parameters and benchmark results
    # the value of the metric is calculated directly after obtaining the benchmark results
    metrics["GFLOP/s"] = lambda p: (size / 1e9) / (p["time"] / 1000)

    # Alternatively you can specify the metric using strings
    # in these strings you can use the names of the kernel parameters and benchmark results
    # directly as they will be replaced by the tuner before evaluating this string
    metrics["GB/s"] = f"({size}*4*2/1e9) / (time/1000)"

    result = tune_kernel("vector_add",
                         kernel_string,
                         size,
                         args,
                         tune_params,
                         metrics=metrics)

    with open("vector_add.json", 'w') as fp:
        json.dump(result, fp)

    return result
Example #28
0
def tune():
    with open('spmv.cu', 'r') as f:
        kernel_string = f.read()

    nrows = numpy.int32(128 * 1024)
    ncols = 64 * 1024
    nnz = int(nrows * ncols * 0.001)
    #problem_size = (nrows, 1)
    problem_size = nrows

    #generate sparse matrix in CSR
    rows = numpy.asarray([0] + sorted(numpy.random.rand(nrows - 1) * nnz) +
                         [nnz]).astype(numpy.int32)
    cols = (numpy.random.rand(nnz) * ncols).astype(numpy.int32)
    vals = numpy.random.randn(nnz).astype(numpy.float32)

    #input and output vector  (y = matrix * x)
    x = numpy.random.randn(ncols).astype(numpy.float32)
    y = numpy.zeros(nrows).astype(numpy.float32)

    args = [y, rows, cols, vals, x, nrows]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32 * i for i in range(1, 33)]
    tune_params["threads_per_row"] = [1, 32]
    tune_params["read_only"] = [0, 1]

    grid_div_x = ["block_size_x/threads_per_row"]

    #compute reference answer using scipy.sparse
    row_ind = list(
        chain.from_iterable([[i] * (rows[i + 1] - rows[i])
                             for i in range(nrows)]))
    matrix = csr_matrix((vals, (row_ind, cols)), shape=(nrows, ncols))
    start = time.clock()
    expected_y = matrix.dot(x)
    end = time.clock()
    print("computing reference using scipy.sparse took: " +
          str(start - end / 1000.0) + " ms.")

    answer = [expected_y, None, None, None, None, None]

    return kernel_tuner.tune_kernel("spmv_kernel",
                                    kernel_string,
                                    problem_size,
                                    args,
                                    tune_params,
                                    grid_div_x=grid_div_x,
                                    verbose=True,
                                    answer=answer,
                                    atol=1e-4)
Example #29
0
def tune_horizontal(kernel_string, image, height, width):
    args = [height, width, image]

    #use only one column of thread blocks
    problem_size = (1, height)
    grid_div_x = []
    grid_div_y = ["block_size_y"]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    return tune_kernel("computeMeanHorizontally", kernel_string, problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x)
Example #30
0
def test_bayesian_optimization(env):
    for method in [
            "poi", "ei", "lcb", "lcb-srinivas", "multi", "multi-advanced",
            "multi-fast"
    ]:
        print(method, flush=True)
        options = dict(popsize=5, max_fevals=10, method=method)
        result, _ = kernel_tuner.tune_kernel(*env,
                                             strategy="bayes_opt",
                                             strategy_options=options,
                                             verbose=True,
                                             cache=cache_filename,
                                             simulation_mode=True)
        assert len(result) > 0
def test_sequential_runner_not_matching_answer1():
    kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
            int i = blockIdx.x * block_size_x + threadIdx.x;
            if (i<n) {
                c[i] = a[i] + b[i];
            }
        } """
    args = get_vector_add_args()
    answer = [args[1] + args[2]]
    tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]}

    try:
        kernel_tuner.tune_kernel(
            "vector_add", kernel_string, args[-1], args, tune_params,
            method="diff_evo", verbose=True, answer=answer)
        print("Expected a TypeError to be raised")
        assert False
    except TypeError as expected_error:
        print(str(expected_error))
        assert "The length of argument list and provided results do not match." == str(expected_error)
    except Exception:
        print("Expected a TypeError to be raised")
        assert False
Example #32
0
def tune_vertical(kernel_string, image, height, width):
    args = [height, width, image]

    #only one row of thread-blocks is to be created
    problem_size = (width, 1)
    grid_div_x = ["block_size_x"]
    grid_div_y = []

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    return tune_kernel("computeMeanVertically", kernel_string, problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x)
def test_sequential_runner_not_matching_answer2():
    kernel_string = """__global__ void vector_add(float *c, float *a, float *b, int n) {
            int i = blockIdx.x * block_size_x + threadIdx.x;
            if (i<n) {
                c[i] = a[i] + b[i];
            }
        } """
    args = get_vector_add_args()
    answer = [np.ubyte([12]), None, None, None]
    tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]}

    try:
        kernel_tuner.tune_kernel(
            "vector_add", kernel_string, args[-1], args, tune_params,
            method="diff_evo", verbose=True, answer=answer)

        print("Expected a TypeError to be raised")
        assert False
    except TypeError as expected_error:
        print(str(expected_error))
        assert "Element 0" in str(expected_error)
    except Exception:
        print("Expected a TypeError to be raised")
        assert False
Example #34
0
def tune():
    with open('convolution.cu', 'r') as f:
        kernel_string = f.read()

    filter_size = (17, 17)
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)
    border_size = (filter_size[0]//2*2, filter_size[1]//2*2)
    input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1]))

    output = numpy.zeros(size).astype(numpy.float32)
    input = numpy.random.randn(input_size).astype(numpy.float32)

    filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32)
    cmem_args= {'d_filter': filter }

    args = [output, input, filter]
    tune_params = OrderedDict()
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]

    tune_params["block_size_x"] = [16*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(1,6)]

    tune_params["tile_size_x"] = [2**i for i in range(3)]
    tune_params["tile_size_y"] = [2**i for i in range(3)]

    tune_params["use_padding"] = [0,1]  #toggle the insertion of padding in shared memory
    tune_params["read_only"] = [0,1]    #toggle using the read-only cache

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    #compute the answer using a naive kernel
    params = { "block_size_x": 16, "block_size_y": 16}
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]
    results = kernel_tuner.run_kernel("convolution_naive", kernel_string,
        problem_size, args, params,
        grid_div_y=["block_size_y"], grid_div_x=["block_size_x"])

    #set non-output fields to None
    answer = [results[0], None, None]

    #start kernel tuning with correctness verification
    return kernel_tuner.tune_kernel("convolution_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer)
Example #35
0
def tune():
    kernel_string = _rotation_kernel_source
    problem_size = (1024, 1024)
    x = 128 * numpy.ones(problem_size, dtype=numpy.float32)
    out = numpy.zeros(problem_size, dtype=numpy.uint8)

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [ 16, 32 ]
    tune_params["block_size_y"] = [ 16, 32 ]
    tune_params["oldiw"] = [ problem_size[0] ]
    tune_params["oldih"] = [ problem_size[1] ]
    tune_params["newiw"] = [ problem_size[0] ]
    tune_params["newih"] = [ problem_size[1] ]

    args = [ numpy.float32(0.5), numpy.float32(20), out ]

    return kernel_tuner.tune_kernel("copy_texture_kernel", kernel_string, problem_size, args, tune_params, texmem_args = { 'tex': { 'array': x, 'address_mode': 'border' } })
def test_diff_evo():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    } """

    args = get_vector_add_args()
    tune_params = {"block_size_x": [128+64*i for i in range(5)]}

    result, _ = kernel_tuner.tune_kernel(
        "vector_add", kernel_string, args[-1], args, tune_params,
        method="diff_evo", verbose=True)

    print(result)
    assert len(result) > 0
def test_random_sample():

    kernel_string = "float test_kernel(float *a) { return 1.0f; }"
    a = np.arange(4, dtype=np.float32)

    tune_params = {"block_size_x": range(1, 25)}
    print(tune_params)

    result, _ = kernel_tuner.tune_kernel(
        "test_kernel", kernel_string, (1, 1), [a], tune_params, sample_fraction=0.1)

    print(result)

    # check that number of benchmarked kernels is 10% (rounded up)
    assert len(result) == 3

    # check all returned results make sense
    for v in result:
        assert v['time'] == 1.0
Example #38
0
def test_random_sample():

    kernel_string = "float test_kernel(float *a) { return 1.0f; }"
    a = np.arange(4, dtype=np.float32)

    tune_params = {"block_size_x": range(1, 25)}
    print(tune_params)

    result, _ = kernel_tuner.tune_kernel(
        "test_kernel", kernel_string, (1, 1), [a], tune_params,
        strategy="random_sample", strategy_options={"fraction": 0.1})

    print(result)

    # check that number of benchmarked kernels is 10% (rounded up)
    assert len(result) == 3

    # check all returned results make sense
    for v in result:
        assert v['time'] == 1.0
def tune():

    size = int(72*1024*1024)

    a = np.random.randn(size).astype(np.float32)
    b = np.random.randn(size).astype(np.float32)
    c = np.zeros_like(b)
    n = np.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["N"] = [size]
    tune_params["block_size_x"] = [32, 64, 128, 256, 512]

    result, env = tune_kernel("time_vector_add", "vector_add_acc.F90", size, args,
                              tune_params, lang="C", compiler="pgfortran",
                              compiler_options=["-acc=verystrict", "-ta=tesla,lineinfo"])

    return result
Example #40
0
def tune_transpose(kernel_string, image, height, width):
    output = np.zeros((width, height), dtype=np.float32)
    args = [height, width, output, image]

    #tune the transpose kernel
    problem_size = (width, height)
    grid_div_x = ["block_size_x"]
    grid_div_y = ["block_size_y"]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32 * i for i in range(1, 9)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    return tune_kernel("transpose",
                       kernel_string,
                       problem_size,
                       args,
                       tune_params,
                       grid_div_y=grid_div_y,
                       grid_div_x=grid_div_x)
Example #41
0
def tune():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 80000000

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]

    nvmlobserver = NVMLObserver(["nvml_energy", "temperature"])

    metrics = OrderedDict()
    metrics["GFLOPS/W"] = lambda p: (size / 1e9) / p["nvml_energy"]

    results, env = tune_kernel("vector_add",
                               kernel_string,
                               size,
                               args,
                               tune_params,
                               observers=[nvmlobserver],
                               metrics=metrics,
                               iterations=32)

    with open("vector_add.json", 'w') as fp:
        json.dump(results, fp)

    return results
Example #42
0
def tune():
    with open('spmv.cu', 'r') as f:
        kernel_string = f.read()

    nrows = numpy.int32(128*1024)
    ncols = 64*1024
    nnz = int(nrows*ncols*0.001)
    #problem_size = (nrows, 1)
    problem_size = nrows

    #generate sparse matrix in CSR
    rows = numpy.asarray([0]+sorted(numpy.random.rand(nrows-1)*nnz)+[nnz]).astype(numpy.int32)
    cols = (numpy.random.rand(nnz)*ncols).astype(numpy.int32)
    vals = numpy.random.randn(nnz).astype(numpy.float32)

    #input and output vector  (y = matrix * x)
    x = numpy.random.randn(ncols).astype(numpy.float32)
    y = numpy.zeros(nrows).astype(numpy.float32)

    args = [y, rows, cols, vals, x, nrows]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]
    tune_params["threads_per_row"] = [1, 32]
    tune_params["read_only"] = [0, 1]

    grid_div_x = ["block_size_x/threads_per_row"]

    #compute reference answer using scipy.sparse
    row_ind = list(chain.from_iterable([[i] * (rows[i+1]-rows[i]) for i in range(nrows)]))
    matrix = csr_matrix((vals, (row_ind, cols)), shape=(nrows, ncols))
    start = time.clock()
    expected_y = matrix.dot(x)
    end = time.clock()
    print("computing reference using scipy.sparse took: " + str(start-end / 1000.0) + " ms.")

    answer = [expected_y, None, None, None, None, None]

    return kernel_tuner.tune_kernel("spmv_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, verbose=True, answer=answer, atol=1e-4)
Example #43
0
def test_noodles_runner():

    skip_if_no_cuda_device()

    if sys.version_info[0] < 3 or (sys.version_info[0] == 3
                                   and sys.version_info[1] < 5):
        raise SkipTest("Noodles runner test requires Python 3.5 or newer")

    import importlib.util
    noodles_installed = importlib.util.find_spec("noodles") is not None

    if not noodles_installed:
        raise SkipTest("Noodles runner test requires Noodles")

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 100
    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]
    tune_params = {"block_size_x": [128 + 64 * i for i in range(15)]}

    result, _ = kernel_tuner.tune_kernel("vector_add",
                                         kernel_string,
                                         size,
                                         args,
                                         tune_params,
                                         use_noodles=True,
                                         num_threads=4)

    assert len(result) == len(tune_params["block_size_x"])
Example #44
0
def tune():

    size = 10000000

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["block_size_x"] = [128+64*i for i in range(15)]

    result = tune_kernel("vector_add", my_fancy_generator, size, args,
        tune_params, lang="OpenCL")

    with open("vector_add.json", 'w') as fp:
        json.dump(result, fp)

    return result
Example #45
0
def test_random_sample():

    kernel_string = "float test_kernel(float *a) { return 1.0f; }"
    a = numpy.array([1, 2, 3]).astype(numpy.float32)

    tune_params = {"block_size_x": range(1, 25)}
    print(tune_params)

    result, _ = kernel_tuner.tune_kernel("test_kernel",
                                         kernel_string, (1, 1), [a],
                                         tune_params,
                                         sample_fraction=0.1)

    print(result)

    #check that number of benchmarked kernels is 10% (rounded up)
    assert len(result) == 3

    #check all returned results make sense
    for v in result:
        assert v['time'] == 1.0
Example #46
0
def tune():
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)

    A = numpy.random.randn(*problem_size).astype(numpy.float32)
    B = numpy.random.randn(*problem_size).astype(numpy.float32)
    C = numpy.zeros_like(A)

    args = [C, A, B]
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [16 * 2**i for i in range(3)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    tune_params["tile_size_x"] = [2**i for i in range(4)]
    tune_params["tile_size_y"] = [2**i for i in range(4)]

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    restrict = ["block_size_x==block_size_y*tile_size_y"]

    answer = [numpy.dot(A, B), None, None]

    metrics = OrderedDict()
    metrics["GFLOP/s"] = lambda p: (2 * 4096**3 / 1e9) / (p["time"] / 1e3)

    res, env = kernel_tuner.tune_kernel("matmul_kernel",
                                        "matmul.cu",
                                        problem_size,
                                        args,
                                        tune_params,
                                        grid_div_y=grid_div_y,
                                        grid_div_x=grid_div_x,
                                        restrictions=restrict,
                                        verbose=True,
                                        iterations=32,
                                        metrics=metrics)

    with open("matmul.json", 'w') as fp:
        json.dump(res, fp)
Example #47
0
def tune():

    with open('stencil.cl', 'r') as f:
        kernel_string = f.read()

    problem_size = (4096, 2048)
    size = numpy.prod(problem_size)

    x_old = numpy.random.randn(size).astype(numpy.float32)
    x_new = numpy.copy(x_old)
    args = [x_new, x_old]

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    grid_div_x = ["block_size_x"]
    grid_div_y = ["block_size_y"]

    return kernel_tuner.tune_kernel("stencil_kernel", kernel_string, problem_size,
        args, tune_params, grid_div_x=grid_div_x, grid_div_y=grid_div_y,
        verbose = True)
Example #48
0
def tune():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000

    a = cp.random.randn(size).astype(cp.float32)
    b = cp.random.randn(size).astype(cp.float32)
    c = cp.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["block_size_x"] = [128 + 64 * i for i in range(15)]

    answer = [a + b, None, None, None]

    result = tune_kernel("vector_add",
                         kernel_string,
                         size,
                         args,
                         tune_params,
                         answer=answer,
                         verbose=True,
                         lang="Cupy")

    with open("vector_add.json", 'w') as fp:
        json.dump(result, fp)

    return result
Example #49
0
def tune_prefix_sum_kernel():

    with open(get_kernel_path()+'prefixsum.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    problem_size = (N, 1)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]

    max_blocks = np.ceil(N/float(max(tune_params["block_size_x"]))).astype(np.int32)
    x = np.ones(N).astype(np.int32)

    #setup kernel arguments
    prefix_sums = np.zeros(N).astype(np.int32)
    block_carry = np.zeros(max_blocks).astype(np.int32)
    args = [prefix_sums, block_carry, x, N]

    #tune only the first kernel that computes the thread block-wide prefix sums
    #and outputs the block carry values
    return tune_kernel("prefix_sum_block", kernel_string, problem_size, args, tune_params, verbose=True)
Example #50
0
def test_diff_evo():

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    } """

    args = get_vector_add_args()
    tune_params = {"block_size_x": [128 + 64 * i for i in range(5)]}

    result, _ = kernel_tuner.tune_kernel("vector_add",
                                         kernel_string,
                                         args[-1],
                                         args,
                                         tune_params,
                                         method="diff_evo",
                                         verbose=True)

    print(result)
    assert len(result) > 0
Example #51
0
def tune():
    with open('convolution.cl', 'r') as f:
        kernel_string = f.read()

    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)
    input_size = ((problem_size[0]+16) * (problem_size[1]+16))

    output = numpy.zeros(size).astype(numpy.float32)
    input = numpy.random.randn(input_size).astype(numpy.float32)

    filter = numpy.random.randn(17*17).astype(numpy.float32)
    args = [output, input, filter]
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [16*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    tune_params["tile_size_x"] = [2**i for i in range(3)]
    tune_params["tile_size_y"] = [2**i for i in range(3)]

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    #compute the answer using a naive kernel
    params = { "block_size_x": 16, "block_size_y": 16 }
    results = kernel_tuner.run_kernel("convolution_naive", kernel_string,
        problem_size, args, params,
        grid_div_y=["block_size_y"], grid_div_x=["block_size_x"])

    #set non-output fields to None
    answer = [results[0], None, None]

    #start kernel tuning with correctness verification
    return kernel_tuner.tune_kernel("convolution_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, answer=answer)
def tune(number_of_sources):

    N = 61
    T = 20
    #K = 150
    K = number_of_sources
    F = 1

    print('N', N, 'T', T, 'K', K, 'F', F)

    args = generate_input_data(N, T, K, F)

    problem_size = (T * K * F, N)

    ref = call_reference_kernel(N, T, K, F, args, cp)

    #print(ref[17][:20])

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5, 11)]
    tune_params["use_kernel"] = [0]
    tune_params["use_shared_mem"] = [0, 1]

    #restrict = ["use_kernel == 0 or block_size_x<=64"]
    results, env = tune_kernel("kernel_tuner_host_array_beam",
                               [get_kernel_path() + "predict_model.cu"],
                               problem_size,
                               args,
                               tune_params,
                               lang="C",
                               compiler_options=cp,
                               verbose=True,
                               answer=ref,
                               atol=1e-4)

    return results
def tune_pnpoly_kernel():

    with open(get_kernel_path()+'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0

    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, size]

    # (vk.x-vj.x) / (vk.y-vj.y)
    slopes = numpy.zeros(vertices).astype(numpy.float32)
    for i in range(len(slopes)):
        if i == 0:
            slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i])
        else:
            slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i])

    cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes }

    tune_params = OrderedDict()

    tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    #tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32
    #tune_params["block_size_x"] = [256]                        #fixed size

    tune_params["tile_size"] = [2**i for i in range(6)]
    #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, args, params, cmem_args=cmem_args)
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result)

    return results, tune_params
with open('transpose.cu', 'r') as f:
    kernel_string = f.read()

width = 4096
height = 8192

problem_size = (width, height)
size = numpy.prod(problem_size)

A = numpy.random.randn(size).astype(numpy.float32)
AT = numpy.zeros_like(A)

args = [AT, A, width, height]
tune_params = dict()
tune_params["block_size_x"] = [16*2**i for i in range(3)]
tune_params["block_size_y"] = [2**i for i in range(6)]

tune_params["tile_size_x"] = [2**i for i in range(4)]
tune_params["tile_size_y"] = [2**i for i in range(4)]

grid_div_x = ["block_size_x", "tile_size_x"]
grid_div_y = ["block_size_y", "tile_size_y"]

restrict = ["block_size_x*tile_size_x==block_size_y*tile_size_y"]

kernel_tuner.tune_kernel("transpose_kernel", kernel_string,
    problem_size, args, tune_params,
    grid_div_y=grid_div_y, grid_div_x=grid_div_x,
    restrictions=restrict, verbose=True)

    #pragma omp parallel num_threads(nthreads)
    {
        int offset = omp_get_thread_num()*chunk;
        for (int i = offset; i<offset+chunk && i<n; i++) {
            c[i] = a[i] + b[i];
        }
    }

    return (float)((omp_get_wtime() - start)*1e3);
}
"""

size = 72*1024*1024

a = numpy.random.randn(size).astype(numpy.float32)
b = numpy.random.randn(size).astype(numpy.float32)
c = numpy.zeros_like(b)
n = numpy.int32(size)

args = [c, a, b, n]

tune_params = OrderedDict()
tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
tune_params["vecsize"] = [1, 2, 4, 8, 16]

answer = [a+b, None, None, None]

tune_kernel("vector_add", kernel_string, size, args, tune_params,
    answer=answer, compiler_options=['-O3'])
import numpy
import kernel_tuner
from collections import OrderedDict

with open('convolution.cl', 'r') as f:
    kernel_string = f.read()

problem_size = (4096, 4096)
size = numpy.prod(problem_size)
input_size = (problem_size[0]+16) * (problem_size[0]+16)

output = numpy.zeros(size).astype(numpy.float32)
input = numpy.random.randn(input_size).astype(numpy.float32)
filter = numpy.random.randn(17*17).astype(numpy.float32)

args = [output, input, filter]
tune_params = OrderedDict()
tune_params["block_size_x"] = [16*i for i in range(1,9)]
tune_params["block_size_y"] = [2**i for i in range(6)]

tune_params["tile_size_x"] = [2**i for i in range(3)]
tune_params["tile_size_y"] = [2**i for i in range(3)]

grid_div_x = ["block_size_x", "tile_size_x"]
grid_div_y = ["block_size_y", "tile_size_y"]

kernel_tuner.tune_kernel("convolution_kernel", kernel_string,
    problem_size, args, tune_params,
    grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True)