Beispiel #1
0
def tune_energy(kernel_string, height, width, crosscorr, loc):
    """step 4 compute energy"""

    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["num_blocks"] = [2**i for i in range(5,11)]
    max_blocks = max(tune_params["num_blocks"])

    params = {"block_size_x": 512, "num_blocks": 64}
    num_blocks = np.int32(params["num_blocks"])
    problem_size = ("num_blocks", 1)

    energy_part = np.zeros((max_blocks), dtype=np.float64)
    args = [height, width, energy_part, loc, crosscorr]

    output3 = run_kernel("computeEnergy",
        kernel_string, problem_size, args, params, grid_div_x=[])

    tune_kernel("computeEnergy",
        kernel_string, problem_size, args, tune_params, grid_div_x=[])

    energy_part = output3[2]
    energy = np.zeros((1), dtype=np.float64)

    args = [energy, energy_part, num_blocks]
    output4 = run_kernel("sumDoubles",
        kernel_string, (1,1), args, params, grid_div_x=[])

    energy = output4[0]
    return energy
Beispiel #2
0
def tune_find_peak(kernel_string, height, width, crosscorr):
    """step 3 find peak"""
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["num_blocks"] = [2**i for i in range(5,11)]
    max_blocks = max(tune_params["num_blocks"])

    params = {"block_size_x": 512, "num_blocks": 64}
    num_blocks = np.int32(params["num_blocks"])
    problem_size = ("num_blocks", 1)

    peakval = np.zeros((1), dtype=np.float32)
    peakvals = np.zeros((max_blocks), dtype=np.float32)
    peakindx = np.zeros((max_blocks), dtype=np.int32)
    loc = np.zeros((1), dtype=np.int32)
    val = np.zeros((1), dtype=np.float32)

    args = [height, width, peakval, peakvals, peakindx, crosscorr]
    output1 = run_kernel("findPeak",
        kernel_string, problem_size, args, params, grid_div_x=[])

    tune_kernel("findPeak",
        kernel_string, problem_size, args, tune_params, grid_div_x=[])

    peakvals = output1[2]
    peakindx = output1[3]

    args = [loc, val, peakindx, peakvals, num_blocks]
    output2 = run_kernel("maxlocFloats",
        kernel_string, (1,1), args, params, grid_div_x=[])

    loc = output2[0]
    val = output2[1]
    return loc, val
Beispiel #3
0
def test_wiener():

    with open(get_kernel_path() + 'wienerfilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output = np.zeros(problem_size, dtype=np.float32)

    args = [height, width, output, image]

    params = OrderedDict()
    params["block_size_x"] = 32
    params["block_size_y"] = 8
    params["reuse_computation"] = 1

    answer = run_kernel("computeVarianceEstimates",
                        kernel_string,
                        problem_size,
                        args,
                        params,
                        grid_div_y=["block_size_y"])

    reference = run_kernel("computeVarianceEstimates_naive",
                           kernel_string,
                           problem_size,
                           args,
                           params,
                           grid_div_y=["block_size_y"])

    assert np.allclose(answer[2], reference[2], atol=1e-6)
def test_prefix_sum_kernel():
    skip_if_no_cuda_device()

    with open(get_kernel_path()+'prefixsum.cu', 'r') as f:
        kernel_string = f.read()

    size = 256
    problem_size = (size, 1)
    params = {"block_size_x": 128}
    max_blocks = size//params["block_size_x"]
    x = np.ones(size).astype(np.int32)

    #compute reference answer
    reference = np.cumsum(x)

    #setup kernel inputs
    prefix_sums = np.zeros(size).astype(np.int32)
    block_carry = np.zeros(max_blocks).astype(np.int32)
    n = np.int32(size)

    args = [prefix_sums, block_carry, x, n]

    #call the first kernel that computes the incomplete prefix sums
    #and outputs the block carry values
    result = run_kernel("prefix_sum_block", kernel_string,
        problem_size, args, params)

    prefix_sums = result[0]
    block_filler = np.zeros(max_blocks).astype(np.int32)
    block_out = np.zeros(max_blocks).astype(np.int32)

    args = [block_out, block_filler, result[1], np.int32(max_blocks)]

    #call the kernel again, but this time on the block carry values
    #one thread block should be sufficient
    if max_blocks > params["block_size_x"]:
        print("warning: block size too small")

    result = run_kernel("prefix_sum_block", kernel_string,
        (1, 1), args, params,
        grid_div_x=[])

    block_carry = result[0]
    args = [prefix_sums, block_carry, n]

    #call a simple kernel to propagate the block carry values to all
    #elements
    answer = run_kernel("propagate_block_carry", kernel_string,
        problem_size, args, params)

    #verify
    test_result = np.sum(answer[0] - reference) == 0

    print("answer")
    print(answer[0])
    print("reference")
    print(reference)

    assert test_result
Beispiel #5
0
def test_find_peak():

    with open(get_kernel_path() + 'peaktocorrelationenergy.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test_small.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    #generate some bogus crosscorr data
    crosscorr = np.random.randn(height, width, 2).astype(np.float32)

    #compute reference in Python
    peak_index = np.argmax(np.absolute(crosscorr[:, :, 0]))
    peak_value = np.absolute(crosscorr[:, :, 0].flatten()[peak_index])

    params = {"block_size_x": 512, "num_blocks": 64}
    problem_size = ("num_blocks", 1)
    num_blocks = np.int32(params["num_blocks"])

    peakval = np.zeros((1), dtype=np.float32)
    peakvals = np.zeros((num_blocks), dtype=np.float32)
    peakindx = np.zeros((num_blocks), dtype=np.int32)
    loc = np.zeros((1), dtype=np.int32)
    val = np.zeros((1), dtype=np.float32)

    args = [height, width, peakval, peakvals, peakindx, crosscorr]
    output1 = run_kernel("findPeak",
                         kernel_string,
                         problem_size,
                         args,
                         params,
                         grid_div_x=[])

    peakvals = output1[3]
    peakindx = output1[4]

    args = [loc, val, peakindx, peakvals, num_blocks]
    output2 = run_kernel("maxlocFloats",
                         kernel_string, (1, 1),
                         args,
                         params,
                         grid_div_x=[])

    loc = output2[0][0]
    val = output2[1][0]

    print("answer")
    print("loc=", loc, "val=", val)

    print("reference")
    print("loc=", peak_index, "val=", peak_value)

    assert loc == peak_index
    assert np.isclose(val, peak_value, atol=1e-6)
Beispiel #6
0
def test_against_reference(size, ndim, A, B, scale, grad, cost):

    numpy.set_printoptions(edgeitems=50)

    #call the reference function
    ref_cost, ref_gradient = call_reference_function(size, ndim, A, B, scale,
                                                     grad, cost)

    #call the GPU function
    with open(get_kernel_path() + 'kernels.cu', 'r') as f:
        kernel_string = f.read()

    scale_sq = (scale * scale).astype(numpy.float64)
    m = numpy.int32(size)
    n = numpy.int32(B.shape[0])
    arguments = [A, B, m, n, scale_sq, grad, cost]

    params = {"block_size_x": 256}
    answer = run_kernel("GaussTransform",
                        kernel_string,
                        size,
                        arguments,
                        params,
                        compiler_options=compiler_options,
                        grid_div_x=[])

    #collect the results from the first kernel
    grad_i = answer[5]
    gradient = grad_i
    cross_term = answer[6]

    #call the second kernel to reduce the per thread block cross terms to a single value
    out = numpy.zeros(1).astype(numpy.float64)
    arguments = [out, cross_term, m, n, m]
    answer = run_kernel("reduce_cross_term",
                        kernel_string,
                        1,
                        arguments,
                        params,
                        compiler_options=compiler_options,
                        grid_div_x=[])

    #final cross term
    cost = answer[0]

    print("answer")
    print(cost)

    print(gradient)

    assert numpy.isclose(ref_cost, cost, atol=1e-8)
    assert numpy.allclose(ref_gradient, gradient, atol=1e-8)

    return cost, gradient
def test_vector_add():
    #Check pycuda is installed and if a CUDA capable device is present, if not skip the test
    try:
        import pycuda.driver as drv
        drv.init()
    except (ImportError, Exception):
        pytest.skip("PyCuda not installed or no CUDA device detected")

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000
    problem_size = (size, 1)

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]
    params = {"block_size_x": 512}

    answer = run_kernel("vector_add", kernel_string, problem_size, args, params)

    assert numpy.allclose(answer[0], a+b, atol=1e-8)
Beispiel #8
0
def tune_degrees_dense():

    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(4.5e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)
    problem_size = (N,1)
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]
    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        qd_string = f.read()
    data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1})
    correlations = data[0]
    sums = data[1]  #partial sum of the # of correlated hits to hits later in time

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [2**i for i in range(5,11)]
    tune_params["window_width"] = [sliding_window_width]

    args = [sums, correlations, N]
    return tune_kernel("degrees_dense", kernel_string, problem_size, args, tune_params, verbose=True)
Beispiel #9
0
def test_radix(radix, c_type):
    # this test runs 256 instances of the radix n function
    # it does not use twiddle factors, so as a test
    # it's not to be relied upon fully
    n = numpy.int32(256)
    m = {'float2': 1, 'float4': 2, 'float8': 4}[c_type]
    x = numpy.random.normal(size=(n, radix, m, 2)).astype(numpy.float32)
    y = numpy.zeros_like(x)

    y_ref = numpy.fft.fft(x[..., 0] + 1j * x[..., 1], axis=1)

    parity_splitting = parity.ParitySplitting(radix * n, radix)
    codelets = "{}\n{}".format(
        generator.generate_preprocessor(parity_splitting, False,
                                        c_type=c_type),
        generator.generate_fma_codelets(parity_splitting, False,
                                        c_type=c_type))
    args = [x, y, n]
    answer = run_kernel(f"test_radix_{radix}",
                        codelets,
                        1,
                        args, {},
                        compiler_options=["-DTESTING_RADIX"])

    y = answer[1]
    y = y[..., 0] + 1j * y[..., 1]

    numpy.testing.assert_almost_equal(y, y_ref, decimal=5)
Beispiel #10
0
def test_hostfunction():

    #setup test input
    size, ndim, A, B, scale, grad, cost = get_real_data()
    #size, ndim, A, B, scale, grad, cost = generate_inputs()

    #call the reference function
    ref_cost, ref_gradient = call_reference_function(size, ndim, A, B, scale,
                                                     grad, cost)

    #call the host function
    m = numpy.int32(size)
    n = numpy.int32(B.shape[0])
    arguments = [cost, A, B, m, n, ndim, scale, grad]
    with open(get_kernel_path() + 'gausstransform.cu', 'r') as f:
        kernel_string = f.read()
    answer = run_kernel("test_GaussTransformHost",
                        kernel_string,
                        size,
                        arguments, {},
                        lang="C",
                        compiler_options=compiler_options + ['-arch=sm_30'])
    cost = answer[0][0]
    print("reference")
    print(ref_cost)
    gradient = answer[7]
    print(ref_gradient)

    print("answer")
    print(cost)

    print(gradient)

    assert numpy.isclose(ref_cost, cost, atol=1e-8)
    assert numpy.allclose(ref_gradient, gradient, atol=1e-8)
Beispiel #11
0
def test_vector_add():
    #Check pycuda is installed and if a CUDA capable device is present, if not skip the test
    try:
        import pycuda.driver as drv
        drv.init()
    except (ImportError, Exception):
        raise SkipTest("PyCuda not installed or no CUDA device detected")

    kernel_string = """
    __global__ void vector_add(float *c, float *a, float *b, int n) {
        int i = blockIdx.x * block_size_x + threadIdx.x;
        if (i<n) {
            c[i] = a[i] + b[i];
        }
    }
    """

    size = 10000000
    problem_size = (size, 1)

    a = numpy.random.randn(size).astype(numpy.float32)
    b = numpy.random.randn(size).astype(numpy.float32)
    c = numpy.zeros_like(b)
    n = numpy.int32(size)

    args = [c, a, b, n]
    params = {"block_size_x": 512}

    answer = run_kernel("vector_add", kernel_string, problem_size, args,
                        params)

    assert numpy.allclose(answer[0], a + b, atol=1e-8)
Beispiel #12
0
def test_2n():
    N = 1024

    signal1, signal2 = np.random.normal(size=(2, N)).astype(np.float32)
    x = signal1 + 1j * signal2

    y = np.fft.fft(x).astype(np.complex64)

    Xa, Xb = np.zeros((2, N), dtype=np.float32)
    _, Xa, Xb = run_kernel("test_fix_2n", "fft1024_mc_fma.cl", 1, [y, Xa, Xb],
                           {
                               "TESTING": 1,
                               "block_size_x": 1
                           })

    signal1_ref = np.fft.rfft(signal1)[:-1]
    signal2_ref = np.fft.rfft(signal2)[:-1]

    print(Xa)
    print(signal1_ref)

    assert abs(Xa.view(np.complex64) - signal1_ref).max() < 1e-3

    print(Xb)
    print(signal2_ref)

    assert abs(Xb.view(np.complex64) - signal2_ref).max() < 1e-3
Beispiel #13
0
def test_2N_r2cfft():
    N = 1024

    signal1, signal2 = np.random.normal(size=(2, N)).astype(np.float32)
    x = np.c_[signal1, signal2]
    y = np.zeros_like(x)
    _, y = run_kernel("fft_1024", "fft1024_mc_fma.cl", 1, [x, y], {
        "TESTING": 1,
        "block_size_x": 1024
    })

    c = y[..., 0] + 1j * y[..., 1]
    Xa_r, Xa_i, Xb_r, Xb_i = fix_2n(c, N)
    signal1_ans = Xa_r + 1j * Xa_i
    signal2_ans = Xb_r + 1j * Xb_i

    signal1_ref = np.fft.rfft(signal1)[:-1]
    signal2_ref = np.fft.rfft(signal2)[:-1]

    print(signal1_ans)
    print(signal1_ref)

    assert abs(signal1_ans - signal1_ref).max() < 1e-3

    print(signal2_ans)
    print(signal2_ref)

    assert abs(signal2_ans - signal2_ref).max() < 1e-3
Beispiel #14
0
def test():

    with open('vector_add.F90', 'r') as f:
        kernel_string = f.read()

    size = 10000000

    a = np.random.randn(size).astype(np.float32)
    b = np.random.randn(size).astype(np.float32)
    c = np.zeros_like(b)
    n = np.int32(size)

    args = [c, a, b, n]

    tune_params = dict()
    tune_params["N"] = size
    tune_params["NTHREADS"] = 4

    answer = run_kernel("vector_add",
                        kernel_string,
                        size,
                        args,
                        tune_params,
                        lang="C",
                        compiler="pgfortran")

    assert np.allclose(answer[0], a + b, atol=1e-8)
def test_quadratic_difference_kernel():
    skip_if_no_cuda_device()

    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(300)
    sliding_window_width = np.int32(150)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N)

    correlations_ref = np.zeros((sliding_window_width, N), 'uint8')
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)

    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]

    #call the CUDA kernel
    params = { "block_size_x": 256, "write_sums": 1, 'window_width': sliding_window_width }
    answer = run_kernel("quadratic_difference_linear", kernel_string, problem_size, args, params)

    #compute reference answer
    correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct)

    test_result = np.sum(answer[0] - correlations_ref) == 0
    if not test_result == True:
        print("test quadratic_difference_linear FAILED, attempting to create a plot for visual comparison")
        create_plot(correlations_ref, answer[0])

    assert test_result
def test():

    cp = [
        "-I/home/bwn200/eigen-git-mirror/", "-I/home/bwn200/cxxopts/include/",
        "-lcublas", "-lcurand"
    ]

    size = np.int32(1024)
    problem_size = (size, size)

    #C program assumes data is stored column-major
    A = np.random.randn(*problem_size).astype(np.float32, order='F')
    B = np.random.randn(*problem_size).astype(np.float32, order='F')
    C = np.zeros_like(A)

    args = [C, A, B, size]

    answer = run_kernel("call_cublas_gemm_basic_version",
                        "gemm_cublas.cpp",
                        1,
                        args,
                        params={},
                        compiler_options=cp,
                        compiler="nvcc",
                        lang="C",
                        log=logging.DEBUG)

    #numpy insists on returning the result in row-major, regardless of input
    #using a transpose as a quick fix, there should be a better solution
    expected = np.dot(A, B).T

    assert np.allclose(expected, answer[0], atol=1e-3)
Beispiel #17
0
def test_expdist_ref():

    size = numpy.int32(100)
    ndim = numpy.int32(2)
    cost, A, B, scale_A, scale_B = generate_inputs(size, ndim, 1)

    arguments = [cost, A, B, size, size, ndim, scale_A, scale_B]

    with open(get_kernel_path() + 'expdist_c.cpp', 'r') as f:
        kernel_string = f.read()

    answer = run_kernel("call_expdist",
                        kernel_string,
                        size,
                        arguments, {},
                        lang="C",
                        compiler_options=['-I' + get_kernel_path()])

    cost = call_reference_function(size, ndim, A, B, scale_A, scale_B, cost)

    print("cost")
    print(cost)

    print("A")
    print(A)
    print("B")
    print(B)
    print("scale_A")
    print(scale_A)
    print("scale_B")
    print(scale_B)

    assert 100.0 < cost and cost < 200.0
def test_dot_product():

    function_name = "dot_product"

    a = np.random.randn(3).astype(np.float64)
    b = np.random.randn(3).astype(np.float64)
    c = np.zeros((1), dtype=np.float64)

    args = [c, a, b]
    convert = [True, True, True]

    kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert)

    answer = run_kernel("call_function", kernel_string, 1, args, {},
               lang="C", compiler_options=cp, compiler="nvcc")

    expected = a.dot(b)

    print("answer")
    print(answer[0])

    print("expected")
    print(expected)

    assert np.allclose(answer[0], expected, atol=1e-6)
def test_multiply_matrix():

    function_name = "multiply_matrix"

    a = np.random.randn(9).astype(np.float64)
    b = np.random.randn(9).astype(np.float64)
    c = np.zeros_like(a)

    #args = [c, a, b, np.int32(3)]
    args = [c, a, b]
    convert = [True for _ in args]
    #convert[-1] = False

    template_parameters = "double, 9, 3"

    kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert,
                        template_parameters=template_parameters)

    answer = run_kernel("call_function", kernel_string, 1, args, {},
               lang="C", compiler_options=cp, compiler="nvcc")

    expected = a.reshape(3,3).dot(b.reshape(3,3))

    print("answer")
    print(answer[0].reshape(3,3))

    print("expected")
    print(expected)

    assert np.allclose(answer[0].reshape(3,3), expected, atol=1e-6)
Beispiel #20
0
def test_multiply_matrix():

    function_name = "multiply_matrix"

    with open('matrix_multiply.cpp', 'r') as f:
        kernel_string = f.read()

    a = np.random.randn(9).astype(np.float64)
    b = np.random.randn(9).astype(np.float64)
    c = np.zeros_like(a)

    args = [c, a, b, np.int32(3)]
    convert = [True for _ in args]
    convert[-1] = False

    #generate a wrapper function with "extern C" binding that can be called from Python
    kernel_string = wrappers.cpp(function_name,
                                 kernel_string,
                                 args,
                                 convert_to_array=convert)

    answer = run_kernel(function_name + "_wrapper",
                        kernel_string,
                        1,
                        args, {},
                        lang="C")

    #compute expected answer of matrix multiplication with Numpy
    expected = a.reshape(3, 3).dot(b.reshape(3, 3))

    assert np.allclose(answer[0].reshape(3, 3), expected)
def test_add_matrix():

    function_name = "add_matrix"

    a = np.random.randn(9).astype(np.float64)
    b = np.random.randn(9).astype(np.float64)
    c = np.zeros_like(a)

    args = [c, a, b]
    convert = [True for _ in args]

    kernel_string = generate_wrapper(function_name, filename, args, convert_to_array=convert)

    answer = run_kernel("call_function", kernel_string, 1, args, {},
               lang="C", compiler_options=cp, compiler="nvcc")

    expected = a + b

    print("answer")
    print(answer[0])

    print("expected")
    print(expected)

    assert np.allclose(answer[0], expected, atol=1e-6)
def test_prefix_sum_single_block():
    skip_if_no_cuda_device()

    with open(get_kernel_path()+'prefixsum.cu', 'r') as f:
        kernel_string = f.read()

    size = 487
    problem_size = (size, 1)
    params = {"block_size_x": 128}
    max_blocks = size//params["block_size_x"]
    x = np.ones(size).astype(np.int32)

    #compute reference answer
    reference = np.cumsum(x)

    #setup kernel inputs
    prefix_sums = np.zeros(size).astype(np.int32)
    block_carry = np.zeros(max_blocks).astype(np.int32)
    n = np.int32(size)

    args = [prefix_sums, block_carry, x, n]

    #call the first kernel that computes the incomplete prefix sums
    #and outputs the block carry values
    answer = run_kernel("prefix_sum_single_block", kernel_string, (1,1), args, params)

    #verify
    test_result = np.sum(answer[0] - reference) == 0

    print("answer")
    print(answer[0])
    print("reference")
    print(reference)

    assert test_result
def test_propagate_block_carry():
    skip_if_no_cuda_device()

    with open(get_kernel_path()+'prefixsum.cu', 'r') as f:
        kernel_string = f.read()

    size = 1000
    n = np.int32(size)

    params = {"block_size_x": 256}
    inputs = (np.random.rand(size)*100.0).astype(np.int32)
    block_carry = (np.random.rand(size//params["block_size_x"]+1)*100.0).astype(np.int32)

    args = [inputs, block_carry, n]
    answer = run_kernel("propagate_block_carry", kernel_string, (size,1), args, params)

    reference = inputs.copy()
    bs = params["block_size_x"]
    reference[:bs] = inputs[:bs]
    reference[bs:] = [inputs[i] + block_carry[i//bs-1] for i in range(bs,size)]

    print(block_carry)
    print(answer[0])
    print(reference)

    assert all(answer[0] == reference)
Beispiel #24
0
def test_gausstransform_ref():

    size = numpy.int32(2000)
    ndim = numpy.int32(2)
    A = numpy.random.randn(size * ndim).astype(numpy.float64)
    B = numpy.random.randn(size * ndim).astype(numpy.float64)
    scale = numpy.float64(10.0)
    grad = numpy.zeros(size * ndim).astype(numpy.float64)
    cost = numpy.zeros((1)).astype(numpy.float64)

    arguments = [cost, A, B, size, size, ndim, scale, grad]

    with open(get_kernel_path('gausstransform') + 'gausstransform_c.cpp',
              'r') as f:
        kernel_string = f.read()

    answer = run_kernel(
        "call_GaussTransform",
        kernel_string,
        size,
        arguments, {},
        lang="C",
        compiler_options=['-I' + get_kernel_path('gausstransform')])

    cost = answer[0]
    print(cost)

    assert 1.0 > cost and cost > 0.0

    gradient = answer[7]
    print(gradient)
Beispiel #25
0
def tune_crosscorr(kernel_string, height, width, image_freq, image2_freq):
    """step 2 Fourier transforms and cross correlation"""
    problem_size = (width, height)
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)]
    tune_params["block_size_y"] = [2**i for i in range(6)]

    image_freq = image_freq.reshape(height,width,2)
    image_freq = image_freq[:,:,0] + 1j * image_freq[:,:,1]
    image_freq = fft2(image_freq).astype(np.complex64)

    image2_freq = image2_freq.reshape(height,width,2)
    image2_freq = image2_freq[:,:,0] + 1j * image2_freq[:,:,1]
    image2_freq = fft2(image2_freq).astype(np.complex64)

    crosscorr = np.zeros((height,width,2), dtype=np.float32)

    args = [height, width, crosscorr, image_freq, image2_freq]
    params = {"block_size_x": 32, "block_size_y": 16}
    output = run_kernel("computeCrossCorr",
        kernel_string, problem_size, args, params, grid_div_y=["block_size_y"])

    tune_kernel("computeCrossCorr",
        kernel_string, problem_size, args, tune_params, grid_div_y=["block_size_y"])

    crosscorr = output[2].reshape(height,width,2)
    crosscorr_invert = crosscorr[:,:,0] + 1j * crosscorr[:,:,1]
    crosscorr_invert = ifft2(crosscorr_invert)

    crosscorr[:,:,0] = crosscorr_invert.real
    crosscorr[:,:,1] = crosscorr_invert.imag

    return crosscorr
def call_reference_kernel(N, B, T, K, F, args):

    problem_size = B

    params = {'block_size_x': 32, "use_kernel": 1}
    answer = run_kernel("kernel_coherencies", get_kernel_path()+"predict_model.cu",
                               problem_size, args, params, compiler_options=cp)

    return answer
def test_dense2sparse_kernel():
    skip_if_no_cuda_device()

    with open(get_kernel_path()+'dense2sparse.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(300)
    sliding_window_width = np.int32(150)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    correlations = generate_correlations_table(N, sliding_window_width, cutoff=2.87)

    #obtain full correlation matrix for reference
    dense_matrix = get_full_matrix(correlations)

    #setup all kernel inputs
    node_degrees = dense_matrix.sum(axis=0)
    prefix_sums = np.cumsum(node_degrees).astype(np.int32)
    total_correlated_hits = np.sum(node_degrees.sum())
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    col_idx = np.zeros(total_correlated_hits).astype(np.int32)

    #call the CUDA kernel
    args = [row_idx, col_idx, prefix_sums, correlations, N]
    params = { "block_size_x": 256, 'window_width': sliding_window_width, "use_shared": 1 }
    answer = run_kernel("dense2sparse_kernel", kernel_string, problem_size, args, params)

    row_idx = answer[0]
    col_idx = answer[1]

    print("computed")
    print("row_idx", row_idx)
    print("col_idx", col_idx)

    #obtain Python objects for the sparse representations of both matrices
    answer = csr_matrix((np.ones_like(row_idx), (row_idx, col_idx)), shape=(N,N))
    reference = csr_matrix(dense_matrix)

    print("reference")
    print("row_idx", reference.nonzero()[0])
    print("col_idx", reference.nonzero()[1])

    #subtract both sparse matrices and test
    #if number of non zero elements is zero, i.e. matrix is empty
    diff = reference - answer
    test_result = diff.nnz == 0
    print("diff")
    print(diff)

    #verify
    if not test_result == True:
        print("test dense2sparse FAILED, attempting to create a plot for visual comparison")
        create_plot(answer.todense(), reference.todense())

    assert test_result
Beispiel #28
0
def generate_large_correlations_table(N, sliding_window_width):
    """ generate a larget set of input data with an expected density of correlated hits

    This function is for testing purposes. It generates a large correlations
    table of size N by sliding_window_width, which is filled with zeros
    or ones when two hits are considered correlated. This function has no cutoff
    parameter but uses generate_input_data() to get input data. The correlations
    table is reconstructed on the GPU, for which a kernel is compiled and ran
    on the fly.

    :param N: The number of hits to be considerd by this correlation table
    :type N: int

    :param sliding_window_width: The sliding window width used for this
            correlation table.
    :type sliding_window_width: int

    :returns: correlations table of size N by sliding_window_width and an array
            storing the number of correlated hits per hit of size N.
    :rtype: numpy ndarray of type numpy.uint8, a numpy array of type numpy.int32

    """
    #generating a very large correlations table takes hours on the CPU
    #reconstruct input data on the GPU
    x,y,z,ct = generate_input_data(N)
    problem_size = (N,1)
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]
    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        qd_string = f.read()
    data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1})
    correlations = data[0]
    sums = data[1]

    #now I cant compute the node degrees on the CPU anymore, so using another GPU kernel
    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        degrees_string = f.read()
    args = [sums, correlations, N]
    data = run_kernel("degrees_dense", degrees_string, problem_size, args, {"block_size_x": 512})
    sums = data[0]

    return correlations, sums
Beispiel #29
0
def generate_large_correlations_table(N, sliding_window_width):
    """ generate a larget set of input data with an expected density of correlated hits

    This function is for testing purposes. It generates a large correlations
    table of size N by sliding_window_width, which is filled with zeros
    or ones when two hits are considered correlated. This function has no cutoff
    parameter but uses generate_input_data() to get input data. The correlations
    table is reconstructed on the GPU, for which a kernel is compiled and ran
    on the fly.

    :param N: The number of hits to be considerd by this correlation table
    :type N: int

    :param sliding_window_width: The sliding window width used for this
            correlation table.
    :type sliding_window_width: int

    :returns: correlations table of size N by sliding_window_width and an array
            storing the number of correlated hits per hit of size N.
    :rtype: numpy ndarray of type numpy.uint8, a numpy array of type numpy.int32

    """
    #generating a very large correlations table takes hours on the CPU
    #reconstruct input data on the GPU
    x,y,z,ct = generate_input_data(N)
    problem_size = (N,1)
    correlations = np.zeros((sliding_window_width, N), 'uint8')
    sums = np.zeros(N).astype(np.int32)
    args = [correlations, sums, N, sliding_window_width, x, y, z, ct]
    with open(get_kernel_path()+'quadratic_difference_linear.cu', 'r') as f:
        qd_string = f.read()
    data = run_kernel("quadratic_difference_linear", qd_string, problem_size, args, {"block_size_x": 512, "write_sums": 1})
    correlations = data[0]
    sums = data[1]

    #now I cant compute the node degrees on the CPU anymore, so using another GPU kernel
    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        degrees_string = f.read()
    args = [sums, correlations, N]
    data = run_kernel("degrees_dense", degrees_string, problem_size, args, {"block_size_x": 512})
    sums = data[0]

    return correlations, sums
def tune_pnpoly():

    #change to dir with source files because of includes in pnpoly_host.cu
    os.chdir(get_kernel_path())

    with open('pnpoly_host.cu', 'r') as f:
        host_string = f.read()
    with open('pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0
    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, vertex_xy, size]

    tune_params = OrderedDict()

    #tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32

    tune_params["tile_size"] = [2**i for i in range(6)]
    tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, [bitmap, points, size], params, cmem_args={"d_vertices": vertex_xy})
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly_host", host_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, answer=result, lang="C", verbose=True)

    return results, tune_params
Beispiel #31
0
def call_reference_function(size, ndim, A, B, scale_A, scale_B, cost):
    arguments = [cost, A, B, size, size, ndim, scale_A, scale_B]
    with open(get_kernel_path() + 'expdist_c.cpp', 'r') as f:
        kernel_string = f.read()
    answer = run_kernel("call_expdist",
                        kernel_string,
                        size,
                        arguments, {},
                        lang="C",
                        compiler_options=['-I' + get_kernel_path()])
    ref_cost = answer[0][0]
    return ref_cost
Beispiel #32
0
def test_fastnoise():

    with open(get_kernel_path()+'fastnoisefilter.cu', 'r') as f:
        kernel_string = f.read()

    image = imread(get_testdata_path() + "test.jpg", mode="F")

    height = np.int32(image.shape[0])
    width = np.int32(image.shape[1])
    problem_size = (width, height)

    output1 = np.zeros_like(image)
    output2 = np.zeros_like(image)
    output3 = np.zeros_like(image)

    args = [height, width, output1, output2, image]

    params = OrderedDict()
    params["block_size_x"] = 32
    params["block_size_y"] = 16

    d = np.gradient(image)
    norm = np.sqrt( (d[0]*d[0]) + (d[1]*d[1]) )
    scale = 1.0 / (1.0 + norm)
    dys = d[0] * scale
    dxs = d[1] * scale

    answer = run_kernel("normalized_gradient",
        kernel_string, problem_size, args, params)

    assert np.allclose(answer[2], dxs, atol=1e-6)
    assert np.allclose(answer[3], dys, atol=1e-6)

    args = [height, width, output3, dxs, dys]
    answer = run_kernel("gradient",
        kernel_string, problem_size, args, params)

    reference = np.gradient(dys, axis=0) + np.gradient(dxs, axis=1)

    assert np.allclose(answer[2], reference, atol=1e-6)
def call_reference_kernel(Nelem, r1, r2, r3, x, y, z, tar):
    with open('predict_model_snippet.cu', 'r') as f:
        kernel_string = f.read()
    blockDim_2 = np.int32(power_bit_length(Nelem))
    args = [np.int32(Nelem), r1, r2, r3, x, y, z, tar, blockDim_2]
    params = {"block_size_x": int(Nelem)}
    reference = kernel_tuner.run_kernel(
        "kernel_array_beam_slave_sincos_original",
        kernel_string,
        1,
        args,
        params,
        grid_div_x=[])
    return reference[7]
Beispiel #34
0
def tune_correlate_full_kernel(kernel_name):

    with open(get_kernel_path()+'correlate_full.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(1e6)
    sliding_window_width = np.int32(1500)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    x,y,z,ct = generate_input_data(N, factor=1750.0)

    #setup kernel arguments
    row_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    col_idx = np.zeros(10).astype(np.int32)         #not used in first kernel
    prefix_sums = np.zeros(10).astype(np.int32)     #not used in first kernel
    sums = np.zeros(N).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    #run the sums kernel once
    params = {"block_size_x": 256, "write_sums": 1}
    answer = run_kernel(kernel_name, kernel_string, problem_size, args, params)
    reference = [None for _ in range(len(args))]
    reference[3] = answer[3]
    sums = reference[3].astype(np.int32)

    #setup tuning parameters
    tune_params = OrderedDict()
    tune_params["block_size_x"] = [32*i for i in range(1,33)] #multiples of 32
    tune_params["write_sums"] = [1]
    tune_params["write_spm"] = [0]

    kernel_1 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    #tune kernel #2
    total_correlated_hits = sums.sum()
    print("total_correlated_hits", total_correlated_hits)
    print("density", total_correlated_hits/(float(N)*sliding_window_width))

    col_idx = np.zeros(total_correlated_hits).astype(np.int32)
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    prefix_sums = np.cumsum(sums).astype(np.int32)
    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    tune_params["write_sums"] = [0]
    tune_params["write_spm"] = [1]

    kernel_2 = tune_kernel(kernel_name, kernel_string, problem_size, args, tune_params, verbose=True)

    return kernel_1, kernel_2
Beispiel #35
0
def test_fft_1024():
    x = np.random.normal(size=(1024, 2)).astype(np.float32)
    y = np.zeros_like(x)
    _, y = run_kernel("fft_1024", "fft1024_mc_fma.cl", 1, [x, y], {
        "TESTING": 1,
        "block_size_x": 1024
    })
    y_Z = y[..., 0] + 1j * y[..., 1]
    y_ref = np.fft.fft(x[..., 0] + 1j * x[..., 1])

    print(y_Z)
    print(y_ref)

    assert abs(y_Z - y_ref).max() < 1e-3
def call_reference_kernel(N, T, K, F, args, cp):

    problem_size = (T * K * F, N)

    params = {"block_size_x": 32, "use_kernel": 1}
    answer = run_kernel("kernel_tuner_host_array_beam",
                        [get_kernel_path() + "predict_model.cu"],
                        problem_size,
                        args,
                        params,
                        lang="C",
                        compiler_options=cp)
    ref = [None for _ in answer]
    ref[17] = answer[17]
    return ref
Beispiel #37
0
def tune():
    with open('convolution.cu', 'r') as f:
        kernel_string = f.read()

    filter_size = (17, 17)
    problem_size = (4096, 4096)
    size = numpy.prod(problem_size)
    border_size = (filter_size[0]//2*2, filter_size[1]//2*2)
    input_size = ((problem_size[0]+border_size[0]) * (problem_size[1]+border_size[1]))

    output = numpy.zeros(size).astype(numpy.float32)
    input = numpy.random.randn(input_size).astype(numpy.float32)

    filter = numpy.random.randn(filter_size[0]*filter_size[1]).astype(numpy.float32)
    cmem_args= {'d_filter': filter }

    args = [output, input, filter]
    tune_params = OrderedDict()
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]

    tune_params["block_size_x"] = [16*i for i in range(1,9)]
    tune_params["block_size_y"] = [2**i for i in range(1,6)]

    tune_params["tile_size_x"] = [2**i for i in range(3)]
    tune_params["tile_size_y"] = [2**i for i in range(3)]

    tune_params["use_padding"] = [0,1]  #toggle the insertion of padding in shared memory
    tune_params["read_only"] = [0,1]    #toggle using the read-only cache

    grid_div_x = ["block_size_x", "tile_size_x"]
    grid_div_y = ["block_size_y", "tile_size_y"]

    #compute the answer using a naive kernel
    params = { "block_size_x": 16, "block_size_y": 16}
    tune_params["filter_width"] = [filter_size[0]]
    tune_params["filter_height"] = [filter_size[1]]
    results = kernel_tuner.run_kernel("convolution_naive", kernel_string,
        problem_size, args, params,
        grid_div_y=["block_size_y"], grid_div_x=["block_size_x"])

    #set non-output fields to None
    answer = [results[0], None, None]

    #start kernel tuning with correctness verification
    return kernel_tuner.tune_kernel("convolution_kernel", kernel_string,
        problem_size, args, tune_params,
        grid_div_y=grid_div_y, grid_div_x=grid_div_x, verbose=True, cmem_args=cmem_args, answer=answer)
Beispiel #38
0
def test_parity(parity_splitting: ParitySplitting):
    kernel = generate_preprocessor(
        parity_splitting, False) + generate_parity_function(parity_splitting)
    x = np.arange(parity_splitting.N, dtype=np.int32)
    y = np.zeros_like(x)
    kernel_args = [x, y]

    results = run_kernel("test_parity_{}".format(parity_splitting.radix),
                         kernel,
                         parity_splitting.N,
                         kernel_args, {},
                         compiler_options=["-DTESTING"])
    y_ref = np.array(
        [parity(parity_splitting.radix, i) for i in range(parity_splitting.N)])

    assert np.all(results[1] == y_ref)
Beispiel #39
0
def test_fft_4():
    x = np.random.normal(size=(1024, 4, 2)).astype(np.float32)
    y = np.zeros_like(x)
    for cycle in range(4):
        y_ref = np.fft.fft(np.roll(x[..., 0] + 1j * x[..., 1], -cycle, axis=1))
        _, _, y = run_kernel("test_fft_4", "fft1024_mc_fma.cl", 1,
                             [np.int32(cycle), x, y], {
                                 "TESTING": 1,
                                 "block_size_x": 1024
                             })
        y_Z = np.roll(y[..., 0] + 1j * y[..., 1], -cycle, axis=1)

        print(y_Z)
        print(y_ref)

        assert abs(y_Z - y_ref).max() < 1e-4
Beispiel #40
0
def test_transpose(parity_splitting: ParitySplitting):
    kernel = generate_preprocessor(
        parity_splitting,
        False) + generate_transpose_function(parity_splitting)
    x = np.arange(parity_splitting.N, dtype=np.int32)
    y = np.zeros_like(x)
    kernel_args = [x, y]

    results = run_kernel("test_transpose_{}".format(parity_splitting.radix),
                         kernel,
                         parity_splitting.N,
                         kernel_args, {},
                         compiler_options=["-DTESTING"])
    y_ref = x.reshape(parity_splitting.factors).T.flatten()

    assert np.all(results[1] == y_ref)
def test_quadratic_difference_full_sums(kernel_name, mode="qd"):
    skip_if_no_cuda_device()

    with open(get_kernel_path()+"correlate_full.cu", 'r') as f:
        kernel_string = f.read()

    N = np.int32(600)
    sliding_window_width = np.int32(150)
    problem_size = (N, 1)

    x,y,z,ct = generate_input_data(N, factor=18.0)

    correlations_ref = np.zeros((sliding_window_width, N), 'uint8')
    #compute reference answer
    if mode == "qd":
        correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct)
    elif mode == "3b":
        ct = ct / 0.299792458
        correlations_ref = correlations_cpu_3B(correlations_ref, x, y, z, ct)
    corr_matrix = get_full_matrix(correlations_ref)

    sums = np.zeros(N).astype(np.int32)
    row_idx = np.zeros(10).astype(np.int32)         #not used in this test
    col_idx = np.zeros(10).astype(np.int32)         #not used in this test
    prefix_sums = np.zeros(10).astype(np.int32)     #not used in this test

    #call the CUDA kernel
    params = { "block_size_x": 256, "write_sums": 1, 'window_width': sliding_window_width, 'tile_size_x': 1 }

    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]
    answer = run_kernel(kernel_name, kernel_string, problem_size, args, params, compiler_options=["--std=c++11"])

    sums_ref = np.sum(corr_matrix, axis=0)
    #sums_ref = np.sum(correlations_ref, axis=0)
    print("reference", sums_ref.sum())
    print(sums_ref)

    sums = answer[3]
    print("answer", sums.sum())
    print(sums)

    diff = (sums_ref - sums).astype(np.int8)
    print("diff")
    print(diff)

    assert all(diff == 0)
Beispiel #42
0
def test_degrees_kernel():
    skip_if_no_cuda_device()

    def in_degrees(correlations):
        degrees = np.zeros(correlations.shape[1])
        for i in range(correlations.shape[1]):
            in_degree = 0
            for j in range(correlations.shape[0]):
                col = i-j-1
                if col>=0:
                    in_degree += correlations[j, col]
            degrees[i] = in_degree
        return degrees

    with open(get_kernel_path()+'degrees.cu', 'r') as f:
        kernel_string = f.read()

    N = np.int32(400)
    sliding_window_width = np.int32(150)
    problem_size = (N, 1)

    #generate input data with an expected density of correlated hits
    correlations = generate_correlations_table(N, sliding_window_width, cutoff=2.87)

    #compute reference answer
    in_degree = in_degrees(correlations)
    out_degree = np.sum(correlations, axis=0).astype(np.int32)
    reference = (in_degree+out_degree)

    #call the CUDA kernel
    args = [out_degree, correlations, N]
    params = { "block_size_x": 256, 'window_width': sliding_window_width }
    answer = run_kernel("degrees_dense", kernel_string, problem_size, args, params)

    print("answer", answer[0])
    print("reference", reference)

    #verify
    test_result = np.sum(answer[0] - reference) == 0
    if not test_result == True:
        print("test degrees_dense FAILED, attempting to create a plot for visual comparison")
        create_plot(reference.reshape(20,20), answer[0].reshape(20,20))

    assert test_result
Beispiel #43
0
def create_sparse_matrix(correlations, sums):
    """ call GPU kernel to transform a correlations table into a spare matrix

    This function compiles the dense2sparse GPU kernel and calls it convert a
    densely stored correlations table into a sparsely stored correlation matrix.
    The sparse notation used is CSR.

    This routine uses a transposed correlations table of N by window_width, whereas
    most other routines use window_width by N, this needs to be fixed.

    :param correlations: A correlations table of size N by sliding_window_width
    :type correlations: a 2d numpy array of type numpy.uint8

    :param sums: An array with the number of correlated hits per hit
    :type sums: numpy array of type numpy.int32

    :returns: This function returns three arrays that together form the sparse matrix

        * row_idx: the row index of each entry in the column index array
        * col_idx: the column index of each correlation in the sparse matrix
        * prefix_sums: the offset into the column index array for each row

    :rtype: numpy ndarray of type numpy.int32
    """
    N = np.int32(correlations.shape[0])
    prefix_sums = np.cumsum(sums).astype(np.int32)
    total_correlated_hits = np.sum(sums.sum())
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    col_idx = np.zeros(total_correlated_hits).astype(np.int32)
    with open(get_kernel_path()+'dense2sparse.cu', 'r') as f:
        kernel_string = f.read()
    args = [row_idx, col_idx, prefix_sums, correlations, N]
    params = { "block_size_x": 256, "window_width": correlations.shape[1],
                "write_sums": 1, "use_shared": 1}
    data = run_kernel("dense2sparse_kernel", kernel_string, (N,1), args, params)
    return data[0], data[1], prefix_sums
def tune_pnpoly_kernel():

    with open(get_kernel_path()+'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    size = numpy.int32(2e7)
    problem_size = (size, 1)
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #as test input we use a circle with radius 1 as polygon and
    #a large set of normally distributed points around 0,0

    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    vertex_x = numpy.cos(vertex_seeds)
    vertex_y = numpy.sin(vertex_seeds)
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, size]

    # (vk.x-vj.x) / (vk.y-vj.y)
    slopes = numpy.zeros(vertices).astype(numpy.float32)
    for i in range(len(slopes)):
        if i == 0:
            slopes[i] = (vertex_x[-1] - vertex_x[i]) / (vertex_y[-1] - vertex_y[i])
        else:
            slopes[i] = (vertex_x[i-1] - vertex_x[i]) / (vertex_y[i-1] - vertex_y[i])

    cmem_args= {'d_vertices': vertex_xy, "d_slopes": slopes }

    tune_params = OrderedDict()

    tune_params["block_size_x"] = [2**i for i in range(6,10)]   #powers of two
    #tune_params["block_size_x"] = [32*i for i in range(1,32)]  #multiple of 32
    #tune_params["block_size_x"] = [256]                        #fixed size

    tune_params["tile_size"] = [2**i for i in range(6)]
    #tune_params["f_unroll"] = [i for i in range(1,20) if float(vertices)/i==vertices//i]
    tune_params["between_method"] = [0, 1, 2, 3]
    tune_params["use_precomputed_slopes"] = [0, 1]
    tune_params["use_method"] = [0, 1]

    grid_div_x = ["block_size_x", "tile_size"]

    #compute a reference answer using naive kernel
    params = {"block_size_x": 512}
    result = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, args, params, cmem_args=cmem_args)
    result = [result[0], None, None]

    #start tuning
    results = kernel_tuner.tune_kernel("cn_pnpoly", kernel_string,
        problem_size, args, tune_params,
        grid_div_x=grid_div_x, cmem_args=cmem_args, answer=result)

    return results, tune_params
args_old = [correlations, N, sliding_window_width, x, y, z, ct]
args = [correlations, sums, N, sliding_window_width, x, y, z, ct]

tune_params = dict()
tune_params["block_size_x"] = [2**i for i in range(7)]
tune_params["block_size_y"] = [2**i for i in range(7)]

grid_div_x = ["block_size_x"]
grid_div_y = ["block_size_y"]

restrict = ["block_size_x*block_size_y >= 32"]

#run the kernel once for with parameters known to produce correct output
#the result list can be used to verify the output of the quadratic_difference_linear kernel
params = { "block_size_x": 16, "block_size_y": 16 }
result = run_kernel("quadratic_difference", kernel_string, problem_size, args_old, params, grid_div_x=grid_div_x, grid_div_y=grid_div_y)



#uncomment the following to tune the old kernel
#tune_kernel("quadratic_difference", kernel_string, problem_size, args, tune_params,
#    grid_div_x=grid_div_x, grid_div_y=grid_div_y, restrictions=restrict)



#now tune the quadratic_difference_linear kernel
kernel_name = "quadratic_difference_full_shfl"



args = [col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]
max_temp = numpy.zeros(num_blocks).astype(numpy.int32)
locations = numpy.zeros(size).astype(numpy.int32)
use_index = numpy.int32(1)
n = numpy.int32(size)

args = [max_loc, max_temp, locations, x, use_index, n]

params = dict()
params["block_size_x"] = 64
params["num_blocks"] = 8
params["use_shuffle"] = 1
params["vector"] = 4

#call the first kernel that computes the incomplete max locs
result = run_kernel("max_loc", kernel_string,
    problem_size, args, params,
    grid_div_x=[])

#then call the kernel again on the intermediate result with 1 thread block
args = [max_loc, max_temp, result[0], result[1], numpy.int32(0), num_blocks]

params["num_blocks"] = 1

result_final = run_kernel("max_loc", kernel_string,
    (1, 1), args, params,
    grid_div_x=[])

print "expected", numpy.argmax(x), x.max()
print "intermediate answer", result[0], result[1]
print "kernel answer", result_final[0][0], result_final[1][0]
x = numpy.ones(size).astype(numpy.float32)
print x

prefix_sums = numpy.zeros(size).astype(numpy.float32)
block_carry = numpy.zeros(max_blocks).astype(numpy.float32)
n = numpy.int32(size)

args = [prefix_sums, block_carry, x, n]

params = dict()
params["block_size_x"] = 64

#call the first kernel that computes the incomplete prefix sums
#and outputs the block carry values
result = run_kernel("prefix_sum_block", kernel_string,
    problem_size, args, params,
    grid_div_x=["block_size_x"])

prefix_sums = result[0]
print result[0]
print result[1]

block_filler = numpy.zeros(max_blocks).astype(numpy.float32)
block_out = numpy.zeros(max_blocks).astype(numpy.float32)

args = [block_out, block_filler, result[1], numpy.int32(max_blocks)]

#call the kernel again, but this time on the block carry values
#one thread block should be sufficient
if max_blocks > params["block_size_x"]:
    print("warning: block size too small")
def test_pnpoly_kernel():

    skip_if_no_cuda_device()

    with open(get_kernel_path()+'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    problem_size = (int(2e6), 1)
    size = numpy.int32(numpy.prod(problem_size))
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #to verify the output of the gpu kernel
    #we use a circle with radius 1 as polygon and
    #do a simple distance to 0,0 check for all points

    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    print "points_x min max", points_x.min(), points_x.max()
    print "points_y min max", points_y.min(), points_y.max()

    vertex_x = numpy.cos(vertex_seeds)
    vertex_x[-1] = vertex_x[0]
    vertex_y = numpy.sin(vertex_seeds)
    vertex_y[-1] = vertex_y[0]
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, size]

    print "vertex_x min max", vertex_x.min(), vertex_x.max()
    print "vertex_y min max", vertex_y.min(), vertex_y.max()

    #from matplotlib import pyplot
    #plot all points
    #pyplot.scatter(points_x, points_y)
    #plot the outline of the polygon
    #pyplot.plot(vertex_x, vertex_y)
    #pyplot.show()

    cmem_args= {'d_vertices': vertex_xy }

    params = dict()
    params["block_size_x"] = 64
    params["tile_size"] = 1
    params["between_method"] = 2
    params["use_method"] = 0

    kernel_name = "cn_pnpoly"

    #compute kernel output
    result = kernel_tuner.run_kernel(kernel_name, kernel_string,
        problem_size, args, params,
        cmem_args=cmem_args)

    answer = result[0]
    answer_sum = numpy.sum(answer)
    print("answer sum=", answer_sum)
    print(result[0])

    #compute reference answer using naive kernel
    reference = kernel_tuner.run_kernel("cn_pnpoly_naive", kernel_string,
        problem_size, args, params, cmem_args=cmem_args)
    reference = reference[0]
    #reference = [numpy.sqrt(x*x + y*y) < 1.0 for x,y in zip(points_x, points_y)]
    #reference = numpy.array(reference).astype(numpy.int32)
    reference_sum = numpy.sum(reference)
    print("reference sum =", reference_sum)
    print(reference)


    diff = answer - reference
    print("diff abs sum=", numpy.sum(numpy.absolute(diff)) )

    for i in range(len(diff)):
        if diff[i] != 0:
            x = points[i*2]
            y = points[i*2+1]
            print("diff=",diff[i],"error on point i=", i, "(x,y)=", (x,y), "dist to 0,0=", numpy.sqrt(x*x+y*y) )
            if y in vertex_y:
                print ("y equals y-coordinate of a vertex")



    #we assert with a small margin because the test
    #and the kernel compute different things
    assert numpy.sum(numpy.absolute(answer - reference)) < 5
Beispiel #49
0
#!/usr/bin/env python
import numpy
import kernel_tuner

problem_size = (4096, 4096)
size = numpy.prod(problem_size)

A = numpy.random.randn(size).astype(numpy.float32)
B = numpy.random.randn(size).astype(numpy.float32)
C = numpy.zeros_like(A)

args = [C, A, B]

params = {"block_size_x": 32, "block_size_y": 8, "tile_size_x": 4, "tile_size_y": 4}

grid_div_x = ["block_size_x", "tile_size_x"]
grid_div_y = ["block_size_y", "tile_size_y"]

results = kernel_tuner.run_kernel("matmul_kernel", "../examples/cuda/matmul.cu",
                                  problem_size, args, params,
                                  grid_div_x=grid_div_x, grid_div_y=grid_div_y)
grid_div_x = ["block_size_x", "tile_size"]

params = dict()
params["block_size_x"] = 512
params["prefetch"] = 0
#params["use_bitmap"] = 0
#params["coalesce_bitmap"] = 0
params["tile_size"] = 1

#kernel_name = "cn_PnPoly"
kernel_name = "cn_PnPoly_naive"
#kernel_name = "pnpoly_cn_gpu"

result = kernel_tuner.run_kernel(kernel_name, kernel_string,
    problem_size, args, params,
    grid_div_x=grid_div_x, cmem_args=cmem_args)

result = [result[0], None, None]

#result = kernel_tuner.run_kernel("pnpoly_cn", kernel_string,
#    problem_size, args, params,
#    grid_div_x=grid_div_x, lang="C")

print "sum=" + str(numpy.sum(result[0]))

params["prefetch"] = 1
res = kernel_tuner.run_kernel("cn_PnPoly", kernel_string,
    problem_size, args, params,
    grid_div_x=grid_div_x, cmem_args=cmem_args)
#!/usr/bin/env python
import numpy
import kernel_tuner

problem_size = (4096, 4096)
size = numpy.prod(problem_size)

A = numpy.random.randn(*problem_size).astype(numpy.float32)
B = numpy.random.randn(*problem_size).astype(numpy.float32)
C = numpy.zeros_like(A)

args = [C, A, B]

answer = [numpy.dot(A,B), None, None]

params = {"block_size_x": 16, "block_size_y": 32}

results = kernel_tuner.run_kernel("matmul_kernel", "matmul_naive.cu",
                                   problem_size, args, params)

# answer = run_kernel("matmul_kernel", [get_kernel_path()+"matmul_naive.cu"], problem_size, args, params, lang="C", compiler_options=cp)
prefix_sum = np.cumsum(sum_rows).astype(np.int32)

print prefix_sum.shape
print prefix_sum

print "bliep", num_correlated_hits, np.sum(sum_rows)

row_idx = np.zeros(num_correlated_hits).astype(np.int32)
col_idx = np.zeros(num_correlated_hits).astype(np.int32)

args = [row_idx, col_idx, prefix_sum, correlations, N]

params = dict()
params["block_size_x"] = 256

result = run_kernel("dense2sparse_kernel", kernel_string, problem_size,
    args, params, grid_div_x=["block_size_x"])

row_idx = result[0]
col_idx = result[1]

correlations_restored = np.zeros((sliding_window_width, N), 'uint8')

correlations_restored[col_idx, row_idx] = 1
#for i in range(num_correlated_hits):
#    correlations_restored[col_idx[i], row_idx[i]] = 1


print "restored_hits", np.sum(correlations_restored)

if False:
    from matplotlib import pyplot
def test_quadratic_difference_full_sparse_matrix(kernel_name, mode):
    skip_if_no_cuda_device()

    with open(get_kernel_path()+"correlate_full.cu", 'r') as f:
        kernel_string = f.read()

    #N,x,y,z,ct = get_real_input_data("/var/scratch/bwn200/KM3Net/event1-crop.txt")
    N = np.int32(600)
    sliding_window_width = np.int32(150)
    problem_size = (N, 1)
    x,y,z,ct = generate_input_data(N, factor=18.0)

    #compute reference answer
    correlations_ref = np.zeros((sliding_window_width, N), 'uint8')
    if mode == "qd":
        correlations_ref = correlations_cpu(correlations_ref, x, y, z, ct)
    elif mode == "3b":
        ct = ct / 0.299792458
        correlations_ref = correlations_cpu_3B(correlations_ref, x, y, z, ct)
    corr_matrix = get_full_matrix(correlations_ref)
    sums_ref = np.sum(corr_matrix, axis=1)
    total_correlated_hits = corr_matrix.sum()

    sums = sums_ref.astype(np.int32)
    row_idx = np.zeros(total_correlated_hits).astype(np.int32)
    col_idx = np.zeros(total_correlated_hits).astype(np.int32)
    prefix_sums = np.cumsum(sums_ref).astype(np.int32)

    args = [row_idx, col_idx, prefix_sums, sums, N, sliding_window_width, x, y, z, ct]

    #call the CUDA kernel
    params = { "block_size_x": 256, "write_spm": 1, 'write_rows': 1, 'window_width': sliding_window_width, 'tile_size_x': 1 }
    answer = run_kernel(kernel_name, kernel_string, problem_size, args, params)

    reference = csr_matrix(corr_matrix)
    col_idx_ref = reference.nonzero()[1]

    row_idx = answer[0]
    print("row_idx")
    print(row_idx)
    col_idx = answer[1]
    print("col_idx")
    print(col_idx)

    print("reference")
    print(list(zip(reference.nonzero()[0], reference.nonzero()[1])))

    answer = csr_matrix((np.ones_like(row_idx), (row_idx, col_idx)), shape=(N,N))

    print("answer")
    print(list(zip(answer.nonzero()[0], answer.nonzero()[1])))

    diff = reference - answer

    print("diff")
    print(list(zip(diff.nonzero()[0], diff.nonzero()[1])))
    print("diff.nnz", diff.nnz)

    answer2 = csr_matrix(sparse_to_dense(prefix_sums, col_idx), shape=(N,N))
    diff2 = reference - answer2
    print("diff2")
    print(list(zip(diff2.nonzero()[0], diff2.nonzero()[1])))
    print("diff2.nnz", diff2.nnz)

    if False:
        create_plot(get_full_matrix(reference), get_full_matrix(answer))

    assert diff.nnz == 0
    assert diff2.nnz == 0
def test_pnpoly_naive_kernel():

    skip_if_no_cuda_device()

    with open(get_kernel_path()+'pnpoly.cu', 'r') as f:
        kernel_string = f.read()

    problem_size = (20000, 1)
    size = numpy.int32(numpy.prod(problem_size))
    vertices = 600

    points = numpy.random.randn(2*size).astype(numpy.float32)
    bitmap = numpy.zeros(size).astype(numpy.int32)

    #to verify the output of the gpu kernel
    #we use a circle with radius 1 as polygon and
    #do a simple distance to 0,0 check for all points

    vertex_seeds = numpy.sort(numpy.random.rand(vertices)*2.0*numpy.pi)[::-1]

    points_x = points[::2]
    points_y = points[1::2]

    print "points_x min max", points_x.min(), points_x.max()
    print "points_y min max", points_y.min(), points_y.max()

    vertex_x = numpy.cos(vertex_seeds)
    vertex_x[-1] = vertex_x[0]
    vertex_y = numpy.sin(vertex_seeds)
    vertex_y[-1] = vertex_y[0]
    vertex_xy = numpy.array( zip(vertex_x, vertex_y) ).astype(numpy.float32)

    args = [bitmap, points, size]

    print "vertex_x min max", vertex_x.min(), vertex_x.max()
    print "vertex_y min max", vertex_y.min(), vertex_y.max()

    #from matplotlib import pyplot
    #plot all points
    #pyplot.scatter(points_x, points_y)
    #plot the outline of the polygon
    #pyplot.plot(vertex_x, vertex_y)
    #pyplot.show()

    cmem_args= {'d_vertices': vertex_xy }

    params = dict()
    params["block_size_x"] = 512

    kernel_name = "cn_pnpoly_naive"

    #compute kernel output
    result = kernel_tuner.run_kernel(kernel_name, kernel_string,
        problem_size, args, params,
        cmem_args=cmem_args)

    answer = result[0]
    answer_sum = numpy.sum(answer)
    print("answer sum=", answer_sum)
    print(result[0])

    #compute reference answer
    reference = [numpy.sqrt(x*x + y*y) < 1.0 for x,y in zip(points_x, points_y)]
    reference = numpy.array(reference).astype(numpy.int32)
    reference_sum = numpy.sum(reference)
    print("reference sum =", reference_sum)
    print(reference)

    #we assert with a small margin because the test
    #and the kernel compute different things
    assert numpy.sum(numpy.absolute(answer - reference)) < 5