Example #1
0
        def math(a, b, c, d, e, f):
            a_gpu = drv.mem_alloc(a.nbytes)
            b_gpu = drv.mem_alloc(b.nbytes)
            c_gpu = drv.mem_alloc(c.nbytes)
            d_gpu = drv.mem_alloc(d.nbytes)
            e_gpu = drv.mem_alloc(e.nbytes)
            f_gpu = drv.mem_alloc(f.nbytes)

            drv.memcpy_htod(a_gpu, a)
            drv.memcpy_htod(b_gpu, b)

            from pycuda.compiler import DynamicSourceModule

            mod = DynamicSourceModule(cuda_string, keep=True)

            func = mod.get_function("math")
            func(
                a_gpu,
                b_gpu,
                c_gpu,
                d_gpu,
                e_gpu,
                f_gpu,
                block=(100, 1, 1),
                grid=(1, 1, 1),
            )

            drv.memcpy_dtoh(c, c_gpu)
            drv.memcpy_dtoh(d, d_gpu)
            drv.memcpy_dtoh(e, e_gpu)
            drv.memcpy_dtoh(f, f_gpu)
Example #2
0
def main(argv):
    max_depth = 2
    if len(argv) > 1:
        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8:
            max_depth = int(argv[1])
        else:
            print("Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)." % argv[0])
            sys.exit(0)

    print("starting Simple Print (CUDA Dynamic Parallelism)")

    mod = DynamicSourceModule(cdpSimplePrint_cu)
    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call

    print("***************************************************************************")
    print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
    print("launch 2 blocks of 2 threads each. The GPU we will do that recursively")
    print("until it reaches max_depth=%d\n" % max_depth)
    print("In total 2")
    num_blocks, sum = 2, 2
    for i in range(1, max_depth):
        num_blocks *= 4
        print("+%d" % num_blocks)
        sum += num_blocks
    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
    print("***************************************************************************\n")

    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)

    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
    cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
Example #3
0
def main(argv):
    max_depth = 2
    if len(argv) > 1:
        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(
                argv[1]) <= 8:
            max_depth = int(argv[1])
        else:
            print(
                "Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)."
                % argv[0])
            sys.exit(0)

    print("starting Simple Print (CUDA Dynamic Parallelism)")

    mod = DynamicSourceModule(cdpSimplePrint_cu)
    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call

    print(
        "***************************************************************************"
    )
    print(
        "The CPU launches 2 blocks of 2 threads each. On the device each thread will"
    )
    print(
        "launch 2 blocks of 2 threads each. The GPU we will do that recursively"
    )
    print("until it reaches max_depth=%d\n" % max_depth)
    print("In total 2")
    num_blocks, sum = 2, 2
    for i in range(1, max_depth):
        num_blocks *= 4
        print("+%d" % num_blocks)
        sum += num_blocks
    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum - 2))
    print(
        "***************************************************************************\n"
    )

    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH,
                                      max_depth)

    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
    cdp_kernel((2, 1), (2, 1, 1), max_depth, 0, 0, -1)
Example #4
0
        def math(a, b, c, d, e, f):
            a_gpu = drv.mem_alloc(a.nbytes)
            b_gpu = drv.mem_alloc(b.nbytes)
            c_gpu = drv.mem_alloc(c.nbytes)
            d_gpu = drv.mem_alloc(d.nbytes)
            e_gpu = drv.mem_alloc(e.nbytes)
            f_gpu = drv.mem_alloc(f.nbytes)

            drv.memcpy_htod(a_gpu, a)
            drv.memcpy_htod(b_gpu, b)

            from pycuda.compiler import DynamicSourceModule
            mod = DynamicSourceModule(cuda_string, keep=True)

            func = mod.get_function("math")
            func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
                    block=(100, 1, 1), grid=(1, 1, 1))

            drv.memcpy_dtoh(c, c_gpu)
            drv.memcpy_dtoh(d, d_gpu)
            drv.memcpy_dtoh(e, e_gpu)
            drv.memcpy_dtoh(f, f_gpu)
Example #5
0
    class_x = classes.eval(feed_dict={x: x_train[idx:idx + 1]})
    print('Predicted class of input {}: {}'.format(idx, class_x))
    # if args.prof:
    #     pr.enable()
    grad_val = gradsign.eval(feed_dict={
        x: x_train[idx:idx + 1],
        y: y_train[idx:idx + 1]
    })
    grad_flat = np.squeeze(grad_val).flatten()
    x_flat = x_train[idx].flatten()
    with open('parallel_fgsm.cu') as f:
        src = f.read()
    src_comp = DynamicSourceModule(src)
    grid = (1, 1)
    block = (1, 1, 1)
    gen_examples_fgsm = src_comp.get_function("gen_examples_fgsm")
    # gen_examples_fgsm.prepare("PPPPII")

    start = time.time()
    gen = curand.MRG32k3aRandomNumberGenerator()
    epsilon_gpu = gpuarray.GPUArray((args.numgens, ), dtype=np.float32)
    gen.fill_uniform(epsilon_gpu)
    # epsilon_gpu = curand.rand((args.numgens,))
    epsilon_gpu = epsilon_gpu * (args.epsmax - args.epsmin) + args.epsmin
    x_gpu = gpuarray.to_gpu(x_flat)
    grad_gpu = gpuarray.to_gpu(grad_flat)
    res_gpu = gpuarray.GPUArray((args.numgens * 28 * 28, ), dtype=np.float32)

    gen_examples_fgsm(res_gpu,
                      x_gpu,
                      grad_gpu,
    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}
}
""".replace('real', real_cpp),
                          no_extern_c=True)
reduce_cp = mod.get_function("reduce_cp")

NUM_REPEATS = 10
N = 100000000
BLOCK_SIZE = 128
NUM_ROUNDS = 10

h_x = numpy.full((N, 1), 1.23, dtype=real_py)
d_x = drv.mem_alloc(h_x.nbytes)
drv.memcpy_htod(d_x, h_x)

grid_size = (N + BLOCK_SIZE - 1) // BLOCK_SIZE
grid_size = (grid_size + NUM_ROUNDS - 1) // NUM_ROUNDS
size_real = numpy.dtype(real_py).itemsize

d_y = drv.mem_alloc(size_real * grid_size)
    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {

        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}""".replace('real', real_cpp))
reduce_global = mod.get_function("reduce_global")
reduce_shared = mod.get_function("reduce_shared")
reduce_dynamic = mod.get_function("reduce_dynamic")


def timing(method):
    NUM_REPEATS = 10
    N = 100000000
    BLOCK_SIZE = 128
    grid_size = (N - 1) // 128 + 1
    h_x = numpy.full((N, 1), 1.23, dtype=real_py)
    d_x = drv.mem_alloc(h_x.nbytes)
    h_y = numpy.zeros((grid_size, 1), dtype=real_py)
    d_y = drv.mem_alloc(h_y.nbytes)
    size_real = numpy.dtype(real_py).itemsize
    t_sum = 0
    {
        B[ny * N + nx] = A[nx * N + ny];
    }
}

__global__ void transpose3(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[ny * N + nx] = __ldg(&A[nx * N + ny]);
    }
}'''.replace('real', real_cpp))

copy = mod.get_function("copy")
transpose1 = mod.get_function("transpose1")
transpose2 = mod.get_function("transpose2")
transpose3 = mod.get_function("transpose3")

def timing(d_A, d_B, N, task):
    NUM_REPEATS = 10
    TILE_DIM = 32
    grid_size_x = (N + TILE_DIM - 1) // TILE_DIM
    grid_size_y = grid_size_x
    block_size = (TILE_DIM, TILE_DIM, 1)
    grid_size = (grid_size_x, grid_size_y, 1)

    t_sum = 0
    t2_sum = 0
    for repeat in range(NUM_REPEATS+1):
mod1 = DynamicSourceModule(r"""
double __device__ add1_device(double x, double y)
{
    return (x + y);
}

void __global__ add1(double *x, double *y, double *z, int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if(n < N) 
    {
        z[n] = add1_device(x[n], y[n]);
    }
}
""")
add1 = mod1.get_function("add1")

mod2 = DynamicSourceModule(r"""
double __device__ add2_device(double x, double y, double *z)
{
    *z = x + y;
}

void __global__ add2(double *x, double *y, double *z, int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if(n < N) 
    {
        add2_device(x[n], y[n], &z[n]);
    }
}
Example #10
0
from __future__ import division
import numpy as np
from pycuda.compiler import DynamicSourceModule
import pycuda.autoinit

DynamicParallelismCode = '''
__global__ void dynamic_hello_ker(int depth)
{
 printf("Hello from thread %d, recursion depth %d!\\n", threadIdx.x, depth);
 if (threadIdx.x == 0 && blockIdx.x == 0 && blockDim.x > 1)
  {
   printf("Launching a new kernel from depth %d .\\n", depth);
   printf("-----------------------------------------\\n");
   dynamic_hello_ker<<< 1, blockDim.x - 1 >>>(depth + 1);
  }
}'''

dp_mod = DynamicSourceModule(DynamicParallelismCode)

hello_ker = dp_mod.get_function('dynamic_hello_ker')

hello_ker(np.int32(0), grid=(1, 1, 1), block=(4, 1, 1))
    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        atomicAdd(d_y, s_y[0]);
    }
}""".replace('real', real_cpp))
reducef = mod.get_function("reduce")


def timing():
    NUM_REPEATS = 10
    N = 100000000
    BLOCK_SIZE = 128
    grid_size = (N - 1) // 128 + 1
    h_x = numpy.full((N, 1), 1.23, dtype=real_py)
    d_x = drv.mem_alloc(h_x.nbytes)
    drv.memcpy_htod(d_x, h_x)
    size_real = numpy.dtype(real_py).itemsize
    t_sum = 0
    t2_sum = 0
    for repeat in range(NUM_REPEATS + 1):
        start = drv.Event()
        const real x1 = d_x[n1];
        const real y1 = d_y[n1];
        for (int n2 = 0; n2 < N; ++n2)
        {
            const real x12 = d_x[n2] - x1;
            const real y12 = d_y[n2] - y1;
            const real distance_square = x12 * x12 + y12 * y12;
            if ((distance_square < cutoff_square) && (n2 != n1))
            {
                d_NL[(count++) * N + n1] = n2;
            }
        }
        d_NN[n1] = count;
    }
}""".replace('real', real_cpp))
find_neighbor_atomic = mod.get_function("find_neighbor_atomic")
find_neighbor_no_atomic = mod.get_function("find_neighbor_no_atomic")


def timing(d_NN, d_NL, d_x, d_y, N, MN, atomic):
    cutoff = 1.9
    cutoff_square = cutoff * cutoff
    NUM_REPEATS = 10
    t_sum = 0
    t2_sum = 0
    for repeat in range(NUM_REPEATS + 1):
        start = drv.Event()
        stop = drv.Event()
        start.record()
        if atomic:
            find_neighbor_atomic(d_NN,
    if (tid == 1) printf("all_sync     (mask1): %d\n", result);

    result = __any_sync(FULL_MASK, tid);
    if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);

    result = __any_sync(mask2, tid);
    if (tid == 0) printf("any_sync     (mask2): %d\n", result);

    int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
    if (tid == 0) printf("shfl:      ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_up:   ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_down: ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_xor:  ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");
}""")
test_warp_primitives = mod.get_function('test_warp_primitives')

test_warp_primitives(block=(16, 1, 1))
Example #14
0
class CudaFunctions(object):
    """Class that groups the CUDA functions on maintains state about the device"""

    def __init__(self, device=0, iterations=7, compiler_options=None):
        """instantiate CudaFunctions object used for interacting with the CUDA device

        Instantiating this object will inspect and store certain device properties at
        runtime, which are used during compilation and/or execution of kernels by the
        kernel tuner. It also maintains a reference to the most recently compiled
        source module for copying data to constant memory before kernel launch.

        :param device: Number of CUDA device to use for this context
        :type device: int

        :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
        :type iterations: int
        """
        if not drv:
            raise ImportError("Error: pycuda not installed, please install e.g. using 'pip install pycuda'.")

        drv.init()
        self.context = drv.Device(device).make_context()

        #inspect device properties
        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
        self.max_threads = devprops['MAX_THREADS_PER_BLOCK']
        self.cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str(devprops['COMPUTE_CAPABILITY_MINOR'])
        self.iterations = iterations
        self.current_module = None
        self.compiler_options = compiler_options or []

        #collect environment information
        env = dict()
        env["device_name"] = self.context.get_device().name()
        env["cuda_version"] = ".".join([str(i) for i in drv.get_version()])
        env["compute_capability"] = self.cc
        env["iterations"] = self.iterations
        env["compiler_options"] = compiler_options
        env["device_properties"] = devprops
        self.env = env
        self.name = env["device_name"]
        self.allocations = []

    def __del__(self):
        for gpu_mem in self.allocations:
            if hasattr(gpu_mem, 'free'): #if needed for when using mocks during testing
                gpu_mem.free()
        if hasattr(self, 'context'):
            self.context.pop()

    def ready_argument_list(self, arguments):
        """ready argument list to be passed to the kernel, allocates gpu mem

        :param arguments: List of arguments to be passed to the kernel.
            The order should match the argument list on the CUDA kernel.
            Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
        :type arguments: list(numpy objects)

        :returns: A list of arguments that can be passed to an CUDA kernel.
        :rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
        """
        gpu_args = []
        for arg in arguments:
            # if arg i is a numpy array copy to device
            if isinstance(arg, numpy.ndarray):
                alloc = drv.mem_alloc(arg.nbytes)
                self.allocations.append(alloc)
                gpu_args.append(alloc)
                drv.memcpy_htod(gpu_args[-1], arg)
            else: # if not an array, just pass argument along
                gpu_args.append(arg)
        return gpu_args


    def compile(self, kernel_name, kernel_string):
        """call the CUDA compiler to compile the kernel, return the device function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The CUDA kernel code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An CUDA kernel that can be called directly.
        :rtype: pycuda.driver.Function
        """
        try:
            no_extern_c = 'extern "C"' in kernel_string

            compiler_options = ['-Xcompiler=-Wall']
            if self.compiler_options:
                compiler_options += self.compiler_options

            self.current_module = DynamicSourceModule(kernel_string, options=self.compiler_options + ["-e", kernel_name],
                                               arch='compute_' + self.cc, code='sm_' + self.cc,
                                               cache_dir=False, no_extern_c=no_extern_c)
            func = self.current_module.get_function(kernel_name)
            return func
        except drv.CompileError as e:
            if "uses too much shared data" in e.stderr:
                raise Exception("uses too much shared data")
            else:
                raise e


    def benchmark(self, func, gpu_args, threads, grid):
        """runs the kernel and measures time repeatedly, returns average time

        Runs the kernel and measures kernel execution time repeatedly, number of
        iterations is set during the creation of CudaFunctions. Benchmark returns
        a robust average, from all measurements the fastest and slowest runs are
        discarded and the rest is included in the returned average. The reason for
        this is to be robust against initialization artifacts and other exceptional
        cases.

        :param func: A PyCuda kernel compiled for this specific kernel configuration
        :type func: pycuda.driver.Function

        :param gpu_args: A list of arguments to the kernel, order should match the
            order in the code. Allowed values are either variables in global memory
            or single values passed by value.
        :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...)

        :param threads: A tuple listing the number of threads in each dimension of
            the thread block
        :type threads: tuple(int, int, int)

        :param grid: A tuple listing the number of thread blocks in each dimension
            of the grid
        :type grid: tuple(int, int)

        :returns: A robust average for the kernel execution time.
        :rtype: float
        """
        start = drv.Event()
        end = drv.Event()
        times = []
        for _ in range(self.iterations):
            self.context.synchronize()
            start.record()
            self.run_kernel(func, gpu_args, threads, grid)
            end.record()
            self.context.synchronize()
            times.append(end.time_since(start))
        times = sorted(times)
        return numpy.mean(times[1:-1])

    def copy_constant_memory_args(self, cmem_args):
        """adds constant memory arguments to the most recently compiled module

        :param cmem_args: A dictionary containing the data to be passed to the
            device constant memory. The format to be used is as follows: A
            string key is used to name the constant memory symbol to which the
            value needs to be copied. Similar to regular arguments, these need
            to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
        :type cmem_args: dict( string: numpy.ndarray, ... )
        """
        logging.debug('copy_constant_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        for k, v in cmem_args.items():
            symbol = self.current_module.get_global(k)[0]
            logging.debug('copying to symbol: ' + str(symbol))
            logging.debug('array to be copied: ')
            logging.debug(v.nbytes)
            logging.debug(v.dtype)
            logging.debug(v.flags)
            drv.memcpy_htod(symbol, v)

    def run_kernel(self, func, gpu_args, threads, grid):
        """runs the CUDA kernel passed as 'func'

        :param func: A PyCuda kernel compiled for this specific kernel configuration
        :type func: pycuda.driver.Function

        :param gpu_args: A list of arguments to the kernel, order should match the
            order in the code. Allowed values are either variables in global memory
            or single values passed by value.
        :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...)

        :param threads: A tuple listing the number of threads in each dimension of
            the thread block
        :type threads: tuple(int, int, int)

        :param grid: A tuple listing the number of thread blocks in each dimension
            of the grid
        :type grid: tuple(int, int)
        """
        func(*gpu_args, block=threads, grid=grid)

    def memset(self, allocation, value, size):
        """set the memory in allocation to the value in value

        :param allocation: A GPU memory allocation unit
        :type allocation: pycuda.driver.DeviceAllocation

        :param value: The value to set the memory to
        :type value: a single 8-bit unsigned int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        drv.memset_d8(allocation, int(value), int(size))

    def memcpy_dtoh(self, dest, src):
        """perform a device to host memory copy

        :param dest: A numpy array in host memory to store the data
        :type dest: numpy.ndarray

        :param src: A GPU memory allocation unit
        :type src: pycuda.driver.DeviceAllocation
        """
        if isinstance(src, drv.DeviceAllocation):
            drv.memcpy_dtoh(dest, src)
        else:
            dest = src
Example #15
0
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}'''.replace('real', real_cpp))

transpose1 = mod.get_function("transpose1")
transpose2 = mod.get_function("transpose2")


def timing(d_A, d_B, N, task):
    NUM_REPEATS = 10
    TILE_DIM = 32
    grid_size_x = (N + TILE_DIM - 1) // TILE_DIM
    grid_size_y = grid_size_x
    block_size = (TILE_DIM, TILE_DIM, 1)
    grid_size = (grid_size_x, grid_size_y, 1)

    t_sum = 0
    t2_sum = 0
    for repeat in range(NUM_REPEATS + 1):
        start = drv.Event()
    real_cpp = 'double'
else:
    real_py = 'float32'
    real_cpp = 'float'

mod = DynamicSourceModule(r"""
void __global__ add(const real *x, const real *y, real *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if(n < N) 
    {
        z[n] = x[n] + y[n];
    }
}
""".replace('real', real_cpp))
add = mod.get_function("add")

EPSILON = 1e-15
NUM_REPEATS = 10
a = 1.23
b = 2.34
c = 3.57
N = 100000000
h_x = numpy.full((N, 1), a, dtype=real_py)
h_y = numpy.full((N, 1), b, dtype=real_py)
h_z = numpy.zeros_like(h_x, dtype=real_py)
d_x = drv.mem_alloc(h_x.nbytes)
d_y = drv.mem_alloc(h_y.nbytes)
d_z = drv.mem_alloc(h_z.nbytes)
drv.memcpy_htod(d_x, h_x)
drv.memcpy_htod(d_y, h_y)
    real_py = 'float64'
    real_cpp = 'double'
else:
    real_py = 'float32'
    real_cpp = 'float'

mod = DynamicSourceModule(r"""
void __global__ gpu_sum(const real *x, const real *y, real *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        z[n] = x[n] + y[n];
    }
}""".replace('real', real_cpp))
gpu_sum = mod.get_function("gpu_sum")


def cpu_sum(x, y, N_host):
    z = numpy.empty_like(x, dtype=real_py)
    for n in range(N_host):
        z[n] = x[n] + y[n]
    return z


def timing(h_x, h_y, h_z, d_x, d_y, d_z, ratio, overlap):
    NUM_REPEATS = 10
    N = h_x.size
    t_sum = 0
    t2_sum = 0
    for repeat in range(NUM_REPEATS + 1):
Example #18
0
def sequential_col_scan():
    # TODO: can add a main function to cuda code to scan manually across cols, instead of doing in python?

    cuda_string = """
    #include <stdio.h>
    __global__ void line_scan(unsigned char *img, int *counter, const int x, int flag[1])
    {
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        //if(y > 852) { return; }
        //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)*
        //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210);
        //counter[y] = counter[y]*bgr_pass + bgr_pass;
        if(y < 853) {
            if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
            && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                counter[y] += 1;
                if(counter[y] == 50) {
                    flag[0] = y;
                }
            } else { counter[y] = 0; }
        }
    }
    
    __device__ void start_scan(unsigned char *img, int *counter, int flag[1])
    {   
        dim3 blocks = (1, 1, 1);
        dim3 threads = (1, 853, 1);
        //for(int i=0; i<853; i++) {counter[i] = 0;}
        for(int col=0; col<1918; col++) {
            line_scan<<<blocks, threads>>>(img, counter, col, flag);
        }
    }
    """

    scantest = SourceModule("""
    #include <stdio.h>
    __global__ void line_scan(unsigned char *img, int *counter, const int x, int flag[1])
    {
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        //if(y > 852) { return; }
        //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)*
        //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210);
        //counter[y] = counter[y]*bgr_pass + bgr_pass;
        if(y < 853) {
            if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
            && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                counter[y] += 1;
                if(counter[y] == 50) {
                    flag[0] = y;
                }
            } else { counter[y] = 0; }
        }
    }
    """)

    image = cv.imread("test images/crop2.png")
    counter = np.zeros(853, np.int32)
    flag = np.array([0])
    # scantest.get_function("line_scan")  # , options=["-rdc=true", "-lcudadevrt", "-lcublas_device"
    start = DynamicSourceModule(cuda_string)
    scancol = start.get_function("start_scan")
    image_gpu = gpuarray.to_gpu_async(image)
    # start(image_gpu)
    # test = gpuarray.take()
    # counter_gpu = gpuarray.to_gpu_async(counter)
    # flag_gpu = cuda.mem_alloc(flag.nbytes)
    counter_gpu = cuda.mem_alloc(counter.nbytes)
    cuda.memcpy_htod(counter_gpu, counter)
    # cuda.memcpy_htod(flag_gpu, flag)
    timer = time.clock()
    # stream = cuda.Stream()
    # TODO: as we are scanning columns, is it faster to transpose array to col major order in memory first?
    scancol(image_gpu, counter_gpu, flag, block=(1, 1, 1))
    print(time.clock() - timer)
    quit()
    for i in range(1918):
        scancol(image_gpu,
                counter_gpu,
                np.uintc(i),
                cuda.InOut(flag),
                block=(1, 32, 1),
                grid=(1, 28))
        # context.synchronize()
        if flag[0] > 0:
            print(time.clock() - timer)
            print(i, flag[0])
            return
    print(time.clock() - timer)
Example #19
0
 int mid = partition(a, lo, hi);
  
 if(mid - 1 - lo > 0)
   quicksort_ker<<< 1, 1, 0, s_left >>>(a, lo, mid - 1);
 if(hi - (mid + 1) > 0)
   quicksort_ker<<< 1, 1, 0, s_right >>>(a, mid + 1, hi);
    
 cudaStreamDestroy(s_left);
 cudaStreamDestroy(s_right);

}
'''

qsort_mod = DynamicSourceModule(DynamicQuicksortCode)

qsort_ker = qsort_mod.get_function('quicksort_ker')

if __name__ == '__main__':
    a = range(100)
    shuffle(a)

    a = np.int32(a)

    d_a = gpuarray.to_gpu(a)

    print 'Unsorted array: %s' % a

    qsort_ker(d_a,
              np.int32(0),
              np.int32(a.size - 1),
              grid=(1, 1, 1),
mod = DynamicSourceModule(r"""
void __global__ arithmetic(real *d_x, const real x0, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        real x_tmp = d_x[n];
        while (sqrt(x_tmp) < x0)
        {
            ++x_tmp;
        }
        d_x[n] = x_tmp;
    }
}""".replace('real', real_cpp))
arithmetic = mod.get_function("arithmetic")

NUM_REPEATS = 10
x0 = numpy.__dict__[real_py](100)

h_x = numpy.zeros((N, 1), dtype=real_py)
d_x = drv.mem_alloc(h_x.nbytes)

t_sum = 0
t2_sum = 0
for repeat in range(NUM_REPEATS + 1):
    drv.memcpy_htod(d_x, h_x)
    start = drv.Event()
    stop = drv.Event()
    start.record()
Example #21
0
    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        atomicAdd(d_y, y);
    }
}
}
""".replace('real', real_cpp),
                          no_extern_c=True)
reduce_syncwarp = mod.get_function("reduce_syncwarp")
reduce_shfl = mod.get_function("reduce_shfl")
reduce_cp = mod.get_function("reduce_cp")


def timing(method):
    NUM_REPEATS = 10
    N = 100000000
    BLOCK_SIZE = 128
    grid_size = (N - 1) // 128 + 1
    h_x = np.full((N, 1), 1.23, dtype=real_py)
    d_x = drv.mem_alloc(h_x.nbytes)
    drv.memcpy_htod(d_x, h_x)
    size_real = np.dtype(real_py).itemsize
    t_sum = 0
    t2_sum = 0