Example #1
0
        def math(a, b, c, d, e, f):
            a_gpu = drv.mem_alloc(a.nbytes)
            b_gpu = drv.mem_alloc(b.nbytes)
            c_gpu = drv.mem_alloc(c.nbytes)
            d_gpu = drv.mem_alloc(d.nbytes)
            e_gpu = drv.mem_alloc(e.nbytes)
            f_gpu = drv.mem_alloc(f.nbytes)

            drv.memcpy_htod(a_gpu, a)
            drv.memcpy_htod(b_gpu, b)

            from pycuda.compiler import DynamicSourceModule

            mod = DynamicSourceModule(cuda_string, keep=True)

            func = mod.get_function("math")
            func(
                a_gpu,
                b_gpu,
                c_gpu,
                d_gpu,
                e_gpu,
                f_gpu,
                block=(100, 1, 1),
                grid=(1, 1, 1),
            )

            drv.memcpy_dtoh(c, c_gpu)
            drv.memcpy_dtoh(d, d_gpu)
            drv.memcpy_dtoh(e, e_gpu)
            drv.memcpy_dtoh(f, f_gpu)
Example #2
0
    def compile(self, kernel_name, kernel_string):
        """call the CUDA compiler to compile the kernel, return the device function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The CUDA kernel code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An CUDA kernel that can be called directly.
        :rtype: pycuda.driver.Function
        """
        try:
            no_extern_c = 'extern "C"' in kernel_string

            compiler_options = ['-Xcompiler=-Wall']
            if self.compiler_options:
                compiler_options += self.compiler_options

            self.current_module = DynamicSourceModule(kernel_string, options=self.compiler_options + ["-e", kernel_name],
                                               arch='compute_' + self.cc, code='sm_' + self.cc,
                                               cache_dir=False, no_extern_c=no_extern_c)
            func = self.current_module.get_function(kernel_name)
            return func
        except drv.CompileError as e:
            if "uses too much shared data" in e.stderr:
                raise Exception("uses too much shared data")
            else:
                raise e
Example #3
0
def main(argv):
    max_depth = 2
    if len(argv) > 1:
        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8:
            max_depth = int(argv[1])
        else:
            print("Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)." % argv[0])
            sys.exit(0)

    print("starting Simple Print (CUDA Dynamic Parallelism)")

    mod = DynamicSourceModule(cdpSimplePrint_cu)
    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call

    print("***************************************************************************")
    print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
    print("launch 2 blocks of 2 threads each. The GPU we will do that recursively")
    print("until it reaches max_depth=%d\n" % max_depth)
    print("In total 2")
    num_blocks, sum = 2, 2
    for i in range(1, max_depth):
        num_blocks *= 4
        print("+%d" % num_blocks)
        sum += num_blocks
    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
    print("***************************************************************************\n")

    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)

    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
    cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
Example #4
0
def main(argv):
    max_depth = 2
    if len(argv) > 1:
        if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(
                argv[1]) <= 8:
            max_depth = int(argv[1])
        else:
            print(
                "Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)."
                % argv[0])
            sys.exit(0)

    print("starting Simple Print (CUDA Dynamic Parallelism)")

    mod = DynamicSourceModule(cdpSimplePrint_cu)
    cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call

    print(
        "***************************************************************************"
    )
    print(
        "The CPU launches 2 blocks of 2 threads each. On the device each thread will"
    )
    print(
        "launch 2 blocks of 2 threads each. The GPU we will do that recursively"
    )
    print("until it reaches max_depth=%d\n" % max_depth)
    print("In total 2")
    num_blocks, sum = 2, 2
    for i in range(1, max_depth):
        num_blocks *= 4
        print("+%d" % num_blocks)
        sum += num_blocks
    print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum - 2))
    print(
        "***************************************************************************\n"
    )

    pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH,
                                      max_depth)

    print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
    cdp_kernel((2, 1), (2, 1, 1), max_depth, 0, 0, -1)
Example #5
0
        def math(a, b, c, d, e, f):
            a_gpu = drv.mem_alloc(a.nbytes)
            b_gpu = drv.mem_alloc(b.nbytes)
            c_gpu = drv.mem_alloc(c.nbytes)
            d_gpu = drv.mem_alloc(d.nbytes)
            e_gpu = drv.mem_alloc(e.nbytes)
            f_gpu = drv.mem_alloc(f.nbytes)

            drv.memcpy_htod(a_gpu, a)
            drv.memcpy_htod(b_gpu, b)

            from pycuda.compiler import DynamicSourceModule
            mod = DynamicSourceModule(cuda_string, keep=True)

            func = mod.get_function("math")
            func(a_gpu, b_gpu, c_gpu, d_gpu, e_gpu, f_gpu,
                    block=(100, 1, 1), grid=(1, 1, 1))

            drv.memcpy_dtoh(c, c_gpu)
            drv.memcpy_dtoh(d, d_gpu)
            drv.memcpy_dtoh(e, e_gpu)
            drv.memcpy_dtoh(f, f_gpu)
Example #6
0
    })
    print('Accuracy of model on test data: {}'.format(acc_test))
    print('Correct Class: {}'.format(y_train[idx]))
    class_x = classes.eval(feed_dict={x: x_train[idx:idx + 1]})
    print('Predicted class of input {}: {}'.format(idx, class_x))
    # if args.prof:
    #     pr.enable()
    grad_val = gradsign.eval(feed_dict={
        x: x_train[idx:idx + 1],
        y: y_train[idx:idx + 1]
    })
    grad_flat = np.squeeze(grad_val).flatten()
    x_flat = x_train[idx].flatten()
    with open('parallel_fgsm.cu') as f:
        src = f.read()
    src_comp = DynamicSourceModule(src)
    grid = (1, 1)
    block = (1, 1, 1)
    gen_examples_fgsm = src_comp.get_function("gen_examples_fgsm")
    # gen_examples_fgsm.prepare("PPPPII")

    start = time.time()
    gen = curand.MRG32k3aRandomNumberGenerator()
    epsilon_gpu = gpuarray.GPUArray((args.numgens, ), dtype=np.float32)
    gen.fill_uniform(epsilon_gpu)
    # epsilon_gpu = curand.rand((args.numgens,))
    epsilon_gpu = epsilon_gpu * (args.epsmax - args.epsmin) + args.epsmin
    x_gpu = gpuarray.to_gpu(x_flat)
    grad_gpu = gpuarray.to_gpu(grad_flat)
    res_gpu = gpuarray.GPUArray((args.numgens * 28 * 28, ), dtype=np.float32)
mod = DynamicSourceModule(r"""
#include <cooperative_groups.h>
using namespace cooperative_groups;

extern "C"{
void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    extern __shared__ real s_y[];

    real y = 0.0;
    const int stride = blockDim.x * gridDim.x;
    for (int n = bid * blockDim.x + tid; n < N; n += stride)
    {
        y += d_x[n];
    }
    s_y[tid] = y;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        d_y[bid] = y;
    }
}
}
""".replace('real', real_cpp),
                          no_extern_c=True)
mod = DynamicSourceModule(r"""
void __global__ reduce_global(real *d_x, real *d_y)
{
    const int tid = threadIdx.x;
    real *x = d_x + blockDim.x * blockIdx.x;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            x[tid] += x[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[blockIdx.x] = x[0];
    }
}

void __global__ reduce_shared(real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    __shared__ real s_y[128];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {

        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}

void __global__ reduce_dynamic(real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {

        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}""".replace('real', real_cpp))
mod = DynamicSourceModule(r'''
__global__ void copy(const real *A, real *B, const int N, const int TILE_DIM)
{
    const int nx = blockIdx.x * TILE_DIM + threadIdx.x;
    const int ny = blockIdx.y * TILE_DIM + threadIdx.y;
    const int index = ny * N + nx;
    if (nx < N && ny < N)
    {
        B[index] = A[index];
    }
}

__global__ void transpose1(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[nx * N + ny] = A[ny * N + nx];
    }
}

__global__ void transpose2(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[ny * N + nx] = A[nx * N + ny];
    }
}

__global__ void transpose3(const real *A, real *B, const int N)
{
    const int nx = blockIdx.x * blockDim.x + threadIdx.x;
    const int ny = blockIdx.y * blockDim.y + threadIdx.y;
    if (nx < N && ny < N)
    {
        B[ny * N + nx] = __ldg(&A[nx * N + ny]);
    }
}'''.replace('real', real_cpp))
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import DynamicSourceModule

import numpy

mod1 = DynamicSourceModule(r"""
double __device__ add1_device(double x, double y)
{
    return (x + y);
}

void __global__ add1(double *x, double *y, double *z, int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if(n < N) 
    {
        z[n] = add1_device(x[n], y[n]);
    }
}
""")
add1 = mod1.get_function("add1")

mod2 = DynamicSourceModule(r"""
double __device__ add2_device(double x, double y, double *z)
{
    *z = x + y;
}

void __global__ add2(double *x, double *y, double *z, int N)
Example #11
0
from __future__ import division
import numpy as np
from pycuda.compiler import DynamicSourceModule
import pycuda.autoinit

DynamicParallelismCode = '''
__global__ void dynamic_hello_ker(int depth)
{
 printf("Hello from thread %d, recursion depth %d!\\n", threadIdx.x, depth);
 if (threadIdx.x == 0 && blockIdx.x == 0 && blockDim.x > 1)
  {
   printf("Launching a new kernel from depth %d .\\n", depth);
   printf("-----------------------------------------\\n");
   dynamic_hello_ker<<< 1, blockDim.x - 1 >>>(depth + 1);
  }
}'''

dp_mod = DynamicSourceModule(DynamicParallelismCode)

hello_ker = dp_mod.get_function('dynamic_hello_ker')

hello_ker(np.int32(0), grid=(1, 1, 1), block=(4, 1, 1))
else:
    real_py = 'float32'
    real_cpp = 'float'

mod = DynamicSourceModule(r"""
void __global__ reduce(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        atomicAdd(d_y, s_y[0]);
    }
}""".replace('real', real_cpp))
reducef = mod.get_function("reduce")


def timing():
mod = DynamicSourceModule(r"""
void __global__ find_neighbor_atomic
(int *d_NN, int *d_NL, const real *d_x, const real *d_y, 
const int N, const int MN, const real cutoff_square)
{
    const int n1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (n1 < N)
    {
        d_NN[n1] = 0;
        const real x1 = d_x[n1];
        const real y1 = d_y[n1];
        for (int n2 = n1 + 1; n2 < N; ++n2)
        {
            const real x12 = d_x[n2] - x1;
            const real y12 = d_y[n2] - y1;
            const real distance_square = x12 * x12 + y12 * y12;
            if (distance_square < cutoff_square)
            {
                d_NL[n1 * MN + atomicAdd(&d_NN[n1], 1)] = n2;
                d_NL[n2 * MN + atomicAdd(&d_NN[n2], 1)] = n1;
            }
        }
    }
}

void __global__ find_neighbor_no_atomic
(int *d_NN, int *d_NL, const real *d_x, const real *d_y, 
const int N, const real cutoff_square)
{
    const int n1 = blockIdx.x * blockDim.x + threadIdx.x;
    if (n1 < N)
    {
        int count = 0;
        const real x1 = d_x[n1];
        const real y1 = d_y[n1];
        for (int n2 = 0; n2 < N; ++n2)
        {
            const real x12 = d_x[n2] - x1;
            const real y12 = d_y[n2] - y1;
            const real distance_square = x12 * x12 + y12 * y12;
            if ((distance_square < cutoff_square) && (n2 != n1))
            {
                d_NL[(count++) * N + n1] = n2;
            }
        }
        d_NN[n1] = count;
    }
}""".replace('real', real_cpp))
mod = DynamicSourceModule(r"""
const unsigned WIDTH = 8;
const unsigned FULL_MASK = 0xffffffff;
void __global__ test_warp_primitives(void)
{
    int tid = threadIdx.x;
    int lane_id = tid % WIDTH;

    if (tid == 0) printf("threadIdx.x: ");
    printf("%2d ", tid);
    if (tid == 0) printf("\n");

    if (tid == 0) printf("lane_id:     ");
    printf("%2d ", lane_id);
    if (tid == 0) printf("\n");

    unsigned mask1 = __ballot_sync(FULL_MASK, tid > 0);
    unsigned mask2 = __ballot_sync(FULL_MASK, tid == 0);
    if (tid == 0) printf("FULL_MASK = %x\n", FULL_MASK);
    if (tid == 1) printf("nask1     = %x\n", mask1);
    if (tid == 0) printf("mask2     = %x\n", mask2);

    int result = __all_sync(FULL_MASK, tid);
    if (tid == 0) printf("all_sync (FULL_MASK): %d\n", result);

    result = __all_sync(mask1, tid);
    if (tid == 1) printf("all_sync     (mask1): %d\n", result);

    result = __any_sync(FULL_MASK, tid);
    if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);

    result = __any_sync(mask2, tid);
    if (tid == 0) printf("any_sync     (mask2): %d\n", result);

    int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
    if (tid == 0) printf("shfl:      ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_up:   ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_down: ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");

    value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
    if (tid == 0) printf("shfl_xor:  ");
    printf("%2d ", value);
    if (tid == 0) printf("\n");
}""")
Example #15
0
class CudaFunctions(object):
    """Class that groups the CUDA functions on maintains state about the device"""

    def __init__(self, device=0, iterations=7, compiler_options=None):
        """instantiate CudaFunctions object used for interacting with the CUDA device

        Instantiating this object will inspect and store certain device properties at
        runtime, which are used during compilation and/or execution of kernels by the
        kernel tuner. It also maintains a reference to the most recently compiled
        source module for copying data to constant memory before kernel launch.

        :param device: Number of CUDA device to use for this context
        :type device: int

        :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
        :type iterations: int
        """
        if not drv:
            raise ImportError("Error: pycuda not installed, please install e.g. using 'pip install pycuda'.")

        drv.init()
        self.context = drv.Device(device).make_context()

        #inspect device properties
        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
        self.max_threads = devprops['MAX_THREADS_PER_BLOCK']
        self.cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str(devprops['COMPUTE_CAPABILITY_MINOR'])
        self.iterations = iterations
        self.current_module = None
        self.compiler_options = compiler_options or []

        #collect environment information
        env = dict()
        env["device_name"] = self.context.get_device().name()
        env["cuda_version"] = ".".join([str(i) for i in drv.get_version()])
        env["compute_capability"] = self.cc
        env["iterations"] = self.iterations
        env["compiler_options"] = compiler_options
        env["device_properties"] = devprops
        self.env = env
        self.name = env["device_name"]
        self.allocations = []

    def __del__(self):
        for gpu_mem in self.allocations:
            if hasattr(gpu_mem, 'free'): #if needed for when using mocks during testing
                gpu_mem.free()
        if hasattr(self, 'context'):
            self.context.pop()

    def ready_argument_list(self, arguments):
        """ready argument list to be passed to the kernel, allocates gpu mem

        :param arguments: List of arguments to be passed to the kernel.
            The order should match the argument list on the CUDA kernel.
            Allowed values are numpy.ndarray, and/or numpy.int32, numpy.float32, and so on.
        :type arguments: list(numpy objects)

        :returns: A list of arguments that can be passed to an CUDA kernel.
        :rtype: list( pycuda.driver.DeviceAllocation, numpy.int32, ... )
        """
        gpu_args = []
        for arg in arguments:
            # if arg i is a numpy array copy to device
            if isinstance(arg, numpy.ndarray):
                alloc = drv.mem_alloc(arg.nbytes)
                self.allocations.append(alloc)
                gpu_args.append(alloc)
                drv.memcpy_htod(gpu_args[-1], arg)
            else: # if not an array, just pass argument along
                gpu_args.append(arg)
        return gpu_args


    def compile(self, kernel_name, kernel_string):
        """call the CUDA compiler to compile the kernel, return the device function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The CUDA kernel code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An CUDA kernel that can be called directly.
        :rtype: pycuda.driver.Function
        """
        try:
            no_extern_c = 'extern "C"' in kernel_string

            compiler_options = ['-Xcompiler=-Wall']
            if self.compiler_options:
                compiler_options += self.compiler_options

            self.current_module = DynamicSourceModule(kernel_string, options=self.compiler_options + ["-e", kernel_name],
                                               arch='compute_' + self.cc, code='sm_' + self.cc,
                                               cache_dir=False, no_extern_c=no_extern_c)
            func = self.current_module.get_function(kernel_name)
            return func
        except drv.CompileError as e:
            if "uses too much shared data" in e.stderr:
                raise Exception("uses too much shared data")
            else:
                raise e


    def benchmark(self, func, gpu_args, threads, grid):
        """runs the kernel and measures time repeatedly, returns average time

        Runs the kernel and measures kernel execution time repeatedly, number of
        iterations is set during the creation of CudaFunctions. Benchmark returns
        a robust average, from all measurements the fastest and slowest runs are
        discarded and the rest is included in the returned average. The reason for
        this is to be robust against initialization artifacts and other exceptional
        cases.

        :param func: A PyCuda kernel compiled for this specific kernel configuration
        :type func: pycuda.driver.Function

        :param gpu_args: A list of arguments to the kernel, order should match the
            order in the code. Allowed values are either variables in global memory
            or single values passed by value.
        :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...)

        :param threads: A tuple listing the number of threads in each dimension of
            the thread block
        :type threads: tuple(int, int, int)

        :param grid: A tuple listing the number of thread blocks in each dimension
            of the grid
        :type grid: tuple(int, int)

        :returns: A robust average for the kernel execution time.
        :rtype: float
        """
        start = drv.Event()
        end = drv.Event()
        times = []
        for _ in range(self.iterations):
            self.context.synchronize()
            start.record()
            self.run_kernel(func, gpu_args, threads, grid)
            end.record()
            self.context.synchronize()
            times.append(end.time_since(start))
        times = sorted(times)
        return numpy.mean(times[1:-1])

    def copy_constant_memory_args(self, cmem_args):
        """adds constant memory arguments to the most recently compiled module

        :param cmem_args: A dictionary containing the data to be passed to the
            device constant memory. The format to be used is as follows: A
            string key is used to name the constant memory symbol to which the
            value needs to be copied. Similar to regular arguments, these need
            to be numpy objects, such as numpy.ndarray or numpy.int32, and so on.
        :type cmem_args: dict( string: numpy.ndarray, ... )
        """
        logging.debug('copy_constant_memory_args called')
        logging.debug('current module: ' + str(self.current_module))
        for k, v in cmem_args.items():
            symbol = self.current_module.get_global(k)[0]
            logging.debug('copying to symbol: ' + str(symbol))
            logging.debug('array to be copied: ')
            logging.debug(v.nbytes)
            logging.debug(v.dtype)
            logging.debug(v.flags)
            drv.memcpy_htod(symbol, v)

    def run_kernel(self, func, gpu_args, threads, grid):
        """runs the CUDA kernel passed as 'func'

        :param func: A PyCuda kernel compiled for this specific kernel configuration
        :type func: pycuda.driver.Function

        :param gpu_args: A list of arguments to the kernel, order should match the
            order in the code. Allowed values are either variables in global memory
            or single values passed by value.
        :type gpu_args: list( pycuda.driver.DeviceAllocation, numpy.int32, ...)

        :param threads: A tuple listing the number of threads in each dimension of
            the thread block
        :type threads: tuple(int, int, int)

        :param grid: A tuple listing the number of thread blocks in each dimension
            of the grid
        :type grid: tuple(int, int)
        """
        func(*gpu_args, block=threads, grid=grid)

    def memset(self, allocation, value, size):
        """set the memory in allocation to the value in value

        :param allocation: A GPU memory allocation unit
        :type allocation: pycuda.driver.DeviceAllocation

        :param value: The value to set the memory to
        :type value: a single 8-bit unsigned int

        :param size: The size of to the allocation unit in bytes
        :type size: int

        """
        drv.memset_d8(allocation, int(value), int(size))

    def memcpy_dtoh(self, dest, src):
        """perform a device to host memory copy

        :param dest: A numpy array in host memory to store the data
        :type dest: numpy.ndarray

        :param src: A GPU memory allocation unit
        :type src: pycuda.driver.DeviceAllocation
        """
        if isinstance(src, drv.DeviceAllocation):
            drv.memcpy_dtoh(dest, src)
        else:
            dest = src
Example #16
0
mod = DynamicSourceModule(r'''
const int TILE_DIM = 32;
__global__ void transpose1(const real *A, real *B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

__global__ void transpose2(const real *A, real *B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM + 1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}'''.replace('real', real_cpp))
import pycuda.driver as drv
import numpy, math, sys
from pycuda.compiler import DynamicSourceModule

if len(sys.argv) == 2 and sys.argv[1] == '-double':
    real_py = 'float64'
    real_cpp = 'double'
else:
    real_py = 'float32'
    real_cpp = 'float'

mod = DynamicSourceModule(r"""
void __global__ add(const real *x, const real *y, real *z, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if(n < N) 
    {
        z[n] = x[n] + y[n];
    }
}
""".replace('real', real_cpp))
add = mod.get_function("add")

EPSILON = 1e-15
NUM_REPEATS = 10
a = 1.23
b = 2.34
c = 3.57
N = 100000000
h_x = numpy.full((N, 1), a, dtype=real_py)
h_y = numpy.full((N, 1), b, dtype=real_py)
h_z = numpy.zeros_like(h_x, dtype=real_py)
Example #18
0
def sequential_col_scan():
    # TODO: can add a main function to cuda code to scan manually across cols, instead of doing in python?

    cuda_string = """
    #include <stdio.h>
    __global__ void line_scan(unsigned char *img, int *counter, const int x, int flag[1])
    {
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        //if(y > 852) { return; }
        //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)*
        //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210);
        //counter[y] = counter[y]*bgr_pass + bgr_pass;
        if(y < 853) {
            if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
            && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                counter[y] += 1;
                if(counter[y] == 50) {
                    flag[0] = y;
                }
            } else { counter[y] = 0; }
        }
    }
    
    __device__ void start_scan(unsigned char *img, int *counter, int flag[1])
    {   
        dim3 blocks = (1, 1, 1);
        dim3 threads = (1, 853, 1);
        //for(int i=0; i<853; i++) {counter[i] = 0;}
        for(int col=0; col<1918; col++) {
            line_scan<<<blocks, threads>>>(img, counter, col, flag);
        }
    }
    """

    scantest = SourceModule("""
    #include <stdio.h>
    __global__ void line_scan(unsigned char *img, int *counter, const int x, int flag[1])
    {
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        //if(y > 852) { return; }
        //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)*
        //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210);
        //counter[y] = counter[y]*bgr_pass + bgr_pass;
        if(y < 853) {
            if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
            && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                counter[y] += 1;
                if(counter[y] == 50) {
                    flag[0] = y;
                }
            } else { counter[y] = 0; }
        }
    }
    """)

    image = cv.imread("test images/crop2.png")
    counter = np.zeros(853, np.int32)
    flag = np.array([0])
    # scantest.get_function("line_scan")  # , options=["-rdc=true", "-lcudadevrt", "-lcublas_device"
    start = DynamicSourceModule(cuda_string)
    scancol = start.get_function("start_scan")
    image_gpu = gpuarray.to_gpu_async(image)
    # start(image_gpu)
    # test = gpuarray.take()
    # counter_gpu = gpuarray.to_gpu_async(counter)
    # flag_gpu = cuda.mem_alloc(flag.nbytes)
    counter_gpu = cuda.mem_alloc(counter.nbytes)
    cuda.memcpy_htod(counter_gpu, counter)
    # cuda.memcpy_htod(flag_gpu, flag)
    timer = time.clock()
    # stream = cuda.Stream()
    # TODO: as we are scanning columns, is it faster to transpose array to col major order in memory first?
    scancol(image_gpu, counter_gpu, flag, block=(1, 1, 1))
    print(time.clock() - timer)
    quit()
    for i in range(1918):
        scancol(image_gpu,
                counter_gpu,
                np.uintc(i),
                cuda.InOut(flag),
                block=(1, 32, 1),
                grid=(1, 28))
        # context.synchronize()
        if flag[0] > 0:
            print(time.clock() - timer)
            print(i, flag[0])
            return
    print(time.clock() - timer)
Example #19
0
 cudaStreamCreateWithFlags(&s_right, cudaStreamNonBlocking);
 
 int mid = partition(a, lo, hi);
  
 if(mid - 1 - lo > 0)
   quicksort_ker<<< 1, 1, 0, s_left >>>(a, lo, mid - 1);
 if(hi - (mid + 1) > 0)
   quicksort_ker<<< 1, 1, 0, s_right >>>(a, mid + 1, hi);
    
 cudaStreamDestroy(s_left);
 cudaStreamDestroy(s_right);

}
'''

qsort_mod = DynamicSourceModule(DynamicQuicksortCode)

qsort_ker = qsort_mod.get_function('quicksort_ker')

if __name__ == '__main__':
    a = range(100)
    shuffle(a)

    a = np.int32(a)

    d_a = gpuarray.to_gpu(a)

    print 'Unsorted array: %s' % a

    qsort_ker(d_a,
              np.int32(0),
Example #20
0
import pycuda.driver as drv
from pycuda.compiler import DynamicSourceModule

if len(sys.argv) > 2 and sys.argv[1] == '-double':
    real_py = 'float64'
    real_cpp = 'double'
else:
    real_py = 'float32'
    real_cpp = 'float'

mod = DynamicSourceModule(r"""
void __global__ add(real *d_x, real *d_y, real *d_z, const int N1)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N1)
    {
        for (int i = 0; i < 100000; ++i)
        {
            d_z[n] = d_x[n] + d_y[n];
        }
    }
}""".replace('real', real_cpp))
add = mod.get_function("add")

global NUM_REPEATS
global N1
global MAX_NUM_STREAMS
global N
global block_size
global streams
NUM_REPEATS = 10
N1 = 1024
    real_cpp = 'float'

if len(sys.argv) == 3:
    N = int(sys.argv[2])
else:
    N = 1000000

print('Type:{}; Number:{};'.format(real_cpp, N))

mod = DynamicSourceModule(r"""
void __global__ arithmetic(real *d_x, const real x0, const int N)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    if (n < N)
    {
        real x_tmp = d_x[n];
        while (sqrt(x_tmp) < x0)
        {
            ++x_tmp;
        }
        d_x[n] = x_tmp;
    }
}""".replace('real', real_cpp))
arithmetic = mod.get_function("arithmetic")

NUM_REPEATS = 10
x0 = numpy.__dict__[real_py](100)

h_x = numpy.zeros((N, 1), dtype=real_py)
d_x = drv.mem_alloc(h_x.nbytes)

t_sum = 0
Example #22
0
mod = DynamicSourceModule(r"""
#include <cooperative_groups.h>
using namespace cooperative_groups;
const unsigned FULL_MASK = 0xffffffff;

extern "C"{void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    for (int offset = 16; offset > 0; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncwarp();
    }

    if (tid == 0)
    {
        atomicAdd(d_y, s_y[0]);
    }
}}

extern "C"{void __global__ reduce_shfl(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    real y = s_y[tid];

    for (int offset = 16; offset > 0; offset >>= 1)
    {
        y += __shfl_down_sync(FULL_MASK, y, offset);
    }

    if (tid == 0)
    {
        atomicAdd(d_y, y);
    }
}
}

extern "C"{void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    extern __shared__ real s_y[];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
    {
        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    real y = s_y[tid];

    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
    for (int i = g.size() >> 1; i > 0; i >>= 1)
    {
        y += g.shfl_down(y, i);
    }

    if (tid == 0)
    {
        atomicAdd(d_y, y);
    }
}
}
""".replace('real', real_cpp),
                          no_extern_c=True)