Ejemplo n.º 1
0
def convolution_cuda(sourceImage, fil):
    sourceImage = np.float32(sourceImage)
    # Perform separable convolution on sourceImage using CUDA.
    destImage = sourceImage.copy()

    (imageHeight, imageWidth) = sourceImage.shape
    # print(imageWidth,imageHeight)
    fil = np.float32(fil)
    DATA_H = imageHeight;
    DATA_W = imageWidth
    DATA_H = np.int32(DATA_H)
    DATA_W = np.int32(DATA_W)
    # Prepare device arrays

    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    fil_gpu = cuda.mem_alloc_like(fil)
    destImage_gpu = cuda.mem_alloc_like(sourceImage)

    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(fil_gpu, fil)
    convolutionGPU(destImage_gpu, sourceImage_gpu, fil_gpu, DATA_W, DATA_H, block=(imageHeight,1 , 1), grid=(1,imageWidth))
    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage, destImage_gpu)

    return destImage
Ejemplo n.º 2
0
def erode_cuda(sourceImage):
    fil = np.ones((7, 7))
    # binary = th2 = cv2.adaptiveThreshold(sourceImage,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,3,2)#cv2.threshold(sourceImage, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    ret, binary = cv2.threshold(sourceImage, 0, 255,
                                cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    sourceImage = np.float32(binary)

    # Perform separable convolution on sourceImage using CUDA.
    destImage = sourceImage.copy()
    (imageHeight, imageWidth) = sourceImage.shape
    # print(imageWidth,imageHeight)
    fil = np.float32(fil)
    DATA_H = imageHeight
    DATA_W = imageWidth
    DATA_H = np.int32(DATA_H)
    DATA_W = np.int32(DATA_W)
    # Prepare device arrays

    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    fil_gpu = cuda.mem_alloc_like(fil)
    destImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(fil_gpu, fil)
    erodeGPU(destImage_gpu,
             sourceImage_gpu,
             fil_gpu,
             DATA_W,
             DATA_H,
             block=(imageHeight, 1, 1),
             grid=(1, imageWidth))
    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage, destImage_gpu)

    destImage = np.uint8(destImage)
    return destImage
def convolution_cuda(sourceImage,  filterx,  filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32',  'source image must be float32'
    (imageHeight,  imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (KERNEL_W, ) ,  'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4  # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(d_Kernel_rows,  filterx) # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns,  filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = numpy.int32(DATA_H)
    DATA_W = numpy.int32(DATA_W)
    convolutionRowGPU(intermediateImage_gpu,  sourceImage_gpu,  DATA_W,  DATA_H,  grid=[int(e) for e in blockGridRows],  block=[int(e) for e in threadBlockRows])    
    convolutionColumnGPU(destImage_gpu,  intermediateImage_gpu,  DATA_W,  DATA_H,  numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),  numpy.int32(DATA_W * threadBlockColumns[1]),  grid=[int(e) for e in blockGridColumns],  block=[int(e) for e in threadBlockColumns])

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage
Ejemplo n.º 4
0
def convolution_cuda(sourceImage, filterx, filtery):
    # Perform separable convolution on sourceImage using CUDA.
    # Operates on floating point images with row-major storage.
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32', 'source image must be float32'
    (imageHeight, imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (
        KERNEL_W,
    ), 'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4
    # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(
        d_Kernel_rows, filterx
    )  # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns, filtery)
    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W,
                               COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1,
                       1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
    DATA_H = numpy.int32(DATA_H)
    DATA_W = numpy.int32(DATA_W)
    grid_rows = tuple([int(e) for e in blockGridRows])
    block_rows = tuple([int(e) for e in threadBlockRows])
    grid_cols = tuple([int(e) for e in blockGridColumns])
    block_cols = tuple([int(e) for e in threadBlockColumns])
    #TESTING CODE
    # print("Block rows \n",block_rows)
    #  print("BLock columns \n",block_cols)
    convolutionRowGPU(intermediateImage_gpu,
                      sourceImage_gpu,
                      DATA_W,
                      DATA_H,
                      grid=grid_rows,
                      block=block_rows)
    convolutionColumnGPU(destImage_gpu,
                         intermediateImage_gpu,
                         DATA_W,
                         DATA_H,
                         numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]),
                         numpy.int32(DATA_W * threadBlockColumns[1]),
                         grid=grid_cols,
                         block=block_cols)

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage, destImage_gpu)
    return destImage
Ejemplo n.º 5
0
    def __init__(self, pts, axis, split, sigma):
        if split[0] < 2 or split[1] < 2:
            raise ValueError("Split needs to be at least 2x2")

        if not pts.flags['C_CONTIGUOUS']:
            pts = np.require(pts, dtype=pts.dtype, requirements=['C'])
            if not pts.flags['C_CONTIGUOUS']:
                raise Exception("Points are not contiguous")

        self.axis = axis
        self.sigma = sigma
        self.pts = pts
        self.pts_gpu = None

        # Initiates all of cuda stuff
        self.grid = np.zeros(split).astype(pts.dtype)
        self.grid_gpu = cuda.mem_alloc_like(self.grid)
        cuda.memcpy_htod(self.grid_gpu, self.grid)

        kernel = SourceModule(self.__cuda_code)
        self.gpu_gaussian = kernel.get_function("gpu_gaussian")

        self.dx = 1 / float(split[0] - 1)
        self.dy = 1 / float(split[1] - 1)

        self.grid_size, self.block_size = self.__setup_cuda_sizes(split)
Ejemplo n.º 6
0
    def __compute_guassian_on_pts(self):
        view = self.view_tile.get_View()

        for dset in self.data_sets:
            _data = np.array(dset.getDataSet(), copy=True)
            _data[:, 0] = (_data[:, 0] - view.left)/view.width()
            _data[:, 1] = (_data[:, 1] - view.bottom)/view.height()

            for row in range(self.grid_size[0]):
                for col in range(self.grid_size[1]):
                    # 3 * SIGMA give the 95%
                    left = 1 / float(self.grid_size[1]) * col - (3 * self.sigma)
                    right = 1 / float(self.grid_size[1]) * (col + 1) + (3 * self.sigma)
                    bottom = 1 / float(self.grid_size[0]) * row - (3 * self.sigma)
                    top = 1 / float(self.grid_size[0]) * (row + 1) + (3 * self.sigma)
                    pts = getFilteredDataSet(_data, (left, right, bottom, top))

                    if len(pts) > 0:
                        self.pts_gpu = cuda.mem_alloc_like(pts)
                        cuda.memcpy_htod(self.pts_gpu, pts)

                        self.gpu_gaussian(self.grid_gpu,  # Grid
                                          self.pts_gpu,  # Points
                                          np.int32(col),  # Block Index x
                                          np.int32(row),  # Block Index y
                                          np.int32(self.grid_size[1]),  # Grid Dimensions x
                                          np.int32(self.grid_size[0]),  # Grid Dimensions y
                                          np.int32(pts.shape[0]),  # Point Length
                                          np.float32(self.dx),  # dx
                                          np.float32(self.dy),  # dy
                                          np.float32(self.sigma),  # Sigma
                                          block=self.block_size)

                        self.pts_gpu.free()
Ejemplo n.º 7
0
    def __init__(self, view_tile, size, sigma, debug=False):
        self.debug = debug
        if size[0] < 2 or size[1] < 2:
            raise ValueError("Split needs to be at least 2x2")

        self.data_sets = view_tile.get_Data()
        for dset in self.data_sets:
            data = dset.getDataSet()
            if not data.flags['C_CONTIGUOUS']:
                print "NOT CONTIGUOUS, trying to reformat the points"
                data = np.require(data, dtype=data.dtype, requirements=['C'])
                if not data.flags['C_CONTIGUOUS']:
                    raise Exception("Points are not contiguous")
                dset.setDataSet(data)

        self.view_tile = view_tile
        self.sigma = sigma
        self.pts_gpu = None

        # Initiates all of cuda stuff
        self.grid = np.zeros(size).astype(np.float32)
        self.grid_gpu = cuda.mem_alloc_like(self.grid)
        cuda.memcpy_htod(self.grid_gpu, self.grid)

        kernel = SourceModule(self.__cuda_code)
        self.gpu_gaussian = kernel.get_function("gpu_gaussian")

        self.view = self.view_tile.get_View()

        self.grid_size, self.block_size = self.__setup_cuda_sizes(size)

        self.dx = 1 / float(size[1] - 1)
        self.dy = 1 / float(size[0] - 1)
Ejemplo n.º 8
0
def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
Ejemplo n.º 9
0
    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function("copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0,255,(32,)).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(
                global_result_array,
                grid=(1, 1), block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
Ejemplo n.º 10
0
    def test_constant_memory(self):
        # contributed by Andrew Wagner

        module = SourceModule("""
        __constant__ float const_array[32];

        __global__ void copy_constant_into_global(float* global_result_array)
        {
            global_result_array[threadIdx.x] = const_array[threadIdx.x];
        }
        """)

        copy_constant_into_global = module.get_function(
            "copy_constant_into_global")
        const_array, _ = module.get_global('const_array')

        host_array = np.random.randint(0, 255, (32, )).astype(np.float32)

        global_result_array = drv.mem_alloc_like(host_array)
        drv.memcpy_htod(const_array, host_array)

        copy_constant_into_global(global_result_array,
                                  grid=(1, 1),
                                  block=(32, 1, 1))

        host_result_array = np.zeros_like(host_array)
        drv.memcpy_dtoh(host_result_array, global_result_array)

        assert (host_result_array == host_array).all
Ejemplo n.º 11
0
    def __init__(self, n, L, CUDA=True):
        self.dtype = dtype
        self.r = np.zeros([n, 3], dtype=dtype)
        self.v = np.zeros([n, 3], dtype=dtype)
        self.m = np.ones([n, 1], dtype=dtype)
        self.a = np.zeros([n, 3], dtype=dtype)
        self.f = np.zeros([n, 3], dtype=dtype)
        self.n = n
        self.nc = L
        self.L = L
        self.Lh = L / 2.0
        self.max_nei = 10
        self.rc = 1.0
        self.CUDA = True

        if (self.CUDA):
            self.h_r = np.zeros([n * 3], dtype=dtype)
            self.h_v = np.zeros([n * 3], dtype=dtype)
            self.h_m = np.zeros([n], dtype=dtype)
            self.h_a = np.zeros([n * 3], dtype=dtype)
            self.h_f = np.zeros([n * 3], dtype=dtype)

            self.h_cells = np.zeros([self.nc * self.nc**3], dtype=np.int32)
            self.h_narray = np.zeros([self.nc**3], dtype=np.int32)
            self.h_nei_index = np.zeros([n], dtype=np.int32)
            self.h_nei_list = np.zeros([n * self.max_nei], dtype=np.int32)

            self.d_r = cuda.mem_alloc_like(self.h_r)
            self.d_v = cuda.mem_alloc_like(self.h_v)
            self.d_m = cuda.mem_alloc_like(self.h_m)
            self.d_a = cuda.mem_alloc_like(self.h_a)
            self.d_f = cuda.mem_alloc_like(self.h_f)

            self.d_nei_index = cuda.mem_alloc_like(self.h_nei_index)
            self.d_nei_list = cuda.mem_alloc_like(self.h_nei_list)
            self.d_cells = cuda.mem_alloc_like(self.h_cells)
            self.d_narray = cuda.mem_alloc_like(self.h_narray)

            cuda.memcpy_htod(self.d_r, self.h_r)
            cuda.memcpy_htod(self.d_v, self.h_v)
            cuda.memcpy_htod(self.d_m, self.h_m)
            cuda.memcpy_htod(self.d_a, self.h_a)
            cuda.memcpy_htod(self.d_f, self.h_f)

            cuda.memcpy_htod(self.d_cells, self.h_cells)
            cuda.memcpy_htod(self.d_narray, self.h_narray)
            cuda.memcpy_htod(self.d_nei_list, self.h_nei_list)
            cuda.memcpy_htod(self.d_nei_index, self.h_nei_index)
Ejemplo n.º 12
0
def batch_memcpy_cmp(size: int, batch: int):
    event_start_1 = cuda.Event()
    event_stop_1 = cuda.Event()
    event_start_2 = cuda.Event()
    event_stop_2 = cuda.Event()

    array = np.random.rand(size, 9)
    array.astype(np.float32)

    mem = cuda.aligned_zeros_like(array)
    mem = cuda.register_host_memory(mem,
                                    cuda.mem_host_register_flags.DEVICEMAP)

    mem_d = cuda.mem_alloc_like(mem)

    event_start_1.record()
    cuda.memcpy_htod(mem_d, mem)
    event_stop_1.record()
    event_stop_1.synchronize()

    mem2 = []
    this_mem = []
    size_per_batch = int(size / batch)
    for i in range(batch):
        mem2.append(
            cuda.mem_alloc_like(array[i * size_per_batch:(i + 1) *
                                      size_per_batch]))
        this_mem.append(array[i * size_per_batch:(i + 1) * size_per_batch])
        this_mem[i] = cuda.register_host_memory(
            this_mem[i], cuda.mem_host_register_flags.DEVICEMAP)

    event_start_2.record()
    for i in range(batch):
        cuda.memcpy_htod(mem2[i], this_mem[i])
    event_stop_2.record()
    event_stop_2.synchronize()

    t1 = event_stop_1.time_since(event_start_1)
    t2 = event_stop_2.time_since(event_start_2)
    print("batch_memcpy_cmp size", size, " batch ", batch)
    print(t1)
    print(t2)
Ejemplo n.º 13
0
def line_cuda(sourceImage):
    time_e = time.time()
    gray = cv2.cvtColor(sourceImage, cv2.COLOR_BGR2GRAY)
    # binary = th2 = cv2.adaptiveThreshold(sourceImage,255,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,3,2)#cv2.threshold(sourceImage, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    ret, binary = cv2.threshold(gray, 0, 255,
                                cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
    # gray = sourceImage
    binary = np.float32(gray)

    destImage = np.float32(gray * 5)
    (imageHeight, imageWidth) = gray.shape
    DATA_H = np.int32(imageHeight)
    DATA_W = np.int32(imageWidth)

    # for i in range( -90,90):
    #     sourceImage_gpu = cuda.mem_alloc_like(binary)
    #     destImage_gpu = cuda.mem_alloc_like(binary)
    #     cuda.memcpy_htod(sourceImage_gpu, binary)
    #     theta = np.int32(i)
    #     HoffGPU(destImage_gpu, sourceImage_gpu, theta, DATA_W, DATA_H, block=(imageHeight,1 , 1), grid=(1,imageWidth))
    #     cuda.memcpy_dtoh(destImage, destImage_gpu)
    #     ans = np.sort(destImage)

    sourceImage_gpu = cuda.mem_alloc_like(binary)
    destImage_gpu = cuda.mem_alloc_like(destImage)
    cuda.memcpy_htod(sourceImage_gpu, binary)
    HoffGPU(destImage_gpu,
            sourceImage_gpu,
            DATA_W,
            DATA_H,
            block=(1, 1, 1),
            grid=(imageWidth, imageHeight))
    cuda.memcpy_dtoh(destImage, destImage_gpu)
    ans = np.sort(destImage)

    time_b = time.time()
    print("GPU mode time:", (time_b - time_e))
    return binary
Ejemplo n.º 14
0
    def matmul(self, mat, return_time=False):
        '''Matrix multiplication between two matrices'''

        # # allocate memory on device for both matrices
        # self.allocate_memory()
        # mat.allocate_memory()

        # JIT compile the cuda kernel and source module
        # with dimension parameters
        mod = SourceModule(self.kernel_matmul % {
            "a_nrows": self.nrows,
            "a_ncols": self.ncols,
            "b_ncols": mat.ncols
        })

        # check dimensions first:
        if self.ncols != mat.nrows:
            raise ValueError("Dimensions {0} and {1} do not match.".format(
                self.ncols, mat.nrows))

        # allocate gpu memory for product yield
        prod_arr = np.zeros((self.nrows, mat.ncols)).astype(np.float32)
        prod_arr_gpu = cuda.mem_alloc_like(prod_arr)
        cuda.memcpy_htod(prod_arr_gpu, prod_arr)
        # cuda.In(prod_arr)

        # get matrix multiplication function
        mmul = mod.get_function("kernel_matmul")

        # also record the time it takes for the evaluation
        t0 = time.perf_counter_ns()
        # evaluate the matrix multiplication
        mmul(prod_arr_gpu,
             self.arr_gpu,
             mat.arr_gpu,
             block=self.block_dim,
             grid=self.grid_dim)

        t1 = time.perf_counter_ns()

        eval_time = (t1 - t0) * (1e-9)  # time for each matmul evaluation

        # move from device to host
        # cuda.Out(prod_arr)
        prod_arr = np.zeros((self.nrows, mat.ncols)).astype(np.float32)
        cuda.memcpy_dtoh(prod_arr, prod_arr_gpu)

        return (cuMatrix(prod_arr),
                eval_time) if return_time else cuMatrix(prod_arr)
Ejemplo n.º 15
0
    def test_register_host_memory(self):
        if drv.get_version() < (4, ):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20, ), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()
Ejemplo n.º 16
0
    def test_register_host_memory(self):
        if drv.get_version() < (4,):
            from py.test import skip
            skip("register_host_memory only exists on CUDA 4.0 and later")

        import sys
        if sys.platform == "darwin":
            from py.test import skip
            skip("register_host_memory is not supported on OS X")

        a = drv.aligned_empty((2**20,), np.float64)
        a_pin = drv.register_host_memory(a)

        gpu_ary = drv.mem_alloc_like(a)
        stream = drv.Stream()
        drv.memcpy_htod_async(gpu_ary, a_pin, stream)
        drv.Context.synchronize()
Ejemplo n.º 17
0
def LPF_cuda(fft, D_0=20):
    fft = np.fft.fftshift(fft)
    destImage = np.float32(fft)
    (imageHeight, imageWidth) = destImage.shape
    # print(imageWidth,imageHeight)
    D_0 = np.int32(D_0)
    DATA_H = np.int32(imageHeight)
    DATA_W = np.int32(imageWidth)
    destImage_gpu = cuda.mem_alloc_like(destImage)
    LPFGPU(destImage_gpu,
           D_0,
           DATA_W,
           DATA_H,
           block=(imageHeight, 1, 1),
           grid=(1, imageWidth))
    cuda.memcpy_dtoh(destImage, destImage_gpu)
    ans = np.multiply(destImage, fft)
    ans = np.fft.ifftshift(ans)
    ans = np.fft.ifft2(ans)
    ans = np.uint8(ans)
    return ans
Ejemplo n.º 18
0
def price_options(strike_info_array, driver_price, forward_anchor, vol_time, bank_time,
                  forward_time, driver_time, decay_percent, today_vol_time,
                  one_day_vol_time, zero_rate, carry_cost, atm_vol, skew, put_curve, call_curve,
                  put_linear_knot, call_linear_knot, put_taper_knot, call_taper_knot,
                  put_taper, call_taper, vol_kernel, pricing_kernel, *args, **kwargs):
    # can't use record arrays with pycuda, so need to copy strikes to scalar array
    strikes = strike_info_array['strikes'].copy()

    # initialize results_array to store calculation results
    results_array = get_results_array(strikes.shape)
    results_array['strikes'] = strikes
    results_array['call_ids'] = strike_info_array['call_ids']
    results_array['put_ids'] = strike_info_array['put_ids']

    # initialize temporary scalar arrays to copy results from gpu before moving to results_array
    call_prices = np.zeros_like(strikes)
    put_prices = np.zeros_like(strikes)
    vols = np.zeros_like(strikes)
    up_call_prices = np.zeros_like(strikes)
    up_put_prices = np.zeros_like(strikes)
    down_call_prices = np.zeros_like(strikes)
    down_put_prices = np.zeros_like(strikes)

    # initialize memory on gpu to be used by kernel
    gpu_strikes = cuda.mem_alloc_like(strikes)
    gpu_vols = cuda.mem_alloc_like(strikes)
    gpu_call_prices = cuda.mem_alloc_like(strikes)
    gpu_put_prices = cuda.mem_alloc_like(strikes)

    # copy strikes to gpu
    cuda.memcpy_htod(gpu_strikes, strikes)

    ####################

    # price options vs current driver_price
    current_vol_time = vol_time - today_vol_time * decay_percent
    current_driver_time = driver_time - ONE_DAY_BANK_TIME * decay_percent
    current_forward_time = forward_time - ONE_DAY_BANK_TIME * decay_percent
    current_bank_time = bank_time - ONE_DAY_BANK_TIME * decay_percent

    ####################

    # calculate forward given driver_price
    forward = calculate_forward(driver_price, current_driver_time, current_forward_time, zero_rate, carry_cost)
    forward = np.float32(forward)
    # calculate current atm vol given driver_price and forward_anchor
    atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve,
                               call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot,
                               call_taper_knot)
    atm = np.float32(atm)

    vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time,
                             atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper,
                             call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size))
    # copy vol results from gpu
    cuda.memcpy_dtoh(vols, gpu_vols)

    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols,
                                 forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size))
    # copy pricing results from gpu
    cuda.memcpy_dtoh(call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(put_prices, gpu_put_prices)

    # copy vol results to results_array
    results_array['vols'] = vols

    # copy price results to results array
    results_array['call_prices'] = call_prices
    results_array['put_prices'] = put_prices

    ####################

    # now estimate vegas by permuting vols used in the options calculations
    # we don't need to recalculate forward or ATM vol as we can use the values calculated earlier
    # also, we're just going to directly permute strike vols by 50bp and use those new vols to calculate the vega

    vol_increment = .005
    # increment vols up
    vols += vol_increment
    # copy to gpu
    cuda.memcpy_htod(gpu_vols, vols)
    #run pricing kernel
    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols,
                                 forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size))
    # copy results from gpu
    cuda.memcpy_dtoh(up_call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(up_put_prices, gpu_put_prices)

    # permute vols downwards by double the increment to undo the increase then increment vols downwards by same amount
    vols -= 2 * vol_increment
    #copy to gpu
    cuda.memcpy_htod(gpu_vols, vols)
    #run pricing kernel
    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols,
                                 forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size))
    # copy results from gpu
    cuda.memcpy_dtoh(down_call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(down_put_prices, gpu_put_prices)

    # estimate vegas then store results in results_array
    results_array['call_vegas'] = up_call_prices - down_call_prices / (.01 / (2 * vol_increment))
    results_array['put_vegas'] = up_put_prices - down_put_prices / (.01 / (2 * vol_increment))

    ####################

    # use the day weights to estimate theta

    # first we need to figure out what the adjusted calc times will be at this time during the next trading day
    decay_vol_time = current_vol_time  - one_day_vol_time
    decay_driver_time = current_bank_time - ONE_DAY_BANK_TIME
    decay_forward_time = current_forward_time - ONE_DAY_BANK_TIME
    decay_bank_time = current_bank_time - ONE_DAY_BANK_TIME

    # calculate adjusted forward given new calculation time
    forward = calculate_forward(driver_price, decay_driver_time, decay_forward_time, zero_rate, carry_cost)
    forward = np.float32(forward)
    # calculate atm vol using today_vol_time to account for the fact that as we adjust our skews
    # due to decay, we will be moving the atm_vol using the previous vol path
    atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve,
                               call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot,
                               call_taper_knot)
    atm = np.float32(atm)
    # now run vol kernel with the tomorrow_vol_time and tomorrow_bank_time with the
    # previously calculated atm_vol
    vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, decay_vol_time,
                             atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper,
                             call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size))
    # run pricing kernel with new times
    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols,
                                 forward, zero_rate, decay_vol_time, decay_bank_time, np.int32(strikes.size))
    # copy pricing results from gpu
    cuda.memcpy_dtoh(down_call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(down_put_prices, gpu_put_prices)

    # estimate thetas and store in results_array
    results_array['call_thetas'] = down_call_prices - call_prices
    results_array['put_thetas'] = down_put_prices - put_prices


    ####################

    # now estimate deltas and gammas by permuting driver price

    # choose driver_increment to give relatively smooth gamma/delta profile
    driver_increment = np.float32(sqrt(3 * vol_time))

    # calculate new forward and vols after incrementing driver_price upwards
    forward = calculate_forward(driver_price+driver_increment, current_driver_time,
                                current_forward_time, zero_rate, carry_cost)
    forward = np.float32(forward)
    atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve,
                               call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper, put_taper_knot,
                               call_taper_knot)
    atm = np.float32(atm)
    vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time,
                             atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper,
                             put_taper_knot, call_taper_knot, np.int32(strikes.size))
    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes, gpu_vols,
                                 forward, zero_rate, current_vol_time, current_bank_time, np.int32(strikes.size))
    # copy results from gpu
    cuda.memcpy_dtoh(up_call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(up_put_prices, gpu_put_prices)

    # calculate new forward and vols after incrementing driver_price downwards
    forward = calculate_forward(driver_price - driver_increment, current_driver_time, current_forward_time,
                                zero_rate, carry_cost)
    forward = np.float32(forward)
    atm = linear_two_sided_vol(forward, forward_anchor, current_vol_time, atm_vol, skew, put_curve,
                               call_curve, put_linear_knot, call_linear_knot, put_taper, call_taper,
                               put_taper_knot, call_taper_knot)
    atm = np.float32(atm)
    vol_kernel.prepared_call(VOL_GRID, VOL_BLOCK, gpu_vols, gpu_strikes, forward, current_vol_time,
                             atm, skew, put_curve, call_curve, put_linear_knot, call_linear_knot,
                             put_taper, call_taper, put_taper_knot, call_taper_knot, np.int32(strikes.size))
    pricing_kernel.prepared_call(OPTION_GRID, PRICING_BLOCK, gpu_call_prices, gpu_put_prices, gpu_strikes,
                                 gpu_vols, forward, zero_rate, current_vol_time, current_bank_time,
                                 np.int32(strikes.size))
    # copy results from gpu
    cuda.memcpy_dtoh(down_call_prices, gpu_call_prices)
    cuda.memcpy_dtoh(down_put_prices, gpu_put_prices)

    # estimate deltas and gammas then store in results_array
    results_array['call_deltas'] = (up_call_prices - down_call_prices) / (2 * driver_increment)
    results_array['put_deltas'] = (up_put_prices - down_put_prices) / (2 * driver_increment)
    results_array['call_gammas'] = (up_call_prices + down_call_prices - 2 * call_prices) / (2 * driver_increment) ** 2
    results_array['put_gammas'] = (up_put_prices + down_put_prices - 2 * put_prices) / (2 * driver_increment) ** 2

    return results_array
Ejemplo n.º 19
0
def convolve_gpu(sourceImage, convFilter, convType):
    """
    convType is the same as in:
    http://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.convolve.html#scipy.signal.convolve
    """
    
    # Cuda C code
    template = """

    #define FILTER_W $FILTER_W
    #define FILTER_H $FILTER_H
    
    #include <stdio.h>
    
    __device__ __constant__ float d_Kernel_filter[FILTER_H*FILTER_W];

    __global__ void ConvolutionKernel(
                float* img, int imgW, int imgH,
                float* out
                )
    {       
        const int nThreads = blockDim.x * gridDim.x;            
        const int idx = blockIdx.x * blockDim.x + threadIdx.x;      
        
        const int outW = imgW - FILTER_W + 1;
        const int outH = imgH - FILTER_H + 1;
        
        const int nPixels = outW * outH;            

        for(int curPixel = idx; curPixel < nPixels; curPixel += nThreads) 
        {
            int x = curPixel % outW;
            int y = curPixel / outW;                    
            float sum = 0;
            for (int filtY = 0; filtY < FILTER_H; filtY++)
                for (int filtX = 0; filtX < FILTER_W; filtX++)
                    {
                    int sx = x + filtX;
                    int sy = y + filtY;
                    sum+= img[sy*imgW + sx] * d_Kernel_filter[filtY*FILTER_W + filtX];
                    }               
            out[y * outW + x] = sum;
        }   
    }
    """
    convFilter = np.flipud(np.fliplr(convFilter))
    (DATA_H,  DATA_W) = sourceImage.shape
    (outH, outW) = (0, 0)
    
    # -- Add zero paddings
    (padWl, padWr, padHt, padHb) = (0, 0, 0, 0) 
    (filtH, filtW) = (convFilter.shape[0], convFilter.shape[1])
    if convType == 'full':
        padWl = filtW-1
        padWr = filtW-1
        padHt = filtH-1
        padHb = filtH-1
        (outH, outW) = (DATA_H+filtH-1, DATA_W+filtW-1)
    elif convType == 'same':
        padWl = filtW/2
        padWr = filtW/2 - (1-filtW%2)
        padHt = filtH/2
        padHb = filtH/2 - (1-filtH%2)        
        (outH, outW) = (DATA_H, DATA_W)
    elif convType == 'valid':
        (outH, outW) = (sourceImage.shape[0]-convFilter.shape[0]+1, sourceImage.shape[1]-convFilter.shape[1]+1)
    
    # -- zero padding
    tmpImg = np.zeros((padHt+DATA_H+padHb, padWl+DATA_W+padWr))
    tmpImg[padHt:padHt+DATA_H, padWl:padWl+DATA_W] = sourceImage
    sourceImage = tmpImg
    (DATA_H,  DATA_W) = sourceImage.shape

    destImage = np.float32(np.zeros((outH, outW)))
    #assert sourceImage.dtype == 'float32',  'source image must be float32'
    #assert convFilter.dtype == 'float32',  'convFilter must be float32'
    
    # -- interface stuff to Cuda C
    template = string.Template(template)
    code = template.substitute(FILTER_H = convFilter.shape[0], FILTER_W = convFilter.shape[1])
    module = SourceModule(code)
    
    # -- change the numpy arrays to row vectors of float32
    sourceImage = np.float32(sourceImage.reshape(sourceImage.size))
    convFilter = np.float32(convFilter.reshape(convFilter.size))
    
    convolutionGPU = module.get_function('ConvolutionKernel')
    d_Kernel_filter = module.get_global('d_Kernel_filter')[0]

    # -- Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    
    cuda.memcpy_htod(d_Kernel_filter,  convFilter) # The kernel goes into constant memory via a symbol defined in the kernel

    convolutionGPU(sourceImage_gpu,  np.int32(DATA_W),  np.int32(DATA_H), destImage_gpu,  block=(400,1,1), grid=(1,1))

    # Pull the data back from the GPU.
    cuda.memcpy_dtoh(destImage,  destImage_gpu)
    return destImage
def convolutional_degrid_GPU(kernel_list,
                             vshape,
                             uvgrid,
                             vuvwmap,
                             vfrequencymap,
                             vpolarisationmap=None):
    mod = SourceModule("""
    #include<stdio.h>
    #include<stdlib.h>
    __global__ void convol_degird_kernels2(float *visReal,
                    float *visImag,
                    float *uvgridReal,
                    float *uvgridImag,
                    float *ckernel0Real,
                    float *ckernel0Imag,
                    int *vfrequencymap,
                    int *x,
                    int *y,
                    int *xf,
                    int *yf,
                    int gh,
                    int gw,
                    int nx,
                    int vnpol,
                    int length)
    {
          
          for(int pol=0;pol<vnpol;pol++)
          {
             int row=threadIdx.x+blockIdx.x*blockDim.x;
             int col=threadIdx.y+blockIdx.y*blockDim.y;
             int slience=threadIdx.z+blockIdx.z*blockDim.z;
             int i=row+col*blockDim.x*gridDim.x+slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
             if(i<length)
             {
                int chan=vfrequencymap[i];
                int xx=x[i];
                int yy=y[i];
                int xxf=xf[i];
                int yyf=yf[i];
                float sumReal=0.0;
                float sumImag=0.0;
          
                int t1=chan*vnpol*nx*nx+pol*nx*nx;
                int t2=yyf*gh*gh*gh+xxf*gw*gh;
                for(int j=yy;j<yy+gh;j++)
                {
                   for(int k=xx;k<xx+gw;k++)
                   {
                      int t3=t1+j*nx+k;
                      int t4=t2+(j-yy)*gh+k-xx;
                      sumReal+=(uvgridReal[t3]*ckernel0Real[t4]-uvgridImag[t3]*ckernel0Imag[t4]);
                      sumImag+=(uvgridReal[t3]*ckernel0Imag[t4]+uvgridImag[t3]*ckernel0Real[t4]);
                   }
                }
               visReal[i*vnpol+pol]=sumReal;
               visImag[i*vnpol+pol]=sumImag;
             }  
          }
    }
    """)
    kernel_indices, kernels = kernel_list
    kernel_oversampling, _, gh, gw = kernels[0].shape
    assert gh % 2 == 0, "Convolution kernel must have even number of pixels"
    assert gw % 2 == 0, "Convolution kernel must have even number of pixels"
    inchan, inpol, ny, nx = uvgrid.shape
    vnpol = vshape[1]
    nvis = vshape[0]
    vis = numpy.zeros(vshape, dtype='complex')
    wt = numpy.zeros(vshape)

    # uvw -> fraction of grid mapping
    y, yf = frac_coord(ny, kernel_oversampling, vuvwmap[:, 1])
    y -= gh // 2
    x, xf = frac_coord(nx, kernel_oversampling, vuvwmap[:, 0])
    x -= gw // 2
    uvgridReal = uvgrid.real
    uvgridImag = uvgrid.imag

    if len(kernels) > 1:
        ckernels = numpy.conjugate(kernels)
        length = min(len(kernel_indices), len(vfrequencymap), len(x), len(y),
                     len(xf), len(yf))

        for pol in range(vnpol):
            for i in range(length):
                kind = kernel_indices[i]
                chan = vfrequencymap[i]
                xx = x[i]
                yy = y[i]
                xxf = xf[i]
                yyf = yf[i]
                vis[i, pol] = numpy.sum(
                    uvgrid[chan, pol, yy:yy + gh, xx:xx + gw] *
                    ckernels[kind][yyf, xxf, :, :])

    else:
        ckernel0 = numpy.conjugate(kernels[0])
        ckernel0Real = ckernel0.real
        ckernel0Imag = ckernel0.imag
        length = min(len(vfrequencymap), len(x), len(y), len(xf), len(yf))

        visReal = np.zeros_like(wt, dtype=np.float32)
        visIamg = np.zeros_like(wt, dtype=np.float32)
        vis_real = visReal.reshape(-1)
        vis_iamg = visIamg.reshape(-1)
        uvgridReal = np.array(uvgridReal)
        uvgrid_real = uvgridReal.reshape(-1)
        uvgridImag = np.array(uvgridImag)
        uvgrid_imag = uvgridImag.reshape(-1)
        ckernel0Real = np.array(ckernel0Real)
        ckernel0_real = ckernel0Real.reshape(-1)
        ckernel0Imag = np.array(ckernel0Imag)
        ckernel0_imag = ckernel0Imag.reshape(-1)

        # vis_real_gpu=drv.mem_alloc_like(vis_real)
        # vis_iamg_gpu=drv.mem_alloc_like(vis_iamg)
        uvgrid_real_gpu = drv.mem_alloc_like(uvgrid_real)
        uvgrid_imag_gpu = drv.mem_alloc_like(uvgrid_imag)
        ckernel0_real_gpu = drv.mem_alloc_like(ckernel0_real)
        ckernel0_imag_gpu = drv.mem_alloc_like(ckernel0_imag)
        vfrequencymap_gpu = drv.mem_alloc_like(vfrequencymap)
        x_gpu = drv.mem_alloc_like(x)
        y_gpu = drv.mem_alloc_like(y)
        xf_gpu = drv.mem_alloc_like(xf)
        yf_gpu = drv.mem_alloc_like(yf)

        strm = drv.Stream()
        drv.memcpy_htod_async(uvgrid_real_gpu, np.array(uvgrid_real), strm)
        drv.memcpy_htod_async(uvgrid_imag_gpu, np.array(uvgrid_imag), strm)
        drv.memcpy_htod_async(ckernel0_real_gpu, np.array(ckernel0_real), strm)
        drv.memcpy_htod_async(ckernel0_imag_gpu, np.array(ckernel0_imag), strm)
        drv.memcpy_htod_async(vfrequencymap_gpu, np.array(vfrequencymap), strm)
        drv.memcpy_htod_async(x_gpu, np.array(x), strm)
        drv.memcpy_htod_async(y_gpu, np.array(y), strm)
        drv.memcpy_htod_async(xf_gpu, np.array(xf), strm)
        drv.memcpy_htod_async(yf_gpu, np.array(yf), strm)
        strm.synchronize()
        uvgrid_real = np.array(uvgrid_real)
        uvgrid_imag = np.array(uvgrid_imag)
        vis_real = np.array(vis_real)
        vis_iamg = np.array(vis_iamg)
        convol_degird_kernels2 = mod.get_function("convol_degird_kernels2")

        convol_degird_kernels2(drv.Out(vis_real),
                               drv.Out(vis_iamg),
                               uvgrid_real_gpu,
                               uvgrid_imag_gpu,
                               ckernel0_real_gpu,
                               ckernel0_imag_gpu,
                               vfrequencymap_gpu,
                               x_gpu,
                               y_gpu,
                               xf_gpu,
                               yf_gpu,
                               np.int32(gh),
                               np.int32(gw),
                               np.int32(nx),
                               np.int32(vnpol),
                               np.int32(length),
                               block=(32, 32, 1),
                               grid=(1024, 96, 1))
        vis = numpy.ones((length, vnpol), dtype='complex')
        vis_real_2D = vis_real.reshape(-1, vnpol)
        vis_iamg_2D = vis_iamg.reshape(-1, vnpol)
        vis.real = vis_real_2D
        vis.imag = vis_iamg_2D
    return numpy.array(vis)
def convolutional_grid_GPU(kernel_list,
                           uvgrid,
                           vis,
                           visweights,
                           vuvwmap,
                           vfrequencymap,
                           vpolarisationmap=None):
    mod = SourceModule("""
    #include<stdio.h>
    #include<stdlib.h>
    __global__ void convol_grid_kernel1(float *uvgrid_real,
               float *uvgrid_imag,
               float *sumwt,
               float *kernels_real,
               float *kernels_imag,
               float *viswt_real,
               float *viswt_imag,
               float *wts,
               int *kernel_indices,
               int *vfrequencypam,
               int *x,
               int *y,
               int *xf,
               int *yf,
               int nx,
               int gh,
               int gw,
               int npol,
               int length)
    {
        for(int pol=0;pol<npol;pol++)
        {
           int row=threadIdx.x+blockIdx.x*blockDim.x;
           int col=threadIdx.y+blockIdx.y*blockDim.y;
           int slience=threadIdx.z+blockIdx.z*blockDim.z;
           int i=row+col*blockDim.x*gridDim.x+
               slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
           if(i<length)
           {
              float v_real=viswt_real[i*npol+pol];
              float v_imag=viswt_imag[i*npol+pol];
              float vwt=wts[i*npol+pol];
              int kind=kernel_indices[i];
              int chan=vfrequencypam[i];
              int xx=x[i];
              int yy=y[i];
              int xxf=xf[i];
              int yyf=yf[i];
              for(int j=yy;j<yy+gh;j++)
                 for(int k=xx;k<xx+gw;k++)
                 {
                    int w=chan*npol*nx*nx+pol*nx*nx+j*nx+k;
                    int q=kind*gh*gh*gh*gh+yyf*gh*gh*gh+xxf*gh*gh+j*gh+k;
                    uvgrid_real[w] +=(kernels_real[q]*v_real-
                        kernels_imag[q]*v_imag);
                    uvgrid_imag[w] +=(kernels_real[q]*v_imag+
                        kernels_imag[q]*v_real);
                 }
              sumwt[chan*npol+pol]+=vwt;
           }
        }
    }
    
    __global__ void convol_grid_kernel2(float *uvgrid_real,
               float *uvgrid_imag,
               float *sumwt,
               float *kernel0_real,
               float *kernel0_imag,
               float *viswt_real,
               float *viswt_imag,
               float *wts,
               int *vfrequencymap,
               int *x,
               int *y,
               int *xf,
               int *yf,
               int nx,
               int gh,
               int gw,
               int npol,
               int length)
    {
       for(int pol=0;pol<npol;pol++)
       {
          int row=threadIdx.x+blockIdx.x*blockDim.x;
          int col=threadIdx.y+blockIdx.y*blockDim.y;
          int slience=threadIdx.z+blockIdx.z*blockDim.z;
          int i=row+col*blockDim.x*gridDim.x+
               slience*blockDim.x*gridDim.x*blockDim.y*gridDim.y;
          if(i<length)
          {
             float v_real=viswt_real[i*npol+pol];
             float v_imag=viswt_imag[i*npol+pol];
             float vwt=wts[i*npol+pol];
             int chan=vfrequencymap[i];//89
             int xx=x[i];
             int yy=y[i];
             int xxf=xf[i];
             int yyf=yf[i];
             for(int j=yy;j<yy+gh;j++)
                for(int k=xx;k<xx+gw;k++)
                {
                   int w=chan*pol*nx*nx+pol*nx*nx+j*nx+k;
                   int q=yyf*gh*gh*gh+xxf*gh*gh+j*gh+k;
                   uvgrid_real[w]+=(kernel0_real[q]*v_real-
                            kernel0_imag[q]*v_imag);
                   uvgrid_imag[w]+=(kernel0_real[q]*v_imag+
                            kernel0_imag[q]*v_real);
                      
                }
             sumwt[chan*npol+pol] += vwt;
          }
       }
    }
    """)
    kernel_indices, kernels = kernel_list
    kernel_oversampling, _, gh, gw = kernels[0].shape
    assert gh % 2 == 0, "Convolution kernel must have even number of pixels"
    assert gw % 2 == 0, "Convolution kernel must have even number of pixels"
    inchan, inpol, ny, nx = uvgrid.shape

    # Construct output grids (in uv space)
    sumwt = numpy.zeros([inchan, inpol])

    # uvw -> fraction of grid mapping
    y, yf = frac_coord(ny, kernel_oversampling, vuvwmap[:, 1])
    y -= gh // 2
    x, xf = frac_coord(nx, kernel_oversampling, vuvwmap[:, 0])
    x -= gw // 2

    # About 228k samples per second for standard kernel so about 10 million CMACs per second

    # Now we can loop over all rows
    wts = visweights[...]
    viswt = vis[...] * visweights[...]
    npol = vis.shape[-1]

    uvgrid_array = np.array(uvgrid)
    uvgrid_real = uvgrid_array.real.reshape(-1)
    uvgrid_imag = uvgrid_array.imag.reshape(-1)
    viswt_array = np.array(viswt)
    viswt_real = viswt_array.real.reshape(-1)
    viswt_imag = viswt_array.imag.reshape(-1)
    wts1 = np.array(wts).reshape(-1)

    if len(kernels) > 1:

        for pol in range(npol):
            minlen = min(len(viswt[..., pol]), len(wts[..., pol]),
                         len(kernel_indices), len(list(vfrequencymap)), len(x),
                         len(y), len(xf), len(yf))
            for i in range(minlen):
                print(pol, "->", i)
                v = viswt[i, :, pol]
                vwt = wts[i, :, pol]
                kind = kernel_indices[i]
                chan = vfrequencymap[i]
                xx = x[i]
                yy = y[i]
                xxf = xf[i]
                yyf = yf[i]
                uvgrid[chan, pol, yy:yy + gh,
                       xx:xx + gw] += kernels[kind][yyf, xxf, :, :] * v
                sumwt[chan, pol] += vwt

    else:
        kernel0 = kernels[0]

        kernel0_array = np.array(kernel0)
        kernel0_real = kernel0_array.real.reshape(-1)

        kernel0_imag = kernel0_array.imag.reshape(-1)
        convol_grid_kernel2 = mod.get_function("convol_grid_kernel2")
        length = len(x)
        cublock = (32, 32, 1)
        if length // (32 * 32) > 32:
            cugrid = (32, length // (32 * 32 * 32) + 1 *
                      (length % (32 * 32 * 32) != 0), 1)
        else:
            cugrid = (length // (32 * 32) + 1 * (length % (32 * 32) != 0), 1,
                      1)

        kernel0_real_gpu = drv.mem_alloc_like(kernel0_real)
        kernel0_imag_gpu = drv.mem_alloc_like(kernel0_imag)
        viswt_real_gpu = drv.mem_alloc_like(viswt_real)
        viswt_imag_gpu = drv.mem_alloc_like(viswt_imag)
        wts1_gpu = drv.mem_alloc_like(wts1)
        vfrequencymap_gpu = drv.mem_alloc_like(vfrequencymap)
        x_gpu = drv.mem_alloc_like(x)
        y_gpu = drv.mem_alloc_like(y)
        xf_gpu = drv.mem_alloc_like(xf)
        yf_gpu = drv.mem_alloc_like(yf)
        #print(kernel0_real)
        strm = drv.Stream()
        drv.memcpy_htod_async(kernel0_real_gpu, np.array(kernel0_real), strm)
        drv.memcpy_htod_async(kernel0_imag_gpu, np.array(kernel0_imag), strm)
        drv.memcpy_htod_async(viswt_real_gpu, np.array(viswt_real), strm)
        drv.memcpy_htod_async(viswt_imag_gpu, np.array(viswt_imag), strm)
        drv.memcpy_htod_async(wts1_gpu, np.array(wts1), strm)
        drv.memcpy_htod_async(vfrequencymap_gpu, np.array(vfrequencymap), strm)
        drv.memcpy_htod_async(x_gpu, np.array(x), strm)
        drv.memcpy_htod_async(y_gpu, np.array(y), strm)
        drv.memcpy_htod_async(xf_gpu, np.array(xf), strm)
        drv.memcpy_htod_async(yf_gpu, np.array(yf), strm)
        strm.synchronize()
        uvgrid_real = np.array(uvgrid_real)
        uvgrid_imag = np.array(uvgrid_imag)
        convol_grid_kernel2(drv.Out(uvgrid_real),
                            drv.Out(uvgrid_imag),
                            drv.Out(sumwt),
                            kernel0_real_gpu,
                            kernel0_imag_gpu,
                            viswt_real_gpu,
                            viswt_imag_gpu,
                            wts1_gpu,
                            vfrequencymap_gpu,
                            x_gpu,
                            y_gpu,
                            xf_gpu,
                            yf_gpu,
                            np.int32(nx),
                            np.int32(gh),
                            np.int32(gw),
                            np.int32(npol),
                            np.int32(length),
                            block=(32, 32, 1),
                            grid=(100, 100, 1))
        uvgrid_real_4D = uvgrid_real.reshape(inchan, inpol, nx, ny)
        uvgrid_imag_4D = uvgrid_imag.reshape(inchan, inpol, nx, ny)
        uvgrid.real = uvgrid_real_4D
        uvgrid.imag = uvgrid_imag_4D
    return uvgrid, sumwt
Ejemplo n.º 22
0
                bin[(i+1)*3 + (j+1)]=0;
            else
                bin[(i+1)*3 + (j+1)]=1;
        }

	lbpvals[row*numCols+col] = (bin[0]*128) + (bin[1]*64) + (bin[2]*32) + (bin[3]) + (bin[5]*16) + (bin[6]*2) + (bin[7]*4) + (bin[8]*8);
}
""")

kernel = mod.get_function("lbpval_data")

img = cv2.imread("image.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

rows, cols = img.shape
d_img = cuda.mem_alloc_like(img)
d_res = cuda.mem_alloc_like(img)
bins = numpy.zeros((3, 3), numpy.uint8)
d_bins = cuda.mem_alloc_like(bins)

numRows = numpy.int32(rows)
numCols = numpy.int32(cols)

cuda.memcpy_htod(d_img, img)
kernel(d_img, d_res, numRows, numCols, grid=(int(numpy.ceil(cols / 32.0)), int(numpy.ceil(rows / 32.0))), block=(32, 32, 1))

h_res = numpy.zeros((rows, cols), numpy.uint8)
cuda.memcpy_dtoh(h_res, d_res)

cv2.imshow('Final', h_res)
cv2.imwrite('LBP_RES.png', h_res)
Ejemplo n.º 23
0
                d_u[2*NN+iind] = ZER;
            }else{
                d_ddu[2*NN+iind] = ddwn;
                d_du[2*NN+iind] = dw + dt/TWO*(ddw + ddwn);
                d_u[2*NN+iind] = wi + dt*dw + dt*dt/TWO*ddw;
            }
        }
    }
    """,
    options=["--use_fast_math"])

d_dil = cuda.mem_alloc(NN * L.dtype.itemsize)
d_u = cuda.mem_alloc(3 * NN * L.dtype.itemsize)
d_du = cuda.mem_alloc(3 * NN * L.dtype.itemsize)
d_ddu = cuda.mem_alloc(3 * NN * L.dtype.itemsize)
d_Sf = cuda.mem_alloc_like(Sf)
d_dmg = cuda.mem_alloc(((NB) * NN + 7) // 8)
cuda.memcpy_htod(d_Sf, Sf)

d_calcForceState = mod.get_function("calcForceState")
d_calcDilation = mod.get_function("calcDilation")

d_calcForceState.set_cache_config(cuda.func_cache.PREFER_L1)
d_calcDilation.set_cache_config(cuda.func_cache.PREFER_L1)
dil = np.empty((NN))

print("Begining simulation: ", NN)

t0 = time.time()
for tt in range(100):
    d_calcDilation(d_Sf,
Ejemplo n.º 24
0
def single_advance_gpu(state, num_points, grid_space):

    rhs = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32)
    time_before = cuda.Event()
    time_1 = cuda.Event()
    time_after = cuda.Event()
    ## allocate GPU memory 
    indices_device = cuda.mem_alloc_like(indices)
    cuda.memcpy_htod(indices_device, indices)

    f_min = cuda.mem_alloc(int(sizeof_float * 
                num_moments * num_nodes * num_points))
    f_max = cuda.mem_alloc(int(sizeof_float * 
                num_moments*num_nodes*num_points))
    
    flux_1 = cuda.mem_alloc_like(state)
    flux_2 = cuda.mem_alloc_like(state)

    ## compile GPU kernel 
    BlockSize = (256, 1, 1)
    GridSize = (num_points +BlockSize[0] - 1) /BlockSize[0];
    GridSize = (int(GridSize), 1, 1)

    domain_get_flux = QUAD.get_function('domain_get_flux_3d')
    fsum = QUAD.get_function('fsum_3d')
    flux_out = QUAD.get_function('flux_3d')
    ## compute_rhs 

    time_before.record()
    # grid_inversion(state)
    # output are pointer object to GPU memory 
    _, w, x, y, z = chyqmom27(state, num_points)

    time_1.record()

    # domain_get_fluxes(weights, abscissas, qbmm_mgr.indices,
    #                 num_points, qbmm_mgr.num_moments,
    #                 qbmm_mgr.num_nodes, flux)
    domain_get_flux(w, x, y, z, indices_device,
                    f_min, f_max, 
                    np.int32(num_moments), 
                    np.int32(num_nodes), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)

    fsum(flux_1, f_min, f_max, 
                    np.int32(num_moments), 
                    np.int32(num_nodes), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)
    flux_out(flux_1, flux_2, np.float32(grid_space), 
                    np.int32(num_moments), 
                    np.int32(num_points),
                    block=BlockSize, grid=GridSize)
    
    time_after.record()
    time_1.synchronize()
    time_after.synchronize()

    total_time = time_after.time_since(time_before)
    quad_time = time_after.time_since(time_1)
    
    cuda.memcpy_dtoh(rhs, flux_2)
    w.free()
    x.free()
    y.free()
    z.free()
    return rhs, total_time, quad_time
Ejemplo n.º 25
0
    destImage = sourceImage.copy()
    assert sourceImage.dtype == 'float32', 'source image must be float32'
    (imageHeight, imageWidth) = sourceImage.shape
    assert filterx.shape == filtery.shape == (
        KERNEL_W,
    ), 'Kernel is compiled for a different kernel size! Try changing KERNEL_W'
    filterx = numpy.float32(filterx)
    filtery = numpy.float32(filtery)
    DATA_W = iAlignUp(imageWidth, 16)
    DATA_H = imageHeight
    BYTES_PER_WORD = 4
    # 4 for float32
    DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD
    KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD
    # Prepare device arrays
    destImage_gpu = cuda.mem_alloc_like(destImage)
    sourceImage_gpu = cuda.mem_alloc_like(sourceImage)
    intermediateImage_gpu = cuda.mem_alloc_like(sourceImage)
    cuda.memcpy_htod(sourceImage_gpu, sourceImage)
    cuda.memcpy_htod(
        d_Kernel_rows, filterx
    )  # The kernel goes into constant memory via a symbol defined in the kernel
    cuda.memcpy_htod(d_Kernel_columns, filtery)

    # Call the kernels for convolution in each direction.
    blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H)
    blockGridColumns = (iDivUp(DATA_W,
                               COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H))
    threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1,
                       1)
    threadBlockColumns = (COLUMN_TILE_W, 8, 1)
Ejemplo n.º 26
0
def _thread(pid, tid, cuda_context, cuda_kernel, dispatcher, temp_storage,
            total_edge_count, log_lock, merge_lock, exit_signal, exit_state):
    try:
        with log_lock:
            logging.debug('Clustering subprocess {} thread {} started.'.format(
                pid, tid))

        cuda_context.push()

        ref_block_height, ref_block_width = block_dimensions
        edg_path = Path(temp_storage, 'edg')
        dps_path = Path(temp_storage, 'dps')
        ranked_spectra = session.ranked_spectra
        cuda_stream = drv.Stream()

        allocation_size_divisor = allocation_size_initial_divisor
        allocation_size = int(ref_block_height * ref_block_width /
                              allocation_size_divisor)
        reallocated = False

        with log_lock:
            logging.debug(
                'Clustering subprocess {} thread {}: Allocating host and device memory.'
                .format(pid, tid))
        # allocate host pagelocked memory
        # input
        plm_precursor_mass = drv.pagelocked_empty(
            ref_block_height + ref_block_width,
            dtype=CG_PRECURSOR_MASS_DATA_TYPE)
        plm_mz = drv.pagelocked_empty(
            (ref_block_height + ref_block_width, num_of_peaks),
            dtype=CG_MZ_DATA_TYPE)
        plm_intensity = drv.pagelocked_empty(
            (ref_block_height + ref_block_width, num_of_peaks),
            dtype=CG_INTENSITY_DATA_TYPE)
        plm_block_dimensions = drv.pagelocked_empty(
            2, dtype=CG_BLOCK_DIMENSIONS_DATA_TYPE)
        plm_offset = drv.pagelocked_empty(2, dtype=CG_OFFSET_DATA_TYPE)
        plm_allocation_size = drv.pagelocked_empty(
            1, dtype=CG_ALLOCATION_SIZE_DATA_TYPE)
        # output
        plm_counter = drv.pagelocked_empty(1, dtype=CG_COUNTER_DATA_TYPE)
        plm_edge = drv.pagelocked_empty((allocation_size, 2),
                                        dtype=CG_EDGE_DATA_TYPE)
        plm_dot_product = drv.pagelocked_empty(allocation_size,
                                               dtype=CG_DOT_PRODUCT_DATA_TYPE)
        plm_overflowed = drv.pagelocked_empty(1, dtype=CG_OVERFLOWED_DATA_TYPE)

        # allocate device memory
        # input
        dvp_precursor_mass = drv.mem_alloc_like(plm_precursor_mass)
        dvp_mz = drv.mem_alloc_like(plm_mz)
        dvp_intensity = drv.mem_alloc_like(plm_intensity)
        dvp_block_dimensions = drv.mem_alloc_like(plm_block_dimensions)
        dvp_offset = drv.mem_alloc_like(plm_offset)
        dvp_allocation_size = drv.mem_alloc_like(plm_allocation_size)
        # output
        dvp_counter = drv.mem_alloc_like(plm_counter)
        dvp_edge = drv.mem_alloc_like(plm_edge)
        dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
        dvp_overflowed = drv.mem_alloc_like(plm_overflowed)

        with log_lock:
            logging.debug(
                'Clustering subprocess {} thread {}: Start iterating dispatcher.'
                .format(pid, tid))
        previous_row_id = -1
        dispatcher.connect(pid, tid)

        # iterate dispatcher to get blocks
        for row_id, column_id, block in dispatcher.iterate(pid, tid):
            if exit_signal.value:
                with log_lock:
                    logging.debug(
                        'Subprocess {} thread {}: Received exit signal, exits now.'
                        .format(pid, tid))
                break

            try:
                y_range, x_range = block
                block_height = y_range[1] - y_range[0]
                block_width = x_range[1] - x_range[0]
                if row_id != previous_row_id:
                    with log_lock:
                        logging.debug(
                            '\033[92mSubprocess {} thread {}: Processing row {} (y:{}->{}).\033[0m'
                            .format(pid, tid, row_id, *y_range))
                    previous_row_id = row_id

                # get necessary data
                plm_precursor_mass[:
                                   block_height] = ranked_spectra.precursor_mass[
                                       y_range[0]:y_range[1]]
                plm_precursor_mass[
                    block_height:block_height +
                    block_width] = ranked_spectra.precursor_mass[
                        x_range[0]:x_range[1]]
                plm_mz[:block_height] = ranked_spectra.mz[
                    y_range[0]:y_range[1]]
                plm_mz[block_height:block_height +
                       block_width] = ranked_spectra.mz[x_range[0]:x_range[1]]
                plm_intensity[:block_height] = ranked_spectra.intensity[
                    y_range[0]:y_range[1]]
                plm_intensity[block_height:block_height +
                              block_width] = ranked_spectra.intensity[
                                  x_range[0]:x_range[1]]
                plm_block_dimensions[:] = (block_height, block_width)
                plm_offset[:] = (y_range[0], x_range[0])
                # upload data
                drv.memcpy_htod_async(dvp_precursor_mass, plm_precursor_mass,
                                      cuda_stream)
                drv.memcpy_htod_async(dvp_mz, plm_mz, cuda_stream)
                drv.memcpy_htod_async(dvp_intensity, plm_intensity,
                                      cuda_stream)
                drv.memcpy_htod_async(dvp_block_dimensions,
                                      plm_block_dimensions, cuda_stream)
                drv.memcpy_htod_async(dvp_offset, plm_offset, cuda_stream)

                if reallocated:
                    allocation_size_divisor = allocation_size_initial_divisor
                    allocation_size = int(ref_block_height * ref_block_width /
                                          allocation_size_divisor)
                    # reallocate host pagelocked memory
                    del plm_edge
                    del plm_dot_product
                    plm_edge = drv.pagelocked_empty((allocation_size, 2),
                                                    dtype=CG_EDGE_DATA_TYPE)
                    plm_dot_product = drv.pagelocked_empty(
                        allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE)
                    # reallocate device memory
                    del dvp_edge
                    del dvp_dot_product
                    dvp_edge = drv.mem_alloc_like(plm_edge)
                    dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
                    with log_lock:
                        logging.debug(
                            '\033[92mSubprocess {} thread {}: Reset memory allocation size divisor to {}.\033[0m'
                            .format(pid, tid, allocation_size_divisor))
                    reallocated = False

                cublockdim = (cuda_block_dimensions[1],
                              cuda_block_dimensions[0], 1)
                cugriddim = (math.ceil(block_width / cuda_block_dimensions[1]),
                             math.ceil(block_height /
                                       cuda_block_dimensions[0]))
                while True:
                    plm_allocation_size[0] = allocation_size
                    plm_counter[0] = 0
                    plm_overflowed[0] = False
                    drv.memcpy_htod_async(dvp_allocation_size,
                                          plm_allocation_size, cuda_stream)
                    drv.memcpy_htod_async(dvp_counter, plm_counter,
                                          cuda_stream)
                    drv.memcpy_htod_async(dvp_overflowed, plm_overflowed,
                                          cuda_stream)

                    cuda_kernel.prepared_async_call(
                        cugriddim, cublockdim, cuda_stream, dvp_precursor_mass,
                        dvp_mz, dvp_intensity, dvp_block_dimensions,
                        dvp_offset, dvp_allocation_size, dvp_counter, dvp_edge,
                        dvp_dot_product, dvp_overflowed)

                    # transfer computation result from device to host
                    drv.memcpy_dtoh_async(plm_edge, dvp_edge, cuda_stream)
                    drv.memcpy_dtoh_async(plm_counter, dvp_counter,
                                          cuda_stream)
                    drv.memcpy_dtoh_async(plm_overflowed, dvp_overflowed,
                                          cuda_stream)
                    drv.memcpy_dtoh_async(plm_dot_product, dvp_dot_product,
                                          cuda_stream)
                    cuda_stream.synchronize()

                    if plm_overflowed[0]:
                        allocation_size_divisor = int(allocation_size_divisor /
                                                      2)
                        if allocation_size_divisor < 1:
                            err_msg = (
                                '\nSubprocess {} thread {}: Allocation size divisor reached to the impossible value of {}.'
                                .format(pid, tid, allocation_size_divisor))
                            with log_lock:
                                logging.error(err_msg)
                            raise Exception(err_msg)
                        with log_lock:
                            logging.debug(
                                '\033[92mSubprocess {} thread {}: Edge list overflowed, '
                                'decreases allocation size divisor to {}.\033[0m'
                                .format(pid, tid, allocation_size_divisor))
                        allocation_size = int(block_width * block_height /
                                              allocation_size_divisor)
                        # reallocate host pagelocked memory
                        del plm_edge
                        del plm_dot_product
                        plm_edge = drv.pagelocked_empty(
                            (allocation_size, 2), dtype=CG_EDGE_DATA_TYPE)
                        plm_dot_product = drv.pagelocked_empty(
                            allocation_size, dtype=CG_DOT_PRODUCT_DATA_TYPE)
                        # reallocate device memory
                        del dvp_edge
                        del dvp_dot_product
                        dvp_edge = drv.mem_alloc_like(plm_edge)
                        dvp_dot_product = drv.mem_alloc_like(plm_dot_product)
                        reallocated = True
                        continue
                    else:
                        break

                if abs(plm_precursor_mass[block_height - 1] -
                       plm_precursor_mass[block_height + block_width -
                                          1]) > precursor_tolerance:
                    dispatcher.next_row(pid, tid)
                with merge_lock:
                    edge_list_size = int(plm_counter[0])
                    if edge_list_size != 0:
                        total_edge_count.value += edge_list_size
                        edg = np.memmap(str(edg_path),
                                        dtype=CG_EDGE_DATA_TYPE,
                                        mode='r+',
                                        shape=(total_edge_count.value, 2))
                        dps = np.memmap(str(dps_path),
                                        dtype=CG_DOT_PRODUCT_DATA_TYPE,
                                        mode='r+',
                                        shape=total_edge_count.value)
                        edg[-edge_list_size:] = plm_edge[:edge_list_size]
                        dps[-edge_list_size:] = plm_dot_product[:
                                                                edge_list_size]

            except Exception:
                err_msg = '\nSubprocess {} thread {}: Failed to clustering block (y:{}->{}, x:{}->{}).' \
                    .format(pid, tid, y_range[0], y_range[1], x_range[0], x_range[1])
                with log_lock:
                    logging.error(err_msg)
                raise

        with log_lock:
            if not exit_signal.value:
                logging.debug(
                    'Subprocess {} thread {}: Reached the end of iteration, work done.'
                    .format(pid, tid))
        cuda_context.pop()

    except (Exception, KeyboardInterrupt) as e:
        if type(e) is KeyboardInterrupt:
            with log_lock:
                logging.debug(
                    'Subprocess {} thread {}: Received KeyboardInterrupt, exits now.'
                    .format(pid, tid))
        else:
            with log_lock:
                logging.exception(
                    '\nSubprocess {} thread {}: Ended unexpectedly. Logging traceback:\n'
                    '==========TRACEBACK==========\n'.format(pid, tid))
            exit_signal.value = True
            exit_state.value = 1
        cuda_context.pop()
    return
Ejemplo n.º 27
0
""")

# Kernel Function Declaration
createKernel = mod.get_function("createKernel")
gaussianBlur = mod.get_function("gaussianBlur")
sobelFilter = mod.get_function("sobelFilter")
nonMaxSuppress = mod.get_function("nonMaxSuppress")
threshold = mod.get_function("threshold")

# Gaussian Filter Create
size = numpy.uint8(5)
sig = numpy.int32(2)
s = numpy.int32(0)
k = numpy.zeros(size * size, dtype=numpy.float32)
kernel = cuda.mem_alloc_like(k)
createKernel(kernel,
             size,
             sig,
             grid=(1, 1),
             block=(int(size), 1, 1),
             shared=int(size * size))

# Gaussian Blur
colorImg = cv2.imread('Original.jpg')
img = cv2.cvtColor(colorImg, cv2.COLOR_BGR2GRAY)
blur = numpy.zeros(img.shape, dtype=numpy.uint8)
width, height = img.shape
i = copy.deepcopy(img)
d_i = cuda.mem_alloc_like(i)
d_res = cuda.mem_alloc_like(i)