Example #1
0
def CFPLF_Hebbian_Sparse_GPU(projection):
    """
    Sparse CF Projection learning function applying Hebbian learning
    to the weights in a projection.
    """
    single_conn_lr = projection.learning_rate / projection.n_units
    # Transfering source and destination activities:
    src_activity_gpu = gpuarray.to_gpu_async(
        np.ravel(projection.src.activity).astype(np.float32), )
    dest_activity_gpu = gpuarray.to_gpu_async(
        np.ravel(projection.dest.activity).astype(np.float32), )

    # Computing Hebbian learning weights:
    projection.hebbian_kernel(single_conn_lr,
                              projection.nzrows_gpu,
                              projection.nzcols_gpu,
                              src_activity_gpu,
                              dest_activity_gpu,
                              projection.weights_gpu.Val,
                              range=slice(0, projection.nzcount, 1))

    # Normalisation values:
    projection.weights_gpu.mv(projection.norm_ones_gpu,
                              y=projection.norm_total_gpu,
                              autosync=False)
    projection.has_norm_total = True
Example #2
0
def cuda_sobel(m_wi):
	m_sobel = np.zeros_like(m_wi, np.float32)
	
	# Transfer image asynchronously.
	cu_m_wi = gpu.to_gpu_async(m_wi)
	cu_m_sobel = gpu.to_gpu_async(m_sobel)

	# Get block/grid size for steps 1-3.
	height, width = m_wi.shape

	block_hw = 32
	block_size =	(block_hw,block_hw,1)

	filter_radius = 1
	tile_size  = block_hw  - filter_radius * 2

	grid_width  = (width  + tile_size - 1) / tile_size
	grid_height = (height + tile_size- 1) / tile_size
	grid_size = (grid_width, grid_height)
	
	width	= np.int32(width)
	height = np.int32(height)

	kernel_sobel(cu_m_wi,cu_m_sobel, width, height, block=block_size, grid=grid_size)
	
	m_sobel = cu_m_sobel.get()
	return m_sobel
Example #3
0
def cuda_sobel(m_wi):
    m_sobel = np.zeros_like(m_wi, np.float32)

    # Transfer image asynchronously.
    cu_m_wi = gpu.to_gpu_async(m_wi)
    cu_m_sobel = gpu.to_gpu_async(m_sobel)

    # Get block/grid size for steps 1-3.
    height, width = m_wi.shape

    block_hw = 32
    block_size = (block_hw, block_hw, 1)

    filter_radius = 1
    tile_size = block_hw - filter_radius * 2

    grid_width = (width + tile_size - 1) / tile_size
    grid_height = (height + tile_size - 1) / tile_size
    grid_size = (grid_width, grid_height)

    width = np.int32(width)
    height = np.int32(height)

    kernel_sobel(cu_m_wi,
                 cu_m_sobel,
                 width,
                 height,
                 block=block_size,
                 grid=grid_size)

    m_sobel = cu_m_sobel.get()
    return m_sobel
Example #4
0
 def __init__(self, Y, verbose=False):
     """Initialize instance.
     """
     self._verbose = verbose
     self._Y_gpu = gpuarray.to_gpu_async(Y)
     self._Y_gpu_scratch = gpuarray.to_gpu_async(Y)
     self._featuredim = Y.shape[0]
     self._framecount = Y.shape[1]
Example #5
0
def fft_stitch(N, plan2d, plan1d, hostarr, largebox_d):
	w = hostarr.shape[0]
	META_GRID_SIZE = w/N
	fftbatch = 2
	for meta_z in xrange(META_GRID_SIZE): #fft along x
		largebox_d = gpuarray.to_gpu_async(hostarr[:, :, meta_z*N:(meta_z+1)*N].transpose(1,2,0))
		#print largebox_d.shape
		plan1d.execute(largebox_d, batch=fftbatch, inverse=True)
		hostarr[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.real.get_async().transpose(2,0,1)
	for meta_x in xrange(META_GRID_SIZE): #fft along y, z
		largebox_d = gpuarray.to_gpu_async(hostarr[meta_x*N:(meta_x+1)*N, :, :].copy())
		plan2d.execute(largebox_d, batch=fftbatch, inverse=True)
		hostarr[meta_x*N:(meta_x+1)*N, :, :] = largebox_d.real.get_async()
	return hostarr
Example #6
0
    def __init__(self,
                 num_inputs=None,
                 num_outputs=None,
                 weights=None,
                 b=None,
                 stream=None,
                 relu=False,
                 sigmoid=False,
                 delta=None):
        self.stream = stream

        if delta is None:
            self.delta = np.float32(0.001)
        else:
            self.delta = np.float32(delta)

        if weights is None:
            weights = (np.random.rand(num_outputs, num_inputs) - 0.5)
            self.num_inputs = np.int32(num_inputs)
            self.num_outputs = np.int32(num_outputs)

        if type(weights) != pycuda.gpuarray.GPUArray:
            self.weights = gpuarray.to_gpu_async(np.array(weights,
                                                          dtype=np.float32),
                                                 stream=self.stream)
        else:
            self.weights = weights

        if num_inputs is None or num_outputs is None:
            self.num_inputs = np.int32(self.weights.shape[1])
            self.num_outputs = np.int32(self.weights.shape[0])
        else:
            self.num_inputs = np.int32(num_inputs)
            self.num_outputs = np.int32(num_outputs)

        if b is None:
            b = gpuarray.zeros((self.num_outputs, ), dtype=np.float32)

        if type(b) != pycuda.gpuarray.GPUArray:
            self.b = gpuarray.to_gpu_async(np.array(b, dtype=np.float32),
                                           stream=self.stream)
        else:
            self.b = b

        self.relu = np.int32(relu)
        self.sigmoid = np.int32(sigmoid)

        self.block = (32, 1, 1)
        self.grid = (int(np.ceil(self.num_outputs / 32)), 1, 1)
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):

    # we set up our complex lattice as such
    real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
    imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
    mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)    
    
    # copy complex lattice to the GPU
    mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)

    # synchronize in current context
    pycuda.autoinit.context.synchronize()

    # allocate an empty array on the GPU
    mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)

    mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))

    pycuda.autoinit.context.synchronize()
              
    mandelbrot_graph = mandelbrot_graph_gpu.get_async()
    
    pycuda.autoinit.context.synchronize()

    return mandelbrot_graph
Example #8
0
def gpu_r2c_fft(in1, is_gpuarray=False, store_on_gpu=False):
    """
    This function makes use of the scikits implementation of the FFT for GPUs to take the real to complex FFT.

    INPUTS:
    in1             (no default):       The array on which the FFT is to be performed.
    is_gpuarray     (default=True):     Boolean specifier for whether or not input is on the gpu.
    store_on_gpu    (default=False):    Boolean specifier for whether the result is to be left on the gpu or not.

    OUTPUTS:
    gpu_out1                            The gpu array containing the result.
    OR
    gpu_out1.get()                      The result from the gpu array.
    """

    if is_gpuarray:
        gpu_in1 = in1
    else:
        gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32))

    output_size = np.array(in1.shape)
    output_size[1] = 0.5*output_size[1] + 1

    gpu_out1 = gpuarray.empty([output_size[0], output_size[1]], np.complex64)
    gpu_plan = Plan(gpu_in1.shape, np.float32, np.complex64)
    fft(gpu_in1, gpu_out1, gpu_plan)

    if store_on_gpu:
        return gpu_out1
    else:
        return gpu_out1.get()
def CFPLF_Hebbian_Sparse_GPU(projection):
    """
    Sparse CF Projection learning function applying Hebbian learning
    to the weights in a projection.
    """
    single_conn_lr = projection.learning_rate/projection.n_units
    # Transfering source and destination activities:
    src_activity_gpu = gpuarray.to_gpu_async(np.ravel(projection.src.activity).astype(np.float32), )
    dest_activity_gpu = gpuarray.to_gpu_async(np.ravel(projection.dest.activity).astype(np.float32), )

    # Computing Hebbian learning weights:
    projection.hebbian_kernel(single_conn_lr, projection.nzrows_gpu, projection.nzcols_gpu, src_activity_gpu, dest_activity_gpu, projection.weights_gpu.Val, range=slice(0, projection.nzcount, 1))

    # Normalisation values:
    projection.weights_gpu.mv(projection.norm_ones_gpu, y=projection.norm_total_gpu, autosync=False)
    projection.has_norm_total = True
Example #10
0
def batch_get_sol_params(x_nd, K_nn, bend_coefs, rot_coef=np.r_[1e-4, 1e-4, 1e-1]):
    n, d = x_nd.shape

    x_gpu = gpuarray.to_gpu(x_nd)

    H_arr_gpu = []
    for b in bend_coefs:
        cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float32)
        cur_offset[d+1:, d+1:] = b * K_nn
        cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef)
        H_arr_gpu.append(gpuarray.to_gpu(cur_offset))
    H_ptr_gpu = get_gpu_ptrs(H_arr_gpu)

    A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T
    n_cnts = A.shape[0]
    _u,_s,_vh = np.linalg.svd(A.T)
    N = _u[:,n_cnts:]
    F = np.zeros((n + d + 1, d), np.float32)
    F[1:d+1, :d] -= np.diag(rot_coef)
    
    Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float32)
    F = F.astype(np.float32)
    N = N.astype(np.float32)

    Q_gpu     = gpuarray.to_gpu(Q)
    Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))]
    Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu)

    F_gpu     = gpuarray.to_gpu(F)
    F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))]
    F_ptr_gpu = get_gpu_ptrs(F_arr_gpu)

    N_gpu = gpuarray.to_gpu(N)
    N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))]
    N_ptr_gpu = get_gpu_ptrs(N_arr_gpu)
    
    dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu,
                      Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu,
                      transa = 'T')
    # N'HN
    NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'),
                                           (H_arr_gpu, H_ptr_gpu, 'N'),
                                           (N_arr_gpu, N_ptr_gpu, 'N'))
    iH_arr = []
    for NHN in NHN_arr_gpu:
        iH_arr.append(scipy.linalg.inv(NHN.get()).copy())
    iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr]
    iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu)

    proj_mats   = m_dot_batch((N_arr_gpu,  N_ptr_gpu,   'N'),
                              (iH_arr_gpu, iH_ptr_gpu, 'N'),
                              (N_arr_gpu,  N_ptr_gpu,   'T'),
                              (Q_arr_gpu,  Q_ptr_gpu,   'T'))

    offset_mats = m_dot_batch((N_arr_gpu,  N_ptr_gpu,   'N'),
                              (iH_arr_gpu, iH_ptr_gpu, 'N'),
                              (N_arr_gpu,  N_ptr_gpu,   'T'),
                              (F_arr_gpu,  F_ptr_gpu,   'N'))

    return proj_mats, offset_mats
Example #11
0
def full_scan():
    # TODO: testing how slow a single full scan is with no parallelism
    sequential = SourceModule("""
        #include <stdio.h>
        __global__ void full_scan(unsigned char *img, int line[2])
        {
            int counter = 0;
            for(int y=0; y<853; y++) {
                for(int x=0; x<1918; x++) {
                    if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180)
                    && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
                        counter++;
                        if(counter == 50) {
                            line[0] = x;
                            line[1] = y;
                            return;
                        }
                    } else { counter = 0; }
                }
            }

        }
        """)

    image = cv.imread("test images/crop2.png")
    seq = sequential.get_function("full_scan")
    image_gpu = gpuarray.to_gpu_async(image)
    line = np.array([0, 0])
    timer = time.clock()
    seq(image_gpu, cuda.InOut(line), block=(1, 1, 1))
    print(time.clock() - timer)
    print(line)
def CFPRF_DotProduct_Sparse_GPU(projection):
    """
    Sparse CF Projection response function calculating the dot-product
    between incoming activities and CF weights. Uses GPU.
    """
    projection.input_buffer_pagelocked[:] = np.ravel(projection.input_buffer).astype(np.float32)  
    projection.input_buffer_gpu = gpuarray.to_gpu_async(projection.input_buffer_pagelocked, stream=projection.pycuda_stream)
    projection.weights_gpu.mv(projection.input_buffer_gpu, alpha=projection.strength, y=projection.activity_gpu_buffer, autosync=False, stream=projection.pycuda_stream)
    projection.activity_gpu_buffer.get_async(ary=projection.activity, stream=projection.pycuda_stream)
Example #13
0
    def eval_(self,
              x,
              y=None,
              batch_size=None,
              stream=None,
              delta=None,
              w_t=None,
              b_t=None):
        if stream is None:
            stream = self.stream

        if type(x) != pycuda.gpuarray.GPUArray:
            x = gpuarray.to_gpu_async(np.array(x, dtype=np.float32),
                                      stream=self.stream)

        if batch_size is None:
            if len(x.shape) == 2:
                batch_size = np.int32(x.shape[0])
            else:
                batch_size = np.int32(1)

        if delta is None:
            delta = self.delta

        delta = np.float32(delta)

        if w_t is None:
            w_t = np.int32(-1)

        if b_t is None:
            b_t = np.int32(-1)

        if y is None:
            if batch_size == 1:
                y = gpuarray.empty((self.num_outputs, ), dtype=np.float32)
            else:
                y = gpuarray.empty((batch_size, self.num_outputs),
                                   dtype=np.float32)

        eval_ker(self.num_outputs,
                 self.num_inputs,
                 self.relu,
                 self.sigmoid,
                 self.weights,
                 self.b,
                 x,
                 y,
                 np.int32(batch_size),
                 w_t,
                 b_t,
                 delta,
                 block=self.block,
                 grid=self.grid,
                 stream=stream)

        return y
Example #14
0
def magnitude(vec, vec2):
    #, fn = mod.get_function('magnitude')):
    #gpu_vec = drv.mem_alloc(vec.nbytes)
    #drv.memcpy_htod(gpu_vec, vec)

    #fn(gpu_vec, block=(512, 1, 1))

    #dest = drv.from_device_like(gpu_vec, vec)

    #print 'Dot product: ', dest[0]
    
    gpu_arry = gpuarr.to_gpu_async(vec)
    gpu_arry2 = gpuarr.to_gpu_async(vec2)
    mag = cumath.sqrt(gpuarr.dot(gpu_arry, gpu_arry, dtype=np.float32))
    mag2 = cumath.sqrt(gpuarr.dot(gpu_arry2, gpu_arry2, dtype=np.float32))

    product = gpuarr.dot(gpu_arry, gpu_arry2, dtype=np.float32) / mag + mag2
    print product
    return product.get()
Example #15
0
def row_scan():
    # scan full rows each thread, multiple rows in parallel
    # TODO: this is bad for memory coalescing?

    # int bgr_pass = (img[x * 3 + y * 1918 * 3] <= 4) * (153 <= img[1 + x * 3 + y * 1918 * 3]) * (
    #           img[1 + x * 3 + y * 1918 * 3] <= 180) *
    # (196 <= img[2 + x * 3 + y * 1918 * 3]) * (img[2 + x * 3 + y * 1918 * 3] <= 210);
    # counter = counter * bgr_pass + bgr_pass;
    startscan = SourceModule("""
    #include <stdio.h>
    __global__ void line_scan(unsigned char *img, int *flag)
    {
        int y = threadIdx.y + blockIdx.y * blockDim.y;
        int counter = 0;
        if(y > 852) { return; }
        for(int x=0; x<1918; x++) {
            int pix_g = img[1 + x*3 + y*1918*3];
            int pix_r = img[2 + x*3 + y*1918*3];
            if((img[x*3 + y*1918*3] < 5) & (152 < pix_g) & (pix_g < 181)
            & (195 < pix_r) & (pix_r < 211)) {
                counter++;
                if(counter == 50) {
                    flag[0] = x;
                    flag[1] = y;
                }
            } else { counter = 0; }
        }
    }
    """)

    image = cv.imread("test images/crop2.png")
    scan = startscan.get_function("line_scan")
    timer = time.clock()
    flag = np.array([0, 0])
    image_gpu = gpuarray.to_gpu_async(image)
    flag_gpu = gpuarray.to_gpu_async(flag)

    scan(image_gpu, flag_gpu, block=(1, 32, 1), grid=(1, 27))
    print(flag_gpu.get())
    print(time.clock() - timer)
Example #16
0
    def _gpu_init(self, debug):
        self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32),
                                       stream=self.stream2)

        self.neighbors_index = cuda.to_gpu_async(self.num_neighbors,
                                                 stream=self.stream1)
        exclusiveScan(self.neighbors_index, stream=self.stream1)

        self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors,
                                                   stream=self.stream2)

        self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2)
        self.dev_waypoints = cuda.to_gpu_async(self.waypoints,
                                               stream=self.stream2)

        self.dev_neighbors = cuda.to_gpu_async(self.neighbors,
                                               stream=self.stream2)

        self.dev_Gindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype)
        self.dev_xindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype)

        self.dev_xindicator_zeros = cuda.GPUArray(self.Vopen.shape,
                                                  self.Vopen.dtype)

        self.zero_val = np.zeros((), np.int32)

        self.dev_xindicator_zeros.fill(self.zero_val, stream=self.stream1)

        # self.stream1.synchronize()
        self.stream2.synchronize()
Example #17
0
    def __init__(self, array, dtype=None, allocator=mem_alloc, stream=None):

        self.dtype = array.dtype if dtype is None else dtype
        self.nnz = array.nnz
        self.shape = array.shape

        if self.nnz == 0:  # let's not waste time
            return

        if not sparse.isspmatrix_csr(array):
            array = sparse.csr_matrix(array, dtype=self.dtype)

        if not array.has_sorted_indices:
            array = array.sorted_indices()

        if stream is not None:
            self.data = gpuarray.to_gpu_async(array.data.astype(dtype=self.dtype), allocator=allocator, stream=stream)
            self.indptr = gpuarray.to_gpu_async(array.indptr, allocator=allocator, stream=stream)
            self.indices = gpuarray.to_gpu_async(array.indices, allocator=allocator, stream=stream)
        else:
            self.data = gpuarray.to_gpu(array.data.astype(dtype=self.dtype), allocator=allocator)
            self.indptr = gpuarray.to_gpu(array.indptr, allocator=allocator)
            self.indices = gpuarray.to_gpu(array.indices, allocator=allocator)
        self.descr = cusparse.cusparseCreateMatDescr()
Example #18
0
    def add_new_frame(self, x):
        x = x.flatten()
        assert len(x) == self._featuredim
        
        x_gpu = gpuarray.to_gpu_async(x)
        BLOCK_SIZE = (256,1,1)
        nblocks = int(np.ceil(float(self._featuredim) / BLOCK_SIZE[0]))
        GRID_SIZE = (nblocks, self._framecount, 1)

        cudabuffer.cyclebuffer(self._Y_gpu_scratch, x_gpu, self._Y_gpu,
                               np.int32(self._featuredim), np.int32(self._framecount),
                               block=BLOCK_SIZE, grid=GRID_SIZE)

        # Copy self._Y_gpu into self._Y_gpu_scratch
        cuda.memcpy_dtod_async(self._Y_gpu_scratch.gpudata, self._Y_gpu.gpudata, self._Y_gpu.nbytes)
Example #19
0
    def __init__(self, array, dtype=None, allocator=mem_alloc, stream=None):

        self.dtype = array.dtype if dtype is None else dtype
        self.nnz = array.nnz
        self.shape = array.shape

        if self.nnz == 0:  # let's not waste time
            return

        if not sparse.isspmatrix_csr(array):
            array = sparse.csr_matrix(array, dtype=self.dtype)

        if not array.has_sorted_indices:
            array = array.sorted_indices()

        if stream is not None:
            self.data = gpuarray.to_gpu_async(array.data.astype(dtype=self.dtype), allocator=allocator, stream=stream)
            self.indptr = gpuarray.to_gpu_async(array.indptr, allocator=allocator, stream=stream)
            self.indices = gpuarray.to_gpu_async(array.indices, allocator=allocator, stream=stream)
        else:
            self.data = gpuarray.to_gpu(array.data.astype(dtype=self.dtype), allocator=allocator)
            self.indptr = gpuarray.to_gpu(array.indptr, allocator=allocator)
            self.indices = gpuarray.to_gpu(array.indices, allocator=allocator)
        self.descr = cusparse.cusparseCreateMatDescr()
Example #20
0
def parrallel_scan():
    # Array stored in memory in linear fashion, row by row, col by col, pix/bgr ie r0c0pb, r0c0pg, r0c0pr, r0c1pb
    # We should parallelize blocks of ~284y 1x 1z, keep array[853] to keep counter for each row, then we will be
    # scanning scanning across the image column by column, keeping counter for each row
    image = cv.imread("test images/crop2.png")
    modtest = SourceModule("""
    #include <stdio.h>
    __global__ void line_scan(const unsigned char *img, int *counter)
    {
        //int x = blockIdx.x;
        //int y = threadIdx.y + blockIdx.y * blockDim.y;
        // transposed image height and width against x and y to get threads to run down then across columns of image
        int x = blockIdx.y;
        int y = 852 - (threadIdx.x + (blockIdx.x * 288));
        if(y < 0) { return; }
        // attempt at removing thread divergence, doesnt seem to make any difference in speed
        //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)*
        //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210);
        //counter[y] = counter[y]*bgr_pass + bgr_pass;
        if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180) 
        && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) {
            counter[y] += 1;
            if(counter[y] == 50) {
                counter[853] = x;
                counter[854] = y;
            }
        } else { counter[y] = 0; }
        __syncthreads();
    }
    """)
    func = modtest.get_function("line_scan")

    # TODO: added syncing of threads before they finish to try keep columns in sequence...not guaranteed to work?
    # blocks can still run out of order due to SMMs eg 50 line: last+1 line resets counter before last line is scanned
    test = np.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]],
                     [[12, 13, 14], [15, 16, 17], [18, 19, 20], [21, 22, 23]]])
    counter = np.array([0 for _ in range(855)])
    image_gpu = gpuarray.to_gpu_async(image)
    # counter_gpu = gpuarray.to_gpu_async(counter)
    # image_gpu = cuda.mem_alloc(image.nbytes)
    counter_gpu = cuda.mem_alloc(counter.nbytes)
    # cuda.memcpy_htod(image_gpu, image)
    cuda.memcpy_htod(counter_gpu, counter)
    timer = time.clock()
    func(image_gpu, counter_gpu, block=(288, 1, 1), grid=(3, 1918))
    print(time.clock() - timer)
    cuda.memcpy_dtoh(counter, counter_gpu)
    print(counter[-2:])
Example #21
0
def CFPRF_DotProduct_Sparse_GPU(projection):
    """
    Sparse CF Projection response function calculating the dot-product
    between incoming activities and CF weights. Uses GPU.
    """
    projection.input_buffer_pagelocked[:] = np.ravel(
        projection.input_buffer).astype(np.float32)
    projection.input_buffer_gpu = gpuarray.to_gpu_async(
        projection.input_buffer_pagelocked, stream=projection.pycuda_stream)
    projection.weights_gpu.mv(projection.input_buffer_gpu,
                              alpha=projection.strength,
                              y=projection.activity_gpu_buffer,
                              autosync=False,
                              stream=projection.pycuda_stream)
    projection.activity_gpu_buffer.get_async(ary=projection.activity,
                                             stream=projection.pycuda_stream)
Example #22
0
    def step_init(self, iter_parameters, debug):
        self.cost[self.start] = np.inf
        self.Vunexplored[self.start] = 1
        self.Vopen[self.start] = 0

        if self.start != iter_parameters['start'] and len(self.route) > 2:
            del self.route[-1]

        self.obstacles = iter_parameters['obstacles']
        self.num_obs = iter_parameters['num_obs']
        self.parent = np.full(self.n, -1).astype(np.int32)

        self.start = iter_parameters['start']
        self.goal = iter_parameters['goal']
        self.radius = iter_parameters['radius']
        self.threshold = np.array([iter_parameters['threshold']
                                   ]).astype(np.float32)

        self.cost[self.start] = 0
        self.Vunexplored[self.start] = 0
        self.Vopen[self.start] = 1

        if debug:
            print('parents:', self.parent)
            print('cost: ', self.cost)
            print('Vunexplored: ', self.Vunexplored)
            print('Vopen: ', self.Vopen)

        self.dev_open = cuda.to_gpu_async(self.Vopen, stream=self.stream2)

        self.dev_threshold = cuda.to_gpu_async(self.threshold,
                                               stream=self.stream1)

        self.dev_radius = cuda.to_gpu_async(np.array([self.radius
                                                      ]).astype(np.float32),
                                            stream=self.stream2)
        self.dev_obstacles = cuda.to_gpu_async(self.obstacles,
                                               stream=self.stream2)
        self.dev_num_obs = cuda.to_gpu_async(self.num_obs, stream=self.stream2)

        self.dev_parent = cuda.to_gpu_async(self.parent, stream=self.stream2)
        self.dev_cost = cuda.to_gpu_async(self.cost, stream=self.stream1)

        self.dev_unexplored = cuda.to_gpu_async(self.Vunexplored,
                                                stream=self.stream1)
Example #23
0
def to_gpu_async(array, stream=None):
    """Copies the given CPU array asynchronously to the current device.

    Args:
        array: Array to be sent to GPU. If it is :class:`~numpy.ndarray`, then
            its memory must be pagelocked.
        stream (~pycuda.driver.Stream): CUDA stream.

    Returns:
        ~pycuda.gpuarray.GPUArray: Array on GPU.

        If given ``array`` is already on GPU, then this function just returns
        ``array`` without performing any copy.

    """
    if isinstance(array, GPUArray):
        return array
    return gpuarray.to_gpu_async(array, allocator=mem_alloc, stream=stream)
Example #24
0
def to_gpu_async(array, stream=None):
    """Copies the given CPU array asynchronously to the current device.

    Args:
        array: Array to be sent to GPU. If it is :class:`~numpy.ndarray`, then
            its memory must be pagelocked.
        stream (~pycuda.driver.Stream): CUDA stream.

    Returns:
        ~pycuda.gpuarray.GPUArray: Array on GPU.

        If given ``array`` is already on GPU, then this function just returns
        ``array`` without performing any copy.

    """
    if isinstance(array, GPUArray):
        return array
    return gpuarray.to_gpu_async(array, allocator=mem_alloc, stream=stream)
Example #25
0
def batch_sum(a_arr_gpu, a_ptr_gpu):
    """
    computes a sum of all of the arrays pointed to by a_arr_gpu and a_ptr_gpu
    """
    if len(a_arr_gpu[0].shape) != 1:
        n, m       = a_arr_gpu[0].shape
        total_size = n * m
        flat_a_gpu = [a.ravel() for a in a_arr_gpu]
    else:
        total_size = a_arr_gpu[0].shape[0]
        flat_a_gpu = a_arr_gpu

    ones_vec      = gpuarray.to_gpu_async(np.ones((total_size, 1), dtype=np.float32))
    ones_arr_gpu  = [ones_vec for i in range(len(a_arr_gpu))]
    ones_ptr_gpu  = get_gpu_ptrs(ones_arr_gpu)

    res_arr, res_ptrs = dot_batch(flat_a_gpu, ones_arr_gpu, a_ptr_gpu, ones_ptr_gpu)
    return [r.get()[0] for r in res_arr]
Example #26
0
    def __init__(self, Xpoints, numMixtures):
        print "GPU Implementation Chosen"
        LikelihoodEvaluator.__init__(self, Xpoints, numMixtures)

        #Pycuda imports
        import pycuda.autoinit
        from pycuda import gpuarray
        from pycuda.compiler import SourceModule

        self.gpuarray = gpuarray

        with open("KernelV2.cu") as f:

            if self.numPoints >= 1024:
                mod = SourceModule(f.read().replace('512', '1024'))
                self.numThreads = 1024
            else:
                mod = SourceModule(f.read())
                self.numThreads = 512

        if self.numPoints > self.numThreads:
            self.numBlocks = self.numPoints / self.numThreads
            if self.numPoints % self.numThreads != 0: self.numBlocks += 1
        else:
            self.numBlocks = 1

        print "numBlocks: {}, numPoints: {}".format(self.numBlocks, self.numPoints)
        #Set the right number of threads and blocks given the datasize
        #Using a max of 1024 threads, fix correct blocksize


        self.likelihoodKernel = mod.get_function("likelihoodKernel")
        self.likelihoodKernel.prepare('PPPPiiiP')

        self.Xpoints = self.Xpoints.astype(np.float32)
        self.Xpoints = gpuarray.to_gpu_async(self.Xpoints)

        self.means_gpu = gpuarray.zeros(shape = (self.numMixtures, self.dim), dtype = np.float32)
        self.diagCovs_gpu = gpuarray.zeros(shape = (self.numMixtures, self.dim), dtype = np.float32)
        self.weights_gpu = gpuarray.zeros(shape = self.numMixtures, dtype = np.float32)

        self.llVal = gpuarray.zeros(shape = self.numBlocks,  dtype=np.float32)
Example #27
0
def batch_sum(a_arr_gpu, a_ptr_gpu):
    """
    computes a sum of all of the arrays pointed to by a_arr_gpu and a_ptr_gpu
    """
    if len(a_arr_gpu[0].shape) != 1:
        n, m = a_arr_gpu[0].shape
        total_size = n * m
        flat_a_gpu = [a.ravel() for a in a_arr_gpu]
    else:
        total_size = a_arr_gpu[0].shape[0]
        flat_a_gpu = a_arr_gpu

    ones_vec = gpuarray.to_gpu_async(np.ones((total_size, 1),
                                             dtype=np.float32))
    ones_arr_gpu = [ones_vec for i in range(len(a_arr_gpu))]
    ones_ptr_gpu = get_gpu_ptrs(ones_arr_gpu)

    res_arr, res_ptrs = dot_batch(flat_a_gpu, ones_arr_gpu, a_ptr_gpu,
                                  ones_ptr_gpu)
    return [r.get()[0] for r in res_arr]
Example #28
0
    def eval_(self, x, y=None, batch_size=None, stream=None):

        if stream is None:
            stream = self.stream

        if type(x) != pycuda.gpuarray.GPUArray:
            temp = np.array(x, dtype=np.float32)
            x = gpuarray.to_gpu_async(temp, stream=stream)

        if batch_size == None:
            if len(x.shape) == 2:
                batch_size = np.int32(x.shape[0])
            else:
                batch_size = np.int32(1)
        else:
            batch_size = np.int32(batch_size)

        if y is None:
            if batch_size == 1:
                y = gpuarray.empty((self.num, ), dtype=np.float32)
            else:
                y = gpuarray.empty((batch_size, self.num), dtype=np.float32)

        exp_ker(self.num,
                x,
                y,
                batch_size,
                block=(32, 1, 1),
                grid=(int(np.ceil(self.num / 32)), 1, 1),
                stream=stream)

        mean_ker(self.num,
                 y,
                 y,
                 batch_size,
                 block=(32, 1, 1),
                 grid=(int(np.ceil(batch_size / 32)), 1, 1),
                 stream=stream)

        return y
Example #29
0
    def __init__(self, Xpoints, numMixtures):
        super().__init__(Xpoints, numMixtures)

        import pycuda.autoinit  # must be run

        from pycuda import gpuarray
        from pycuda.compiler import SourceModule

        self.gpuarray = gpuarray
        self.numThreads, self.numBlocks = chooseGridThread(self.numPoints)
        with open(self.cuFile) as f:
            # have to use a replace here since we can't guarantee the format working
            # with the c syntax. A bit hacky sadly, but easier than mallocing
            mod = SourceModule(f.read().replace("MAX_THREADS",
                                                str(self.numThreads)))

        # print("numBlocks: {}, numPoints: {}".format(self.numBlocks, self.numPoints))
        # Set the right number of threads and blocks given the datasize

        self.likelihoodKernel = mod.get_function("likelihoodKernel")
        # this is telling PyCUDA what args I'll be passing through later
        # same syntax as struct module (think pack unpack)
        self.likelihoodKernel.prepare('PPPPiiiP')

        self.Xpoints = self.Xpoints.astype(np.float32)
        # dump the X in the GPU memory so we don't need to keep transferring
        # note we assume size(Xpoints) < GPU memory
        self.Xpoints = gpuarray.to_gpu_async(self.Xpoints)

        # we allocate memory for the parameters so this is only done on startup
        self.means_gpu = gpuarray.zeros(shape=(self.numMixtures, self.dim),
                                        dtype=np.float32)
        self.diagCovs_gpu = gpuarray.zeros(shape=(self.numMixtures, self.dim),
                                           dtype=np.float32)
        self.weights_gpu = gpuarray.zeros(shape=self.numMixtures,
                                          dtype=np.float32)

        # Allocate Memory for all our computations
        self.llVal = gpuarray.zeros(shape=self.numBlocks, dtype=np.float32)
Example #30
0
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high,
                   max_iters, upper_bound):
    # we set up our complex lattice as such
    real_vals = np.matrix(np.linspace(real_low, real_high, width),
                          dtype=np.complex64)
    imag_vals = np.matrix(np.linspace(imag_high, imag_low, height),
                          dtype=np.complex64) * 1j
    mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(),
                                  dtype=np.complex64)

    # copy complex lattice to the GPU
    # changed this to be explicitly synchronized. (to: to_gpu_async)
    # We can copy to the GPU asynchronously with to_gpu_async, and then synchronize as follows:
    mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
    # We can access the current context object with pycuda.autoinit.context,
    # and we can synchronize in our current context by calling the pycuda.autoinit.context.synchronize() function.
    # synchronize in current context
    pycuda.autoinit.context.synchronize()

    # allocate an empty array on the GPU
    # allocates memory on the GPU with the gpuarray.empty function. Memory allocation in CUDA is,
    # by the nature of the GPU architecture, automatically synchronized; there is no asynchronous memory
    # allocation equivalent here.
    mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape,
                                          dtype=np.float32)

    mandel_ker(mandelbrot_lattice_gpu, mandelbrot_graph_gpu,
               np.int32(max_iters), np.float32(upper_bound))

    pycuda.autoinit.context.synchronize()

    mandelbrot_graph = mandelbrot_graph_gpu.get_async()

    pycuda.autoinit.context.synchronize()

    return mandelbrot_graph
    def _gpu_init(self, debug):
        self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32),
                                       stream=self.stream2)

        self.neighbors_index = cuda.to_gpu_async(self.num_neighbors,
                                                 stream=self.stream1)
        exclusiveScan(self.neighbors_index, stream=self.stream1)

        self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors,
                                                   stream=self.stream2)

        self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2)
        self.dev_waypoints = cuda.to_gpu_async(self.waypoints,
                                               stream=self.stream2)

        self.dev_neighbors = cuda.to_gpu_async(self.neighbors,
                                               stream=self.stream2)

        self.stream1.synchronize()
        self.stream2.synchronize()
Example #32
0
]
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

    x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i])
    y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i])

    cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1,
                       y_gpu[i].gpudata, 1)
    ans[i] = y_gpu[i].get_async(stream=streams[i])

cublas.cublasDestroy(h)

# Uncomment to check for errors in the calculation
#y_gpu = np.array([yg.get() for yg in y_gpu])
#y_gpu = np.array(y_gpu).reshape(y.shape)
#print np.allclose(y_gpu, a*x + y)

e.record()
e.synchronize()
Example #33
0
def gpu_source_extraction(in1, tolerance, store_on_gpu, neg_comp):
    """
    The following function determines connectivity within a given wavelet decomposition. These connected and labelled
    structures are thresholded to within some tolerance of the maximum coefficient at the scale. This determines
    whether on not an object is to be considered as significant. Significant objects are extracted and factored into
    a mask which is finally multiplied by the wavelet coefficients to return only wavelet coefficients belonging to
    significant objects across all scales. This GPU accelerated version speeds up the extraction process.

    INPUTS:
    in1         (no default):   Array containing the wavelet decomposition.
    tolerance   (no default):   Percentage of maximum coefficient at which objects are deemed significant.
    store_on_gpu(no default):   Boolean specifier for whether the decomposition is stored on the gpu or not.

    OUTPUTS:
    objects*in1                 The wavelet coefficients of the significant structures.
    objects                     The mask of the significant structures - if store_on_gpu is True, returns a gpuarray.
    """

    # The following are pycuda kernels which are executed on the gpu. Specifically, these both perform thresholding
    # operations. The gpu is much faster at this on large arrays due to their massive parallel processing power.

    ker1 = SourceModule("""
                        __global__ void gpu_mask_kernel1(int *in1, int *in2)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int tid2 = i + j;

                            if (in1[tid2] == in2[0])
                                { in1[tid2] = -1; }
                        }
                       """, keep=True)

    ker2 = SourceModule("""
                        __global__ void gpu_mask_kernel2(int *in1)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int tid2 = i + j;

                            if (in1[tid2] >= 0)
                                { in1[tid2] = 0; }
                            else
                                { in1[tid2] = 1; }
                        }
                       """, keep=True)

    ker3 = SourceModule("""
                        __global__ void gpu_store_objects(int *in1, float *out1, int *scale)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int k = (blockDim.z * blockIdx.z + threadIdx.z)*(len*len);
                            const int tid2 = i + j;
                            const int tid3 = i + j + k;

                            if (blockIdx.z==scale[0])
                                { out1[tid3] = in1[tid2]; }
                        }
                       """, keep=True)

    # The following initialises some variables for storing the labelled image and the number of labels. The per scale
    # maxima are also initialised here.

    scale_maxima = np.empty([in1.shape[0],1], dtype=np.float32)
    objects = np.empty_like(in1, dtype=np.int32)
    object_count = np.empty([in1.shape[0],1], dtype=np.int32)

    # The following loop uses functionality from the ndimage module to assess connectivity. The maxima are also
    # calculated here.

    for i in range(in1.shape[0]):
        if neg_comp:
            scale_maxima[i] = np.max(abs(in1[i,:,:]))
        else:
            scale_maxima[i] = np.max(in1[i,:,:])
        objects[i,:,:], object_count[i] = ndimage.label(in1[i,:,:], structure=[[1,1,1],[1,1,1],[1,1,1]])

    # The following bind the pycuda kernels to the expressions on the left.

    gpu_mask_kernel1 = ker1.get_function("gpu_mask_kernel1")
    gpu_mask_kernel2 = ker2.get_function("gpu_mask_kernel2")

    # If store_on_gpu is the following handles some initialisation.

    if store_on_gpu:
        gpu_store_objects = ker3.get_function("gpu_store_objects")
        gpu_objects = gpuarray.empty(objects.shape, np.float32)
        gpu_idx = gpuarray.zeros([1], np.int32)
        gpu_idx += (objects.shape[0]-1)

    # The following removes the insignificant objects and then extracts the remaining ones.

    for i in range(-1,-in1.shape[0]-1,-1):

        condition = tolerance*scale_maxima[i]

        if neg_comp:
            if i==(-1):
                tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:]
            else:
                tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:]*objects[i+1,:,:]
        else:
            if i==(-1):
                tmp = (in1[i,:,:]>=condition)*objects[i,:,:]
            else:
                tmp = (in1[i,:,:]>=condition)*objects[i,:,:]*objects[i+1,:,:]

        labels = (np.unique(tmp[tmp>0])).astype(np.int32)

        gpu_objects_page = gpuarray.to_gpu_async(objects[i,:,:].astype(np.int32))

        for j in labels:
            label = gpuarray.to_gpu_async(np.array(j))
            gpu_mask_kernel1(gpu_objects_page, label, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32))

        gpu_mask_kernel2(gpu_objects_page, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32))

        objects[i,:,:] = gpu_objects_page.get()

        # In the event that all operations are to be done on the GPU, the following stores a version of the objects
        # on the GPU. A handle to the gpuarray is then returned.

        if store_on_gpu:
            gpu_store_objects(gpu_objects_page, gpu_objects, gpu_idx, block=(32,32,1), grid=(objects.shape[2]//32,
                                                                                             objects.shape[1]//32, objects.shape[0]))
            gpu_idx -= 1

    if store_on_gpu:
        return objects*in1, gpu_objects
    else:
        return objects*in1, objects
Example #34
0
def init_stitch(N):
	"""outputs the high resolution k-box, and the smoothed r box

	Input
	-----------
	N:  int32
		size of box to load onto the GPU, should be related to DIM by powers of 2

	"""
	if N is None:
		N = np.int32(HII_DIM) #prepare for stitching
	META_GRID_SIZE = DIM/N
	M = np.int32(HII_DIM/META_GRID_SIZE)
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN/DIM)
	print 'scale', scale
	HII_scale = np.float32(BOX_LEN/HII_DIM)
	shape = (DIM,DIM,N)
	stitch_grid_size = (DIM/(block_size[0]),
						DIM/(block_size[0]),
						N/(block_size[0]))
	HII_stitch_grid_size = (HII_DIM/(block_size[0]),
						HII_DIM/(block_size[0]),
						M/(block_size[0]))
	#ratio of large box to small size
	kernel_source = open(cmd_folder+"/initialize_stitch.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'DIM': DIM, 
		'VOLUME': VOLUME,
		'META_BLOCKDIM': N
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_stitch = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)
	plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64)
	plan1d = Plan((np.int64(DIM)), dtype=np.complex64)
	print "init pspec"
	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)
	#hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64')
	hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64)
	#hbox_small = np.zeros(HII_shape, dtype=np.float32)
	#hbox_large = n
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)

	# Set up pinned memory for transfer
	#largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize())
	largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32)
	largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	print "init boxes"
	for meta_z in xrange(META_GRID_SIZE):
		# MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3)
		init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy()
	#if want to get velocity need to use this
	if True:
		print "saving kbox"
		np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)

	print "Executing FFT on device"
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print hbox_large.dtype
	print "Finished FFT on device"
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)
	
	if True:
		print "loading kbox"
		hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		#cuda.memcpy_htod_async(largebox_d, largebox_pin)
		largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size);
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async()
	#import IPython; IPython.embed()
	print "Executing FFT on host"
	#hbox_large = hifft(hbox_large).astype(np.complex64).real
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print "Finished FFT on host"
	#import IPython; IPython.embed()

	# for meta_x in xrange(META_GRID_SIZE):
	# 	for meta_y in xrange(META_GRID_SIZE):
	# 		for meta_z in xrange(META_GRID_SIZE):
	# 			largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N])
	# 			HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size);
	# 			hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get()
	#plan = Plan(shape, dtype=np.complex64)
	#plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)


	# This saves a large resolution deltax

	
	print "downsampling"
	smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32)
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		#largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		largebox_d /= scale**3 #
		subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space
		hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async()
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small)
	#import IPython; IPython.embed()


	# To get velocities: reload the k-space box
	hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32)
	hvbox_small = np.zeros(HII_shape, dtype=np.float32)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64)
	smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		for meta_z in xrange(META_GRID_SIZE):
			largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
			velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size)
			HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size)
			print hvbox_large.shape, largevbox_d.shape
			hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async()
		hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real
		for meta_z in xrange(META_GRID_SIZE):
			largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size)
			hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async()
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get())

	return
Example #35
0
 def allocate_globals(O):
     d_O = gpuarray.to_gpu_async(O)
     return d_O
Example #36
0
    def __init__(self, coordinate, mass, box_size, cutoff, eps, sig, dt=1):
        '''
        Constructor
        '''

        autoinit.context.set_cache_config(driver.func_cache.PREFER_L1)

        # Scalars
        self.dfloat = 'float32'
        self.dint = 'int32'
        self.iter = np.int32(0)
        self.kb = np.float32(8.6173324e-5)

        self.ions = np.int32(coordinate.shape[0])
        self.mass = np.float32(mass)
        self.cutoff = np.float32(cutoff)

        self.eps = np.float32(eps)
        self.sig = np.float32(sig)
        self.dt = np.float32(dt)

        self.threads = 256
        self.block = (self.threads, 1, 1)
        self.grid = (ceil(self.ions / self.threads), 1, 1)

        self.ion_type = ['Ar'] * self.ions

        # float3
        self.box_size = np.array(box_size).astype(self.dfloat)

        # 1D arrays
        self.potential_energy = gpuarray.zeros(self.ions, np.float32)
        self.kinetic_energy = gpuarray.zeros(self.ions, np.float32)

        # 3D arrays
        self.coordinate = gpuarray.to_gpu_async(coordinate.astype(self.dfloat))
        self.coordinate_sorted = gpuarray.to_gpu_async(
            coordinate.astype(self.dfloat))
        self.velocity = gpuarray.zeros_like(self.coordinate)
        self.velocity_sorted = gpuarray.zeros_like(self.coordinate)
        self.force = gpuarray.zeros_like(self.coordinate)

        # Timers
        self.start = driver.Event()
        self.end = driver.Event()
        self.timer = 0

        # System data
        self.system_pe = np.array([]).astype(self.dfloat)
        self.system_ke = np.array([]).astype(self.dfloat)

        # Create the bins
        self.create_bins()

        # Load kernels
        float3 = gpuarray.vec.float3
        int3 = gpuarray.vec.int3

        self.prefixsum = DoubleBuffer(self.bins, self.threads)
        self.fill_bins = self.load_function('lj_force.cu', 'fillBins',
                                            ('PPP', float3, int3, 'i'))
        self.counting_sort = self.load_function('lj_force.cu', 'countingSort',
                                                'PPPPPPi')
        self.lj_force = self.load_function(
            'lj_force.cu', 'ljForce', ('PPPPP', float3, float3, int3, 'fffi'))
        self.verlet_pre = self.load_function('lj_force.cu', 'verletPre',
                                             ('PPPPP', float3, 'ffi'))
        self.verlet_pos = self.load_function('lj_force.cu', 'verletPos',
                                             'PPPffi')

        return
Example #37
0
import skcuda.cublas as cublas
import skcuda

s = cuda.Event()
e = cuda.Event()
s.record()

nStreams = 8
stream = [cuda.Stream() for i in range(nStreams)]
N = 8192
print skcuda.misc.get_current_device()

x = [np.asarray(np.random.rand(N/nStreams), np.float32) for i in range(nStreams)]
#x_pin = cuda.register_host_memory(x)
#xf = np.fft.fft(x)
x_gpu = [gpuarray.to_gpu_async(x[i], stream=stream[i]) for i in range(nStreams)]

xf_gpu = [gpuarray.empty((N/nStreams)/2 + 1, np.complex64) for i in range(nStreams)]
plan = [Plan(x[0].shape, np.float32, np.complex64, stream=stream[i]) for i in range(nStreams)]
print skcuda.misc.get_current_device()
for i in range(nStreams):
	fft(x_gpu[i], xf_gpu[i], plan[i])
	print skcuda.misc.get_current_device()

x_pin = [xf_gpu[i].get_async(stream=stream[i]) for i in range(nStreams)]

#print np.allclose(xf[0:N/2 + 1], xf_gpu.get(), atol=1e-6)

e.record()
e.synchronize()
print s.time_till(e), "ms"
Example #38
0
def gpu_iuwt_recomposition(in1, scale_adjust, store_on_gpu, smoothed_array):
    """
    This function calls the a trous algorithm code to recompose the input into a single array. This is the
    implementation of the isotropic undecimated wavelet transform recomposition for a GPU.

    INPUTS:
    in1             (no default):   Array containing wavelet coefficients.
    scale_adjust    (no default):   Indicates the number of omitted array pages.
    store_on_gpu    (no default):   Boolean specifier for whether the decomposition is stored on the gpu or not.

    OUTPUTS:
    recomposiiton                   Array containing the reconstructed array.
    """

    wavelet_filter = (1./16)*np.array([1,4,6,4,1], dtype=np.float32)    # Filter-bank for use in the a trous algorithm.
    wavelet_filter = gpuarray.to_gpu_async(wavelet_filter)

    # Determines scale with adjustment and creates a zero array on the GPU to store the output,unless smoothed_array
    # is given.

    max_scale = in1.shape[0] + scale_adjust

    if smoothed_array is None:
        recomposition = gpuarray.zeros([in1.shape[1], in1.shape[2]], np.float32)
    else:
        recomposition = gpuarray.to_gpu(smoothed_array.astype(np.float32))

    # Determines whether the array is already on the GPU or not. If not, moves it to the GPU.

    try:
        gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32))
    except:
        gpu_in1 = in1

    # Creates a working array on the GPU.

    gpu_tmp = gpuarray.empty_like(recomposition)

    # Creates and fills an array with the appropriate scale value.

    gpu_scale = gpuarray.zeros([1], np.int32)
    gpu_scale += max_scale-1

     # Fetches the a trous kernels.

    gpu_a_trous_row_kernel, gpu_a_trous_col_kernel = gpu_a_trous()

    grid_rows = int(in1.shape[1]//32)
    grid_cols = int(in1.shape[2]//32)

    # The following loops call the a trous algorithm code to recompose the input. The first loop assumes that there are
    # non-zero wavelet coefficients at scales above scale_adjust, while the second loop completes the recomposition
    # on the scales less than scale_adjust.

    for i in range(max_scale-1, scale_adjust-1, -1):
        gpu_a_trous_row_kernel(recomposition, gpu_tmp, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

        gpu_a_trous_col_kernel(gpu_tmp, recomposition, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

        recomposition = recomposition[:,:] + gpu_in1[i-scale_adjust,:,:]

        gpu_scale -= 1

    if scale_adjust>0:
        for i in range(scale_adjust-1, -1, -1):
            gpu_a_trous_row_kernel(recomposition, gpu_tmp, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

            gpu_a_trous_col_kernel(gpu_tmp, recomposition, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

            gpu_scale -= 1

    # Return values depend on mode.

    if store_on_gpu:
        return recomposition
    else:
        return recomposition.get()
Example #39
0
def gpu_iuwt_decomposition(in1, scale_count, scale_adjust, store_smoothed, store_on_gpu):
    """
    This function calls the a trous algorithm code to decompose the input into its wavelet coefficients. This is
    the isotropic undecimated wavelet transform implemented for a GPU.

    INPUTS:
    in1                 (no default):   Array on which the decomposition is to be performed.
    scale_count         (no default):   Maximum scale to be considered.
    scale_adjust        (no default):   Adjustment to scale value if first scales are of no interest.
    store_smoothed      (no default):   Boolean specifier for whether the smoothed image is stored or not.
    store_on_gpu        (no default):   Boolean specifier for whether the decomposition is stored on the gpu or not.

    OUTPUTS:
    detail_coeffs                       Array containing the detail coefficients.
    C0                  (optional):     Array containing the smoothest version of the input.
    """

    # The following simple kernel just allows for the construction of a 3D decomposition on the GPU.

    ker = SourceModule("""
                        __global__ void gpu_store_detail_coeffs(float *in1, float *in2, float* out1, int *scale, int *adjust)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int k = (blockDim.z * blockIdx.z + threadIdx.z)*(len*len);
                            const int tid2 = i + j;
                            const int tid3 = i + j + k;

                            if ((blockIdx.z + adjust[0])==scale[0])
                                { out1[tid3] = in1[tid2] - in2[tid2]; }

                        }
                       """)

    wavelet_filter = (1./16)*np.array([1,4,6,4,1], dtype=np.float32)    # Filter-bank for use in the a trous algorithm.
    wavelet_filter = gpuarray.to_gpu_async(wavelet_filter)

    # Initialises an empty array to store the detail coefficients.

    detail_coeffs = gpuarray.empty([scale_count-scale_adjust, in1.shape[0], in1.shape[1]], np.float32)

    # Determines whether the array is already on the GPU or not. If not, moves it to the GPU.

    try:
        gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32))
    except:
        gpu_in1 = in1

    # Sets up some working arrays on the GPU to prevent memory transfers.

    gpu_tmp = gpuarray.empty_like(gpu_in1)
    gpu_out1 = gpuarray.empty_like(gpu_in1)
    gpu_out2 = gpuarray.empty_like(gpu_in1)

    # Sets up some parameters required by the algorithm on the GPU.

    gpu_scale = gpuarray.zeros([1], np.int32)
    gpu_adjust = gpuarray.zeros([1], np.int32)
    gpu_adjust += scale_adjust

    # Fetches the a trous kernels and sets up the unique storing kernel.

    gpu_a_trous_row_kernel, gpu_a_trous_col_kernel = gpu_a_trous()
    gpu_store_detail_coeffs = ker.get_function("gpu_store_detail_coeffs")

    grid_rows = int(in1.shape[0]//32)
    grid_cols = int(in1.shape[1]//32)

    # The following loop, which iterates up to scale_adjust, applies the a trous algorithm to the scales which are
    # considered insignificant. This is important as each set of wavelet coefficients depends on the last smoothed
    # version of the input.

    if scale_adjust>0:
        for i in range(0, scale_adjust):
            gpu_a_trous_row_kernel(gpu_in1, gpu_tmp, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

            gpu_a_trous_col_kernel(gpu_tmp, gpu_out1, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

            gpu_in1, gpu_out1 = gpu_out1, gpu_in1
            gpu_scale += 1

    # The meat of the algorithm - two sequential applications fo the a trous followed by determination and storing of
    # the detail coefficients. C0 is reassigned the value of C on each loop - C0 is always the smoothest version of the
    # input image.

    for i in range(scale_adjust, scale_count):

        gpu_a_trous_row_kernel(gpu_in1, gpu_tmp, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

        gpu_a_trous_col_kernel(gpu_tmp, gpu_out1, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))           # Approximation coefficients.

        gpu_a_trous_row_kernel(gpu_out1, gpu_tmp, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))

        gpu_a_trous_col_kernel(gpu_tmp, gpu_out2, wavelet_filter, gpu_scale,
                                block=(32,32,1), grid=(grid_cols, grid_rows))           # Approximation coefficients.

        gpu_store_detail_coeffs(gpu_in1, gpu_out2, detail_coeffs, gpu_scale, gpu_adjust,
                                block=(32,32,1), grid=(grid_cols, grid_rows, int(scale_count))) # Detail coefficients.

        gpu_in1, gpu_out1 = gpu_out1, gpu_in1
        gpu_scale += 1

    # Return values depend on mode. NOTE: store_smoothed does not work if the result stays on the gpu.

    if store_on_gpu:
        return detail_coeffs
    elif store_smoothed:
        return detail_coeffs.get(), gpu_in1.get()
    else:
        return detail_coeffs.get()
Example #40
0
		self.stream.synchronize()

	def fromDevice(self, buf, shape=None):
		res = buf.get()
		if shape is not None:
			res = res.reshape(shape)
		return res

	def toDevice(self, buf, shape=None, async=False, dest=None):
		if shape is not None:
			buf = buf.reshape(shape)

		if dest is None:
			if async:
			# FIXME: there must be a warning in docs that buf has to be pagelocked
				return gpuarray.to_gpu_async(buf, stream=self.stream)
			else:
				return gpuarray.to_gpu(buf)
		else:
			cuda.memcpy_htod_async(dest.gpudata,
				buf, stream=None)

	def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None):

		elem_size = buf.dtype.itemsize
		size = buf.nbytes if length is None else elem_size * length
		src_offset *= elem_size
		dest_offset *= elem_size

		if dest is None:
			ddest = self.allocate(buf.shape, buf.dtype)
Example #41
0
# We can get a stream object from the pycuda.driver submodule with the Stream class
for _ in range(num_arrays):
    streams.append(drv.Stream())

# generate random arrays.
for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))

t_start = time()

# copy arrays to GPU.
# switch to the asynchronous and stream-friendly version of this function,
# gpu_array.to_gpu_async, instead. (We must now also specify which stream
# each memory operation should use with the stream parameter)
for k in range(num_arrays):
    data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))

# process arrays.
# This is exactly as before, only we must specify what stream to use by using the stream parameter
for k in range(num_arrays):
    mult_ker(data_gpu[k],
             np.int32(array_len),
             block=(64, 1, 1),
             grid=(1, 1, 1),
             stream=streams[k])

# copy arrays from GPU.
# We can do this by switching the gpuarray get function to get_async, and again using the stream parameter
for k in range(num_arrays):
    gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
Example #42
0
def watershed(I):

  # Get contiguous image + shape.
  height, width = I.shape
  I = np.float32(I.copy())

  # Get block/grid size for steps 1-3.
  block_size =  (6,6,1)
  grid_size =   (width/(block_size[0]-2),
                height/(block_size[0]-2))

  # Get block/grid size for step 4.
  block_size2 = (16,16,1)
  grid_size2  = (width/(block_size2[0]-2),
                height/(block_size2[0]-2))

  # Initialize variables.
  labeled       = np.zeros([height,width]) 
  labeled       = np.float32(labeled)
  width         = np.int32(width)
  height        = np.int32(height)
  count         = np.int32([0])

  # Transfer labels asynchronously.
  labeled_d = gpu.to_gpu_async(labeled)
  counter_d = gpu.to_gpu_async(count)

  # Bind CUDA textures.
  I_cu = cu.matrix_to_array(I, order='C')
  cu.bind_array_to_texref(I_cu, image_texture)

  # Step 1.
  descent_kernel(labeled_d, width, 
  height, block=block_size, grid=grid_size)
  
  start_time = cu.Event()
  end_time = cu.Event()
  start_time.record()

  # Step 2.
  increment_kernel(labeled_d,width,height,
  block=block_size2,grid=grid_size2)
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2

  while old != new:
    old = new
    minima_kernel(labeled_d, counters_d,
    width, height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]

  # Step 3.
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    plateau_kernel(labeled_d, counters_d, width,
    height, block=block_size, grid=grid_size)
    new = counters_d.get()[0]
  
  # Step 4
  counters_d = gpu.to_gpu(np.int32([0]))
  old, new = -1, -2
  while old != new:
    old = new
    flood_kernel(labeled_d, counters_d, width,
    height, block=block_size2, grid=grid_size2)
    new = counters_d.get()[0]

  result = labeled_d.get()
  
  # End GPU timers.
  end_time.record()
  end_time.synchronize()
  gpu_time = start_time.\
  time_till(end_time) * 1e-3

  # print str(gpu_time)

  return result
Example #43
0
def gpu_source_extraction(in1, tolerance, store_on_gpu, neg_comp):
    """
    The following function determines connectivity within a given wavelet decomposition. These connected and labelled
    structures are thresholded to within some tolerance of the maximum coefficient at the scale. This determines
    whether on not an object is to be considered as significant. Significant objects are extracted and factored into
    a mask which is finally multiplied by the wavelet coefficients to return only wavelet coefficients belonging to
    significant objects across all scales. This GPU accelerated version speeds up the extraction process.

    INPUTS:
    in1         (no default):   Array containing the wavelet decomposition.
    tolerance   (no default):   Percentage of maximum coefficient at which objects are deemed significant.
    store_on_gpu(no default):   Boolean specifier for whether the decomposition is stored on the gpu or not.

    OUTPUTS:
    objects*in1                 The wavelet coefficients of the significant structures.
    objects                     The mask of the significant structures - if store_on_gpu is True, returns a gpuarray.
    """

    # The following are pycuda kernels which are executed on the gpu. Specifically, these both perform thresholding
    # operations. The gpu is much faster at this on large arrays due to their massive parallel processing power.

    ker1 = SourceModule("""
                        __global__ void gpu_mask_kernel1(int *in1, int *in2)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int tid2 = i + j;

                            if (in1[tid2] == in2[0])
                                { in1[tid2] = -1; }
                        }
                       """, keep=True)

    ker2 = SourceModule("""
                        __global__ void gpu_mask_kernel2(int *in1)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int tid2 = i + j;

                            if (in1[tid2] >= 0)
                                { in1[tid2] = 0; }
                            else
                                { in1[tid2] = 1; }
                        }
                       """, keep=True)

    ker3 = SourceModule("""
                        __global__ void gpu_store_objects(int *in1, float *out1, int *scale)
                        {
                            const int len = gridDim.x*blockDim.x;
                            const int i = (blockDim.x * blockIdx.x + threadIdx.x);
                            const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len;
                            const int k = (blockDim.z * blockIdx.z + threadIdx.z)*(len*len);
                            const int tid2 = i + j;
                            const int tid3 = i + j + k;

                            if (blockIdx.z==scale[0])
                                { out1[tid3] = in1[tid2]; }
                        }
                       """, keep=True)

    # The following initialises some variables for storing the labelled image and the number of labels. The per scale
    # maxima are also initialised here.

    scale_maxima = np.empty([in1.shape[0],1], dtype=np.float32)
    objects = np.empty_like(in1, dtype=np.int32)
    object_count = np.empty([in1.shape[0],1], dtype=np.int32)

    # The following loop uses functionality from the ndimage module to assess connectivity. The maxima are also
    # calculated here.

    for i in range(in1.shape[0]):
        if neg_comp:
            scale_maxima[i] = np.max(abs(in1[i,:,:]))
        else:
            scale_maxima[i] = np.max(in1[i,:,:])
        objects[i,:,:], object_count[i] = ndimage.label(in1[i,:,:], structure=[[1,1,1],[1,1,1],[1,1,1]])

    # The following bind the pycuda kernels to the expressions on the left.

    gpu_mask_kernel1 = ker1.get_function("gpu_mask_kernel1")
    gpu_mask_kernel2 = ker2.get_function("gpu_mask_kernel2")

    # If store_on_gpu is the following handles some initialisation.

    if store_on_gpu:
        gpu_store_objects = ker3.get_function("gpu_store_objects")
        gpu_objects = gpuarray.empty(objects.shape, np.float32)
        gpu_idx = gpuarray.zeros([1], np.int32)
        gpu_idx += (objects.shape[0]-1)

    # The following removes the insignificant objects and then extracts the remaining ones.

    for i in range(-1,-in1.shape[0]-1,-1):

        condition = tolerance*scale_maxima[i]

        if neg_comp:
            if i==(-1):
                tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:]
            else:
                tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:]*objects[i+1,:,:]
        else:
            if i==(-1):
                tmp = (in1[i,:,:]>=condition)*objects[i,:,:]
            else:
                tmp = (in1[i,:,:]>=condition)*objects[i,:,:]*objects[i+1,:,:]

        labels = (np.unique(tmp[tmp>0])).astype(np.int32)

        gpu_objects_page = gpuarray.to_gpu_async(objects[i,:,:].astype(np.int32))

        for j in labels:
            label = gpuarray.to_gpu_async(np.array(j))
            gpu_mask_kernel1(gpu_objects_page, label, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32))

        gpu_mask_kernel2(gpu_objects_page, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32))

        objects[i,:,:] = gpu_objects_page.get()

        # In the event that all operations are to be done on the GPU, the following stores a version of the objects
        # on the GPU. A handle to the gpuarray is then returned.

        if store_on_gpu:
            gpu_store_objects(gpu_objects_page, gpu_objects, gpu_idx, block=(32,32,1), grid=(objects.shape[2]//32,
                                                                                             objects.shape[1]//32, objects.shape[0]))
            gpu_idx -= 1

    if store_on_gpu:
        return objects*in1, gpu_objects
    else:
        return objects*in1, objects
Example #44
0
func(cuda.InOut(a_cpu), block=(4, 4, 1))

print a_cpu

#
# ---- prepared call -------------------------------------------
#
grid = (1, 1)
block = (4, 4, 1)
func.prepare("P")
func.prepared_call(grid, block, a_gpu)

#
# ---- GPUArray -------------------------------------------
#
b_gpu = gpuarray.to_gpu(np.random.randn(4, 4).astype(np.float32))
b_doubled = (2 * b_gpu).get()

print b_gpu
print b_doubled

#
# ---- GPUArray -------------------------------------------
#
b_gpu = gpuarray.to_gpu_async(np.random.randn(100, 100).astype(np.float32))
b_doubled = (2 * b_gpu).get_async()

print "This is Async..........."
print b_gpu
print b_doubled
Example #45
0
func(cuda.InOut(a_cpu), block=(4,4,1))

print a_cpu

#
# ---- prepared call -------------------------------------------
#
grid  = (1,1)
block = (4,4,1)
func.prepare("P")
func.prepared_call(grid, block, a_gpu)

#
# ---- GPUArray -------------------------------------------
#
b_gpu = gpuarray.to_gpu(np.random.randn(4,4).astype(np.float32))
b_doubled = (2 * b_gpu).get()

print b_gpu
print b_doubled

#
# ---- GPUArray -------------------------------------------
#
b_gpu = gpuarray.to_gpu_async(np.random.randn(100,100).astype(np.float32))
b_doubled = (2 * b_gpu).get_async()

print "This is Async..........."
print b_gpu
print b_doubled
Example #46
0
def test_batch_get_sol_params(f, bend_coefs, rot_coef, atol=1e-7, index=0):
    seg_info = f.items()[index][1]
    inv_group =  seg_info['inv']
    ds_key = 'DS_SIZE_{}'.format(DS_SIZE)
    x_nd = inv_group[ds_key]['scaled_cloud_xyz'][:]
    K_nn = inv_group[ds_key]['scaled_K_nn'][:]

    n, d = x_nd.shape

    x_gpu = gpuarray.to_gpu(x_nd)

    H_arr_gpu = []
    for b in bend_coefs:
        cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float64)
        cur_offset[d+1:, d+1:] = b * K_nn
        cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef)
        H_arr_gpu.append(gpuarray.to_gpu(cur_offset))
    H_ptr_gpu = get_gpu_ptrs(H_arr_gpu)

    A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T
    n_cnts = A.shape[0]
    _u,_s,_vh = np.linalg.svd(A.T)
    N = _u[:,n_cnts:]
    F = np.zeros((n + d + 1, d), np.float64)
    F[1:d+1, :d] += np.diag(rot_coef)
    
    Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float64)
    F = F.astype(np.float64)
    N = N.astype(np.float64)

    Q_gpu     = gpuarray.to_gpu(Q)
    Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))]
    Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu)

    F_gpu     = gpuarray.to_gpu(F)
    F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))]
    F_ptr_gpu = get_gpu_ptrs(F_arr_gpu)

    N_gpu = gpuarray.to_gpu(N)
    N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))]
    N_ptr_gpu = get_gpu_ptrs(N_arr_gpu)
    
    dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu,
                      Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu,
                      transa = 'T')
    QTQ = Q.T.dot(Q)
    H_list = []
    for i, bend_coef in enumerate(bend_coefs):
        H = QTQ
        H[d+1:,d+1:] += bend_coef * K_nn
        rot_coefs = np.ones(d) * rot_coef if np.isscalar(rot_coef) else rot_coef
        H[1:d+1, 1:d+1] += np.diag(rot_coefs)
        # ipdb.set_trace()
        H_list.append(H)

    # N'HN
    NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'),
                                           (H_arr_gpu, H_ptr_gpu, 'N'),
                                           (N_arr_gpu, N_ptr_gpu, 'N'))

    NHN_list = [N.T.dot(H.dot(N)) for H in H_list]
    for i, NHN in enumerate(NHN_list):
        assert(np.allclose(NHN, NHN_arr_gpu[i].get(), atol=atol))

    iH_arr = []
    for NHN in NHN_arr_gpu:
        iH_arr.append(scipy.linalg.inv(NHN.get()).copy())

    h_inv_list = [scipy.linalg.inv(NHN) for NHN in NHN_list]
    assert(np.allclose(iH_arr, h_inv_list, atol=atol))

    iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr]
    iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu)

    proj_mats   = m_dot_batch((N_arr_gpu,  N_ptr_gpu,   'N'),
                              (iH_arr_gpu, iH_ptr_gpu, 'N'),
                              (N_arr_gpu,  N_ptr_gpu,   'T'),
                              (Q_arr_gpu,  Q_ptr_gpu,   'T'))
    
    proj_mats_list = [N.dot(h_inv.dot(N.T.dot(Q.T))) for h_inv in h_inv_list]
    assert(np.allclose(proj_mats_list, proj_mats[0][index].get(), atol=atol))

    offset_mats = m_dot_batch((N_arr_gpu,  N_ptr_gpu,   'N'),
                              (iH_arr_gpu, iH_ptr_gpu, 'N'),
                              (N_arr_gpu,  N_ptr_gpu,   'T'),
                              (F_arr_gpu,  F_ptr_gpu,   'N'))
    offset_mats_list = [N.dot(h_inv.dot(N.T.dot(F))) for h_inv in h_inv_list]
    assert(np.allclose(offset_mats_list, offset_mats[0][index].get(), atol=atol))