Beispiel #1
0
    def _create_halo_arrays(self):

        # Allocate space for the halos: two per face,
        # one for sending and one for receiving.

        nz, ny, nx = self.local_dims
        sw = self.stencil_width
        # create two halo regions for each face, one holding
        # the halo values to send, and the other holding
        # the halo values to receive.

        self.left_recv_halo = gpuarray.empty([nz,ny,sw], dtype=np.float64)
        self.left_send_halo = self.left_recv_halo.copy()
        self.right_recv_halo = self.left_recv_halo.copy()
        self.right_send_halo = self.left_recv_halo.copy()
    
        self.bottom_recv_halo = gpuarray.empty([nz,sw,nx], dtype=np.float64)
        self.bottom_send_halo = self.bottom_recv_halo.copy()
        self.top_recv_halo = self.bottom_recv_halo.copy()
        self.top_send_halo = self.bottom_recv_halo.copy()

        self.back_recv_halo = gpuarray.empty([sw,ny,nx], dtype=np.float64)
        self.back_send_halo = self.back_recv_halo.copy()
        self.front_recv_halo = self.back_recv_halo.copy()
        self.front_send_halo = self.back_recv_halo.copy()
Beispiel #2
0
def generate_shifts_2d(width, height, n_samples, with_hot=False):
    x_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (width - 0.01)
    x_shifts = x_shifts.astype(np.uint32)

    y_shifts = gpu_rng.gen_uniform((n_samples,), np.float32) * (height - 0.01)
    y_shifts = y_shifts.astype(np.uint32)

    if with_hot:
        shifts_hot = gp.empty((width * height, n_samples), np.float32)
        threads_per_block = 32
        n_blocks = int(math.ceil(n_samples / threads_per_block))
        gpu_shift_to_hot_2d(x_shifts, y_shifts, shifts_hot,
                            np.uint32(shifts_hot.strides[0]/4),
                            np.uint32(shifts_hot.strides[1]/4),
                            np.uint32(width), np.uint32(height), np.uint32(n_samples),
                            block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
        return x_shifts, y_shifts, shifts_hot
    else:
        shifts = gp.empty((2, n_samples), np.float32)
        threads_per_block = 32
        n_blocks = int(math.ceil(n_samples / threads_per_block))
        gpu_vstack(y_shifts, x_shifts, shifts,
                   np.uint32(shifts.strides[0]/4), np.uint32(shifts.strides[1]/4),
                   np.uint32(n_samples),
                   block=(threads_per_block, 1, 1), grid=(n_blocks, 1))
        return x_shifts, y_shifts, shifts
Beispiel #3
0
def _minmax_impl(a_gpu, axis, min_or_max, stream=None):
    ''' Returns both max and argmax (min/argmin) along an axis.'''
    assert len(a_gpu.shape) < 3
    if iscomplextype(a_gpu.dtype):
        raise ValueError("Cannot compute min/max of complex values")

    if axis is None:  ## Note: PyCUDA doesn't have an overall argmax/argmin!
        if min_or_max == 'max':
            return gpuarray.max(a_gpu).get()
        else:
            return gpuarray.min(a_gpu).get()
    else:
        if axis < 0:
            axis += 2
    assert axis in (0, 1)

    global _global_cublas_allocator
    alloc = _global_cublas_allocator

    n, m = a_gpu.shape if a_gpu.flags.c_contiguous else (a_gpu.shape[1], a_gpu.shape[0])
    col_kernel, row_kernel = _get_minmax_kernel(a_gpu.dtype, min_or_max)
    if (axis == 0 and a_gpu.flags.c_contiguous) or (axis == 1 and a_gpu.flags.f_contiguous):
        target = gpuarray.empty(m, dtype=a_gpu.dtype, allocator=alloc)
        idx = gpuarray.empty(m, dtype=np.uint32, allocator=alloc)
        col_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                   block=(32, 1, 1), grid=(m, 1, 1), stream=stream)
    else:
        target = gpuarray.empty(n, dtype=a_gpu, allocator=alloc)
        idx = gpuarray.empty(n, dtype=np.uint32, allocator=alloc)
        row_kernel(a_gpu, target, idx, np.uint32(m), np.uint32(n),
                block=(32, 1, 1), grid=(n, 1, 1), stream=stream)
    return target, idx
Beispiel #4
0
    def get(self, V_gpu, xcl_gpu, xcr_gpu, W_gpu, x_gpu, stream=None):
        """
        """
        if stream is None:
            stream = cuda.Stream()

        # Temporary variables
        z_gpu = gpuarray.empty((self.params['V_d'], 
                                self.params['V_w']), self.params['dtype'])
        xc2_gpu = gpuarray.empty(2*self.params['w_d'], self.params['dtype'])
            
        blockDim_x = self.params['V_w']
        
        self._func[0](xcl_gpu, xcr_gpu, xc2_gpu,
                      block=(blockDim_x, 1, 1),
                      stream=stream)            
        
        gridDim_z = self.params['V_d']
        blockDim_y = self.params['V_w']
        
        self._func[1](V_gpu, xc2_gpu, z_gpu,
                      block=(1, blockDim_y, 1),
                      grid=(1, 1, gridDim_z),
                      stream=stream)

        blockDim_y = self.params['W_h']
        
        self._func[2](W_gpu, xc2_gpu, z_gpu, x_gpu,
                      block=(1, blockDim_y, 1),
                      grid=(1, 1, 1),
                      stream=stream)                   
Beispiel #5
0
def sample_dropout_mask(x, dropout_probability=.5, columns=None, stream=None, target=None,
                        dropout_mask=None, dropout_prob_array=None):
    """ Samples a dropout mask and applies it in place"""

    assert x.flags.c_contiguous

    if columns is not None:
        assert len(columns) == 2
        x_tmp = x
        x = extract_columns(x, columns[0], columns[1])

    shape = x.shape

    if dropout_prob_array is None:
        dropout_prob_array = gpuarray.empty(shape, x.dtype)
    sampler.fill_uniform(dropout_prob_array, stream)

    if dropout_mask is None:
        dropout_mask = gpuarray.empty(shape, np.int8)

    if target is None: target = x
    
    all_kernels['sample_dropout_mask'](
        x, target, dropout_mask, dropout_prob_array,
        np.float32(dropout_probability))

    if columns is not None:
        insert_columns(x, x_tmp, columns[0])

    return dropout_mask
Beispiel #6
0
    def __init__(self, res=(640, 480)):
        mod = cuda.SourceModule(file("cpp/trace.cu").read(), keep=True, options=['-I../cpp'], no_extern_c=True)
        self.InitEyeRays = mod.get_function("InitEyeRays")
        self.InitFishEyeRays = mod.get_function("InitFishEyeRays")
        self.Trace = mod.get_function("Trace")
        self.ShadeSimple = mod.get_function("ShadeSimple")
        self.mod = mod

        self.block = (16, 32, 1)  # 15: 32, 18: 28, 19: 24
        self.grid = ( res[0]/self.block[0], res[1]/self.block[1] )
        self.resx, self.resy = (self.grid[0]*self.block[0], self.grid[1]*self.block[1])

        self.smallblock = (16, 16, 1)
        self.smallgrid = ( res[0]/self.smallblock[0], res[1]/self.smallblock[1] )

        self.d_img = ga.empty( (self.resy, self.resx, 4), uint8 )

        '''
        struct RayData
        {
          float3 dir;
          float t;
          VoxNodeId endNode;
          int endNodeChild;
          float endNodeSize;
        };
        '''
        raySize = struct.calcsize("3f f i i f")
        self.d_rays = ga.empty( (self.resy, self.resx, raySize), uint8 )

        self.setLightPos((0.5, 0.5, 1))
        self.detailCoef = 10.0
Beispiel #7
0
    def enable3d(self):
        self.point1 = self.point-(self.mesh_diagonal_norm/60)*self.axis2
        self.point2 = self.point+(self.mesh_diagonal_norm/60)*self.axis2

        self.viewing_angle = 0.0

        pos1, dir1 = from_film(self.point1, axis1=self.axis1, axis2=self.axis2,
                               size=self.size, width=self.film_width)
        pos2, dir2 = from_film(self.point2, axis1=self.axis1, axis2=self.axis2,
                               size=self.size, width=self.film_width)

        self.rays1 = gpu.GPURays(pos1, dir1,
                                 max_alpha_depth=self.max_alpha_depth)
        self.rays2 = gpu.GPURays(pos2, dir2,
                                 max_alpha_depth=self.max_alpha_depth)

        scope_size = (self.size[0]//4, self.size[0]//4)

        scope_pos, scope_dir = from_film(self.point, axis1=self.axis1,
                                         axis2=self.axis2, size=scope_size,
                                         width=self.film_width/4.0)

        self.scope_rays = gpu.GPURays(scope_pos, scope_dir)

        self.scope_pixels_gpu = ga.empty(self.scope_rays.pos.size, dtype=np.uint32)

        self.pixels1_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
        self.pixels2_gpu = ga.empty(self.width*self.height, dtype=np.uint32)
        
        self.distances_gpu = ga.empty(self.scope_rays.pos.size,
                                      dtype=np.float32)
        self.display3d = True
    def test_cublasSgetriBatched(self):
        l,m = 11,7
        np.random.seed(1)
        A = np.random.rand(l,m, m).astype(np.float32)
        
        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        c_gpu = gpuarray.empty((l,m,m), np.float32)
        c_arr = bptrs(c_gpu)

        p_gpu = gpuarray.empty((l,m), np.int32)
        i_gpu = gpuarray.zeros(l, np.int32)

        cublas.cublasSgetrfBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, 
                    i_gpu.gpudata, l)

        cublas.cublasSgetriBatched(self.cublas_handle, 
                    m, a_arr.gpudata, m, p_gpu.gpudata, c_arr.gpudata,m,
                    i_gpu.gpudata, l)
        
        X = np.array(map(np.linalg.inv,A))
        X_ = c_gpu.get()

        assert np.allclose(X,X_,6)
Beispiel #9
0
def test_cublas_bug():
    '''
    The SGEMM call would cause all calls after it to fail for some unknown
    reason. Likely this is caused swaprows causing memory corruption.

    NOTE: this was confirmed by nvidia to be a bug within CUDA, and should be
          fixed in CUDA 6.5
    '''
    from pycuda.driver import Stream
    from skcuda.cublas import cublasSgemm
    from skcuda.misc import _global_cublas_handle as handle

    n = 131

    s = slice(128, n)
    X = gpuarray.to_gpu(np.random.randn(n, 2483).astype(np.float32))
    a = gpuarray.empty((X.shape[1], 3), dtype=np.float32)
    c = gpuarray.empty((a.shape[0], X.shape[1]), dtype=np.float32)
    b = gpuarray.empty_like(X)

    m, n = a.shape[0], b[s].shape[1]
    k = a.shape[1]
    lda = m
    ldb = k
    ldc = m
    #cublasSgemm(handle, 0, 0, m, n, k, 0.0, b.gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    cublasSgemm(handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc)
    #print handle, 'n', 'n', m, n, k, 1.0, b[s].gpudata, lda, a.gpudata, ldb, 0.0, c.gpudata, ldc

    #gpuarray.dot(d, Xoutd[s])
    #op.sgemm(a, b[s], c)

    stream = Stream()
    stream.synchronize()
Beispiel #10
0
def gradient_gpu(y_gpu, mode='valid'):

  shape = np.array(y_gpu.shape).astype(np.uint32)
  dtype = y_gpu.dtype
  block_size = (16,16,1)
  grid_size = (int(np.ceil(float(shape[1])/block_size[0])),
               int(np.ceil(float(shape[0])/block_size[1])))
  shared_size = int((1+block_size[0])*(1+block_size[1])*dtype.itemsize)

  preproc = _generate_preproc(dtype, shape)
  mod = SourceModule(preproc + kernel_code, keep=True)

  if mode == 'valid':
    gradient_gpu = mod.get_function("gradient_valid")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]-1), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0]-1, y_gpu.shape[1]), y_gpu.dtype)

  if mode == 'same':
    gradient_gpu = mod.get_function("gradient_same")

    gradx_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    grady_gpu = cua.empty((y_gpu.shape[0], y_gpu.shape[1]), y_gpu.dtype)
    
  gradient_gpu(gradx_gpu.gpudata, grady_gpu.gpudata, y_gpu.gpudata,
               block=block_size, grid=grid_size, shared=shared_size)

  return (gradx_gpu, grady_gpu)
Beispiel #11
0
    def get(self, V_gpu, d_gpu, xcl_gpu, xcr_gpu,
            W_gpu, dpl_gpu, dpr_gpu, stream=None):  
            
        S_gpu = gpuarray.empty(self.params['V_w'], self.params['dtype'])
        VVT_gpu = gpuarray.to_gpu(np.ndarray(shape=self.params['V_shape'],
                                             dtype=self.params['dtype'],
                                             strides=self.params['V_strides']))
        tmp_gpu = gpuarray.empty((self.params['V_w'], self.params['w_d']), 
                                  self.params['dtype'])
        dp2_gpu = gpuarray.empty(self.params['W_w'], self.params['dtype'])

        if stream is None:
            stream = cuda.Stream()

        gridDim_x = 2
        gridDim_y = 2
        gridDim_z = self.params['V_d']        
        blockDim_x = self.params['V_w']//gridDim_x
        blockDim_y = self.params['V_h']//gridDim_y

        self._func[0](V_gpu, VVT_gpu, 
                      block=(blockDim_x, blockDim_y, 1), 
                      grid=(gridDim_x, gridDim_y, gridDim_z),
                      stream=stream)
                   
        gridDim_x = 1
        gridDim_y = 2
        blockDim_x = self.params['V_d']
        blockDim_y = self.params['V_h']//gridDim_y

        self._func[1](VVT_gpu, d_gpu, xcl_gpu, xcr_gpu, tmp_gpu,
                      block=(blockDim_x, blockDim_y, 1), 
                      grid=(gridDim_x, gridDim_y, 1),
                      stream=stream)

        gridDim_x = 1
        gridDim_y = 2
        blockDim_x = self.params['V_d']
        blockDim_y = self.params['V_h']//gridDim_y

        self._func[2](tmp_gpu, S_gpu,
                      block=(blockDim_x, blockDim_y, 1), 
                      grid=(gridDim_x, gridDim_y, 1),
                      stream=stream)

        gridDim_x = 2
        gridDim_y = 1
        blockDim_x = self.params['V_w']//gridDim_x
        blockDim_y = self.params['V_d']

        self._func[3](W_gpu, d_gpu, S_gpu, xcl_gpu, xcr_gpu, dp2_gpu,
                      block=(blockDim_x, blockDim_y, 1), 
                      grid=(gridDim_x, gridDim_y, 1),
                      stream=stream)
                      
        blockDim_x = self.params['V_w']
        self._func[4](dp2_gpu, dpl_gpu, dpr_gpu,
                      block=(blockDim_x, 1, 1), 
                      grid=(1, 1, 1),
                      stream=stream)                     
Beispiel #12
0
 def initializeGpuMemory(self):
     K = self.modelParams["proc_id_model","K"]
     
     # Sufficient statistics for the parameters of G kernels
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
     self.gpuPtrs["impulse_model","GS"] = gpuarray.empty_like(self.base.dSS["dS"])
Beispiel #13
0
    def _init_comm_bufs(self):
        """
        Buffers for sending/receiving data from other modules.

        Notes
        -----
        Must be executed after `_init_port_dicts()`.
        """

        # Buffers (and their interfaces and MPI types) for receiving data
        # transmitted from source modules:
        self._in_buf = {}
        self._in_buf['gpot'] = {}
        self._in_buf['spike'] = {}
        self._in_buf_int = {}
        self._in_buf_int['gpot'] = {}
        self._in_buf_int['spike'] = {}        
        self._in_buf_mtype = {}
        self._in_buf_mtype['gpot'] = {}
        self._in_buf_mtype['spike'] = {}        
        for in_id in self._in_ids:
            self._in_buf['gpot'][in_id] = \
                gpuarray.empty(len(self._in_port_dict_ids['gpot'][in_id]),
                               self.pm['gpot'].dtype)
            self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id])
            self._in_buf_mtype['gpot'][in_id] = \
                dtype_to_mpi(self._in_buf['gpot'][in_id].dtype)
            self._in_buf['spike'][in_id] = \
                gpuarray.empty(len(self._in_port_dict_ids['spike'][in_id]),
                               self.pm['spike'].dtype)
            self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id])
            self._in_buf_mtype['spike'][in_id] = \
                dtype_to_mpi(self._in_buf['spike'][in_id].dtype)

        # Buffers (and their interfaces and MPI types) for transmitting data to
        # destination modules:
        self._out_buf = {}
        self._out_buf['gpot'] = {}
        self._out_buf['spike'] = {}
        self._out_buf_int = {}
        self._out_buf_int['gpot'] = {}
        self._out_buf_int['spike'] = {}
        self._out_buf_mtype = {}
        self._out_buf_mtype['gpot'] = {}
        self._out_buf_mtype['spike'] = {}
        for out_id in self._out_ids:
            self._out_buf['gpot'][out_id] = \
                gpuarray.empty(len(self._out_port_dict_ids['gpot'][out_id]),
                               self.pm['gpot'].dtype)
            self._out_buf_int['gpot'][out_id] = bufint(self._out_buf['gpot'][out_id])
            self._out_buf_mtype['gpot'][out_id] = \
                dtype_to_mpi(self._out_buf['gpot'][out_id].dtype)

            self._out_buf['spike'][out_id] = \
                gpuarray.empty(len(self._out_port_dict_ids['spike'][out_id]),
                               self.pm['spike'].dtype)
            self._out_buf_int['spike'][out_id] = bufint(self._out_buf['spike'][out_id])
            self._out_buf_mtype['spike'][out_id] = \
                dtype_to_mpi(self._out_buf['spike'][out_id].dtype)
 def getFields(self,x,y):
   outX = gpuarray.empty((self.Nfields,x,y),np.float32)
   outY = gpuarray.empty((self.Nfields,x,y),np.float32)
   grid = (int(ceil(x/32)),int(ceil(y/32)))
   block = (int(ceil(x/grid[0])),int(ceil(y/grid[1])),1)
   for i in range(self.Nfields):
     self.resampleF[i].prepared_call(grid,block,outX[i,:,:].gpudata,outY[i,:,:].gpudata,np.int32(x),np.int32(y))
   return outX,outY
Beispiel #15
0
def show_values(matrix_size, threads_per_block):
    a_cpu = np.random.randn(matrix_size, matrix_size).astype(np.float32)

    # transfer host (CPU) memory to device (GPU) memory
    a_gpu = gpuarray.to_gpu(a_cpu)
    id_groups_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_groups_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_threads_x = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_threads_y = gpuarray.empty((matrix_size, matrix_size), np.float32)
    id_cell = gpuarray.empty((matrix_size, matrix_size), np.float32)

    blocks = (threads_per_block, 1, 1)

    blocks_per_side = int(matrix_size / threads_per_block)

    if (blocks_per_side * threads_per_block) < matrix_size:
        blocks_per_side = blocks_per_side + 1

    grid = (blocks_per_side, matrix_size, 1)

    print("Blocks: ", blocks)
    print("Grid: ", grid)

    kernel_code = kernel_source_code % {'MATRIX_SIZE': matrix_size, 'BLOCK_SIZE': threads_per_block}

    compiled_kernel = compiler.SourceModule(kernel_code)

    kernel_binary = compiled_kernel.get_function("markThreadID")

    kernel_binary(
        # inputs
        a_gpu,
        # outputs
        id_groups_x, id_groups_y, id_threads_x, id_threads_y, id_cell,
        block=blocks,
        grid=grid
    )

    id_blocks_x_cpu = id_groups_x.get()
    id_blocks_y_cpu = id_groups_y.get()
    id_threads_x_cpu = id_threads_x.get()
    id_threads_y_cpu = id_threads_y.get()
    id_cell_cpu = id_cell.get()

    print("id_blocks_x_cpu")
    print(id_blocks_x_cpu)

    print("id_blocks_y_cpu")
    print(id_blocks_y_cpu)

    print("id_threads_x_cpu")
    print(id_threads_x_cpu)

    print("id_threads_y_cpu")
    print(id_threads_y_cpu)

    print("id_cell_cpu")
    print(id_cell_cpu)
 def initializeGpuMemory(self):
     """
     Allocate GPU memory for the base model parameters
     """
     N = self.base.data.N
     K = self.base.data.K
             
     self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32)
     self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32)
Beispiel #17
0
 def removeProcessEventHandler(self, procId):
     """
     Remove process procID from the set of processes and update data structures
     accordingly. We can assume that the base model has updated K.
     """
     K = self.modelParams["proc_id_model","K"]
     
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
Beispiel #18
0
 def test_work_area(self):
     x = np.asarray(np.random.rand(self.N), np.float32)
     xf = np.fft.rfftn(x)
     x_gpu = gpuarray.to_gpu(x)
     xf_gpu = gpuarray.empty(self.N // 2 + 1, np.complex64)
     plan = fft.Plan(x.shape, np.float32, np.complex64, auto_allocate=False)
     work_area = gpuarray.empty((plan.worksize,), np.uint8)
     plan.set_work_area(work_area)
     fft.fft(x_gpu, xf_gpu, plan)
     assert np.allclose(xf, xf_gpu.get(), atol=atol_float32)
 def initializeGpuMemory(self):
     K = self.modelParams["proc_id_model","K"]
     N = self.base.data.N
     
     self.gpuPtrs["graph_model","A"] = gpuarray.empty((K,K), dtype=np.bool)
     self.gpuPtrs["graph_model","WGS"]   = gpuarray.empty((K,N), dtype=np.float32)
     
     qratio_width = int(np.ceil(np.float32(self.base.data.N)/self.params["blockSz"]))
     self.gpuPtrs["graph_model","qratio"] = gpuarray.empty((qratio_width,), dtype=np.float64)
     self.gpuPtrs["graph_model","lkhd_ratio"] = gpuarray.empty((1,), dtype=np.float32)
Beispiel #20
0
    def get_workspace(self, n):
        from pyfft.cuda import Plan as pycufftplan
        import pycuda.gpuarray as gpuarray

        ws = self.get(n)
        if ws: return ws
        return self.setdefault(n,
            (pycufftplan(int(n), stream=self.stream, normalize=False),
             gpuarray.empty(n, dtype=complex64(0.).dtype),
             gpuarray.empty(n, dtype=complex64(0.).dtype)))
Beispiel #21
0
    def benchmark(self):
        discr = self.discr
        given = self.plan.given

        from hedge.backends.cuda.tools import int_ceiling
        block_count = int_ceiling(
                len(discr.mesh.elements)/self.plan.elements_per_block())
        all_fluxes_on_faces = [gpuarray.empty(
            (block_count * self.plan.microblocks_per_block()
                * given.aligned_face_dofs_per_microblock(),),
                dtype=given.float_type,
                allocator=discr.pool.allocate)
                for i in range(len(self.fluxes))]

        field = gpuarray.empty(
                (self.plan.input_dofs_per_block() * block_count,),
                dtype=given.float_type,
                allocator=discr.pool.allocate)

        fdata = self.fake_flux_face_data_block(block_count)
        ilist_data = self.fake_index_list_data()

        block, gather, texref_map = self.get_kernel(fdata, ilist_data,
                for_benchmark=True)

        for dep_expr in self.all_deps:
            field.bind_to_texref_ext(texref_map[dep_expr],
                    allow_double_hack=True)

        if "cuda_fastbench" in discr.debug:
            count = 1
        else:
            count = 20

        start = cuda.Event()
        start.record()
        for i in range(count):
            if block_count >= 2**16:
                return None

            try:
                gather.prepared_call(
                        (block_count, 1), block,
                        0,
                        fdata.device_memory,
                        *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                        )
            except cuda.LaunchError:
                return None

        stop = cuda.Event()
        stop.record()
        stop.synchronize()

        return 1e-3/count * stop.time_since(start)
Beispiel #22
0
    def __init__(self, A1, A2, left, use_batch=False):
        """Creates a new LinearOperator interface to the superoperator E.
        
        This is a wrapper to be used with SciPy's sparse linear algebra routines.
        
        Parameters
        ----------
        A1 : ndarray
            Ket parameter tensor. 
        A2 : ndarray
            Bra parameter tensor.
        left : bool
            Whether to multiply with a vector to the left (or to the right).
        """
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))
        
        self.use_batch = use_batch
        self.left = left
        
        self.D = A1[0].shape[1]        
        self.shape = (self.D**2, self.D**2)        
        self.dtype = sp.dtype(A1[0][0].dtype)
        
        self.calls = 0        
        
        self.out = garr.empty((self.D, self.D), dtype=self.dtype)        
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)

        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None

            self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            
            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())
        
        self.hdl = cb.cublasCreate()
Beispiel #23
0
def genCks( allValidSpikVec, MATRIX_SIZE, TILE_WIDTH, configVec_str, spikTransMatFile) :
	#using all generated valid spiking vector files, 'feed' the files to the CUDA C kernels to evaluate (1)
	for spikVec in  allValidSpikVec[ 0 ] :
		# string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...]
		# to configVec = 211 <string>
		Ck_1_str = configVec_str 
		#write into total list of Ckspri
		#allGenCk = addTotalCk( allGenCk, Ck_1_str )
		#print spikVec		
		#form the filenames of the Cks and the Sks
		Ck = 'c_' + Ck_1_str + '_' + spikVec
		Ck_1 = 'c_' + Ck_1_str
		Sk = 's_' + spikVec
		#print ' Ck, Ck_1, Sk: ', Ck, Ck_1, Sk
		#import the vectors/Matrix as numpy ND arrays 
		Ck_1 = toNumpyArr( Ck_1, MATRIX_SIZE )
		Sk = toNumpyArr( Sk, MATRIX_SIZE )
		M = toNumpyArr( spikTransMatFile, MATRIX_SIZE )
		#allocate memory in the GPU
		Ck_1gpu = gpuarray.to_gpu( Ck_1 )
		Skgpu = gpuarray.to_gpu( Sk )
		Mgpu = gpuarray.to_gpu( M )
		SkMgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 )
		Ckgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 )
		#get kernel code from template by specifying the constant MATRIX_SIZE
		#matmul_kernel = matmul_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE}
		matmul_kernel = matmul_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH}
		#matadd_kernel = matadd_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE}
		matadd_kernel = matadd_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH}
		# compile the kernel code 
		mulmod = compiler.SourceModule(matmul_kernel)
		addmod = compiler.SourceModule(matadd_kernel)
		matrixmul = mulmod.get_function( "MatrixMulKernel" )
		matrixadd = addmod.get_function( "MatrixAddKernel" )
		#call kernel functions
		#matrixmul( Skgpu, Mgpu, SkMgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), )
		#print ' BEFORE DEVICE CALLS. Time is '
		#print str(datetime.now())
		#create PyCUDA events to record time of kernel execution
		startTime = driver.Event()
		endTime = driver.Event()
		startTime.record( ) #start the timer
		matrixmul( Skgpu, Mgpu, SkMgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) )
		#matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), )
		matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) )
		endTime.record( ) #start the end time timer.
		endTime.synchronize( ) # synchronize end of threads
		simTime = startTime.time_till( endTime ) * 1e-3
		print " Kernel call exec time is ", simTime
		#print ' AFTER DEVICE CALLS. Time is '
		#print str(datetime.now())
		#print Ck_1gpu.get()[ 4 ] #this is a numpy ND array
		#write ND array into a file
		NDarrToFile( Ck, Ckgpu )
Beispiel #24
0
 def initializeGpuMemory(self):
     """
     Allocate GPU memory for the base model parameters
     """
     N = self.base.data.N
     K = self.modelParams["proc_id_model","K"]
     
     self.gpuPtrs["parent_model","Z"] = gpuarray.empty((N,), dtype=np.int32)
     self.gpuPtrs["parent_model","WGS"]   = gpuarray.empty((K,N), dtype=np.float32)
     self.gpuPtrs["parent_model","urand_Z_gpu"] = gpuarray.empty((N,), dtype=np.float32)
     self.gpuPtrs["parent_model","Zi_temp_gpu"] = gpuarray.empty((N,), dtype=np.int32)
 def initializeGpuMemory(self):
     K = self.params["K"]
     N = self.base.data.N
     D = self.base.data.D
     gridx = int(np.ceil(np.float32(N)/self.params["blockSz"]))
     
     self.gpuPtrs["proc_id_model","C"] = gpuarray.empty((N,), dtype=np.int32)
     self.gpuPtrs["proc_id_model","Ns"] = gpuarray.empty((K,), dtype=np.int32)
     
     self.gpuPtrs["proc_id_model","Xstats"] = gpuarray.empty((K,gridx), dtype=np.float32)
     self.gpuPtrs["proc_id_model","Xmean"] = gpuarray.empty((K,D), dtype=np.float32)
     self.gpuPtrs["proc_id_model","Xprec"] = gpuarray.empty((K,D,D), dtype=np.float32)      
Beispiel #26
0
    def todense(self, out=None, allocator=mem_alloc, stream=None):
        if out is None:
            out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")

        if self.nnz == 0:  # weird but happens
            out.fill(0.0, stream=stream)
            return out

        # we need to out-of-place transpose if we want rowmajor outputs
        # thus we need a temporary to store our results
        if out.flags.c_contiguous:
            tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")
        else:
            tmp = out

        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, stream.handle)
            cublas.cublasSetStream(cublas_handle, stream.handle)

        cusparse.cusparseScsr2dense(
            cusparse_handle,
            self.shape[0],
            self.shape[1],
            self.descr,
            self.data.gpudata,
            self.indptr.gpudata,
            self.indices.gpudata,
            tmp.gpudata,
            tmp.shape[0],
        )

        if out.flags.c_contiguous:
            cublas.cublasSgeam(
                cublas_handle,
                1,
                1,
                tmp.shape[1],
                tmp.shape[0],
                1.0,
                tmp.gpudata,
                tmp.shape[0],
                0.0,
                0,
                tmp.shape[0],
                out.gpudata,
                out.shape[1],
            )
        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, 0)
            cublas.cublasSetStream(cublas_handle, 0)

        return out
Beispiel #27
0
    def _init_comm_bufs(self):
        """
        Buffers for receiving data from other modules.

        Notes
        -----
        Must be executed after `_init_port_dicts()`.
        """

        # Buffer interface to and MPI type of this module's port data array:
        self._data_int = {}
        self._data_int['gpot'] = bufint(self.data['gpot'])
        self._data_int['spike'] = bufint(self.data['spike'])
        self._data_mtype = {}        
        self._data_mtype['gpot'] = dtype_to_mpi(self.data['gpot'].dtype)
        self._data_mtype['spike'] = dtype_to_mpi(self.data['spike'].dtype)

        # Buffers (and their interfaces and MPI types) for receiving data
        # transmitted from source modules:
        self._in_buf = {}
        self._in_buf['gpot'] = {}
        self._in_buf['spike'] = {}
        self._in_buf_int = {}
        self._in_buf_int['gpot'] = {}
        self._in_buf_int['spike'] = {}        
        self._in_buf_mtype = {}
        self._in_buf_mtype['gpot'] = {}
        self._in_buf_mtype['spike'] = {}        
        for in_id in self._in_ids:

            # Get interfaces of pattern connecting the current module to
            # source module `in_id`; `int_1` is connected to the current
            # module, `int_0` is connected to the other module:
            pat = self.routing_table[in_id, self.id]['pattern']
            int_0 = self.routing_table[in_id, self.id]['int_0']
            int_1 = self.routing_table[in_id, self.id]['int_1']

            # The buffers must be the same size as the port data arrays of the
            # modules from which they received data:
            self._in_buf['gpot'][in_id] = \
                gpuarray.empty(len(self.pm_all['gpot'][in_id]),
                               self.pm['gpot'].dtype)                               
            self._in_buf_int['gpot'][in_id] = bufint(self._in_buf['gpot'][in_id])
            self._in_buf_mtype['gpot'][in_id] = \
                dtype_to_mpi(self._in_buf['gpot'][in_id])

            self._in_buf['spike'][in_id] = \
                gpuarray.empty(len(self.pm_all['spike'][in_id]),
                               self.pm['spike'].dtype)
            self._in_buf_int['spike'][in_id] = bufint(self._in_buf['spike'][in_id])
            self._in_buf_mtype['spike'][in_id] = \
                dtype_to_mpi(self._in_buf['spike'][in_id])
Beispiel #28
0
 def addNewProcessEventHandler(self, newProcParams):
     """
     If a new process is added the parameters will be in the given dict.
     We need to update all our data structures that depend on K. We can
     assume that the base model has updated K
     """
     del self.gpuPtrs["impulse_model","nnz_Z"]
     del self.gpuPtrs["impulse_model","g_suff_stats"]
     
     K = self.modelParams["proc_id_model","K"]
     
     self.gpuPtrs["impulse_model","nnz_Z"] = gpuarray.empty((K,K), dtype=np.int32)
     self.gpuPtrs["impulse_model","g_suff_stats"] = gpuarray.empty((K,K), dtype=np.float32) 
 def DT_GPU(self, X, c):
     DIM = X.size
     floatSize = X.dtype.itemsize
     q = gpuarray.empty(DIM / 2, X.dtype)
     p = gpuarray.empty(DIM / 2, X.dtype)
     XNext = gpuarray.empty(DIM, X.dtype)
     cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2)
     qNext = q + c * self.dt * self.dTdp(p)
     pNext = p
     cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2)
     return XNext
 def DV_GPU(self, X, d):
     DIM = X.size
     floatSize = X.dtype.itemsize
     q = gpuarray.empty(DIM / 2, X.dtype)
     p = gpuarray.empty(DIM / 2, X.dtype)
     XNext = gpuarray.empty(DIM, X.dtype)
     cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2)
     qNext = q
     pNext = p - d * self.dt * self.dVdq(q)
     cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2)
     cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2)
     return XNext
Beispiel #31
0
def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds
def register_multiple_images_subpix_cuda(stack, template):

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as drv
    import pycuda.cumath as cumath
    import skcuda.fft as cu_fft
    import skcuda.linalg as lin
    import skcuda.cublas as cub
    from numpy import pi, newaxis, floor
    import cmath
    from pycuda.elementwise import ElementwiseKernel
    from pycuda.compiler import SourceModule

    from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix
    from numpy.fft import fftshift, ifftshift
    fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1,
                                                   use_numpy_fft=False)

    mod = SourceModule("""
   #include <pycuda-complex.hpp>"
   
    __global__ void load_convert(unsigned short *a, float *b,int f, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = f * imlen;
        if (idx <imlen)
        {
            b[idx] = (float)a[offset+idx];
        }
    }
        
    __global__ void convert_export(float *a, unsigned short *b,int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ;
        }
    }
        
    __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx] = x[idx] * y[idx];
        }
    }
        
    __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = x[idx]._M_re;
            y[idx]._M_im = -x[idx]._M_im;
        }
    }
        
        
    __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = 0;
            y[idx]._M_im = x[idx] * sx;
        }
    }
        
    __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen,  int nlargeh, int nh)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = imlenl*3/4;
        if (idx<imlen)
        {
            int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl;
            x[target_ind] = y[idx];
        }      
    
    }    
        
    __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx]._M_re = 0;
            z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg;
        }
    }
        
    __global__ void sub_float(float *x, float *y, float sv,  int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            x[idx] = y[idx]-sv;
        }
    }
        

    """)

    load_convert_kernel = mod.get_function('load_convert')
    convert_export_kernel = mod.get_function('convert_export')
    convert_multiply_kernel = mod.get_function('convert_multiply')
    multiply_float_kernel = mod.get_function('multiply_comp_float')
    transfer_array_kernel = mod.get_function('transfer_array')
    calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix')
    conj_kernel = mod.get_function('calc_conj')
    sub_float_kernel = mod.get_function('sub_float')

    Z = stack.shape[0]
    M = stack.shape[1]
    N = stack.shape[2]
    max_memsize = 4200000000

    imlen = M * N
    half_imlen = M * (N // 2 + 1)
    grid_dim = (64, int(imlen / (512 * 64)) + 1, 1)
    block_dim = (512, 1, 1)  #512 threads per block

    stack_bin = int(max_memsize / (M * N * stack.itemsize))
    stack_ite = int(Z / stack_bin) + 1

    usfac = 100  ## needs to be bigger than 10

    if not template.shape == stack.shape[1:]:
        raise ValueError("Images must have same shape.")

    if np.any(np.isnan(template)):
        template = template.copy()
        template[template != template] = 0
    if np.any(np.isnan(stack)):
        stack = stack.copy()
        stack[stack != stack] = 0

    mlarge = M * 2
    nlarge = N * 2

    t = time.time()

    plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64)
    plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32)
    plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32)
    cub_h = cub.cublasCreate()

    template_gpu = gpuarray.to_gpu(template.astype('float32'))
    source_gpu = gpuarray.empty((M, N), np.float32)
    ifft_gpu = gpuarray.empty((M, N), np.float32)
    result_gpu = gpuarray.empty((M, N), np.uint16)

    templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64)

    cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True)
    templatef_gpu = templatef_gpu.conj()

    move_list = np.zeros((Z, 2))

    largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64)
    largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32)
    imlenl = mlarge * (nlarge // 2 + 1)

    zoom_factor = 1.5
    dftshiftG = trunc(ceil(usfac * zoom_factor) / 2)
    #% Center of output array at dftshift+1
    upsample_dim = int(ceil(usfac * zoom_factor))

    term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)).
              T[:, newaxis]) / N  # fftfreq  # output points
    term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :]
    term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis]
    term2r = (ifftshift(np.arange(M, dtype='float')) -
              floor(M / 2))[newaxis, :]  # fftfreq
    term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) +
                                             1), :].astype('float32'))
    term2c_gpu = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu = gpuarray.to_gpu(term1r.astype('float32'))
    term2r_gpu = gpuarray.to_gpu(term2r.astype('float32'))
    term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32'))

    kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32)
    kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32)
    kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64)
    kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64)

    Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M))
    Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N))
    [Nc, Nr] = np.meshgrid(Nc, Nr)
    Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32'))
    Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32'))

    upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64)
    upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64)

    source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype)
    copy = drv.Memcpy3D()
    copy.set_src_host(stack.data)
    copy.set_dst_device(source_stack.gpudata)
    copy.width_in_bytes = copy.src_pitch = stack.strides[1]
    copy.src_height = copy.height = M

    for zb in range(stack_ite):

        zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z))
        copy.depth = len(zrange)
        copy.src_z = int(zrange[0])
        copy()

        for i in range(len(zrange)):

            t = zb * stack_bin + i
            load_convert_kernel(source_stack,
                                source_gpu.gpudata,
                                np.int32(i),
                                np.int32(imlen),
                                block=block_dim,
                                grid=grid_dim)
            cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True)

            multiply_float_kernel(sourcef_gpu,
                                  templatef_gpu,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            transfer_array_kernel(largearray1_gpu,
                                  prod_gpu1,
                                  np.int32(imlenl),
                                  np.int32(half_imlen),
                                  np.int32(nlarge // 2 + 1),
                                  np.int32(N // 2 + 1),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(largearray1_gpu,
                        largearray2_gpu,
                        plan_inverse_big,
                        scale=True)
            peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size,
                                       largearray2_gpu.gpudata, 1)
            rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape)

            md2 = trunc(mlarge / 2)
            nd2 = trunc(nlarge / 2)
            if rloc > md2:
                row_shift2 = rloc - mlarge
            else:
                row_shift2 = rloc
            if cloc > nd2:
                col_shift2 = cloc - nlarge
            else:
                col_shift2 = cloc
            row_shiftG = row_shift2 / 2.
            col_shiftG = col_shift2 / 2.

            # Initial shift estimate in upsampled grid

            row_shiftG0 = round(row_shiftG * usfac) / usfac
            col_shiftG0 = round(col_shiftG * usfac) / usfac
            # Matrix multiply DFT around the current shift estimate
            roffG = dftshiftG - row_shiftG0 * usfac
            coffG = dftshiftG - col_shiftG0 * usfac

            sub_float_kernel(term2c_gpu,
                             term2c_gpu_ori,
                             np.float32(coffG / usfac),
                             np.int32(term2c_gpu.size),
                             block=block_dim,
                             grid=grid_dim)
            sub_float_kernel(term1r_gpu,
                             term1r_gpu_ori,
                             np.float32(roffG),
                             np.int32(term1r_gpu.size),
                             block=block_dim,
                             grid=grid_dim)

            lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu)
            lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu)
            convert_multiply_kernel(kernc_gpu,
                                    kernc_gpuc,
                                    np.float32(-2 * pi),
                                    np.int32(kernc_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            convert_multiply_kernel(kernr_gpu,
                                    kernr_gpuc,
                                    np.float32(-2 * pi / (M * usfac)),
                                    np.int32(kernr_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(kernc_gpuc, out=kernc_gpuc)
            cumath.exp(kernr_gpuc, out=kernr_gpuc)

            conj_kernel(prod_gpu1,
                        prod_gpu2,
                        np.int32(half_imlen),
                        block=block_dim,
                        grid=grid_dim)

            lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1)
            lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2)

            CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2)
            rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape)
            CCGmax = CCG[rlocG, clocG]

            rlocG = rlocG - dftshiftG  #+ 1 # +1 # questionable/failed hack + 1;
            clocG = clocG - dftshiftG  #+ 1 # -1 # questionable/failed hack - 1;
            row_shiftG = row_shiftG0 + rlocG / usfac
            col_shiftG = col_shiftG0 + clocG / usfac

            diffphaseG = arctan2(imag(CCGmax), real(CCGmax))

            # Compute registered version of source stack
            calc_shiftmatrix_kernel(Nr_gpu,
                                    Nc_gpu,
                                    shiftmatrix,
                                    np.float32(row_shiftG * 2 * np.pi),
                                    np.float32(col_shiftG * 2 * np.pi),
                                    np.float32(diffphaseG),
                                    np.int32(half_imlen),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(shiftmatrix, out=shiftmatrix)
            multiply_float_kernel(sourcef_gpu,
                                  shiftmatrix,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse)
            convert_export_kernel(ifft_gpu,
                                  result_gpu,
                                  np.int32(imlen),
                                  block=block_dim,
                                  grid=grid_dim)

            move_list[t, :] = (row_shiftG, col_shiftG)
            stack[t, :, :] = result_gpu.get()

    cub.cublasDestroy(cub_h)
    return (stack, move_list)
Beispiel #33
0
    def test_curand_wrappers(self):
        from pycuda.curandom import get_curand_version

        if get_curand_version() is None:
            from pytest import skip

            skip("curand not installed")

        generator_types = []
        if get_curand_version() >= (3, 2, 0):
            from pycuda.curandom import (
                XORWOWRandomNumberGenerator,
                Sobol32RandomNumberGenerator,
            )

            generator_types.extend(
                [XORWOWRandomNumberGenerator, Sobol32RandomNumberGenerator])
        if get_curand_version() >= (4, 0, 0):
            from pycuda.curandom import (
                ScrambledSobol32RandomNumberGenerator,
                Sobol64RandomNumberGenerator,
                ScrambledSobol64RandomNumberGenerator,
            )

            generator_types.extend([
                ScrambledSobol32RandomNumberGenerator,
                Sobol64RandomNumberGenerator,
                ScrambledSobol64RandomNumberGenerator,
            ])
        if get_curand_version() >= (4, 1, 0):
            from pycuda.curandom import MRG32k3aRandomNumberGenerator

            generator_types.extend([MRG32k3aRandomNumberGenerator])

        if has_double_support():
            dtypes = [np.float32, np.float64]
        else:
            dtypes = [np.float32]

        for gen_type in generator_types:
            gen = gen_type()

            for dtype in dtypes:
                gen.gen_normal(10000, dtype)
                # test non-Box-Muller version, if available
                gen.gen_normal(10001, dtype)

                if get_curand_version() >= (4, 0, 0):
                    gen.gen_log_normal(10000, dtype, 10.0, 3.0)
                    # test non-Box-Muller version, if available
                    gen.gen_log_normal(10001, dtype, 10.0, 3.0)

                x = gen.gen_uniform(10000, dtype)
                x_host = x.get()
                assert (-1 <= x_host).all()
                assert (x_host <= 1).all()

            gen.gen_uniform(10000, np.uint32)
            if get_curand_version() >= (5, 0, 0):
                gen.gen_poisson(10000, np.uint32, 13.0)
                for dtype in dtypes + [np.uint32]:
                    a = gpuarray.empty(1000000, dtype=dtype)
                    v = 10
                    a.fill(v)
                    gen.fill_poisson(a)
                    tmp = (a.get() == (v - 1)).sum() / a.size  # noqa: F841
Beispiel #34
0
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np

import skcuda.fft as cu_fft

print('Testing fft/ifft..')
N = 4096 * 16
batch_size = 16

x = np.asarray(np.random.rand(batch_size, N), np.float32)
xf = np.fft.fft(x)
y = np.real(np.fft.ifft(xf))

x_gpu = gpuarray.to_gpu(x)
xf_gpu = gpuarray.empty((batch_size, N // 2 + 1), np.complex64)
plan_forward = cu_fft.Plan(N, np.float32, np.complex64, batch_size)
cu_fft.fft(x_gpu, xf_gpu, plan_forward)

y_gpu = gpuarray.empty_like(x_gpu)
plan_inverse = cu_fft.Plan(N, np.complex64, np.float32, batch_size)
cu_fft.ifft(xf_gpu, y_gpu, plan_inverse, True)

print('Success status: ', np.allclose(y, y_gpu.get(), atol=1e-6))

print('Testing in-place fft..')
x = np.asarray(
    np.random.rand(batch_size, N) + 1j * np.random.rand(batch_size, N),
    np.complex64)
x_gpu = gpuarray.to_gpu(x)
Beispiel #35
0
def setup_cuda_fft_resample(n_jobs, W, new_len):
    """Set up CUDA FFT resampling

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT resampling.
    W : array
        The filtering function to be used during resampling.
        If n_jobs='cuda', this function will be shortened (since CUDA
        assumes FFTs of real signals are half the length of the signal)
        and turned into a gpuarray.
    new_len : int
        The size of the array following resampling.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to resample.
    W : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or np.ndarray.
        If CUDA is enabled, W will be modified appropriately for use
        with filter.fft_multiply().

    Notes
    -----
    This function is designed to be used with fft_resample().
    """
    cuda_dict = dict(use_cuda=False,
                     fft_plan=None,
                     ifft_plan=None,
                     x_fft=None,
                     x=None,
                     y_fft=None,
                     y=None)
    n_fft_x, n_fft_y = len(W), new_len
    cuda_fft_len_x = int((n_fft_x - (n_fft_x % 2)) // 2 + 1)
    cuda_fft_len_y = int((n_fft_y - (n_fft_y % 2)) // 2 + 1)
    if n_jobs == 'cuda':
        n_jobs = 1
        if cuda_capable:
            # try setting up for float64
            try:
                # do the IFFT normalization now so we don't have to later
                W = gpuarray.to_gpu(W[:cuda_fft_len_x].astype('complex_') /
                                    n_fft_y)
                cuda_dict.update(
                    use_cuda=True,
                    fft_plan=cudafft.Plan(n_fft_x, np.float64, np.complex128),
                    ifft_plan=cudafft.Plan(n_fft_y, np.complex128, np.float64),
                    x_fft=gpuarray.zeros(max(cuda_fft_len_x, cuda_fft_len_y),
                                         np.complex128),
                    x=gpuarray.empty(max(int(n_fft_x), int(n_fft_y)),
                                     np.float64))
                logger.info('Using CUDA for FFT resampling')
            except Exception:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large), falling back to '
                            'n_jobs=1')
        else:
            logger.info('CUDA not used, CUDA has not been initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict, W
Beispiel #36
0
import numpy as np
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import skcuda.cusolver as solver

handle = solver.cusolverDnCreate()
x = np.random.randn(1024, 1024) + 1j * np.random.rand(1024, 1024)
x = x + x.conj().T

# Need to reverse dimensions because CUSOLVER expects column-major matrices:
n, m = x.shape
x_gpu = gpuarray.to_gpu(x.T.copy())

# Set up output buffers:
w = gpuarray.empty(n, dtype=np.double)

# Set up parameters
params = solver.cusolverDnCreateSyevjInfo()
solver.cusolverDnXsyevjSetTolerance(params, 1e-7)
solver.cusolverDnXsyevjSetMaxSweeps(params, 15)

# Set up work buffers:
lwork = solver.cusolverDnZheevj_bufferSize(handle, 'CUSOLVER_EIG_MODE_VECTOR',
                                           'u', n, x_gpu.gpudata, m, w.gpudata,
                                           params)

workspace_gpu = gpuarray.zeros(lwork, dtype=x.dtype)
info = gpuarray.zeros(1, dtype=np.int32)

# Compute:
Beispiel #37
0
transpose_naive_multi = mod.get_function("transpose_naive_multi")
transpose_share = mod.get_function("transpose_share")
transpose_share_multi = mod.get_function("transpose_share_multi")
transpose_share_multi_conflict = mod.get_function("transpose_share_multi_conflict")

TEST = False
if TEST:
	block_size = 32
	i = 2
	size = i * 32
	matrix = np.random.random(size = (size, size)).astype(np.float32)
	matrix_out_cpu = np.empty_like(matrix).astype(np.float32)

	matrix_in = gpuarray.to_gpu(matrix)

	matrix_out_naive = gpuarray.empty((size, size), np.float32)
	matrix_out_naive_multi = gpuarray.empty((size, size), np.float32)
	matrix_out_share = gpuarray.empty((size, size), np.float32)
	matrix_out_share_multi = gpuarray.empty((size, size), np.float32)
	matrix_out_share_multi_conflict = gpuarray.empty((size, size), np.float32)


	matrix_out_cpu = np.transpose(matrix)

	transpose_naive(matrix_out_naive, matrix_in, np.uint32(size), block = (32, 32, 1), grid = (size / block_size, size / block_size, 1 ))
	
	transpose_naive_multi(matrix_out_naive_multi, matrix_in, np.uint32(size), block = (32, 8, 1), grid = (size / block_size, size / block_size, 1 ))

	transpose_share(matrix_out_share, matrix_in, np.uint32(size), block = (32, 32, 1), grid = (size / block_size, size / block_size, 1 ))

	transpose_share_multi(matrix_out_share_multi, matrix_in, np.uint32(size), block = (32, 8, 1), grid = (size / block_size, size / block_size, 1 ))
Beispiel #38
0
def transpose(src):
    w, h = src.shape

    result = gpuarray.empty((h, w), dtype=src.dtype)
    _transpose(result, src)
    return result
Beispiel #39
0
GPU = True
NANT = 128
#NANT = 1024
START_JD = 2458000
END_JD = 2458001
INT_TIME = 21600
NSIDE = 512
#NSIDE = 8

times = np.arange(START_JD, END_JD, INT_TIME / aipy.const.s_per_day)

a_cpu = np.zeros(shape=(NANT, 12 * NSIDE**2), dtype=np.complex64)
a_cpu[:, :] = 1. + 1j
aa_cpu = np.empty(shape=(NANT, NANT), dtype=a_cpu.dtype)

a_gpu = gpuarray.empty(a_cpu.shape, a_cpu.dtype)
aa_gpu = gpuarray.empty((a_cpu.shape[0], a_cpu.shape[0]), a_cpu.dtype)
h = skcuda.cublas.cublasCreate()

print '# Antennas:', NANT
print 'NSIDE:', NSIDE
print 'Starting', time.time()
for ti, jd in enumerate(times):
    print ti, '/', len(times)
    t1 = time.time()
    if GPU:
        a_gpu.set(a_cpu)
        t2 = time.time()
        #skcuda.cublas.cublasSgemm(h, 'n', 't', a_gpu.shape[0], a_gpu.shape[0], a_gpu.shape[1], 1., a_gpu.gpudata, a_gpu.shape[0], a_gpu.gpudata, a_gpu.shape[0], 0., aa_gpu.gpudata, aa_gpu.shape[0])
        skcuda.cublas.cublasCgemm(h, 'n', 't', a_gpu.shape[0], a_gpu.shape[0],
                                  a_gpu.shape[1], 1., a_gpu.gpudata,
Beispiel #40
0
    def select(self,
               target_flag,
               nthreads_per_block=64,
               max_blocks=1024,
               start_photon=None,
               nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon + first_photon),
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu,
                                         self.flags,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon +
                                                     first_photon),
                                            np.int32(photons_this_round),
                                            np.uint32(target_flag),
                                            index_counter_gpu,
                                            self.pos,
                                            self.dir,
                                            self.wavelengths,
                                            self.pol,
                                            self.t,
                                            self.flags,
                                            self.last_hit_triangles,
                                            self.weights,
                                            self.evidx,
                                            pos,
                                            dir,
                                            wavelengths,
                                            pol,
                                            t,
                                            flags,
                                            last_hit_triangles,
                                            weights,
                                            evidx,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t,
                               last_hit_triangles, flags, weights, evidx)
Beispiel #41
0
    def __init__(self,
                 photons,
                 ncopies=1,
                 copy_flags=True,
                 copy_triangles=True,
                 copy_weights=True):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        if not copy_triangles:
            self.last_hit_triangles.fill(-1)
        if not copy_flags:
            self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32)
        else:
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        if not copy_weights:
            self.weights = ga.ones_like(self.last_hit_triangles,
                                        dtype=np.float32)
        else:
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        if copy_triangles:
            self.last_hit_triangles[:nphotons].set(
                photons.last_hit_triangles.astype(np.int32))
        if copy_flags:
            self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        if copy_weights:
            self.weights[:nphotons].set(photons.weights.astype(np.float32))
        self.evidx[:nphotons].set(photons.evidx.astype(np.uint32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                self.evidx,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Beispiel #42
0
    def get_flat_hits(self,
                      gpu_detector,
                      target_flag=(0x1 << 2),
                      nthreads_per_block=64,
                      max_blocks=1024,
                      start_photon=None,
                      nphotons=None,
                      no_map=False):
        '''GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in chunk_iterator(
                nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photon_hits(np.int32(start_photon +
                                                      first_photon),
                                             np.int32(photons_this_round),
                                             np.uint32(target_flag),
                                             self.flags,
                                             gpu_detector.solid_id_map,
                                             self.last_hit_triangles,
                                             gpu_detector.detector_gpu,
                                             index_counter_gpu,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])

        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        channels = ga.empty(shape=reduced_nphotons, dtype=np.int32)

        # And finaly copy hits, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_hits(
                    np.int32(start_photon + first_photon),
                    np.int32(photons_this_round),
                    np.uint32(target_flag),
                    gpu_detector.solid_id_map,
                    gpu_detector.detector_gpu,
                    index_counter_gpu,
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    self.evidx,
                    pos,
                    dir,
                    wavelengths,
                    pol,
                    t,
                    flags,
                    last_hit_triangles,
                    weights,
                    evidx,
                    channels,
                    block=(nthreads_per_block, 1, 1),
                    grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons

        pos = pos.get().view(np.float32).reshape((len(pos), 3))
        dir = dir.get().view(np.float32).reshape((len(dir), 3))
        pol = pol.get().view(np.float32).reshape((len(pol), 3))
        wavelengths = wavelengths.get()
        t = t.get()
        last_hit_triangles = last_hit_triangles.get()
        flags = flags.get()
        weights = weights.get()
        evidx = evidx.get()
        channels = channels.get()
        hitmap = {}
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights, evidx, channels)
Beispiel #43
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      cutools.Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
                gpu_time1 = time.time()
                # transfer host (CPU) memory to device (GPU) memory
                a_gpu = gpuarray.to_gpu(a_cpu)
                d_gpu = gpuarray.to_gpu(d_cpu)
                kr1_gpu = gpuarray.to_gpu(kr1_cpu)
                kr2_gpu = gpuarray.to_gpu(kr2_cpu)
                threshold_bl_gpu = gpuarray.to_gpu(threshold_bl)
                threshold_donut_gpu = gpuarray.to_gpu(threshold_donut)
                threshold_h_gpu = gpuarray.to_gpu(threshold_h)
                threshold_v_gpu = gpuarray.to_gpu(threshold_v)
                bound1array_gpu = gpuarray.to_gpu(bound1array)
                bound3array_gpu = gpuarray.to_gpu(bound3array)

                # create empty gpu array for the result
                expected_bl_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                expected_donut_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                expected_h_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                expected_v_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                observed_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                bin_bl_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                bin_donut_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                bin_h_gpu = gpuarray.empty(
                    (np.shape(a_cpu)[0], np.shape(a_cpu)[1]), np.float32)
                bin_v_gpu = gpuarray.empty(
Beispiel #45
0
def gpuarray_factory(shape, dtype):
    import pycuda.gpuarray as gpuarray
    return gpuarray.empty(shape=shape, dtype=dtype)
Beispiel #46
0
 def test_ndarray_shape(self):
     gpuarray.empty(np.array(3), np.float32)
     gpuarray.empty(np.array([3]), np.float32)
     gpuarray.empty(np.array([2, 3]), np.float32)
Beispiel #47
0
    b = np.asarray(np.random.rand(n, n), t)

    start = time.time()
    c = np.dot(a, b)
    time_cpu.append(time.time() - start)

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)

    start = time.time()
    c_gpu = culinalg.dot(a_gpu, b_gpu)
    time_linalg.append(time.time() - start)

    a_gpu2 = gpuarray.to_gpu(a)
    b_gpu2 = gpuarray.to_gpu(b)
    c_gpu2 = gpuarray.empty((n, n), np.float32)

    h = cublasCreate()

    start = time.time()
    #cublasSgemm(h, 'n', 'n', np.int32(n), np.int32(n), np.int32(n), np.float32(1.0), a_gpu2, np.int32(n), b_gpu2, np.int32(n), np.float32(1.0), c_gpu2, np.int32(n))
    cublasSgemm(h, 'n', 'n', a.shape[0], a.shape[0], a.shape[0], 1.0,
                a_gpu2.gpudata, a.shape[0], b_gpu2.gpudata, a.shape[0], 1.0,
                c_gpu2.gpudata, a.shape[0])
    time_cula.append(time.time() - start)

    cublasDestroy(h)

    #cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc)

MAKE_PLOT = True
Beispiel #48
0
def setup_cuda_fft_multiply_repeated(n_jobs, h, n_fft):
    """Set up repeated CUDA FFT multiplication with a given filter.

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT multiplication.
    h : array
        The filtering function that will be used repeatedly.
    n_fft : int
        The number of points in the FFT.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to filter.
    h_fft : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or ndarray.

    Notes
    -----
    This function is designed to be used with fft_multiply_repeated().
    """
    cuda_dict = dict(use_cuda=False,
                     fft_plan=None,
                     ifft_plan=None,
                     x_fft=None,
                     x=None)
    h_fft = rfft(h, n=n_fft)
    if n_jobs == 'cuda':
        n_jobs = 1
        init_cuda()
        if _cuda_capable:
            from pycuda import gpuarray
            cudafft = _get_cudafft()
            # set up all arrays necessary for CUDA
            # try setting up for float64
            try:
                # do the IFFT normalization now so we don't have to later
                h_fft = gpuarray.to_gpu(h_fft.astype('complex_') / n_fft)
                cuda_dict.update(
                    use_cuda=True,
                    fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128),
                    ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64),
                    x_fft=gpuarray.empty(len(h_fft), np.complex128),
                    x=gpuarray.empty(n_fft, np.float64))
                logger.info('Using CUDA for FFT FIR filtering')
            except Exception as exp:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large: "%s"), falling back to '
                            'n_jobs=1' % str(exp))
        else:
            logger.info('CUDA not used, CUDA could not be initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict, h_fft
Beispiel #49
0
#  for more information on how to get this number for your device
MATRIX_SIZE = 2

# create two random square matrices
a_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)
b_cpu = np.random.randn(MATRIX_SIZE, MATRIX_SIZE).astype(np.float32)

# compute reference on the CPU to verify GPU computation
c_cpu = np.dot(a_cpu, b_cpu)

# transfer host (CPU) memory to device (GPU) memory
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)

# create empty gpu array for the result (C = A * B)
c_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

# get the kernel code from the template
# by specifying the constant MATRIX_SIZE
kernel_code = kernel_code_template % {'MATRIX_SIZE': MATRIX_SIZE}

# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# get the kernel function from the compiled module
matrixmul = mod.get_function("MatrixMulKernel")

# call the kernel on the card
matrixmul(
    # inputs
    a_gpu,
Beispiel #50
0
    def initialize_kernel(self):
        self.kernel_code = """
			// Update each cell of the grid
			// Any live cell with less than two live neighbors dies
			// Any live cell with two or three live neighbors lives
			// Any live cell with four or more live neighbors dies
			// Any dead cell with three neighbors becomes a live cell
			__global__ void life_step(float *board, float *board2)
			{{

				// Matrix size
				unsigned int m_size = {};
				unsigned int num_cells = {};

				// Column index of the element
				unsigned int x = threadIdx.x + blockIdx.x * blockDim.x;
				// Row index of the element
				unsigned int y = threadIdx.y + blockIdx.y * blockDim.y;
				// Thread ID in the board array
				unsigned int thread_id = y * m_size + x;

				// Game of life classically takes place on an infinite grid
				// I've used a toroidal geometry for the problem
				// The matrix wraps from top to bottom and from left to right
				unsigned int above = (thread_id - m_size) % num_cells;
				unsigned int below = (thread_id + m_size) % num_cells;
				unsigned int left;
				if (thread_id % m_size == 0) {{
					left = thread_id + m_size - 1;
				}} else {{
					left = thread_id - 1;
				}}
				unsigned int right;
				if (thread_id % m_size == m_size - 1) {{
					right = thread_id - m_size + 1;
				}} else {{
					right = thread_id + 1;
				}}
				unsigned int above_left;
				if (thread_id % m_size == 0) {{
					above_left = (thread_id - 1) % num_cells;
				}} else {{
					above_left = (thread_id - m_size - 1) % num_cells;
				}}
				unsigned int above_right;
				if (thread_id % m_size == m_size - 1) {{
					above_right = (thread_id - blockDim.x * m_size + 1) % num_cells;
				}} else {{
					above_right = (thread_id - m_size + 1) % num_cells;
				}}
				unsigned int below_left;
				if (thread_id % m_size == 0) {{
					below_left = (thread_id + blockDim.x * m_size - 1) % num_cells;
				}} else {{
					below_left = (thread_id + m_size - 1) % num_cells;
				}}
				unsigned int below_right;
				if (thread_id % m_size == m_size - 1) {{
					below_right = (thread_id + 1) % num_cells;
				}} else {{
					below_right = (thread_id + m_size + 1) % num_cells;
				}}

				unsigned int num_neighbors = board[above] + board[below] + board[left] + board[right] +
					board[above_left] + board[above_right] + board[below_left] + board[below_right];

				unsigned int live_and2 = board[thread_id] && (num_neighbors == 2);			// Live cell with 2 neighbors
				unsigned int live_and3 = board[thread_id] && (num_neighbors == 3);			// Live cell with 3 neighbors
				unsigned int dead_and3 = !(board[thread_id]) && (num_neighbors == 3);		// Dead cell with 3 neighbors
				board2[thread_id] = live_and2 || live_and3 || dead_and3;
			}}
		"""

        # Transfer CPU memory to GPU memory
        self.board_gpu = gpuarray.to_gpu(self.board)
        self.next_board = gpuarray.empty((self.size, self.size), np.float32)

        self.kernel = self.kernel_code.format(self.size, self.size * self.size)

        # Compile kernel code
        self.mod = SourceModule(self.kernel)

        # Get kernel function from compiled module
        self.game = self.mod.get_function('life_step')
Beispiel #51
0
def setup_cuda_fft_multiply_repeated(n_jobs, h_fft):
    """Set up repeated CUDA FFT multiplication with a given filter

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT multiplication.
    h_fft : array
        The filtering function that will be used repeatedly.
        If n_jobs='cuda', this function will be shortened (since CUDA
        assumes FFTs of real signals are half the length of the signal)
        and turned into a gpuarray.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to filter.
    h_fft : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or np.ndarray.
        If CUDA is enabled, h_fft will be modified appropriately for use
        with filter.fft_multiply().

    Notes
    -----
    This function is designed to be used with fft_multiply_repeated().
    """
    cuda_dict = dict(use_cuda=False,
                     fft_plan=None,
                     ifft_plan=None,
                     x_fft=None,
                     x=None)
    n_fft = len(h_fft)
    cuda_fft_len = int((n_fft - (n_fft % 2)) / 2 + 1)
    if n_jobs == 'cuda':
        n_jobs = 1
        if cuda_capable:
            # set up all arrays necessary for CUDA
            # try setting up for float64
            try:
                # do the IFFT normalization now so we don't have to later
                h_fft = gpuarray.to_gpu(
                    h_fft[:cuda_fft_len].astype('complex_') / len(h_fft))
                cuda_dict.update(
                    use_cuda=True,
                    fft_plan=cudafft.Plan(n_fft, np.float64, np.complex128),
                    ifft_plan=cudafft.Plan(n_fft, np.complex128, np.float64),
                    x_fft=gpuarray.empty(cuda_fft_len, np.complex128),
                    x=gpuarray.empty(int(n_fft), np.float64))
                logger.info('Using CUDA for FFT FIR filtering')
            except Exception:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large), falling back to '
                            'n_jobs=1')
        else:
            logger.info('CUDA not used, CUDA has not been initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict, h_fft
Beispiel #52
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()
Beispiel #53
0
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import numpy

free_bytes, total_bytes = cuda.mem_get_info()
exp = 10
while True:
    fill_floats = free_bytes / 4 - (1 << exp)
    if fill_floats < 0:
        raise RuntimeError("couldn't find allocatable size")
    try:
        print "alloc", fill_floats
        ary = gpuarray.empty((fill_floats, ), dtype=numpy.float32)
        break
    except:
        pass

    exp += 1

ary.fill(float("nan"))

print "filled %d out of %d bytes with NaNs" % (fill_floats * 4, free_bytes)
Beispiel #54
0
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
Beispiel #55
0
 def test_numpy_integer_shape(self):
     gpuarray.empty(np.int32(17), np.float32)
     gpuarray.empty((np.int32(17), np.int32(17)), np.float32)
Beispiel #56
0
def _diffusion_child(comm, bm=None):

    rank = comm.Get_rank()
    ngpus = comm.Get_size()

    nodename = socket.gethostname()
    name = '%s %s' % (nodename, rank)
    print(name)

    if rank == 0:

        # split indices on GPUs
        indices_split = _split_indices(bm.indices, ngpus)
        print('Indices:', indices_split)

        # send data to GPUs
        for k in range(1, ngpus):
            sendToChild(comm, bm.indices, indices_split[k], k, bm.data,
                        bm.labels, bm.label.nbrw, bm.label.sorw,
                        bm.label.allaxis)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank)
        ctx = dev.make_context()

        # select the desired script
        if bm.label.allaxis:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(bm.data, bm.labels, bm.indices, indices_split[0],
                       bm.label.nbrw, bm.label.sorw, name)
        tac = time.time()
        print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds')

        # gather data
        zsh_tmp = bm.argmax_z - bm.argmin_z
        ysh_tmp = bm.argmax_y - bm.argmin_y
        xsh_tmp = bm.argmax_x - bm.argmin_x
        if ngpus > 1:
            final_zero = np.empty((bm.nol, zsh_tmp, ysh_tmp, xsh_tmp),
                                  dtype=np.float32)
            for k in range(bm.nol):
                sendbuf = np.copy(walkmap[k])
                recvbuf = np.empty((zsh_tmp, ysh_tmp, xsh_tmp),
                                   dtype=np.float32)
                comm.Barrier()
                comm.Reduce([sendbuf, MPI.FLOAT], [recvbuf, MPI.FLOAT],
                            root=0,
                            op=MPI.SUM)
                final_zero[k] = recvbuf
        else:
            final_zero = walkmap

        # block and grid size
        block = (32, 32, 1)
        x_grid = (xsh_tmp // 32) + 1
        y_grid = (ysh_tmp // 32) + 1
        grid = (int(x_grid), int(y_grid), int(zsh_tmp))
        xsh_gpu = np.int32(xsh_tmp)
        ysh_gpu = np.int32(ysh_tmp)

        # smooth
        if bm.label.smooth:
            try:
                update_gpu = _build_update_gpu()
                curvature_gpu = _build_curvature_gpu()
                a_gpu = gpuarray.empty((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
                b_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
            except Exception as e:
                print(
                    'Warning: GPU out of memory to allocate smooth array. Process starts without smoothing.'
                )
                bm.label.smooth = 0

        if bm.label.smooth:
            final_smooth = np.copy(final_zero)
            for k in range(bm.nol):
                a_gpu = gpuarray.to_gpu(final_smooth[k])
                for l in range(bm.label.smooth):
                    curvature_gpu(a_gpu,
                                  b_gpu,
                                  xsh_gpu,
                                  ysh_gpu,
                                  block=block,
                                  grid=grid)
                    update_gpu(a_gpu,
                               b_gpu,
                               xsh_gpu,
                               ysh_gpu,
                               block=block,
                               grid=grid)
                final_smooth[k] = a_gpu.get()
            final_smooth = np.argmax(final_smooth, axis=0).astype(np.uint8)
            final_smooth = get_labels(final_smooth, bm.allLabels)
            final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
            final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
                  bm.argmin_x:bm.argmax_x] = final_smooth
            final = final[1:-1, 1:-1, 1:-1]
            save_data(bm.path_to_smooth, final, bm.header, bm.final_image_type,
                      bm.label.compression)

        # uncertainty
        if bm.label.uncertainty:
            try:
                max_gpu = gpuarray.zeros((3, zsh_tmp, ysh_tmp, xsh_tmp),
                                         dtype=np.float32)
                a_gpu = gpuarray.zeros((zsh_tmp, ysh_tmp, xsh_tmp),
                                       dtype=np.float32)
                kernel_uncertainty = _build_kernel_uncertainty()
                kernel_max = _build_kernel_max()
                for k in range(bm.nol):
                    a_gpu = gpuarray.to_gpu(final_zero[k])
                    kernel_max(max_gpu,
                               a_gpu,
                               xsh_gpu,
                               ysh_gpu,
                               block=block,
                               grid=grid)
                kernel_uncertainty(max_gpu,
                                   a_gpu,
                                   xsh_gpu,
                                   ysh_gpu,
                                   block=block,
                                   grid=grid)
                uq = a_gpu.get()
                uq *= 255
                uq = uq.astype(np.uint8)
                final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
                final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
                      bm.argmin_x:bm.argmax_x] = uq
                final = final[1:-1, 1:-1, 1:-1]
                save_data(bm.path_to_uq, final, compress=bm.label.compression)
            except Exception as e:
                print(
                    'Warning: GPU out of memory to allocate uncertainty array. Process starts without uncertainty.'
                )
                bm.label.uncertainty = False

        # free device
        ctx.pop()
        del ctx

        # argmax
        final_zero = np.argmax(final_zero, axis=0).astype(np.uint8)

        # save finals
        final_zero = get_labels(final_zero, bm.allLabels)
        final = np.zeros((bm.zsh, bm.ysh, bm.xsh), dtype=np.uint8)
        final[bm.argmin_z:bm.argmax_z, bm.argmin_y:bm.argmax_y,
              bm.argmin_x:bm.argmax_x] = final_zero
        final = final[1:-1, 1:-1, 1:-1]
        save_data(bm.path_to_final, final, bm.header, bm.final_image_type,
                  bm.label.compression)

        # computation time
        t = int(time.time() - bm.TIC)
        if t < 60:
            time_str = str(t) + ' sec'
        elif 60 <= t < 3600:
            time_str = str(t // 60) + ' min ' + str(t % 60) + ' sec'
        elif 3600 < t:
            time_str = str(t // 3600) + ' h ' + str(
                (t % 3600) // 60) + ' min ' + str(t % 60) + ' sec'
        print('Computation time:', time_str)

    else:

        data_z, data_y, data_x, data_dtype = comm.recv(source=0, tag=0)
        data = np.empty((data_z, data_y, data_x), dtype=data_dtype)
        if data_dtype == 'uint8':
            comm.Recv([data, MPI.BYTE], source=0, tag=1)
        else:
            comm.Recv([data, MPI.FLOAT], source=0, tag=1)
        allx, nbrw, sorw = comm.recv(source=0, tag=2)
        if allx:
            labels = []
            for k in range(3):
                labels_z, labels_y, labels_x = comm.recv(source=0, tag=k + 3)
                labels_tmp = np.empty((labels_z, labels_y, labels_x),
                                      dtype=np.int32)
                comm.Recv([labels_tmp, MPI.INT], source=0, tag=k + 6)
                labels.append(labels_tmp)
        else:
            labels_z, labels_y, labels_x = comm.recv(source=0, tag=3)
            labels = np.empty((labels_z, labels_y, labels_x), dtype=np.int32)
            comm.Recv([labels, MPI.INT], source=0, tag=6)
        indices = comm.recv(source=0, tag=9)
        indices_child = comm.recv(source=0, tag=10)

        # init cuda device
        cuda.init()
        dev = cuda.Device(rank % cuda.Device.count())
        ctx = dev.make_context()

        # select the desired script
        if allx:
            from pycuda_small_allx import walk
        else:
            from pycuda_small import walk

        # run random walks
        tic = time.time()
        walkmap = walk(data, labels, indices, indices_child, nbrw, sorw, name)
        tac = time.time()
        print('Walktime_%s: ' % (name) + str(int(tac - tic)) + ' ' + 'seconds')

        # free device
        ctx.pop()
        del ctx

        # send data
        for k in range(walkmap.shape[0]):
            datatemporaer = np.copy(walkmap[k])
            comm.Barrier()
            comm.Reduce([datatemporaer, MPI.FLOAT], None, root=0, op=MPI.SUM)
Beispiel #57
0
A = np.empty((n * batchSize, n), dtype=np.complex64)
B = np.empty((n * batchSize, n), dtype=A.dtype)

for i in range(batchSize):
    x = np.random.randn(n, n) + 1j * np.random.randn(n, n)
    x = x + x.conj().T
    x = x.astype(np.complex64)
    A[i * n:(i + 1) * n, :] = x
    # Need to reverse dimensions because CUSOLVER expects column-major matrices:
    B[i * n:(i + 1) * n, :] = x.T.copy()

x_gpu = gpuarray.to_gpu(B)

# Set up output buffers:
w_gpu = gpuarray.empty((batchSize, n), dtype=np.float32)

# Set up parameters
params = solver.cusolverDnCreateSyevjInfo()
solver.cusolverDnXsyevjSetTolerance(params, 1e-7)
solver.cusolverDnXsyevjSetMaxSweeps(params, 15)

# Set up work buffers:
lwork = solver.cusolverDnCheevjBatched_bufferSize(handle,
                                                  'CUSOLVER_EIG_MODE_VECTOR',
                                                  'u', n, x_gpu.gpudata, n,
                                                  w_gpu.gpudata, params,
                                                  batchSize)

workspace_gpu = gpuarray.zeros(lwork, dtype=A.dtype)
info = gpuarray.zeros(batchSize, dtype=np.int32)
Beispiel #58
0
def _sub_kmeans_gpu(X, k):
    import skcuda
    import skcuda.linalg as LA
    import pycuda.driver as cuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    LA.init()

    n, d = X.shape
    X = X.astype(np.float32)
    V_gpu = random_V(d, mode='gpu')
    m = d / 2
    X_gpu = gpuarray.to_gpu(X)
    mu_D_gpu = skcuda.misc.mean(X_gpu, axis=0, keepdims=True)
    sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu)
    S_D_gpu = LA.dot(sub_gpu, sub_gpu, transa='T')
    mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)])
    itr = 1
    assignment_unchanged = 0
    C_gpu = None
    MAX_ITER = 100
    while itr < MAX_ITER:
        Pc_gpu = projection_matrix(d, m, mode='gpu')
        PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T')
        PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32)

        for i in range(k):
            PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel()

        global_temp = LA.dot(X_gpu, PcV_gpu, transb='T')
        if itr % 2 == 0:
            C_old = C_gpu.get()
        X_transformed_gpu = gpuarray.empty(
            (n, k, m), dtype=np.float32)
        for i in xrange(n):
            temp = global_temp[i]
            X_transformed_gpu[i] = skcuda.misc.subtract(
                PcVmu_is_gpu, temp)

        X_transformed_squared_gpu = LA.multiply(
            X_transformed_gpu, X_transformed_gpu)
        X_transformed_squared_gpu = X_transformed_squared_gpu.reshape(
            (n * k, m))
        X_transformed_sum_gpu = skcuda.misc.sum(
            X_transformed_squared_gpu, axis=-1, keepdims=True)
        X_transformed_sum_gpu = X_transformed_sum_gpu.reshape((n, k))
        C_gpu = skcuda.misc.argmin(
            X_transformed_sum_gpu, axis=1)
        if itr % 2 == 0:
            Cnew = C_gpu.get()
            points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            print('[i] Itr %d: %d points changed' % (itr, points_changed))
        C = C_gpu.get()
        counts = {i: 0 for i in range(k)}
        mu_is = np.zeros((k, d)).astype(np.float32)
        for i in range(n):
            C_id = np.int(C[i])
            mu_is[C_id] += X[i]
            counts[C_id] += 1

        mu_is = np.array([mu_is[i] / counts[i] for i in range(k)])
        mu_is_gpu = gpuarray.to_gpu(mu_is)
        S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32)

        maxv = np.max(counts.values())
        storage = np.empty((k, np.int(maxv), d)).astype(np.float32)
        counter = np.zeros(k, dtype=np.uint32)

        for i in range(n):
            C_id = np.int(C[i])
            X_minus_mu_isi = (X[i] - mu_is[C_id])[:, None]
            storage[C_id, np.int(counter[C_id]), :] = X_minus_mu_isi.ravel()
            counter[C_id] += 1

        storage_gpu = gpuarray.to_gpu(storage)
        for i in range(k):
            curr_cluster_points = storage_gpu[i,
                                              :np.int(counter[i]), :]
            S_is_gpu[i] = LA.dot(curr_cluster_points,
                                 curr_cluster_points, transa='T')

        S_is_sum_gpu = S_is_gpu.reshape((k, d * d))
        S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True)
        S_is_sum_gpu = S_is_sum_gpu.reshape((d, d))

        S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu)

        w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu')

        maxVal = min(w)
        m = np.sum([1 for i in w if i / maxVal > 1e-3])
        m = max(1, m)

        itr += 1
    return C_gpu.get(), V_gpu.get(), m
Beispiel #59
0
    def _allocate_gpu_vectors(self, p0, tile_names, tile_names_map, matches,
                              matches_num):
        """
        Allocates anbd initializes the arrays on the gpu that will be used for the optimization process
        """
        self._matches_num = matches_num
        self._params_num = p0.shape[0]
        self._tiles_num = p0.shape[0] // 3

        # Allocate the parameters and gradients arrays, and copy the initial parameters
        self._cur_params_gpu = gpuarray.to_gpu(p0.astype(np.float32))
        self._next_params_gpu = gpuarray.empty(p0.shape, np.float32, order='C')
        self._gradients_gpu = gpuarray.zeros(p0.shape, np.float32, order='C')
        self._diff_params_gpu = gpuarray.empty(p0.shape, np.float32, order='C')

        # Allocate and copy matches and indexes mappers - TODO - should be async
        self._src_matches_gpu = cuda.mem_alloc(
            int(np.dtype(np.float32).itemsize * 2 * matches_num))
        assert (self._src_matches_gpu is not None)
        self._dst_matches_gpu = cuda.mem_alloc(
            int(np.dtype(np.float32).itemsize * 2 * matches_num))
        assert (self._dst_matches_gpu is not None)
        self._src_idx_to_tile_idx_gpu = cuda.mem_alloc(
            int(np.dtype(int).itemsize * matches_num))
        assert (self._src_idx_to_tile_idx_gpu is not None)
        self._dst_idx_to_tile_idx_gpu = cuda.mem_alloc(
            int(np.dtype(int).itemsize * matches_num))
        assert (self._dst_idx_to_tile_idx_gpu is not None)

        #         counter = 0
        #         for pair_name, pair_matches in matches.items():
        #             pair_matches_len = len(pair_matches[0])
        #             cuda.py_memcpy_htoa(self._src_matches_gpu, counter, pair_matches[0].astype(np.float32, order='C'))
        #             cuda.py_memcpy_htoa(self._dst_matches_gpu, counter, pair_matches[1].astype(np.float32, order='C'))
        #             # copy the mapping to tile idx to the gpu TODO - note that the numpy array is reused, so should be careful in async mode
        #             tile_idx = np.empty((pair_matches_len, ), dtype=np.int32)
        #             tile_idx.fill(tile_names_map[pair_name[0]]) # fill with src tile idx
        #             cuda.py_memcpy_htoa(self._src_idx_to_tile_idx_gpu, counter, tile_idx)
        #             tile_idx.fill(tile_names_map[pair_name[1]]) # fill with dst tile idx
        #             cuda.py_memcpy_htoa(self._dst_idx_to_tile_idx_gpu, counter, tile_idx)
        #             counter += pair_matches_len
        counter = 0
        src_matches_all = np.empty((matches_num, 2),
                                   dtype=np.float32,
                                   order='C')
        dst_matches_all = np.empty((matches_num, 2),
                                   dtype=np.float32,
                                   order='C')
        src_tiles_idxs_all = np.empty((matches_num, ),
                                      dtype=np.int32,
                                      order='C')
        dst_tiles_idxs_all = np.empty((matches_num, ),
                                      dtype=np.int32,
                                      order='C')
        for pair_name, pair_matches in matches.items():
            pair_matches_len = len(pair_matches[0])
            src_matches_all[counter:counter +
                            pair_matches_len] = pair_matches[0].astype(
                                np.float32)
            dst_matches_all[counter:counter +
                            pair_matches_len] = pair_matches[1].astype(
                                np.float32)
            src_tiles_idxs_all[counter:counter +
                               pair_matches_len] = tile_names_map[pair_name[0]]
            dst_tiles_idxs_all[counter:counter +
                               pair_matches_len] = tile_names_map[pair_name[1]]
            counter += pair_matches_len

        cuda.memcpy_htod(self._src_matches_gpu, src_matches_all)
        cuda.memcpy_htod(self._dst_matches_gpu, dst_matches_all)
        cuda.memcpy_htod(self._src_idx_to_tile_idx_gpu, src_tiles_idxs_all)
        cuda.memcpy_htod(self._dst_idx_to_tile_idx_gpu, dst_tiles_idxs_all)

        # Allocate memory for the residuals
        self._residuals_gpu = gpuarray.empty((matches_num, ),
                                             np.float32,
                                             order='C')
Beispiel #60
0
def _sub_kmeans_gpu_custom(X, k):
    import skcuda
    import skcuda.linalg as LA
    import pycuda.driver as cuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import custom_kernels as CC
    LA.init()
    CC.init()

    n, d = X.shape
    X = X.astype(np.float32)
    V_gpu = random_V(d, mode='gpu')

    m = d / 2

    X_gpu = gpuarray.to_gpu(X)
    mu_D_gpu = CC.column_mean(X_gpu)
    sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu)
    sub_gpu_T = LA.transpose(sub_gpu)
    S_D_gpu = CC.matmul(sub_gpu_T, sub_gpu)
    mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)])
    itr = 1
    assignment_unchanged = 0
    C_gpu = None
    MAX_ITER = 100

    while itr < MAX_ITER:
        Pc_gpu = projection_matrix(d, m, mode='gpu')
        PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T')
        PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32)

        for i in range(k):
            PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel()

        global_temp = LA.dot(X_gpu, PcV_gpu, transb='T')
        if itr % 2 == 0:
            C_old = C_gpu.get()
        C_gpu = CC.argmin_mu_diff(global_temp, PcVmu_is_gpu)
        if itr % 2 == 0:
            Cnew = C_gpu.get()
            points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8))
            if points_changed == 0:
                assignment_unchanged += 1
            if assignment_unchanged >= 2:
                break
            print('[i] Itr %d: %d points changed' % (itr, points_changed))

        C = C_gpu.get()
        counts = {i: 0 for i in range(k)}

        for i in xrange(n):
            C_id = np.int(C[i])
            counts[C_id] += 1
        maxv = np.max(counts.values())
        storage = np.zeros((k, np.int(maxv), d)).astype(np.float32)

        counter = np.zeros(k, dtype=np.uint32)  # k
        for i in range(n):
            C_id = np.int(C[i])
            storage[C_id, np.int(counter[C_id]), :] = X[i].ravel()
            counter[C_id] += 1

        storage_gpu = gpuarray.to_gpu(storage)

        mu_is_gpu = CC.sum_axis2(storage_gpu)
        counter_gpu = gpuarray.to_gpu(counter)[:, None]

        mu_is_gpu = skcuda.misc.divide(
            mu_is_gpu, counter_gpu.astype(np.float32))
        S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32)  # k,d,d

        for i in range(k):
            storage_gpu[i] = skcuda.misc.subtract(storage_gpu[i], mu_is_gpu[i])
            curr_cluster_points = storage_gpu[i,
                                              :np.int(counter[i]), :]  # |k|,d
            S_is_gpu[i] = LA.dot(curr_cluster_points,
                                 curr_cluster_points, transa='T')

        S_is_sum_gpu = S_is_gpu.reshape((k, d * d))
        S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True)
        S_is_sum_gpu = S_is_sum_gpu.reshape((d, d))

        S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu)

        w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu')

        maxVal = min(w)
        m = np.sum([1 for i in w if i / maxVal > 1e-3])
        m = max(1, m)

        itr += 1
    return C_gpu.get(), V_gpu.get(), m