Exemple #1
0
 def allocation(self):
     super(DGModalGpu, self).allocation()
     self.ul_gpu = cuda.to_device(self.ul)
     self.ul_prev_gpu = cuda.to_device(self.ul)
     self.ul_tmp_gpu = cuda.to_device(self.ul)
     self.kl_gpu = cuda.to_device(self.ul)
     self.el_sum_gpu = cuda.to_device(np.zeros(self.ne))
def test_compare_order():
    '''
    compare_order between C(row-major), F(column-major)
    '''
    compare_order = mod_cu.get_function('compare_order')


    nx, ny = 3, 4
    f_1d = np.arange(nx*ny, dtype='f8')
    f_2d_C = f_1d.reshape((nx,ny), order='C')
    f_2d_F = f_1d.reshape((nx,ny), order='F')

    print ''
    print 'f_1d_C\n\n', f_1d
    print 'f_2d_C\n', f_2d_C
    print 'f_2d_F\n', f_2d_F

    print ''
    print 'after cuda'
    ret_f_1d = np.zeros_like(f_1d)
    f_1d_gpu = cuda.mem_alloc_like(f_1d)

    f_2d_C_gpu = cuda.to_device(f_2d_C)
    compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_C\n', ret_f_1d

    f_2d_F_gpu = cuda.to_device(f_2d_F)
    compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1))
    cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu)
    print 'f_1d from f_2d_F\n', ret_f_1d
    def train_gpu(self, num_iter, model_file_path):
        if self.batch == 0:
            # Prepare to send the numpy array to gpu
            self.syn1_gpu = cuda.to_device(self.syn1)
            # Create word idx and related data-structure.
            self.base_word_rep = cuda.mem_alloc(len(self.dictionary)*WordRep.memsize)
            word_rep_ptr = int(self.base_word_rep)
            self.word_reps = {}
            for w_idx, word in sorted(self.dictionary.items()):
                word_code = 1-2*self.words_rep[word][0].astype(dtype=np.int32)
                word_point = self.words_rep[word][1].astype(dtype=np.int32)
                self.word_reps[w_idx] = WordRep(word_code, word_point, word_rep_ptr)
                word_rep_ptr += WordRep.memsize
            print "GPU transfers done."


        self.sent_reps_gpu = cuda.to_device(self.sent_reps)
        # Prepare sentences for GPU transfer.
        idx_sentences = [[self.dictionary.token2id[word] for word in sentence if word in self.dictionary]
                         for sentence in self.sentences]

        # Prepare the kernel function
        kernel = self.kernel_str.get_function("train_sg")
        words = np.empty(self.num_sents, dtype=np.int32)
        # sent_reps = np.copy(self.sent_reps)
        for iter in range(num_iter):
            # Sample words for each sentence and transfer to GPU
            for s_idx in range(self.num_sents):
                words[s_idx] = random.choice(idx_sentences[s_idx])
            words_gpu = cuda.to_device(words)
            kernel(self.sent_reps_gpu, np.float32(self.alpha), words_gpu, self.base_word_rep, self.syn1_gpu,
                   block=(self.size, 1, 1), grid=(self.num_sents, 1, 1))
            # autoinit.context.synchronize()
        self.sent_reps = cuda.from_device(self.sent_reps_gpu, self.sent_reps.shape, self.sent_reps.dtype)
        pickle_dump(self.sent_reps, model_file_path)
Exemple #4
0
    def test_simple_kernel_2(self):
        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        a = np.random.randn(400).astype(np.float32)
        b = np.random.randn(400).astype(np.float32)
        a_gpu = drv.to_device(a)
        b_gpu = drv.to_device(b)

        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=(400, 1, 1))
        assert la.norm(dest-a*b) == 0

        drv.Context.synchronize()
        # now try with offsets
        dest = np.zeros_like(a)
        multiply_them(
                drv.Out(dest), np.intp(a_gpu)+a.itemsize, b_gpu,
                block=(399, 1, 1))

        assert la.norm((dest[:-1]-a[1:]*b[:-1])) == 0
Exemple #5
0
def multiply_csr(matrix, vector, block_size, repeat=1):
    '''
    Method multiply matrix by vector using CUDA module for CSR.
    Calculation executed on nVidia GPU.

    Parameters
    ==========
    matrix : Scipy matrix or numpy array
        Matrix to multiplication.
    vector : numpy array
        Vector to multiplication. His length must equal number of columns
        matrix.
    block_size : int (recommended 128 or 256)
        Size of block CUDA.
    repeat : int > 0
        Number of repetitions multiplications. It has no effect on
        result. Specifies the length of returned list of execution times.

    Returns
    =======
    Tuple of result multiplication and list of execution times.
    '''
    if len(vector) != matrix.shape[1]:
        raise ArithmeticError('Length of the vector is not equal to the'
                              'number of columns of the matrix.')
    matrix = mf.convert_to_scipy_csr(matrix)
    data = numpy.array(matrix.data, dtype=numpy.float32)
    indices = numpy.array(matrix.indices, dtype=numpy.int32)
    indptr = numpy.array(matrix.indptr, dtype=numpy.int32)
    data = cuda.to_device(data)
    indices = cuda.to_device(indices)
    indptr = cuda.to_device(indptr)
    num_rows = matrix.shape[0]
    result = numpy.zeros(num_rows, dtype=numpy.float32)
    time_list = []

    grid_size = int(numpy.ceil((num_rows+0.0)/block_size))
    block = (block_size, 1, 1)
    grid = (grid_size, 1)
    g_vector = cuda.to_device(vector)
    num_rows = numpy.int32(num_rows)

    kernel, texref = cudacodes.get_cuda_csr(block_size=block_size)
    texref.set_address(g_vector, vector.nbytes)
    tex = [texref]

    for _ in range(repeat):
        start.record()
        kernel(data,
               indices,
               indptr,
               cuda.Out(result),
               num_rows,
               block=block,
               grid=grid,
               texrefs=tex)
        end.record()
        end.synchronize()
        time_list.append(start.time_till(end))
    return (result, time_list)
Exemple #6
0
def get_phir_gpu (XK, XV, surface, field, par_reac, kernel):

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    MV = numpy.zeros(len(XK))
    L = numpy.sqrt(2*surface.Area) # Representative length
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w    = getWeights(K)
    X_V = numpy.zeros(N*K)
    X_Kx = numpy.zeros(N*K)
    X_Ky = numpy.zeros(N*K)
    X_Kz = numpy.zeros(N*K)
    X_Kc = numpy.zeros(N*K)
    X_Vc = numpy.zeros(N*K)

    for i in range(N*K):
        X_V[i]   = XV[i/K]*w[i%K]*surface.Area[i/K]
        X_Kx[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,0]
        X_Ky[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,1]
        X_Kz[i]  = XK[i/K]*w[i%K]*surface.Area[i/K]*surface.normal[i/K,2]
        X_Kc[i]  = XK[i/K]
        X_Vc[i]  = XV[i/K]

    toc = time.time()
    time_set = toc - tic
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu   = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu  = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu  = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu  = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))


    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq)/par_reac.BSZ))

    get_phir(phir, field.xq_gpu, field.yq_gpu, field.zq_gpu, m_gpu, mx_gpu, my_gpu, mz_gpu, mKc_gpu, mVc_gpu, 
            surface.xjDev, surface.yjDev, surface.zjDev, surface.AreaDev, surface.kDev, surface.vertexDev, 
            numpy.int32(len(surface.xj)), numpy.int32(Nq), numpy.int32(par_reac.K), xkDev, wkDev, REAL(par_reac.threshold),
             AI_int_gpu, numpy.int32(len(surface.xk)), surface.XskDev, surface.WskDev, block=(par_reac.BSZ,1,1), grid=(GSZ,1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
 def __init__(self, code, point, struct_ptr):
     self.code = cuda.to_device(code)
     self.point = cuda.to_device(point)
     self.code_shape, self.code_dtype = code.shape, code.dtype
     self.point_shape, self.point_dtype = point.shape, point.dtype
     cuda.memcpy_htod(int(struct_ptr), np.int32(code.size))
     cuda.memcpy_htod(int(struct_ptr) + 8, np.intp(int(self.code)))
     cuda.memcpy_htod(int(struct_ptr) + 8 + np.intp(0).nbytes, np.intp(int(self.point)))
 def sync_to_device(self):
     self.object_array = np.array([f.as_array()
             for f in self.object_list])
     self.d_object_array = cuda.to_device(self.object_array)
     self.d_object_count = cuda.to_device(np.array([self.object_count],
                                                dtype=np.int32))
     self.device_ptr = cuda.to_device(np.array([self.d_object_array,
                                                self.d_object_count],
                                               dtype=np.intp))
     return self.device_ptr
Exemple #9
0
  def nlargest(self, n):
    """Returns the per-individual threshold above which there are n outputs.
    
    @param n: number of outputs which should be above the threshold
    @type params: int

    @return list of thresholds, in order of individuals, which delimit the top
            n output values
    """
    log.debug("enter nlargest with n=%d", n)

    # Find one more output so that we can use strictly-less-than when counting
    # and underestimate lift rather than overestimating it.
    n = n + 1

    passSizes = []
    while n > 0:
      nextSize = min(self.maxHeapFloats, n)
      passSizes.append(nextSize)
      n -= nextSize

    log.debug("pass sizes: %r", passSizes)
    
    thresholdsMat = np.ones(shape=(self.popSize,),
                            dtype=np.float32) * np.inf
    self.thresholds = driver.to_device(thresholdsMat)

    uintBytes = np.dtype(np.uint32).itemsize
    thresholdCounts = np.zeros(shape=(self.popSize,),
                               dtype=np.uint32)
    self.thresholdCounts = driver.to_device(thresholdCounts)

    for passSize in passSizes:
      log.debug("begin pass size %d", passSize)
      self.nlargestKernel.prepared_call(self.nlargestGridDim,
                                        self.outputs,
                                        self.trainSet.size,
                                        self.popSize,
                                        passSize,
                                        self.thresholds,
                                        self.thresholdCounts)

      driver.Context.synchronize()

      if log.isEnabledFor(logging.DEBUG):
        thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
        log.debug("thresholds: %s", str(thresholdsMat))
        
        thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
        log.debug("thresholdCounts: %s", str(thresholdCounts))

    self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
    return self.thresholdsMat
Exemple #10
0
def P2PKt_gpu(surfSrc, surfTar, m, mKtc, Ktx_gpu, Kty_gpu, Ktz_gpu, 
            surf, LorY, w, param, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event() 
        toc = cuda.Event() 
    else:
        tic = Event()
        toc = Event()

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mKtcDev = cuda.to_device(mKtc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    directKt_gpu = kernel.get_function("P2PKt")
    AI_int = cuda.to_device(numpy.zeros(param.Nround, dtype=numpy.int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays


    directKt_gpu(Ktx_gpu, Kty_gpu, Ktz_gpu, 
                surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mKtcDev, 
                surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, 
                surfSrc.vertexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), 
                numpy.int32(LorY), REAL(param.kappa), REAL(param.threshold),
                numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = numpy.zeros(param.Nround, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=numpy.int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return Ktx_gpu, Kty_gpu, Ktz_gpu
 def K(self, Q, P, angles, quadratures):
     drv.memcpy_htod(self.mod_K.get_global("cos_phi")[0], cos(angles).astype(scipy.float32))
     drv.memcpy_htod(self.mod_K.get_global("sin_phi")[0], sin(angles).astype(scipy.float32))
     Nx = Q.shape[0]
     Ny = int(floor(quadratures.size / 1024.))
     K = scipy.empty((Nx,), dtype=scipy.float32)
     Kb = drv.mem_alloc(4*Ny*Nx)
     Q_gpu = drv.to_device(Q)
     P_gpu = drv.to_device(P)
     self.K_gpu(drv.In(quadratures), Q_gpu, P_gpu, Kb,
                block=(1, 1024, 1), grid=(Nx, Ny), shared=1024*4)
     self.reduction_gpu(Kb, drv.Out(K), block=(1, Ny, 1), grid=(Nx, 1), shared=Ny*4)
     return K/self.L
Exemple #12
0
    def __init__(self):
        self.stream = cuda.Stream()
        self.pool = pycuda.tools.PageLockedMemoryPool()
        self._clear()

        # These resources rely on the slots/ringbuffer mechanism for sharing,
        # and so can be shared across any number of launches, genomes, and
        # render kernels. Notably, seeds are self-synchronizing, so they're not
        # attached to either stream object.
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
        self.d_seeds = cuda.to_device(seeds)
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
        self.d_points = cuda.mem_alloc(self._len_d_points)
Exemple #13
0
def M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, surf, ind0, param, LorY, timing, kernel):

    if param.GPU==1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL

    tic.record()
    M2P_size = surfTar.offsetMlt[surf,len(surfTar.twig)]
    MSort  = numpy.zeros(param.Nm*M2P_size)
    MdSort = numpy.zeros(param.Nm*M2P_size)

    i = -1
    for C in surfTar.M2P_list[surf,0:M2P_size]:
        i+=1
        MSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].M
        MdSort[i*param.Nm:i*param.Nm+param.Nm] = surfSrc.tree[C].Md

#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)
    MDev = cuda.to_device(MSort.astype(REAL))
    MdDev = cuda.to_device(MdSort.astype(REAL))
#    (free, total) = cuda.mem_get_info()
#    print 'Global memory occupancy: %f%% free'%(free*100/total)

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    GSZ = int(numpy.ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    multipole_gpu = kernel.get_function("M2P")

    multipole_gpu(K_gpu, V_gpu, surfTar.offMltDev, surfTar.sizeTarDev,  
                    surfTar.xcDev, surfTar.ycDev, surfTar.zcDev,
                    MDev, MdDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, 
                    ind0.indexDev, numpy.int32(ptr_offset), numpy.int32(ptr_list), REAL(param.kappa), 
                    numpy.int32(param.BlocksPerTwig), numpy.int32(param.NCRIT), numpy.int32(LorY), 
                    block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_M2P += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
Exemple #14
0
    def test_multichannel_linear_texture(self):
        mod = SourceModule("""
        #define CHANNELS 4
        texture<float4, 1, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int i = threadIdx.x+blockDim.x*threadIdx.y;
          float4 texval = tex1Dfetch(mtx_tex, i);
          dest[i*CHANNELS + 0] = texval.x;
          dest[i*CHANNELS + 1] = texval.y;
          dest[i*CHANNELS + 2] = texval.z;
          dest[i*CHANNELS + 3] = texval.w;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        shape = (16, 16)
        channels = 4
        a = np.random.randn(*(shape+(channels,))).astype(np.float32)
        a_gpu = drv.to_device(a)
        mtx_tex.set_address(a_gpu, a.nbytes)
        mtx_tex.set_format(drv.array_format.FLOAT, 4)

        dest = np.zeros(shape+(channels,), dtype=np.float32)
        copy_texture(drv.Out(dest),
                block=shape+(1,),
                texrefs=[mtx_tex]
                )
        #print a
        #print dest
        assert la.norm(dest-a) == 0
Exemple #15
0
    def index_list_backend(self, ilists):
        from pytools import single_valued

        ilist_length = single_valued(len(il) for il in ilists)
        assert ilist_length == self.plan.dofs_per_face

        from cgen import Typedef, POD

        from pytools import flatten

        flat_ilists_uncast = numpy.array(list(flatten(ilists)))

        if numpy.max(flat_ilists_uncast) >= 256:
            tp = numpy.uint16
        else:
            tp = numpy.uint8

        flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
        assert (flat_ilists == flat_ilists_uncast).all()

        return GPUIndexLists(
            type=tp,
            code=[Typedef(POD(tp, "index_list_entry_t"))],
            device_memory=cuda.to_device(flat_ilists),
            bytes=flat_ilists.size * flat_ilists.itemsize,
        )
Exemple #16
0
    def batch_indexing(self, planes, data_points):

        data_size = data_points.shape[0] / 128

        self.benchmark_begin('preparing')

        gpu_alloc_objs = []

        # for data points

        #addresses = [] 
        #for point in data_points:
        #    point_addr = drv.to_device(point)
        #    gpu_alloc_objs.append(point_addr)
        #    addresses.append(int(point_addr))

        #np_addresses = numpy.array(addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        #arrays_gpu = drv.mem_alloc(np_addresses.shape[0] * 8)
        #drv.memcpy_htod(arrays_gpu, np_addresses)

        # for planes

        planes_addresses = [] 
        for plane in planes:
            plane_addr = drv.to_device(plane)
            gpu_alloc_objs.append(plane_addr)
            planes_addresses.append(int(plane_addr))

        planes_np_addresses = numpy.array(planes_addresses).astype(numpy.uint64)

        # 64 bit addressing space. each point costs 8 bytes
        planes_arrays_gpu = drv.mem_alloc(planes_np_addresses.shape[0] * 8)
        drv.memcpy_htod(planes_arrays_gpu, planes_np_addresses)

        # projections
 
        projections = numpy.zeros(data_size).astype(numpy.uint64)

        length = numpy.array([data_size]).astype(numpy.uint64)
 
        print "total: " + str(data_size) + " data points to indexing." 

        self.benchmark_end('preparing')
        self.benchmark_begin('cudaing')

        self.indexing_kernel(
            planes_arrays_gpu, drv.In(data_points), drv.Out(projections), drv.In(length),
            block = self.block, grid = self.grid)
        
        self.benchmark_end('cudaing')

        #count = 0
        #for pro in projections:
        #    print "count: " + str(count) + " " + str(pro)
        #    count += 1
        #print projections.shape

        return projections
Exemple #17
0
def go_sort_old(count, stream=None):
    data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
    ddata = cuda.to_device(data)
    print 'Done seeding'

    grids = count / 8192
    pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
    dpfxs = cuda.to_device(pfxs)

    launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    #dsplit = cuda.to_device(pfxs)
    #launch('crappy_split', dpfxs, dsplit,
            #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    #if not stream:
        #split = cuda.from_device_like(dsplit, pfxs)
        #split_ = cuda.from_device_like(dsplit_, pfxs)
        #print np.all(split == split_)

    dshortseg_pfxs = cuda.mem_alloc(256 * 4)
    dshortseg_sums = cuda.mem_alloc(256 * 4)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            dshortseg_pfxs, dshortseg_sums,
            block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1)

    dsorted = cuda.mem_alloc(count * 4)
    launch('sort_8', ddata, dsorted, dpfxs,
            block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1)

    launch('sort_8_a', ddata, dsorted, dpfxs, dsplit,
            block=(32, 32, 1), grid=(grids, 1), stream=stream)
    if not stream:
        sorted = cuda.from_device(dsorted, (count,), np.int32)
        f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
        sort_stat = f(rle(sorted))
        with open('dev.txt', 'w') as fp: fp.write(sort_stat)

        sorted_np = np.sort(data)
        np_stat = f(rle(sorted_np))
        with open('cpu.txt', 'w') as fp: fp.write(np_stat)

        print 'is_sorted?', np.all(sorted == sorted_np)
Exemple #18
0
    def cls_init(self,kernel_nr,y_cls,cls1,cls2,cls1_n,cls2_n):
        """
        Prepare cuda kernel call for kernel_nr, copy data for particular binary classifier, between class 1 vs 2.
         
        Parameters
        ------------
        kernel_nr : int
            concurrent kernel number
        y_cls : array-like
            binary class labels (1,-1)
        cls1: int
            first class number
        cls2: int
            second class number
        cls1_n : int
            number of elements of class 1
        cls2_n : int
            number of elements of class 2
        kernel_out : array-like
            array for gpu kernel result, size=2*len(y_cls)
        
        """
        warp=32
        align_cls1_n =  cls1_n+(warp-cls1_n%warp)%warp
        align_cls2_n =  cls2_n+(warp-cls2_n%warp)%warp
        
        self.cls1_N_aligned=align_cls1_n

        sum_cls= align_cls1_n+align_cls2_n   
        self.sum_cls[kernel_nr] = sum_cls
              
        
        self.cls_count[kernel_nr] = np.array([cls1_n,cls2_n],dtype=np.int32)
        self.cls[kernel_nr] = np.array([cls1,cls2],dtype=np.int32)  
        
        self.g_cls_count[kernel_nr] = cuda.to_device(self.cls_count[kernel_nr])
        
        self.g_cls[kernel_nr] = cuda.to_device(self.cls[kernel_nr])
        
        self.bpg[kernel_nr] =int( np.ceil( (self.threadsPerRow*sum_cls+0.0)/self.tpb ))
        
        self.g_y[kernel_nr] =  cuda.to_device(y_cls)
        
        self.kernel_out[kernel_nr] = np.zeros(2*y_cls.shape[0],dtype=np.float32)
        
        ker_out = self.kernel_out[kernel_nr]      
        self.g_out[kernel_nr] = cuda.to_device(ker_out) # cuda.mem_alloc_like(ker_out)
Exemple #19
0
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
    def __init__(self, fields, str_f, pt0, pt1):
        """
        """

        common.check_type('fields', fields, Fields)
        common.check_type('str_f', str_f, (str, list, tuple), str)
        common.check_type('pt0', pt0, (list, tuple), (int, float))
        common.check_type('pt1', pt1, (list, tuple), (int, float))

        pt0 = list( common.convert_indices(fields.ns, pt0) )
        pt1 = list( common.convert_indices(fields.ns, pt1) )

        # local variables
        str_fs = common.convert_to_tuple(str_f)
        dtype_str_list = fields.dtype_str_list

        for strf in str_fs:
            strf_list = ['ex', 'ey', 'ez', 'hx', 'hy', 'hz']
            common.check_value('str_f', strf, strf_list)

        for axis, n, p0, p1 in zip(['x', 'y', 'z'], fields.ns, pt0, pt1):
            common.check_value('pt0 %s' % axis, p0, range(n))
            common.check_value('pt1 %s' % axis, p1, range(n))

        # program
        macros = ['NMAX', 'XID', 'YID', 'ZID', \
                'ARGS', \
                'TARGET', 'SOURCE', 'OVERWRITE', \
                'DTYPE']

        values = common_gpu.macro_replace_list(pt0, pt1) + \
                ['DTYPE *source', \
                'target[sub_idx]', 'source[idx]', '='] + \
                dtype_str_list

        ksrc = common.replace_template_code( \
                open(common_gpu.src_path + 'copy.cu').read(), macros, values)
        program = SourceModule(ksrc)
        kernel_copy = program.get_function('copy')

        # allocation
        source_bufs = [fields.get_buf(str_f) for str_f in str_fs]
        shape = common.shape_two_points(pt0, pt1, len(str_fs))

        host_array = np.zeros(shape, fields.dtype)
        split_host_array = np.split(host_array, len(str_fs))
        split_host_array_dict = dict( zip(str_fs, split_host_array) ) 
        target_buf = cuda.to_device(host_array)

        # global variables
        self.mainf = fields
        self.kernel_copy = kernel_copy
        self.source_bufs = source_bufs
        self.target_buf = target_buf
        self.host_array = host_array
        self.split_host_array_dict = split_host_array_dict
Exemple #21
0
    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
Exemple #22
0
def get_heartbeat(d_lead, length, sampling_rate):
    # Kernel Parameters
    threads_per_block = 200
    num_blocks = length / threads_per_block


    # Get RR
    reduce_by = 32
    edge_signal = cuda.mem_alloc(4 * length)
    
    edge_detect(edge_signal, d_lead,
                grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    indecies = numpy.zeros(length / reduce_by).astype(numpy.int32)
    masks = cuda.to_device(numpy.zeros(length / reduce_by).astype(numpy.int32))
    d_index = cuda.to_device(indecies)
    index_of_peak(d_index, masks, edge_signal,
                  grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    cd_index, c_length = compact_sparse_with_mask(d_index, masks, length / reduce_by)

    # Allocate output
    # full_rr_signal = numpy.zeros(c_length).astype(numpy.int32)
    dev_rr = cuda.mem_alloc(c_length * 4)

    num_blocks = (c_length / threads_per_block) + 1
    get_compact_rr(dev_rr,
                   cd_index,
                   numpy.int32(sampling_rate),
                   numpy.int32(c_length),
                   grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    clean_result(dev_rr, numpy.int32(120), numpy.int32(40),
                 numpy.int32(1), numpy.int32(c_length),
                 grid=(num_blocks, 1), block=(threads_per_block, 1, 1))

    moving_average_filter(dev_rr, c_length, 250)

    index = cuda.from_device(cd_index, (c_length,), numpy.int32)
    rr = cuda.from_device(dev_rr, (c_length,), numpy.int32)
    index[0] = index[1]

    return rr, index / float(sampling_rate * 3600)
Exemple #23
0
    def __init__(self, array, struct_arr_ptr):
        print "copying data to device"

        self.data = cuda.to_device(array)
        self.shape, self.dtype = array.shape, array.dtype

        cuda.memcpy_htod(int(struct_arr_ptr),
                         numpy.getbuffer(numpy.int32(len(array[0]))))
        cuda.memcpy_htod(int(struct_arr_ptr) + 8,
                         numpy.getbuffer(numpy.intp(int(self.data))))
Exemple #24
0
def P2P_gpu(surfSrc, surfTar, m, mx, my, mz, mKc, mVc, K_gpu, V_gpu, 
            surf, LorY, K_diag, IorE, L, w, param, timing, kernel):

    tic = cuda.Event() 
    toc = cuda.Event() 

    tic.record()
    REAL = param.REAL
    mDev   = cuda.to_device(m.astype(REAL))
    mxDev  = cuda.to_device(mx.astype(REAL))
    myDev  = cuda.to_device(my.astype(REAL))
    mzDev  = cuda.to_device(mz.astype(REAL))
    mKcDev = cuda.to_device(mKc.astype(REAL))
    mVcDev = cuda.to_device(mVc.astype(REAL))
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3


    tic.record()
    GSZ = int(ceil(float(param.Nround)/param.NCRIT)) # CUDA grid size
    direct_gpu = kernel.get_function("P2P")
    AI_int = cuda.to_device(zeros(param.Nround, dtype=int32))

    # GPU arrays are flattened, need to point to first element 
    ptr_offset  = surf*len(surfTar.offsetTwigs[surf])  # Pointer to first element of offset arrays 
    ptr_list    = surf*len(surfTar.P2P_list[surf])     # Pointer to first element in lists arrays

    # Check if internal or external to send correct singular integral
    if IorE==1:
        sglInt = surfSrc.sglInt_intDev
    else:
        sglInt = surfSrc.sglInt_extDev


    direct_gpu(K_gpu, V_gpu, surfSrc.offSrcDev, surfTar.offTwgDev, surfTar.P2P_lstDev, surfTar.sizeTarDev,
                surfSrc.kDev, surfSrc.xjDev, surfSrc.yjDev, surfSrc.zjDev, mDev, mxDev, myDev, mzDev, 
                mKcDev, mVcDev, surfTar.xiDev, surfTar.yiDev, surfTar.ziDev, surfSrc.AreaDev, sglInt,
                surfSrc.vertexDev, int32(ptr_offset), int32(ptr_list), 
                int32(LorY), REAL(param.kappa), REAL(param.threshold),
                int32(param.BlocksPerTwig), int32(param.NCRIT), REAL(K_diag), AI_int, 
                surfSrc.XskDev, surfSrc.WskDev, block=(param.BSZ,1,1), grid=(GSZ,1))

    toc.record()
    toc.synchronize()
    timing.time_P2P += tic.time_till(toc)*1e-3


    tic.record()
    AI_aux = zeros(param.Nround, dtype=int32)
    AI_aux = cuda.from_device(AI_int, param.Nround, dtype=int32)
    timing.AI_int += sum(AI_aux[surfTar.unsort])
    toc.record()
    toc.synchronize()
    timing.time_trans += tic.time_till(toc)*1e-3

    return K_gpu, V_gpu
	def allocate_gpu(s):
		a = np.zeros((6,5),'f')
		if s.num_device == 0:
			a[-2,:] = 1.5
		if s.num_device == 1:
			a[1,:] = 2.0
			a[-2,:] = 2.5
		if s.num_device == 2:
			a[1,:] = 3.0
		s.arr_gpu = cuda.to_device(a)
		s.nx, s.ny = s.shape = a.shape
		s.dtype = a.dtype
Exemple #26
0
def transfer_leads(*h_leads):
    length = len(h_leads[0])
    result = []
    grid = ((length / 1024)+1, 1)
    block = (1024, 1, 1)
    for h_lead in h_leads:
        d_lead16 = cuda.to_device(h_lead)
        d_lead32 = cuda.mem_alloc(h_lead.nbytes * 2)
        to_float(d_lead32, d_lead16, numpy.int32(length),
                 grid=grid, block=block)
        result.append(d_lead32)
    return tuple(result) + (length,)
    def __init__(self, array, struct_arr_ptr):
        self.data = cuda.to_device(array)
        self.shape, self.dtype = array.shape, array.dtype
        """
        numpy.getbuffer() needed due to lack of new-style buffer interface for
        scalar numpy arrays as of numpy version 1.9.1

        see: https://github.com/inducer/pycuda/pull/60
        """
        cuda.memcpy_htod(int(struct_arr_ptr),
                         numpy.getbuffer(numpy.int32(array.size)))
        cuda.memcpy_htod(int(struct_arr_ptr) + 8,
                         numpy.getbuffer(numpy.uintp(int(self.data))))
	def local_to_device(self, early_free=True):
		"""
		    Copies locally set array data to device memory.
                    Parameters
                    ----------
		    early_free: boolean 
                            When True, will automatically free all device memory
		            before allocation, preventing double-allocation for a short time.
		"""

		if self.device_ptr != None and early_free == True:
			self.device_ptr.free()

		self.device_ptr = drv.to_device(self.local_array)
Exemple #29
0
def make_blocks(devdata, data):
    from hedge.backends.cuda.tools import pad_and_join

    blocks = ["".join(b) for b in data]
    block_size = devdata.align(max(len(b) for b in blocks))

    class BlockedDataStructure(Record):
        pass

    return BlockedDataStructure(
            blocks=cuda.to_device(pad_and_join(blocks, block_size)),
            max_per_block=max(len(b) for b in data),
            block_size=block_size,
            )
	def alloc_eh_fields(s):
		f = np.zeros((s.nx, s.ny, s.nz), 'f')
		s.ex_gpu = cuda.to_device(f)
		s.ey_gpu = cuda.to_device(f)
		s.ez_gpu = cuda.to_device(f)
		s.hx_gpu = cuda.to_device(f)
		s.hy_gpu = cuda.to_device(f)
		s.hz_gpu = cuda.to_device(f)
		s.eh_fields = [s.ex_gpu, s.ey_gpu, s.ez_gpu, s.hx_gpu, s.hy_gpu, s.hz_gpu]
Exemple #31
0
tpb = 256
bpg = (nx * ny * nz) / tpb

print 'dim (%d, %d, %d)' % (nx, ny, nz)
total_bytes = nx * ny * nz * 4 * 9
if total_bytes / (1024**3) == 0:
    print 'mem %d MB' % (total_bytes / (1024**2))
else:
    print 'mem %1.2f GB' % (float(total_bytes) / (1024**3))

# memory allocate
f = np.random.randn(nx * ny * nz).astype(np.float32).reshape((nx, ny, nz))
#f = np.zeros((nx,ny,nz), 'f')

ex_gpu = cuda.to_device(f)
ey_gpu = cuda.to_device(f)
ez_gpu = cuda.to_device(f)
hx_gpu = cuda.to_device(f)
hy_gpu = cuda.to_device(f)
hz_gpu = cuda.to_device(f)

cex = set_c(f, 'yz')
cey = set_c(f, 'zx')
cez = set_c(f, 'xy')

cex_gpu = cuda.to_device(cex)
cey_gpu = cuda.to_device(cey)
cez_gpu = cuda.to_device(cez)

descr = cuda.ArrayDescriptor3D()
Exemple #32
0
def go_sort_old(count, stream=None):
    data = np.fromstring(np.random.bytes(count), dtype=np.uint8)
    ddata = cuda.to_device(data)
    print 'Done seeding'

    grids = count / 8192
    pfxs = np.zeros((grids + 1, 256), dtype=np.int32)
    dpfxs = cuda.to_device(pfxs)

    launch('prefix_scan_8_0_shmem_shortseg',
           ddata,
           dpfxs,
           block=(32, 16, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    #dsplit = cuda.to_device(pfxs)
    #launch('crappy_split', dpfxs, dsplit,
    #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)
    #if not stream:
    #split = cuda.from_device_like(dsplit, pfxs)
    #split_ = cuda.from_device_like(dsplit_, pfxs)
    #print np.all(split == split_)

    dshortseg_pfxs = cuda.mem_alloc(256 * 4)
    dshortseg_sums = cuda.mem_alloc(256 * 4)
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           dshortseg_pfxs,
           dshortseg_sums,
           block=(32, 8, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)

    dsorted = cuda.mem_alloc(count * 4)
    launch('sort_8',
           ddata,
           dsorted,
           dpfxs,
           block=(32, 16, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    launch('sort_8_a',
           ddata,
           dsorted,
           dpfxs,
           dsplit,
           block=(32, 32, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        sorted = cuda.from_device(dsorted, (count, ), np.int32)
        f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r])
        sort_stat = f(rle(sorted))
        with open('dev.txt', 'w') as fp:
            fp.write(sort_stat)

        sorted_np = np.sort(data)
        np_stat = f(rle(sorted_np))
        with open('cpu.txt', 'w') as fp:
            fp.write(np_stat)

        print 'is_sorted?', np.all(sorted == sorted_np)
def make_frame(frame_dir,
               frame,
               image_dir,
               frames,
               global_vars,
               find_total_max=False,
               save_density=False,
               save_entanglement=False,
               save_expectation_value=False,
               exp_range=[0, 0],
               **kwargs):
    global yMax, rhoMax, rhoMin
    frame_number = (frame.split("_")[1]).split(".")[0]
    print "Plotting", frame_dir
    QuantumState = np.load(frame_dir)
    # for i in xrange(len(QuantumState)):
    #     # if QuantumState[i,0,0,0]*QuantumState[i,0,0,0].conjugate()>0.:
    #     print i, QuantumState[i,0,0,0]
    # print "Info prob:",np.sum(QuantumState*QuantumState.conjugate())
    # QuantumState+= 1.
    fig = plt.figure(figsize=(15, 12))
    oldQuantumState = None
    oldQuantumState = QuantumState.copy()
    xSize = global_vars["Qx"] / 2
    ySize = global_vars["Qy"] / 2
    zSize = 1
    blockX = 256
    x_nSize = xSize * ySize * (2 * xSize * ySize - 1)
    while x_nSize % blockX != 0:
        blockX /= 2
    gridX = x_nSize / blockX
    blockY = 1
    blockZ = 1
    gridY = 1
    gridZ = 1
    blockX_r1_r2 = 16
    blockY_r1_r2 = 16
    blockZ_r1_r2 = 1
    while xSize % blockX_r1_r2 != 0:
        blockX_r1_r2 /= 2
        blockY_r1_r2 /= 2
    gridX_r1_r2 = xSize * xSize / blockX_r1_r2
    gridY_r1_r2 = ySize * ySize / blockY_r1_r2
    gridZ_r1_r2 = 1

    QuantumField_r1_r2_real = np.zeros((xSize, ySize, xSize, ySize),
                                       dtype=np.float64)
    QuantumField_r1_r2_imag = np.zeros((xSize, ySize, xSize, ySize),
                                       dtype=np.float64)
    Rho_projected_real = np.zeros((xSize, ySize), dtype=np.float64)
    # AnalyticFieldReal = np.zeros((xSize, xSize), dtype = np.float64)
    # AnalyticFieldImag = np.zeros((xSize, xSize), dtype = np.float64)
    Lattice = np.zeros(3, dtype=np.int_)
    Lattice[0], Lattice[1], Lattice[2] = xSize, ySize, zSize
    Time = np.zeros(1, dtype=np.float64)
    Time[0] = int(frame_number)
    # purity = np.zeros(1, dtype = np.float64)

    # gpuPurity = drv.to_device(purity)
    gpuLattice = drv.to_device(Lattice)
    gpuTime = drv.to_device(Time)
    gpuQField = drv.to_device(QuantumState)
    gpuQF_r1_r2_real = drv.to_device(QuantumField_r1_r2_real)
    gpuQF_r1_r2_imag = drv.to_device(QuantumField_r1_r2_imag)
    gpuRho_projected_real = drv.to_device(Rho_projected_real)
    # gpuAnalyticFieldReal = drv.to_device(AnalyticFieldReal)
    # gpuAnalyticFieldImag = drv.to_device(AnalyticFieldImag)
    get_QS(gpuQField,
           gpuQF_r1_r2_real,
           gpuQF_r1_r2_imag,
           gpuLattice,
           block=(blockX, blockY, blockZ),
           grid=(gridX, gridY, gridZ))
    get_Rho_projected(gpuQF_r1_r2_real,
                      gpuQF_r1_r2_imag,
                      gpuRho_projected_real,
                      gpuLattice,
                      block=(blockX_r1_r2, blockY_r1_r2, blockZ_r1_r2),
                      grid=(gridX_r1_r2, gridY_r1_r2, gridZ_r1_r2))
    # if save_entanglement:
    #     calcPurity(gpuQF_r1_r2_real, gpuQF_r1_r2_imag, gpuPurity, gpuLattice, block=(blockX_r1_r2,blockY_r1_r2,blockZ_r1_r2), grid=(gridX_r1_r2,gridY_r1_r2,gridZ_r1_r2))
    # get_Analytic(gpuAnalyticFieldReal, gpuAnalyticFieldImag, gpuAn, gpuBn, gpuAn2, gpuBn2, gpuLattice, gpuTime, block=(blockX,blockY,blockZ), grid=(gridX,gridY,gridZ))
    QuantumField_r1_r2_real = drv.from_device(gpuQF_r1_r2_real,
                                              QuantumField_r1_r2_real.shape,
                                              np.float64)
    QuantumField_r1_r2_imag = drv.from_device(gpuQF_r1_r2_imag,
                                              QuantumField_r1_r2_imag.shape,
                                              np.float64)
    Rho_projected_real = drv.from_device(gpuRho_projected_real,
                                         Rho_projected_real.shape, np.float64)
    # AnalyticFieldReal = drv.from_device(gpuAnalyticFieldReal, AnalyticFieldReal.shape, np.float64)
    # AnalyticFieldImag = drv.from_device(gpuAnalyticFieldImag, AnalyticFieldImag.shape, np.float64)
    RhoFieldProjected = np.zeros((xSize, ySize), dtype=DTYPE)
    # RhoFieldProjectedAnalytic = np.zeros((xSize), dtype = DTYPE)

    rho_total = np.sum(Rho_projected_real)
    # print rho_total
    # print rho_total
    for x in xrange(xSize):
        for y in xrange(ySize):
            RhoFieldProjected[x, y] = Rho_projected_real[x, y] / rho_total

    #Get Rho and Phase from QField
    # RhoField = (QuantumField_r1_r2_real+1.j*QuantumField_r1_r2_imag)*(QuantumField_r1_r2_real+1.j*QuantumField_r1_r2_imag).conjugate()
    # PhaseField = np.arctan2(QuantumField_r1_r2_imag,QuantumField_r1_r2_real)+np.pi

    if int(frame_number) == 0:
        # rhoMin, rhoMax = np.amin(RhoField).real, np.amax(RhoField).real
        yMax = np.amax(RhoFieldProjected.real)

    time = int(frame_number)
    # purity = drv.from_device(gpuPurity, purity.shape, np.float64)
    # entanglement = 1-2.*purity[0]/(rho_total*rho_total)
    xAxis = setXaxis(xSize)
    yAxis = setXaxis(ySize)

    time_text = plt.suptitle(r'$\tau = $' + str(time),
                             fontsize=14,
                             horizontalalignment='center',
                             verticalalignment='top')
    # if save_entanglement:
    #     time_text = plt.suptitle(r'$\tau = $' + str(time) + '    ' + r'$\mathcal{E} = $' + str(entanglement) ,fontsize=14,horizontalalignment='center',verticalalignment='top')
    # full_text = plt.suptitle(r'$\tau = $' + str(time) + '    ' + r'$P_{f} = $' + str('{:1.15f}'.format(prob)) + '     ' + r'$P_{p} = $' + str('{:1.15f}'.format(probP)),fontsize=14,horizontalalignment='center',verticalalignment='top')
    # gs = gridspec.GridSpec(1,1)
    # ax = fig.add_subplot(gs[0,0], xlim=(0,xSize), xlabel=r'$x(\ell)$', ylim=(-0.0000001, 1.1*yMax), ylabel=r'${| \Psi |}^{2}$')

    ############################### PLOTTING ####################################################
    # print "Rho: ", RhoFieldProjected
    if yMax == 0.:
        yMax += 1.
    # Density
    plt.subplot(111)
    plt.imshow(((RhoFieldProjected.real).T),
               extent=(np.amin(xAxis), np.amax(xAxis), np.amin(yAxis),
                       np.amax(yAxis)),
               origin='lower',
               cmap=colorMapRho,
               norm=colors.SymLogNorm(linthresh=linThresh * yMax,
                                      linscale=linScale,
                                      vmin=0.,
                                      vmax=yMax))
    putLabels(r'$x\ \ (\ell)$', r'$y\ \ (\ell)$',
              r'$\rho \ \ (\frac{1}{\ell^2})$')
    ax = plt.gca()
    ax.set_aspect('equal')

    # #Phase
    # plt.subplot(122)
    # plt.imshow((PhaseField.real), extent=(np.amin(xAxis), np.amax(xAxis), np.amin(xAxis), np.amax(xAxis)), origin = 'lower',
    #     cmap=colorMapPhase, norm=colors.Normalize(vmin=0.,vmax=2.*np.pi))
    # putLabels(r'$x_{1}\ \ (\ell)$', r'$x_{2}\ \ (\ell)$', r'$\theta \ \ (Radians)$')
    # # Any value whose absolute value is > .0001 will have zero transparency
    # alphas = Normalize(0, rhoMax, clip=True)((rhoMax-RhoField.real))
    # # alphas = colors.SymLogNorm(linthresh=linThresh*rhoMax,linscale=linScale,vmin=0.,vmax=10., clip=True)((rhoMax-RhoField.real).T)
    # alphas = np.clip(alphas**4, 0.0, 1)  # alpha value clipped at the bottom at .4
    # # Normalize the colors b/w 0 and 1, we'll then pass an MxNx4 array to imshow
    # cmap = plt.cm.gist_gray
    # colorsMap = colors.SymLogNorm(linthresh=linThresh*rhoMax,linscale=linScale,vmin=0.,vmax=rhoMax)((0.*RhoField.real))
    # colorsMap = cmap(colorsMap)

    # # Now set the alpha channel to the one we created above
    # colorsMap[..., -1] = alphas
    # plt.imshow(colorsMap, extent=(np.amin(xAxis), np.amax(xAxis), np.amin(xAxis), np.amax(xAxis)), origin = 'lower')
    # ax = plt.gca()
    # ax.set_aspect('equal')

    # Screen density

    fig.tight_layout(pad=0.4, w_pad=5.0, h_pad=1.0, rect=[.05, .05, .95, .95])

    if save_density:
        rho_dir = frame_dir.split("Data")[0] + "Density"
        if not os.path.exists(rho_dir + '/'):
            os.makedirs(rho_dir + '/')
        print "Saving density to", rho_dir + '/Frame_' + frame_number + '.npy'
        np.save(rho_dir + '/Frame_' + frame_number, RhoFieldProjected.real)

# if save_entanglement:
# 	ent_dir = frame_dir.split("Data")[0] + "Entanglement"
# 	if not os.path.exists(ent_dir+ '/'):
# 		os.makedirs(ent_dir + '/')
# 	print "Saving entanglement to", ent_dir + '/entanglement.npy'
# 	if time==0:
# 		entanglementArray = np.asarray([[entanglement, time]])
# 	else:
# 		entArrayOld = np.load(ent_dir + '/entanglement.npy'  )
# 		entArrayNew = np.asarray([[entanglement, time]])
# 		entanglementArray = np.append(entArrayOld, entArrayNew, axis=0)
# 	np.save(ent_dir + '/entanglement' , entanglementArray)

#    if save_expectation_value:
#        exp_dir = frame_dir.split("Data")[0] + "expectation"
#        if not os.path.exists(exp_dir+ '/'):
#            os.makedirs(exp_dir + '/')
#        print "Saving expectation to", exp_dir + '/expectation.npy'
#        exp_val = 0.
#        if exp_range==[0,0]:
#            for x in xrange(xSize):
#                exp_val += x*RhoFieldProjected[x].real
#        else:
#            for x in xrange(int(exp_range[0]*xSize),int(exp_range[1]*xSize),1):
#                exp_val += x*RhoFieldProjected[x].real
#        if time==0:
#            expectationArray = np.asarray([exp_val])
#        else:
#            expArrayOld = np.load(exp_dir + '/expectation.npy'  )
#            expArrayNew = np.asarray([exp_val])
#            expectationArray = np.append(expArrayOld, expArrayNew, axis=0)
#        np.save(exp_dir + '/expectation' , expectationArray)

#Free memory
    gpuLattice.free()
    gpuQField.free()
    gpuQF_r1_r2_real.free()
    gpuQF_r1_r2_imag.free()
    gpuRho_projected_real.free()

    gpuTime.free()
    # gpuPurity.free()
    # gpuAnalyticFieldReal.free()
    # gpuAnalyticFieldImag.free()

    if not os.path.exists(image_dir + '/'):
        os.makedirs(image_dir + '/')
    fig.savefig(image_dir + '/Frame_' + frame_number + ".png")
    plt.close(fig)
Exemple #34
0
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule
from six.moves import range

thread_strides = 16
block_size = 256
macroblock_count = 33

total_size = thread_strides * block_size * macroblock_count
dtype = numpy.float32

a = numpy.random.randn(total_size).astype(dtype)
b = numpy.random.randn(total_size).astype(dtype)

a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
c_gpu = cuda.mem_alloc(a.nbytes)

from cgen import FunctionBody, \
        FunctionDeclaration, POD, Value, \
        Pointer, Module, Block, Initializer, Assign
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
Exemple #35
0
def get_phir_gpu(XK, XV, surface, field, par_reac, kernel):
    """
    It computes the reaction potential on the GPU  and it brings the data
    to the cpu.

    Arguments
    ----------
    XK      : array, input for the double layer potential.
    XV      : array, input for the single layer potential.
    surface : class, surface where we are computing the reaction potential.
    field   : class, information about the different regions in the molecule.
    par_reac: class, fine parameters related to the surface.

    Returns
    --------
    phir_cpu: array, reaction potential brought from the GPU to the cpu.
    AI_int  : int, counter of the amount of near singular integrals solved.
    """

    REAL = par_reac.REAL
    Nq = len(field.xq)
    N = len(XK)
    AI_int = 0

    # Setup vector
    K = par_reac.K
    tic = time.time()
    w = getWeights(K)
    X_V = numpy.zeros(N * K)
    X_Kx = numpy.zeros(N * K)
    X_Ky = numpy.zeros(N * K)
    X_Kz = numpy.zeros(N * K)
    X_Kc = numpy.zeros(N * K)
    X_Vc = numpy.zeros(N * K)

    for i in range(N * K):
        X_V[i] = XV[i // K] * w[i % K] * surface.area[i // K]
        X_Kx[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 0]
        X_Ky[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 1]
        X_Kz[i] = XK[i // K] * w[i % K] * surface.area[
            i // K] * surface.normal[i // K, 2]
        X_Kc[i] = XK[i // K]
        X_Vc[i] = XV[i // K]

    toc = time.time()
    sort = surface.sortSource
    phir = cuda.to_device(numpy.zeros(Nq, dtype=REAL))
    m_gpu = cuda.to_device(X_V[sort].astype(REAL))
    mx_gpu = cuda.to_device(X_Kx[sort].astype(REAL))
    my_gpu = cuda.to_device(X_Ky[sort].astype(REAL))
    mz_gpu = cuda.to_device(X_Kz[sort].astype(REAL))
    mKc_gpu = cuda.to_device(X_Kc[sort].astype(REAL))
    mVc_gpu = cuda.to_device(X_Vc[sort].astype(REAL))
    AI_int_gpu = cuda.to_device(numpy.zeros(Nq, dtype=numpy.int32))
    xkDev = cuda.to_device(surface.xk.astype(REAL))
    wkDev = cuda.to_device(surface.wk.astype(REAL))

    get_phir = kernel.get_function("get_phir")
    GSZ = int(numpy.ceil(float(Nq) / par_reac.BSZ))

    get_phir(phir,
             field.xq_gpu,
             field.yq_gpu,
             field.zq_gpu,
             m_gpu,
             mx_gpu,
             my_gpu,
             mz_gpu,
             mKc_gpu,
             mVc_gpu,
             surface.xjDev,
             surface.yjDev,
             surface.zjDev,
             surface.AreaDev,
             surface.kDev,
             surface.vertexDev,
             numpy.int32(len(surface.xj)),
             numpy.int32(Nq),
             numpy.int32(par_reac.K),
             xkDev,
             wkDev,
             REAL(par_reac.threshold),
             AI_int_gpu,
             numpy.int32(len(surface.xk)),
             surface.XskDev,
             surface.WskDev,
             block=(par_reac.BSZ, 1, 1),
             grid=(GSZ, 1))

    AI_aux = numpy.zeros(Nq, dtype=numpy.int32)
    AI_aux = cuda.from_device(AI_int_gpu, Nq, dtype=numpy.int32)
    AI_int = numpy.sum(AI_aux)

    phir_cpu = numpy.zeros(Nq, dtype=REAL)
    phir_cpu = cuda.from_device(phir, Nq, dtype=REAL)

    return phir_cpu, AI_int
Exemple #36
0
	if( idx < nx ) f[ijk] += sin(0.1*tn);
}
"""


if __name__ == '__main__':
	nx, ny, nz = 320, 320, 320

	print 'dim (%d, %d, %d)' % (nx, ny, nz)
	print 'mem %1.2f GB' % ( nx*ny*nz*4*12./(1024**3) )

	# memory allocate
	f = np.zeros((nx,ny,nz), 'f', order='F')
	cf = np.ones_like(f)*0.5

	ex_gpu = cuda.to_device(f)
	ey_gpu = cuda.to_device(f)
	ez_gpu = cuda.to_device(f)
	hx_gpu = cuda.to_device(f)
	hy_gpu = cuda.to_device(f)
	hz_gpu = cuda.to_device(f)

	cex_gpu = cuda.to_device(cf)
	cey_gpu = cuda.to_device(cf)
	cez_gpu = cuda.to_device(cf)
	chx_gpu = cuda.to_device(cf)
	chy_gpu = cuda.to_device(cf)
	chz_gpu = cuda.to_device(cf)

	# prepare kernels
	from pycuda.compiler import SourceModule
Exemple #37
0
	sys.exit()
if (nx*ny)%Dy != 0:
	print "Error: nx*ny is not multiple of %d" % (Dy)
	sys.exit()

Bx, By = nz/Dx, nx*ny/Dy	# number of block
MBy = MAX_BLOCK/Bx
bpg_list = [(Bx,MBy) for i in range(By/MBy)]
if By%MBy != 0: bpg_list.append( (Bx,By%MBy) )
#print bpg_list

# memory allocate
f = np.zeros((nx,ny,nz), 'f')
cf = np.ones_like(f)*0.5

ex_gpu = cuda.to_device(f)
ey_gpu = cuda.to_device(f)
ez_gpu = cuda.to_device(f)
hx_gpu = cuda.to_device(f)
hy_gpu = cuda.to_device(f)
hz_gpu = cuda.to_device(f)

cex = set_c(f,'yz')
cey = set_c(f,'zx')
cez = set_c(f,'xy')

descr = cuda.ArrayDescriptor3D()
descr.width = nz
descr.height = ny
descr.depth = nx
descr.format = cuda.dtype_to_array_format(f.dtype)
Exemple #38
0
        
        plt.imshow(J)
        plt.show()
        
        cu.init()
        d = cu.Device(1)
        ctx = d.make_context()

        kernel_size = 3
        block_size = (16, 16)
        #grid_size = calculate_grid_size((height, width), block_size)
        grid_size = (32, 32)
        #print(I.shape)
        #print(grid_size)

        I_gpu = cu.to_device(I.astype('float32'))
        J_gpu = cu.mem_alloc(J.nbytes)

        source = cu.module_from_file("sobel.cubin")

        kernel_naive = source.get_function("sobel_filter")
        kernel_naive.prepare(['P', 'P', 'Q', 'Q', 'Q', 'Q'])
        time = kernel_naive.prepared_timed_call(grid_size, block_size, I_gpu, J_gpu, height, width, kernel_size, 4)
        J1 = cu.from_device(J_gpu, shape=J.shape, dtype="float32")
        print("Time spent in kernel1: {}s".format(time() * 1e-3))

        print("L1 norm: {}".format( np.sum(np.sum( np.abs(J - J1) )) ))

        plt.imshow(J1)
        plt.show()
        
total_bytes = nx*ny*nz*8*9
if total_bytes/(1024**3) == 0:
	print '%d MB' % ( total_bytes/(1024**2) )
else:
	print '%1.2f GB' % ( float(total_bytes)/(1024**3) )

if nz%32 != 0:
	print "Error: nz is not multiple of 32"
	sys.exit()


# memory allocate
f = np.zeros((nx,ny,nz), dtype=np.float64)
cf = np.ones_like(f)*0.5

eh_gpus = ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu = [cuda.to_device(f) for i in range(6)]
ce_gpus = cex_gpu, cey_gpu, cez_gpu = [cuda.to_device(cf) for i in range(3)]


# prepare kernels
tpb = 256
for bpg in xrange(65535, 0, -1):
	if (nx * ny * nz / tpb) % bpg == 0: break
print 'tpb = %d, bpg = %g' % (tpb, bpg)

from pycuda.compiler import SourceModule
mod = SourceModule( kernels.replace('Dx',str(tpb)).replace('nxyz',str(nx*ny*nz)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)), options=['-m 64'] )
update_h = mod.get_function("update_h")
update_e = mod.get_function("update_e")
update_src = mod.get_function("update_src")
Exemple #40
0
 def test_memory(self):
     z = np.random.randn(400).astype(np.float32)
     new_z = drv.from_device_like(drv.to_device(z), z)
     assert la.norm(new_z - z) == 0
Exemple #41
0
"""

kernels = """"""

import numpy as np
import sys
import pycuda.driver as cuda
import pycuda.autoinit

if __name__ == '__main__':
    size = 100  # MBtye
    nx = size * (1024**2) / 4
    nloop = 50

    # memory allocate
    fw_gpu = cuda.to_device(np.zeros(nx, 'f'))
    fr_gpu = cuda.to_device(np.random.randn(nx).astype(np.float32))

    # prepare kernels
    from pycuda.compiler import SourceModule
    body = 'fr[idx]'
    for i in range(nloop):
        if (i > 0): body += ' + fr[idx]'
        kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace(
            'BODY', body)
    print kernels

    mod = SourceModule(kernels)
    kern_list = []
    for i in range(nloop):
        kern_list.append(mod.get_function("func%.2d" % i))
Exemple #42
0
    if pt[2] != None: cf[:, :, pt[2]] = 0

    return cf


if __name__ == '__main__':
    nx, ny, nz = 320, 320, 320

    print 'dim (%d, %d, %d)' % (nx, ny, nz)
    print 'mem %1.2f GB' % (nx * ny * nz * 4 * 12. / (1024**3))

    # memory allocate
    f = np.zeros((nx, ny, nz), 'f')
    cf = np.zeros_like(f)

    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1)))
    cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1)))
    cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None)))
    chx_gpu = cuda.to_device(set_c(cf, (None, 0, 0)))
    chy_gpu = cuda.to_device(set_c(cf, (0, None, 0)))
    chz_gpu = cuda.to_device(set_c(cf, (0, 0, None)))

    # prepare kernels
    from pycuda.compiler import SourceModule
if __name__ == '__main__':
    digit_level = 25  #################################可変パラメーター############################# 最大25
    digitN = 1 << digit_level
    gsz, lsz, gsz2, lsz2 = Creategszlsz()  #GPUカーネルのgridサイズblockサイズ計算

    print("A,Bの要素数=", digitN)
    FMT_P0 = FMTClass(MODP=469762049,
                      MODP_WnSqrt=CreateWnSqrt(60733, 469762049, 26))
    FMT_P1 = FMTClass(MODP=1811939329,
                      MODP_WnSqrt=CreateWnSqrt(59189, 1811939329, 26))
    FMT_P2 = FMTClass(MODP=2013265921,
                      MODP_WnSqrt=CreateWnSqrt(52278, 2013265921, 27))

    host_E = np.zeros(digitN * 2).astype(np.uint32)  #結果格納用
    E = drv.to_device(host_E)  # gpuメモリ確保。かならず0に初期化しておかないといけない

    print("初期値生成")
    host_A, host_B = InitializeAB()
    A0 = drv.to_device(host_A)  # gpuメモリ確保&転送
    B0 = drv.to_device(host_B)  # gpuメモリ確保&転送
    A1 = drv.to_device(host_A)  # gpuメモリ確保&転送
    B1 = drv.to_device(host_B)  # gpuメモリ確保&転送
    A2 = drv.to_device(host_A)  # gpuメモリ確保&転送
    B2 = drv.to_device(host_B)  # gpuメモリ確保&転送

    print("GPU計算開始")
    stime = time.time()
    E0 = FMT_P0.Convolution(A0, B0)
    E1 = FMT_P1.Convolution(A1, B1)
    E2 = FMT_P2.Convolution(A2, B2)
Exemple #44
0
def go_sort(count, stream=None):
    grids = count / 8192

    keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16)
    #keys = np.arange(count, dtype=np.uint16)
    #np.random.shuffle(keys)
    mkeys = np.reshape(keys, (grids, 8192))
    vals = np.arange(count, dtype=np.uint32)
    dkeys = cuda.to_device(keys)
    dvals = cuda.to_device(vals)
    print 'Done seeding'

    dpfxs = cuda.mem_alloc(grids * 256 * 4)
    doffsets = cuda.mem_alloc(count * 2)
    launch('prefix_scan_8_0',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)

    # This stage will be rejiggered along with the split
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)

    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0)
        #print frle(tkeys & 0xff)

    d_skeys = cuda.mem_alloc(count * 2)
    d_svals = cuda.mem_alloc(count * 4)
    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)
    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        # Test integrity of sort (keys and values kept together):
        #   skeys[i] = keys[svals[i]] for all i
        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

    dkeys, d_skeys = d_skeys, dkeys
    dvals, d_svals = d_svals, dvals

    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)

    launch('prefix_scan_8_8',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)
    if not stream:
        pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = np.reshape(tkeys, (grids, 8192))

        new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8)
        print np.nonzero(new_offs != offsets)
        fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8)
        #print frle(fkeys)

    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        #print cuda.from_device(doffsets, (4, 8192), np.uint16)
        #print cuda.from_device(dkeys, (4, 8192), np.uint16)
        #print cuda.from_device(d_skeys, (4, 8192), np.uint16)

        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

        sorted_keys = np.sort(keys)
        # Test that ordering is correct. (Note that we don't need 100%
        # correctness, so this test should be made "soft".)
        print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
if __name__ == '__main__':
    nx, ny, nz = 240, 256, 256
    tpb = 256
    bpg = (nx * ny * nz) / tpb

    print 'dim (%d, %d, %d)' % (nx, ny, nz)
    total_bytes = nx * ny * nz * 4 * 9
    if total_bytes / (1024**3) == 0:
        print 'mem %d MB' % (total_bytes / (1024**2))
    else:
        print 'mem %1.2f GB' % (float(total_bytes) / (1024**3))

    # memory allocate
    f = np.zeros((nx, ny, nz), 'f', order='F')

    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    cex_gpu = cuda.to_device(set_c(f, 'yz'))
    cey_gpu = cuda.to_device(set_c(f, 'zx'))
    cez_gpu = cuda.to_device(set_c(f, 'xy'))

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(
        kernels.replace('TPB', str(tpb)).replace('nxy', str(nx * ny)).replace(
            'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)))
Exemple #46
0
    def test_two_directions(self):

        IMAGE_DIR = "Backpack-perfect"
        im1 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im1.png"))
        im2 = cv2.imread(os.path.join("../data", IMAGE_DIR, "im0.png"))

        stereo = SemiGlobalMatching(im1,
                                    im2,
                                    os.path.join("../data", IMAGE_DIR,
                                                 "calib.txt"),
                                    window_size=3,
                                    resize=(640, 480))

        params = {
            "p1": 5,
            "p2": 90000,
            "census_kernel_size": 7,
            "reversed": True
        }
        stereo.set_params(params)
        stereo.params['ndisp'] = 50
        t1 = time()

        assert stereo.p1 is not None, "parameters have not been set"
        t1 = time()
        cim1 = stereo.census_transform(stereo.im1)
        cim2 = stereo.census_transform(stereo.im2)
        #print(f"census transform time {time() - t1}")

        if not stereo.reversed:
            D = range(int(stereo.params['ndisp']))
        else:
            D = reversed(range(int(-stereo.params['ndisp']), 1))
        cost_images = stereo.compute_disparity_img(cim1, cim2, D)
        cost_images = np.float32(cost_images)
        m, n, D = cost_images.shape
        # direction == (1,0)
        stereo.directions = [(1, 0), (-1, 0), (1, 1), (-1, 1), (1, -1),
                             (-1, -1)]
        #stereo.directions = [(0,1)]
        t1 = time()
        L = stereo.aggregate_cost(cost_images)
        print("python aggregate cost %f" % (time() - t1))
        L = L.transpose((2, 0, 1))

        cost_images = cost_images.transpose((2, 0, 1))
        cost_images = np.ascontiguousarray(cost_images, dtype=np.float32)
        d, rows, cols = cost_images.shape
        d_step = 1
        rows = np.int32(rows)
        cols = np.int32(cols)

        compiler_constants = {
            'D_STEP': d_step,
            'D': d,
            'ARR_SIZE': math.floor(d / d_step),
            'P1': 5,
            'P2': 90000,
            'SHMEM_SIZE': 64
        }
        build_options = [format_compiler_constants(compiler_constants)]
        mod = SourceModule(open("../lib/sgbm_helper.cu").read(),
                           options=build_options)

        shmem_size = 16
        vertical_blocks = int(math.ceil(rows / shmem_size))

        #r_aggregate = mod.get_function('r_aggregate')
        vertical_aggregate_down = mod.get_function('vertical_aggregate_down')
        vertical_aggregate_up = mod.get_function('vertical_aggregate_up')
        diagonal_br_tl_aggregate = mod.get_function('diagonal_br_tl_aggregate')
        diagonal_tl_br_aggregate = mod.get_function('diagonal_tl_br_aggregate')
        diagonal_tr_bl_aggregate = mod.get_function('diagonal_tr_bl_aggregate')
        diagonal_bl_tr_aggregate = mod.get_function('diagonal_bl_tr_aggregate')
        #l_aggregate = mod.get_function('l_aggregate')

        t1 = time()
        cost_images_ptr = drv.to_device(cost_images)
        dp_ptr = drv.mem_alloc(cost_images.nbytes)

        vertical_aggregate_down(dp_ptr,
                                cost_images_ptr,
                                rows,
                                cols,
                                block=(256, 1, 1),
                                grid=(1, 1))
        vertical_aggregate_up(dp_ptr,
                              cost_images_ptr,
                              rows,
                              cols,
                              block=(256, 1, 1),
                              grid=(1, 1))

        #r_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks))
        #l_aggregate(dp_ptr, cost_images_ptr, rows, cols, block = (shmem_size, shmem_size, 1), grid = (1, vertical_blocks))

        diagonal_tl_br_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))
        diagonal_bl_tr_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))

        diagonal_tr_bl_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))
        diagonal_br_tl_aggregate(dp_ptr,
                                 cost_images_ptr,
                                 rows,
                                 cols,
                                 block=(256, 1, 1),
                                 grid=(1, 1))

        print("cuda aggregate cost %f" % (time() - t1))
        drv.stop_profiler()

        agg_image = drv.from_device(dp_ptr,
                                    cost_images.shape,
                                    dtype=np.float32)
        s1 = np.sum(np.float64(L))
        s2 = np.sum(np.float64(agg_image))

        print("L sum: %f" % s1)
        print("out sum: %f" % s2)
        self.assertTrue(np.all(np.isclose(agg_image, L)))
Exemple #47
0
"""

kernels = """"""

import numpy as np
import sys
import pycuda.driver as cuda
import pycuda.autoinit

if __name__ == '__main__':
    size = 100  # MBtye
    nx = size * (1024**2) / 4
    nloop = 30

    # memory allocate
    fw_gpu = cuda.to_device(np.zeros(nx, 'f'))
    f = np.random.randn(nx).astype(np.float32)
    fr_gpu_list = []
    for i in range(nloop):
        fr_gpu_list.append(cuda.to_device(f))

    # prepare kernels
    from pycuda.compiler import SourceModule
    args = 'float *fr00'
    body = 'f = 0.543*fr00[idx];\n\t__syncthreads();\n'
    for i in range(nloop):
        if (i > 0):
            args += ', float *fr%.2d' % i
            body += '\tf += %1.3f*fr%.2d[idx];\n\t__syncthreads();\n' % (
                np.random.ranf(), i)
        kernels += kernel_template.replace('NAME', 'func%.2d' % i).replace(
Exemple #48
0
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from magma_wrapper import magma_spotrf_gpu_wrap, magma_get_spotrf_nb_wrap
    
n = 1000

# Create matrix to be factored
A = np.eye(n, dtype='float32') + np.ones((n,n))*.1
A_gpu = cuda.to_device(A)

# Allocate pagelocked work array
nwork = magma_get_spotrf_nb_wrap(n)
work_gpu = cuda.pagelocked_empty((nb,nb), dtype='float32')

# Do Cholesky factorization
info = magma_spotrf_gpu_wrap('U', n, A_gpu, n, work_gpu)

# Copy back the Cholesy factor and check for correctness
L = cuda.from_device(A_gpu, (n,n), 'float32')
print np.abs(np.dot(L,L.T)-A).max()

Exemple #49
0
        gsz = 1 << (digit_level - 1)  # gpu global_work_size
        lsz = min(gsz, 256)  # gpu local_work_size

        gsz2 = digitN
        lsz2 = min(gsz2, 256)

        #print("A,Bの要素数=",digitN)
        FMT_P0 = FMTClass(MODP=469762049,
                          MODP_WnSqrt=CreateWnSqrt(60733, 469762049, 26))
        FMT_P1 = FMTClass(MODP=1811939329,
                          MODP_WnSqrt=CreateWnSqrt(59189, 1811939329, 26))
        FMT_P2 = FMTClass(MODP=2013265921,
                          MODP_WnSqrt=CreateWnSqrt(52278, 2013265921, 27))

        host_E = np.zeros(digitN * 2).astype(np.uint32)  #結果格納用
        E = drv.to_device(host_E)  # gpuメモリ確保。かならず0に初期化しておかないといけない

        #print("初期値生成")
        host_A, host_B = InitializeAB()
        A0 = drv.to_device(host_A % np.uint32(FMT_P0.MODP))  # gpuメモリ確保&転送
        B0 = drv.to_device(host_B % np.uint32(FMT_P0.MODP))  # gpuメモリ確保&転送
        A1 = drv.to_device(host_A % np.uint32(FMT_P1.MODP))  # gpuメモリ確保&転送
        B1 = drv.to_device(host_B % np.uint32(FMT_P1.MODP))  # gpuメモリ確保&転送
        A2 = drv.to_device(host_A % np.uint32(FMT_P2.MODP))  # gpuメモリ確保&転送
        B2 = drv.to_device(host_B % np.uint32(FMT_P2.MODP))  # gpuメモリ確保&転送

        #print("GPU計算開始")
        stime = time.time()
        E0 = FMT_P0.Convolution(A0, B0)
        E1 = FMT_P1.Convolution(A1, B1)
        E2 = FMT_P2.Convolution(A2, B2)
Exemple #50
0
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing,
               kernel):
    """
    It computes the adjoint double layer potential.

    Arguments
    ----------
    XKt    : array, input for the adjoint double layer potential.
    LorY   : int, Laplace (1) or Yukawa (2).
    surfSrc: class, source surface, the one that contains the gauss points.
    surfTar: class, target surface, the one that contains the collocation points.
    Kt_diag: array, diagonal elements of the adjoint double layer integral
                    operator.
    self   : int, position in the surface array of the source surface.
    param  : class, parameters related to the surface.
    ind0   : array, it contains the indices related to the treecode computation.
    timing : class, it contains timing information for different parts of the
                    code.
    kernel : pycuda source module.

    Returns
    --------
    Kt_lyr: array, adjoint double layer potential.
    """

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)

    tic.record()
    K = param.K
    w = getWeights(K)
    X_Kt = numpy.zeros(Ns * K)
    X_Ktc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_Kt[:] = XKt[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K]
    X_Ktc[:] = XKt[NsK // K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    X_aux = numpy.zeros(Ns * K)
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt,
                 X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_Kt = X_Kt[surfSrc.sortSource]
    X_Ktc = X_Ktc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    Ktx_aux = numpy.zeros(param.Nround)
    Kty_aux = numpy.zeros(param.Nround)
    Ktz_aux = numpy.zeros(param.Nround)

    ### CPU code
    if param.GPU == 0:
        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux,
                                                   Kty_aux, Ktz_aux, self,
                                                   ind0.index_large, param,
                                                   LorY, timing)

        Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc,
                                               Ktx_aux, Kty_aux, Ktz_aux, self,
                                               LorY, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL))
        Kty_gpu = cuda.to_device(Kty_aux.astype(REAL))
        Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu,
                                                  Kty_gpu, Ktz_gpu, self, ind0,
                                                  param, LorY, timing, kernel)

        Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc,
                                              Ktx_gpu, Kty_gpu, Ktz_gpu, self,
                                              LorY, w, param, timing, kernel)

        tic.record()
        Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL)
        Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL)
        Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    Kt_lyr = (Ktx_aux[surfTar.unsort] * surfTar.normal[:, 0] +
              Kty_aux[surfTar.unsort] * surfTar.normal[:, 1] +
              Ktz_aux[surfTar.unsort] * surfTar.normal[:, 2])

    if abs(Kt_diag) > 1e-12:  # if same surface
        Kt_lyr += Kt_diag * XKt

    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return Kt_lyr
Exemple #51
0
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param,
            ind0, timing, kernel):
    """
    It computes the single and double layer potentials.

    Arguments
    ----------
    XK     : array, input for the double layer potential.
    XV     : array, input for the single layer potential.
    LorY   : int, Laplace (1) or Yukawa (2).
    surfSrc: class, source surface, the one that contains the gauss points.
    surfTar: class, target surface, the one that contains the collocation
                    points.
    K_diag : array, diagonal elements of the double layer integral operator.
    V_diag : array, diagonal elements of the single layer integral operator.
    IorE   : int, internal (1) or external (2).
    self   : int, position in the surface array of the source surface.
    param  : class, parameters related to the surface.
    ind0   : array, it contains the indices related to the treecode computation.
    timing : class, it contains timing information for different parts of the
                    code.
    kernel : pycuda source module.

    Returns
    --------
    K_lyr  : array, double layer potential.
    V_lyr  : array, single layer potential.
    """

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    L = numpy.sqrt(2 * surfSrc.area)  # Representative length

    tic.record()
    K = param.K
    w = getWeights(K)
    X_V = numpy.zeros(Ns * K)
    X_Kx = numpy.zeros(Ns * K)
    X_Ky = numpy.zeros(Ns * K)
    X_Kz = numpy.zeros(Ns * K)
    X_Kc = numpy.zeros(Ns * K)
    X_Vc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_V[:] = XV[NsK // K] * w[NsK % K] * surfSrc.area[NsK // K]
    X_Kx[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 0]
    X_Ky[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 1]
    X_Kz[:] = XK[NsK // K] * w[NsK % K] * surfSrc.area[
        NsK // K] * surfSrc.normal[NsK // K, 2]
    X_Kc[:] = XK[NsK // K]
    X_Vc[:] = XV[NsK // K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V,
                 X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_V = X_V[surfSrc.sortSource]
    X_Kx = X_Kx[surfSrc.sortSource]
    X_Ky = X_Ky[surfSrc.sortSource]
    X_Kz = X_Kz[surfSrc.sortSource]
    X_Kc = X_Kc[surfSrc.sortSource]
    X_Vc = X_Vc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    K_aux = numpy.zeros(param.Nround)
    V_aux = numpy.zeros(param.Nround)

    ### CPU code
    if param.GPU == 0:
        K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self,
                                ind0.index_large, param, LorY, timing)

        K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                                X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag,
                                IorE, L, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        K_gpu = cuda.to_device(K_aux.astype(REAL))
        V_gpu = cuda.to_device(V_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0,
                                   param, LorY, timing, kernel)

        K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                               X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L,
                               w, param, timing, kernel)

        tic.record()
        K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL)
        V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    K_lyr = K_aux[surfTar.unsort]
    V_lyr = V_aux[surfTar.unsort]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return K_lyr, V_lyr