コード例 #1
0
ファイル: scattering.py プロジェクト: jaisw7/dgfs1D_gpu
    def perform_precomputation(self):
        # Precompute aa, bb1, bb2 (required for kernel)
        # compute l
        Nv = self.vm.Nv()
        Nrho = self.vm.Nrho()
        M = self.vm.M()
        L = self.vm.L()
        qz = self.vm.qz()
        qw = self.vm.qw()
        sz = self.vm.sz()
        sw = self.vm.sw()
        vsize = self.vm.vsize()
        szpre = self._szpre
        swpre = self._swpre

        check(self.cfg.dtype == np.float64,
              "Need to extend for single precision")

        # precision control
        dint = np.int32
        dfloat = np.float64
        dcplx = np.complex128

        if self.cfg.dtype == np.float32:
            dfloat = np.float32
            dcplx = np.complex64

        l0 = np.concatenate(
            (np.arange(0, Nv / 2, dtype=dint), np.arange(-Nv / 2,
                                                         0,
                                                         dtype=dint)))
        l = np.zeros((3, vsize), dtype=dint)
        for idv in range(vsize):
            I = int(idv / (Nv * Nv))
            J = int((idv % (Nv * Nv)) / Nv)
            K = int((idv % (Nv * Nv)) % Nv)
            l[0, idv] = l0[I]
            l[1, idv] = l0[J]
            l[2, idv] = l0[K]
        d_lx = gpuarray.to_gpu(np.ascontiguousarray(l[0, :]))
        d_ly = gpuarray.to_gpu(np.ascontiguousarray(l[1, :]))
        d_lz = gpuarray.to_gpu(np.ascontiguousarray(l[2, :]))

        # transfer sphere points to gpu
        d_sz_x = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 0]))
        d_sz_y = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 1]))
        d_sz_z = gpuarray.to_gpu(np.ascontiguousarray(sz[:, 2]))

        # define complex to complex plan
        rank = 3
        n = np.array([Nv, Nv, Nv], dtype=np.int32)

        #planD2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_D2Z)
        self.planZ2Z_MNrho = cufftPlanMany(rank, n.ctypes.data, None, 1, vsize,
                                           None, 1, vsize, CUFFT_Z2Z, M * Nrho)
        self.planZ2Z = cufftPlan3d(Nv, Nv, Nv, CUFFT_Z2Z)

        dfltargs = dict(
            dtype=self.cfg.dtypename,
            Nrho=Nrho,
            M=M,
            vsize=vsize,
            sw=sw,
            prefac=self._prefactor,
            cases=self._cases,
            masses=self.vm.masses(),
            qw=qw,
            qz=qz,
            L=L,
            sz=sz,
            gamma=self._gamma,
            eta=self._eta,
            Mpre=self._Mpre,
            szpre=szpre,
            swpre=swpre  #, Ne=self._Ne
        )
        src = DottedTemplateLookup('dgfs1D.bi.kernels.scattering',
                                   dfltargs).get_template(
                                       self.scattering_model).render()

        # Compile the source code and retrieve the kernel
        print("\nCompiling scattering kernels, this may take some time ...")
        module = compiler.SourceModule(src)

        self.block = (256, 1, 1)
        self.grid = get_grid_for_block(self.block, vsize)

        print("Starting precomputation, this may take some time ...")
        start, end = cuda.Event(), cuda.Event()
        cuda.Context.synchronize()
        start.record()
        start.synchronize()

        self.d_aa = gpuarray.empty(Nrho * M * vsize, dtype=dfloat)
        precompute_aa = get_kernel(module, "precompute_a", 'PPPP')
        precompute_aa.prepared_call(self.grid, self.block, d_lx.ptr, d_ly.ptr,
                                    d_lz.ptr, self.d_aa.ptr)

        self.d_bb1 = {}
        self.d_bb2 = {}
        precompute_bb = {}
        for cp, cq in self._cases:
            cpcq = str(cp) + str(cq)
            self.d_bb1[cpcq] = gpuarray.empty(Nrho * M * vsize, dtype=dcplx)
            self.d_bb2[cpcq] = gpuarray.zeros(vsize, dtype=dcplx)
            precompute_bb[cpcq] = module.get_function("precompute_bc_" + cpcq)
            precompute_bb[cpcq].prepare('IIdddPPPPPPPP')
            precompute_bb[cpcq].set_cache_config(cuda.func_cache.PREFER_L1)

            for p in range(Nrho):
                fac = np.pi / L * qz[p]
                fac_b = swpre * pow(qz[p], self._gamma[cpcq] + 2)
                fac_c = qw[p] * sw * fac_b
                for q in range(M):
                    precompute_bb[cpcq].prepared_call(
                        self.grid, self.block, dint(p), dint(q), dfloat(fac),
                        dfloat(fac_b), dfloat(fac_c), d_lx.ptr, d_ly.ptr,
                        d_lz.ptr, d_sz_x.ptr, d_sz_y.ptr, d_sz_z.ptr,
                        self.d_bb1[cpcq].ptr, self.d_bb2[cpcq].ptr)

        end.record()
        end.synchronize()
        secs = start.time_till(end) * 1e-3
        print("Finished precomputation in: %fs" % (secs))

        # transform scalar to complex
        self.r2zKern = module.get_function("r2z")
        self.r2zKern.prepare('IIIPP')
        self.r2zKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the cosSinMul kernel for execution
        self.cosSinMultKern = {}
        #self.computeQGKern = {}
        self.outKern = {}
        for cp, cq in self._cases:
            idx = str(cp) + str(cq)
            self.cosSinMultKern[idx] = module.get_function("cosSinMul_" + idx)
            self.cosSinMultKern[idx].prepare('PPPPP')
            self.cosSinMultKern[idx].set_cache_config(
                cuda.func_cache.PREFER_L1)

            #self.computeQGKern[idx] = module.get_function("computeQG_"+idx)
            #self.computeQGKern[idx].prepare('PPP')
            #self.computeQGKern[idx].set_cache_config(
            #    cuda.func_cache.PREFER_L1)

            self.outKern[idx] = module.get_function("output_" + idx)
            self.outKern[idx].prepare('IIIIPPPP')
            self.outKern[idx].set_cache_config(cuda.func_cache.PREFER_L1)

        # prepare the computeQG kernel
        self.computeQGKern = module.get_function("computeQG")
        self.computeQGKern.prepare('PPP')
        self.computeQGKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the prodKern kernel for execution
        self.prodKern = module.get_function("prod")
        self.prodKern.prepare('PPP')
        self.prodKern.set_cache_config(cuda.func_cache.PREFER_L1)

        # Prepare the ax kernel for execution
        self.ax2Kern = module.get_function("ax2")
        self.ax2Kern.prepare('PPP')
        self.ax2Kern.set_cache_config(cuda.func_cache.PREFER_L1)

        # define scratch  spaces
        self.d_FTf = gpuarray.empty(vsize, dtype=dcplx)
        self.d_FTg = gpuarray.empty(vsize, dtype=dcplx)
        self.d_f1C = gpuarray.empty_like(self.d_FTf)
        self.d_f2C = gpuarray.empty_like(self.d_FTf)
        self.d_QG = gpuarray.empty_like(self.d_FTf)
        self.d_t1 = gpuarray.empty(M * Nrho * vsize, dtype=dcplx)
        self.d_t2 = gpuarray.empty_like(self.d_t1)
        self.d_t3 = gpuarray.empty_like(self.d_t1)
コード例 #2
0
ファイル: fft.py プロジェクト: Jackmastr/cuda_sandbox
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import numpy as np 
from skcuda.fft import fft, Plan
import pycuda.driver as cuda
import skcuda.cublas as cublas
import skcuda

s = cuda.Event()
e = cuda.Event()
s.record()

nStreams = 8
stream = [cuda.Stream() for i in range(nStreams)]
N = 8192
print skcuda.misc.get_current_device()

x = [np.asarray(np.random.rand(N/nStreams), np.float32) for i in range(nStreams)]
#x_pin = cuda.register_host_memory(x)
#xf = np.fft.fft(x)
x_gpu = [gpuarray.to_gpu_async(x[i], stream=stream[i]) for i in range(nStreams)]

xf_gpu = [gpuarray.empty((N/nStreams)/2 + 1, np.complex64) for i in range(nStreams)]
plan = [Plan(x[0].shape, np.float32, np.complex64, stream=stream[i]) for i in range(nStreams)]
print skcuda.misc.get_current_device()
for i in range(nStreams):
	fft(x_gpu[i], xf_gpu[i], plan[i])
	print skcuda.misc.get_current_device()

x_pin = [xf_gpu[i].get_async(stream=stream[i]) for i in range(nStreams)]
コード例 #3
0
# reference: https://medium.com/@CIulius/five-different-ways-to-sum-vectors-in-pycuda-3f2d9409b139
import numpy as np

# --- PyCUDA initialization
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit

########
# MAIN #
########

start = cuda.Event()
end   = cuda.Event()

N = 100000

# --- Create random vectorson the CPU
h_a = np.random.randn(1, N)
h_b = np.random.randn(1, N)

# --- Set CPU arrays as single precision
h_a = h_a.astype(np.float32)
h_b = h_b.astype(np.float32)
h_c = np.empty_like(h_a)

d_a = gpuarray.to_gpu(h_a)
d_b = gpuarray.to_gpu(h_b)

start.record()
d_c = (d_a + d_b)
コード例 #4
0
ファイル: SparseSolve.py プロジェクト: bbkiwi/SpyderWork
def main_cg():
    from optparse import OptionParser

    parser = OptionParser(usage="%prog [options] MATRIX-MARKET-FILE")
    parser.add_option(
        "-s",
        "--is-symmetric",
        action="store_true",
        help="Specify that the input matrix is already symmetric")
    options, args = parser.parse_args()

    from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool
    dev_pool = DeviceMemoryPool()
    pagelocked_pool = PageLockedMemoryPool()

    from scipy.io import mmread
    csr_mat = mmread(args[0]).tocsr().astype(numpy.float32)

    inv_mat_diag = 1 / csr_mat.diagonal()

    print "building..."
    from pycuda.sparse.packeted import PacketedSpMV
    spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype)
    rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype)

    from pycuda.sparse.operator import DiagonalPreconditioner
    if True:
        precon = DiagonalPreconditioner(
            spmv.permute(
                gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate)))
    else:
        precon = None

    from pycuda.sparse.cg import solve_pkt_with_cg
    print "start solve"
    for i in range(4):
        start = drv.Event()
        stop = drv.Event()
        start.record()

        rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate)
        res_gpu, it_count, res_count = \
                solve_pkt_with_cg(spmv, rhs_gpu, precon,
                        tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5,
                        pagelocked_allocator=pagelocked_pool.allocate)
        res = res_gpu.get()

        stop.record()
        stop.synchronize()

        elapsed = stop.time_since(start) * 1e-3
        est_flops = (csr_mat.nnz * 2 * (it_count + res_count) +
                     csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count)

        if precon is not None:
            est_flops += csr_mat.shape[0] * it_count

        print "residual norm: %g" % (la.norm(csr_mat * res - rhs) /
                                     la.norm(rhs))
        print(
            "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, "
            "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count,
                             it_count / elapsed, est_flops / elapsed / 1e9))

    # TODO: mixed precision
    # TODO: benchmark
    pagelocked_pool.stop_holding()
    dev_pool.stop_holding()
コード例 #5
0
            if(con_prob[i]<min)
            {
                min = con_prob[i];
                predict = i;
            }
        }
        if (predict != (int)test_labels[idx]) {error[idx] += 1;}
    }
""")

discrete_predict = mod.get_function("discrete_predict")
discrete_likelihood = mod.get_function("discrete_likelihood")
gaussian_predict = mod.get_function("gaussian_predict")
gaussian_likelihood_mean = mod.get_function("gaussian_likelihood_mean")
gaussian_likelihood_var = mod.get_function("gaussian_likelihood_var")
end = drv.Event()

parser = ArgumentParser()
parser.add_argument("--mode",
                    type=int,
                    default=0,
                    help="descrete:0, continuous:1")
args = parser.parse_args()


def readmnist(mnist_dir, mode='training'):
    if mode == 'training':
        image_dir = os.path.join(mnist_dir, 'train-images-idx3-ubyte')
        label_dir = os.path.join(mnist_dir, 'train-labels-idx1-ubyte')
    elif mode == 'testing':
        image_dir = os.path.join(mnist_dir, 't10k-images-idx3-ubyte')
コード例 #6
0
def main():
    import pycuda.gpuarray as gpuarray

    sizes = []
    times_gpu = []
    flops_gpu = []
    flops_cpu = []
    times_cpu = []

    from pycuda.tools import bitlog2

    max_power = bitlog2(drv.mem_get_info()[0]) - 2
    # they're floats, i.e. 4 bytes each
    for power in range(10, max_power):
        size = 1 << power
        print(size)
        sizes.append(size)
        a = gpuarray.zeros((size, ), dtype=numpy.float32)
        b = gpuarray.zeros((size, ), dtype=numpy.float32)
        b.fill(1)

        if power > 20:
            count = 100
        else:
            count = 1000

        # gpu -----------------------------------------------------------------
        start = drv.Event()
        end = drv.Event()
        start.record()

        for i in range(count):
            a + b

        end.record()
        end.synchronize()

        secs = start.time_till(end) * 1e-3

        times_gpu.append(secs / count)
        flops_gpu.append(size)
        del a
        del b

        # cpu -----------------------------------------------------------------
        a_cpu = numpy.random.randn(size).astype(numpy.float32)
        b_cpu = numpy.random.randn(size).astype(numpy.float32)

        # start timer
        from time import time

        start = time()
        for i in range(count):
            a_cpu + b_cpu
        secs = time() - start

        times_cpu.append(secs / count)
        flops_cpu.append(size)

    # calculate pseudo flops
    flops_gpu = [f / t for f, t in zip(flops_gpu, times_gpu)]
    flops_cpu = [f / t for f, t in zip(flops_cpu, times_cpu)]

    from pytools import Table

    tbl = Table()
    tbl.add_row((
        "Size",
        "Time GPU",
        "Size/Time GPU",
        "Time CPU",
        "Size/Time CPU",
        "GPU vs CPU speedup",
    ))
    for s, t, f, t_cpu, f_cpu in zip(sizes, times_gpu, flops_gpu, times_cpu,
                                     flops_cpu):
        tbl.add_row((s, t, f, t_cpu, f_cpu, f / f_cpu))
    print(tbl)
コード例 #7
0
ファイル: benchmark_gui.py プロジェクト: xmzzaa/PyTom
    stdTGPU2 = stdUnderMask(tempGPU, maskGPU, np.float32(meanVal), p=p)

    linearAdd(tempGPU, maskGPU, np.float64(meanTGPU.get()),
              np.float64(1. / stdTGPU.get()))
    tgpu = tempGPU.get()
    print(tvol.mean(), tvol.max(), tvol.min(), tvol.std())
    print(tgpu.mean(), tgpu.max(), tgpu.min(), tgpu.std())
    print()

    print(stdVal, meanVal, stdTGPU, meanTGPU, stdTGPU2)

    import sys
    sys.exit()
    #mask = paste_in_center(mask, np.zeros_like(volume))

    start = driver.Event()
    end = driver.Event()

    start.record()

    #scores, angles, plan = template_matching_gpu(volume, temp, mask, wedgeV, np.fft.fftshift(stdV).astype(np.float32), [[0,0,0],]*num_angles, return_cpu=True)

    end.record()
    end.synchronize()
    print('exec time (s):',
          start.time_till(end) * 1e-3,
          7000 * start.time_till(end) * 1e-3 / num_angles / 60)

    map = ((plan.templatePadded.get()))
    print(map.max(), map.min(), map.mean())
    mrcfile.new('templatePadded.mrc', map.astype(np.float32), overwrite=True)
コード例 #8
0
def convolution_example(context):

    #create input data
    image_width = 1024
    image_height = 1024
    filter_width = 17
    filter_height = 17
    input_width = image_width + 2*(filter_width//2)
    input_height = image_height + 2*(filter_height//2)

    input_image = numpy.random.randn(input_width, input_height).astype(numpy.float32)
    filter = numpy.random.randn(filter_width, filter_height).astype(numpy.float32)
    output_image = numpy.zeros((image_width, image_height), dtype=numpy.float32)

    #move data to the GPU
    args = [output_image, input_image, filter]
    gpu_args = []
    for arg in args:
        gpu_args.append(drv.mem_alloc(arg.nbytes))
        drv.memcpy_htod(gpu_args[-1], arg)

    #read kernel into string
    with open('convolution.cu', 'r') as f:
        kernel_string = f.read()

    #get compute capability for compiling CUDA kernels
    devprops = { str(k): v for (k, v) in context.get_device().get_attributes().items() }
    cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str(devprops['COMPUTE_CAPABILITY_MINOR'])

    #compile the kernels
    module = SourceModule(kernel_string, arch='compute_' + cc, code='sm_' + cc,
                    cache_dir=False, no_extern_c=True)
    convolution = module.get_function("convolution_kernel")
    convolution_naive = module.get_function("convolution_kernel_naive")

    #setup thread block sizes
    threads = (32, 16, 1)
    grid = (int(numpy.ceil(image_width/float(threads[0]))), int(numpy.ceil(image_height/float(threads[1]))), 1)

    #compute reference using naive kernel
    reference = numpy.zeros_like(output_image)
    start = drv.Event()
    end = drv.Event()
    context.synchronize()
    start.record()
    convolution_naive(*gpu_args, block=threads, grid=grid, stream=None, shared=0)
    end.record()
    context.synchronize()
    print("convolution_kernel_naive took", end.time_since(start), "ms.")
    drv.memcpy_dtoh(reference, gpu_args[0])
    drv.memcpy_htod(gpu_args[0], output_image)

    #launch the kernel
    context.synchronize()
    start.record()
    convolution(*gpu_args, block=threads, grid=grid, stream=None, shared=0)
    end.record()
    context.synchronize()
    print("convolution_kernel took", end.time_since(start), "ms.")

    #copy output data back from GPU
    drv.memcpy_dtoh(output_image, gpu_args[0])

    #compare output with reference
    correct = numpy.allclose(output_image, reference, atol=1e-6)
    if not correct:
        print("TEST FAILED!")
    else:
        print("TEST PASSED!")
コード例 #9
0
    hog_output = np.zeros((height, width)).ravel().astype(np.float32)

    # Number of threads per block
    n_TPB_x = int(16)
    n_TPB_y = int(16)

    # Number of thread blocks
    n_blocks_x = int(np.ceil(width / n_TPB_x))
    n_blocks_y = int(np.ceil(height / n_TPB_y))

    ###
    # Patch-based HoG kernel
    ###

    # Start the timer
    gpu_start_time = cu.Event()
    gpu_end_time = cu.Event()
    gpu_start_time.record()

    # Create device variables
    input_im_device = cu.mem_alloc(t_image.nbytes)
    cu.memcpy_htod(input_im_device, t_image)

    x_gradient_device = cu.mem_alloc(x_gradient.nbytes)
    cu.memcpy_htod(x_gradient_device, x_gradient)

    y_gradient_device = cu.mem_alloc(y_gradient.nbytes)
    cu.memcpy_htod(y_gradient_device, y_gradient)

    gradient_mag_device = cu.mem_alloc(gradient_mag.nbytes)
    cu.memcpy_htod(gradient_mag_device, gradient_mag)
コード例 #10
0
def project_Kt(XKt, LorY, surfSrc, surfTar, Kt_diag, self, param, ind0, timing,
               kernel):

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    Nt = len(surfTar.triangle)
    L = numpy.sqrt(2 * surfSrc.Area)  # Representative length

    tic.record()
    K = param.K
    w = getWeights(K)
    X_Kt = numpy.zeros(Ns * K)
    X_Ktc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_Kt[:] = XKt[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K]
    X_Ktc[:] = XKt[NsK / K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    X_aux = numpy.zeros(Ns * K)
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_Kt,
                 X_aux, X_aux, X_aux, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_Kt = X_Kt[surfSrc.sortSource]
    X_Ktc = X_Ktc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    Ktx_aux = numpy.zeros(param.Nround)
    Kty_aux = numpy.zeros(param.Nround)
    Ktz_aux = numpy.zeros(param.Nround)
    AI_int = 0

    ### CPU code
    if param.GPU == 0:
        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_aux, Kty_aux, Ktz_aux = M2PKt_sort(surfSrc, surfTar, Ktx_aux,
                                                   Kty_aux, Ktz_aux, self,
                                                   ind0.index_large, param,
                                                   LorY, timing)

        Ktx_aux, Kty_aux, Ktz_aux = P2PKt_sort(surfSrc, surfTar, X_Kt, X_Ktc,
                                               Ktx_aux, Kty_aux, Ktz_aux, self,
                                               LorY, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        Ktx_gpu = cuda.to_device(Ktx_aux.astype(REAL))
        Kty_gpu = cuda.to_device(Kty_aux.astype(REAL))
        Ktz_gpu = cuda.to_device(Ktz_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            Ktx_gpu, Kty_gpu, Ktz_gpu = M2PKt_gpu(surfSrc, surfTar, Ktx_gpu,
                                                  Kty_gpu, Ktz_gpu, self, ind0,
                                                  param, LorY, timing, kernel)

        Ktx_gpu, Kty_gpu, Ktz_gpu = P2PKt_gpu(surfSrc, surfTar, X_Kt, X_Ktc,
                                              Ktx_gpu, Kty_gpu, Ktz_gpu, self,
                                              LorY, w, param, timing, kernel)

        tic.record()
        Ktx_aux = cuda.from_device(Ktx_gpu, len(Ktx_aux), dtype=REAL)
        Kty_aux = cuda.from_device(Kty_gpu, len(Kty_aux), dtype=REAL)
        Ktz_aux = cuda.from_device(Ktz_gpu, len(Ktz_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    Kt_lyr = Ktx_aux[surfTar.unsort]*surfTar.normal[:,0] \
           + Kty_aux[surfTar.unsort]*surfTar.normal[:,1] \
           + Ktz_aux[surfTar.unsort]*surfTar.normal[:,2]

    if abs(Kt_diag) > 1e-12:  # if same surface
        Kt_lyr += Kt_diag * XKt

    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return Kt_lyr
コード例 #11
0
def project(XK, XV, LorY, surfSrc, surfTar, K_diag, V_diag, IorE, self, param,
            ind0, timing, kernel):

    if param.GPU == 1:
        tic = cuda.Event()
        toc = cuda.Event()
    else:
        tic = Event()
        toc = Event()

    REAL = param.REAL
    Ns = len(surfSrc.triangle)
    Nt = len(surfTar.triangle)
    L = numpy.sqrt(2 * surfSrc.Area)  # Representative length

    tic.record()
    K = param.K
    w = getWeights(K)
    X_V = numpy.zeros(Ns * K)
    X_Kx = numpy.zeros(Ns * K)
    X_Ky = numpy.zeros(Ns * K)
    X_Kz = numpy.zeros(Ns * K)
    X_Kc = numpy.zeros(Ns * K)
    X_Vc = numpy.zeros(Ns * K)

    NsK = numpy.arange(Ns * K)
    X_V[:] = XV[NsK / K] * w[NsK % K] * surfSrc.Area[NsK / K]
    X_Kx[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK /
                                                      K] * surfSrc.normal[NsK /
                                                                          K, 0]
    X_Ky[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK /
                                                      K] * surfSrc.normal[NsK /
                                                                          K, 1]
    X_Kz[:] = XK[NsK / K] * w[NsK % K] * surfSrc.Area[NsK /
                                                      K] * surfSrc.normal[NsK /
                                                                          K, 2]
    X_Kc[:] = XK[NsK / K]
    X_Vc[:] = XV[NsK / K]

    toc.record()
    toc.synchronize()
    timing.time_mass += tic.time_till(toc) * 1e-3

    tic.record()
    C = 0
    getMultipole(surfSrc.tree, C, surfSrc.xj, surfSrc.yj, surfSrc.zj, X_V,
                 X_Kx, X_Ky, X_Kz, ind0, param.P, param.NCRIT)
    toc.record()
    toc.synchronize()
    timing.time_P2M += tic.time_till(toc) * 1e-3

    tic.record()
    for C in reversed(range(1, len(surfSrc.tree))):
        PC = surfSrc.tree[C].parent
        upwardSweep(surfSrc.tree, C, PC, param.P, ind0.II, ind0.JJ, ind0.KK,
                    ind0.index, ind0.combII, ind0.combJJ, ind0.combKK,
                    ind0.IImii, ind0.JJmjj, ind0.KKmkk, ind0.index_small,
                    ind0.index_ptr)
    toc.record()
    toc.synchronize()
    timing.time_M2M += tic.time_till(toc) * 1e-3

    tic.record()
    X_V = X_V[surfSrc.sortSource]
    X_Kx = X_Kx[surfSrc.sortSource]
    X_Ky = X_Ky[surfSrc.sortSource]
    X_Kz = X_Kz[surfSrc.sortSource]
    X_Kc = X_Kc[surfSrc.sortSource]
    X_Vc = X_Vc[surfSrc.sortSource]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    param.Nround = len(surfTar.twig) * param.NCRIT
    K_aux = numpy.zeros(param.Nround)
    V_aux = numpy.zeros(param.Nround)
    AI_int = 0

    ### CPU code
    if param.GPU == 0:
        K_aux, V_aux = M2P_sort(surfSrc, surfTar, K_aux, V_aux, self,
                                ind0.index_large, param, LorY, timing)

        K_aux, V_aux = P2P_sort(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                                X_Vc, K_aux, V_aux, self, LorY, K_diag, V_diag,
                                IorE, L, w, param, timing)

    ### GPU code
    elif param.GPU == 1:
        K_gpu = cuda.to_device(K_aux.astype(REAL))
        V_gpu = cuda.to_device(V_aux.astype(REAL))

        if surfTar.offsetMlt[self, len(surfTar.twig)] > 0:
            K_gpu, V_gpu = M2P_gpu(surfSrc, surfTar, K_gpu, V_gpu, self, ind0,
                                   param, LorY, timing, kernel)

        K_gpu, V_gpu = P2P_gpu(surfSrc, surfTar, X_V, X_Kx, X_Ky, X_Kz, X_Kc,
                               X_Vc, K_gpu, V_gpu, self, LorY, K_diag, IorE, L,
                               w, param, timing, kernel)

        tic.record()
        K_aux = cuda.from_device(K_gpu, len(K_aux), dtype=REAL)
        V_aux = cuda.from_device(V_gpu, len(V_aux), dtype=REAL)
        toc.record()
        toc.synchronize()
        timing.time_trans += tic.time_till(toc) * 1e-3

    tic.record()
    K_lyr = K_aux[surfTar.unsort]
    V_lyr = V_aux[surfTar.unsort]
    toc.record()
    toc.synchronize()
    timing.time_sort += tic.time_till(toc) * 1e-3

    return K_lyr, V_lyr
コード例 #12
0
def estimate_diameter_gpu(skel,
                          data,
                          lat_data,
                          azth_data,
                          n_scan_angles,
                          max_iters=150,
                          do_reshape=True):
    """Computes 3D diameter at every point of a skeleton of data using GPU.

    Estimates 3D diameter at every point of the skeleton `skel` extracted from the binary
    data `data` with help of orientation information provided by `lat_data` and `azth_data`
    arrays. The diameter is evaluated with a ray casting approach `cast_ray` adapted for
    a 3D case.

    Parameters
    ----------
    skel : 3D array
        Indicates the skeleton of the binary data.

    data : 3D array
        Indicates the 3D binary data.

    lat_data : 3D array
        Indicates the 3D array containing latitude / elevation angle at every point of
        the skeleton in radians.

    azth_data : 3D array
        Indicates the 3D array containing azimuth angle at every point of the skeleton
        in radians.

    n_scan_angles : int
        Indicates the number of scanning angles on a range [0, 360] degrees.

    max_iters : int
        Indicates the maximum length of a ray in each direction.

    do_reshape : boolean
        Specifies if the output array should be reshaped immediately after estimation.

    Returns
    -------
    out : dict
        The dictionary of the 3D array of estimated diameter and the execution time.
    """
    if not cuda_available:
        print 'The pycuda package is not found. The diameter estimation cannot be done.'
        return None

    program, diameter3d = _diameter_kernel()

    Z, Y, X = np.int32(skel.nonzero())
    depth, height, width = np.uint32(skel.shape)

    scan_angl_arr = np.deg2rad(
        np.float32(np.linspace(0, 360, num=n_scan_angles, endpoint=False)))
    radius_arr = np.zeros_like(Z, dtype=np.float32)

    lat_data_1d = lat_data[skel.nonzero()]
    azth_data_1d = azth_data[skel.nonzero()]

    gpu_X = gpuarray.to_gpu(X)
    gpu_Y = gpuarray.to_gpu(Y)
    gpu_Z = gpuarray.to_gpu(Z)

    gpu_lat_data_1d = gpuarray.to_gpu(lat_data_1d)
    gpu_azth_data_1d = gpuarray.to_gpu(azth_data_1d)

    gpu_radius_arr = gpuarray.to_gpu(radius_arr)
    gpu_scan_angl_arr = gpuarray.to_gpu(scan_angl_arr)

    gpu_rad_tex = program.get_texref('tex_data')
    gpu_data = numpy3d_to_array(data)
    gpu_rad_tex.set_array(gpu_data)

    n_scan_angles = np.uint32(n_scan_angles)
    n_points = np.uint32(len(Z))
    max_iters = np.uint32(max_iters)
    norm_factor = np.float32(1. / n_scan_angles)

    block = (16, 16, 1)
    n_blocks = np.ceil(float(n_points) / (block[0] * block[1]))
    g_cols = 2
    g_rows = np.int(np.ceil(n_blocks / g_cols))
    grid = (g_rows, g_cols, 1)

    start = cuda.Event()
    end = cuda.Event()

    start.record()  # start timing
    diameter3d(width,
               height,
               depth,
               n_points,
               norm_factor,
               max_iters,
               n_scan_angles,
               gpu_X,
               gpu_Y,
               gpu_Z,
               gpu_scan_angl_arr,
               gpu_azth_data_1d,
               gpu_lat_data_1d,
               gpu_radius_arr,
               block=block,
               grid=grid)
    end.record()  # end timing
    end.synchronize()

    dm_time = start.time_till(end) * 1e-3
    print "Diameter estimation time: %fs" % (dm_time)

    radius_arr = gpu_radius_arr.get()

    if do_reshape:
        radius_arr = np.reshape(radius_arr, data.shape, order='C')

    out = {'diameter': radius_arr * 2., 'time': dm_time}
    return out
コード例 #13
0
                                    onembed.ctypes.data, 1,
                                    39 * BENG_CHANNELS_ + 1, cufft.CUFFT_R2C,
                                    1)

# Turn trimmed spectrum into 2048 timeseries
n = array([32 * 2 * BENG_CHANNELS_], int32)
inembed = array([39 * BENG_CHANNELS_ + 1], int32)
onembed = array([32 * 2 * BENG_CHANNELS_], int32)
plan_interp_B = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1,
                                    39 * BENG_CHANNELS_ + 1,
                                    onembed.ctypes.data, 1,
                                    32 * 2 * BENG_CHANNELS_, cufft.CUFFT_C2R,
                                    1)

# event timers
tic = cuda.Event()
toc = cuda.Event()

# allocate device memory
# reader
gpu_vdif_buf = cuda.mem_alloc(cpu_vdif_buf.nbytes)
gpu_beng_data_0 = cuda.mem_alloc(8 * BENG_CHANNELS_ * BENG_SNAPSHOTS *
                                 BENG_BUFFER_IN_COUNTS)
gpu_beng_data_1 = cuda.mem_alloc(8 * BENG_CHANNELS_ * BENG_SNAPSHOTS *
                                 BENG_BUFFER_IN_COUNTS)
gpu_fid = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS)
gpu_cid = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS)
gpu_bcount = cuda.mem_alloc(4 * VDIF_PER_BENG * BENG_BUFFER_IN_COUNTS)
gpu_beng_frame_completion = cuda.mem_alloc(4 * BENG_BUFFER_IN_COUNTS)
# reorder/resample
# buffers
コード例 #14
0
ファイル: gpu_test9.py プロジェクト: eretana/hera_sandbox
def vis_gpu(antpos,
            freq,
            eq2tops,
            crd_eq,
            I_sky,
            bm_cube,
            nthreads=NTHREADS,
            max_memory=MAX_MEMORY,
            real_dtype=np.float32,
            complex_dtype=np.complex64,
            verbose=False):
    # ensure shapes
    nant = antpos.shape[0]
    assert (antpos.shape == (nant, 3))
    npix = crd_eq.shape[1]
    assert (crd_eq.shape == (3, npix))
    assert (I_sky.shape == (npix, ))
    beam_px = bm_cube.shape[1]
    assert (bm_cube.shape == (nant, beam_px, beam_px))
    ntimes = eq2tops.shape[0]
    assert (eq2tops.shape == (ntimes, 3, 3))
    # ensure data types
    antpos = antpos.astype(real_dtype)
    eq2tops = eq2tops.astype(real_dtype)
    crd_eq = crd_eq.astype(real_dtype)
    Isqrt = np.sqrt(I_sky).astype(real_dtype)
    bm_cube = bm_cube.astype(real_dtype)  # XXX complex?
    chunk = max(min(npix, MIN_CHUNK),
                2**int(ceil(np.log2(float(nant * npix) / max_memory / 2))))
    npixc = npix / chunk
    # blocks of threads are mapped to (pixels,ants,freqs)
    block = (max(1, nthreads / nant), min(nthreads, nant), 1)
    grid = (int(ceil(npixc / float(block[0]))),
            int(ceil(nant / float(block[1]))))
    gpu_code = GPU_TEMPLATE % {
        'NANT': nant,
        'NPIX': npixc,
        'BEAM_PX': beam_px,
        'BLOCK_PX': block[0],
    }
    gpu_module = compiler.SourceModule(gpu_code)
    bm_interp = gpu_module.get_function("InterpolateBeam")
    meas_eq = gpu_module.get_function("MeasEq")
    bm_texref = gpu_module.get_texref("bm_tex")
    import pycuda.autoinit
    h = cublasCreate()  # handle for managing cublas
    # define GPU buffers and transfer initial values
    bm_texref.set_array(
        numpy3d_to_array(bm_cube)
    )  # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT)
    antpos_gpu = gpuarray.to_gpu(
        antpos)  # never changes, set to -2*pi*antpos/c
    Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype)
    A_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=real_dtype)  # will be set on GPU by bm_interp
    crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype)
    eq2top_gpu = gpuarray.empty(shape=(3, 3),
                                dtype=real_dtype)  # sent from CPU each time
    crdtop_gpu = gpuarray.empty(shape=(3, npixc),
                                dtype=real_dtype)  # will be set on GPU
    tau_gpu = gpuarray.empty(shape=(nant, npixc),
                             dtype=real_dtype)  # will be set on GPU
    v_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=complex_dtype)  # will be set on GPU
    vis_gpus = [
        gpuarray.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    # output CPU buffers for downloading answers
    vis_cpus = [
        np.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    streams = [driver.Stream() for i in xrange(chunk)]
    event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate',
                   'meas_eq', 'vis', 'end')
    vis = np.empty((ntimes, nant, nant), dtype=complex_dtype)
    for t in xrange(ntimes):
        if verbose: print '%d/%d' % (t + 1, ntimes)
        eq2top_gpu.set(
            eq2tops[t])  # defines sky orientation for this time step
        events = [{e: driver.Event()
                   for e in event_order} for i in xrange(chunk)]
        for c in xrange(chunk + 2):
            cc = c - 1
            ccc = c - 2
            if 0 <= ccc < chunk:
                stream = streams[ccc]
                vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream)
                events[ccc]['end'].record(stream)
            if 0 <= cc < chunk:
                stream = streams[cc]
                cublasSetStream(h, stream.handle)
                ## compute crdtop = dot(eq2top,crd_eq)
                # cublas arrays are in Fortran order, so P=M*N is actually
                # peformed as P.T = N.T * M.T
                cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata,
                            npixc, eq2top_gpu.gpudata, 3, 0.,
                            crdtop_gpu.gpudata, npixc)
                events[cc]['eq2top'].record(stream)
                ## compute tau = dot(antpos,crdtop)
                cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1.,
                            crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3,
                            0., tau_gpu.gpudata, npixc)
                events[cc]['tau'].record(stream)
                ## interpolate bm_tex at specified topocentric coords, store interpolation in A
                ## threads are parallelized across pixel axis
                bm_interp(crdtop_gpu,
                          A_gpu,
                          grid=grid,
                          block=block,
                          stream=stream)
                events[cc]['interpolate'].record(stream)
                # compute v = A * I * exp(1j*tau*freq)
                meas_eq(A_gpu,
                        Isqrt_gpu,
                        tau_gpu,
                        real_dtype(freq),
                        v_gpu,
                        grid=grid,
                        block=block,
                        stream=stream)
                events[cc]['meas_eq'].record(stream)
                # compute vis = dot(v, v.T)
                # transpose below incurs about 20% overhead
                cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata,
                            npixc, v_gpu.gpudata, npixc, 0.,
                            vis_gpus[cc].gpudata, nant)
                events[cc]['vis'].record(stream)
            if c < chunk:
                stream = streams[c]
                events[c]['start'].record(stream)
                crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc],
                                     stream=stream)
                Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc],
                                    stream=stream)
                events[c]['upload'].record(stream)
        events[chunk - 1]['end'].synchronize()
        vis[t] = sum(vis_cpus)
        if verbose:
            for c in xrange(chunk):
                print '%d:%d START->END:' % (
                    c, chunk), events[c]['start'].time_till(
                        events[c]['end']) * 1e-3
                #for i,e in enumerate(event_order[:-1]):
                #    print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3
            print 'TOTAL:', events[0]['start'].time_till(
                events[chunk - 1]['end']) * 1e-3
    # teardown GPU configuration
    cublasDestroy(h)
    return vis
コード例 #15
0
ファイル: tmpl8_driver.py プロジェクト: DeLaVlag/tvb-root
	def run_simulation(self):

		# setup data#{{{
		data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T }
		base_shape = self.n_work_items,
		for name, shape in dict(
			tavg0=(self.exposures, self.args.n_regions,),
			tavg1=(self.exposures, self.args.n_regions,),
			state=(self.buf_len, self.states * self.args.n_regions),
			).items():
			# memory error exception for compute device
			try:
				data[name] = np.zeros(shape + base_shape, 'f')
			except MemoryError as e:
				self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large '
							 'for this compute device',
							 e, self.args.n_sweep_arg0, self.args.n_sweep_arg1)
				exit(1)

		gpu_data = self.make_gpu_data(data)#{{{

		# setup CUDA stuff#{{{
		step_fn = self.make_kernel(
			source_file=self.args.filename,
			warp_size=32,
			# block_dim_x=self.args.n_sweep_arg0,
			# ext_options=preproccesor_defines,
			# caching=args.caching,
			args=self.args,
			lineinfo=self.args.lineinfo,
			nh=self.buf_len,
			)#}}}

		# setup simulation#{{{
		tic = time.time()

		n_streams = 32
		streams = [drv.Stream() for i in range(n_streams)]
		events = [drv.Event() for i in range(n_streams)]
		tavg_unpinned = []

		try:
			tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32)
		except drv.MemoryError as e:
			self.logger.error(
				'%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU',
				e, self.params.size)
			exit(1)

		# determine optimal grid recursively
		def dog(fgd):
			maxgd, mingd = max(fgd), min(fgd)
			maxpos = fgd.index(max(fgd))
			if (maxgd - 1) * mingd * bx * by >= nwi:
				fgd[maxpos] = fgd[maxpos] - 1
				dog(fgd)
			else:
				return fgd

		# n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y
		# form an optimal grid recursively
		bx, by = self.args.blockszx, self.args.blockszy
		nwi = self.n_work_items
		rootnwi = int(np.ceil(np.sqrt(nwi)))
		gridx = int(np.ceil(rootnwi / bx))
		gridy = int(np.ceil(rootnwi / by))

		final_block_dim = bx, by, 1

		fgd = [gridx, gridy]
		dog(fgd)
		final_grid_dim = fgd[0], fgd[1]

		assert gridx * gridy * bx * by >= nwi

		self.logger.info('history shape %r', gpu_data['state'].shape)
		self.logger.info('gpu_data %s', gpu_data['tavg0'].shape)
		self.logger.info('on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))
		self.logger.info('final block dim %r', final_block_dim)
		self.logger.info('final grid dim %r', final_grid_dim)

		# run simulation#{{{
		nstep = self.args.n_time

		self.gpu_mem_info() if self.args.verbose else None

		try:
			for i in tqdm.trange(nstep, file=sys.stdout):

				try:
					event = events[i % n_streams]
					stream = streams[i % n_streams]

					if i > 0:
						stream.wait_for_event(events[(i - 1) % n_streams])

					step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len),
							np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt),
							gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'],
							gpu_data['tavg%d' % (i%2,)],
							block=final_block_dim, grid=final_grid_dim)

					event.record(streams[i % n_streams])
				except drv.LaunchError as e:
					self.logger.error('%s', e)
					exit(1)

				tavgk = 'tavg%d' % ((i + 1) % 2,)

				# async wrt. other streams & host, but not this stream.
				if i >= n_streams:
					stream.synchronize()
					tavg_unpinned.append(tavg[i % n_streams].copy())

				drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream)

			# recover uncopied data from pinned buffer
			if nstep > n_streams:
				for i in range(nstep % n_streams, n_streams):
					stream.synchronize()
					tavg_unpinned.append(tavg[i].copy())

			for i in range(nstep % n_streams):
				stream.synchronize()
				tavg_unpinned.append(tavg[i].copy())

		except drv.LogicError as e:
			self.logger.error('%s. Check the number of states of the model or '
						 'GPU block shape settings blockdim.x/y %r, griddim %r.',
						 e, final_block_dim, final_grid_dim)
			exit(1)
		except drv.RuntimeError as e:
			self.logger.error('%s', e)
			exit(1)


		# self.logger.info('kernel finish..')
		# release pinned memory
		tavg = np.array(tavg_unpinned)

		# also release gpu_data
		self.release_gpumem(gpu_data)

		self.logger.info('kernel finished')
		return tavg
コード例 #16
0
ファイル: chyqmom27_script.py プロジェクト: qwang3/PyQBMMlib
def chyqmom27(
    moments: np.ndarray, 
    size: int):

    mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes
    sizeof_float = np.int32(np.dtype(np.float32).itemsize)
    size = np.int32(size)

    BlockSize = (256, 1, 1)
    GridSize = (size +BlockSize[0] - 1) /BlockSize[0];
    GridSize = (int(GridSize), 1, 1)

    # compile kernel
    HYQ = SourceModule(HYQMOM)
    CHY27 = SourceModule(CHYQMOM27)
    hyqmom3 = HYQ.get_function('hyqmom3')

    c_kernel = CHY27.get_function('chyqmom27_cmoments')
    chyqmom27_rho_yf = CHY27.get_function('chyqmom27_rho_yf')
    chyqmom27_zf = CHY27.get_function('chyqmom27_zf')
    chyqmom27_mu = CHY27.get_function('chyqmom27_mu')
    float_value_set = CHY27.get_function('float_value_set')
    float_array_set = CHY27.get_function('float_array_set')
    chyqmom27_set_m = CHY27.get_function('chyqmom27_set_m')
    print_device = CHY27.get_function('print_device')
    chyqmom27_wout = CHY27.get_function('chyqmom27_wout')
    chyqmom27_xout = CHY27.get_function('chyqmom27_xout')
    chyqmom27_yout = CHY27.get_function('chyqmom27_yout')
    chyqmom27_zout = CHY27.get_function('chyqmom27_zout')

    w = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    x = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    y = cuda.aligned_zeros((27, int(size)), dtype=np.float32)
    z = cuda.aligned_zeros((27, int(size)), dtype=np.float32)

    # Allocate memory 
    moments_device = cuda.mem_alloc(int(sizeof_float * size * 16))
    c_moments = cuda.mem_alloc(int(sizeof_float * size * 12))

    m = cuda.mem_alloc(int(sizeof_float * size * 10))
    float_value_set(m, np.float32(1), size, np.int32(0), block=BlockSize, grid=GridSize)
    float_value_set(m, np.float32(0), size, size, block=BlockSize, grid=GridSize)

    w1 = cuda.mem_alloc(int(sizeof_float * size * 3))
    x1 = cuda.mem_alloc(int(sizeof_float * size * 3))

    w2 = cuda.mem_alloc(int(sizeof_float * size * 9))
    x2 = cuda.mem_alloc(int(sizeof_float * size * 9))
    y2 = cuda.mem_alloc(int(sizeof_float * size * 9))

    rho = cuda.mem_alloc(int(sizeof_float * size * 9))
    yf = cuda.mem_alloc(int(sizeof_float * size * 3))
    yp = cuda.mem_alloc(int(sizeof_float * size * 9))
    zf = cuda.mem_alloc(int(sizeof_float * size * 3))

    w3 = cuda.mem_alloc(int(sizeof_float * size * 3))
    x3 = cuda.mem_alloc(int(sizeof_float * size * 3))

    mu = cuda.mem_alloc(int(sizeof_float * size * 3))

    w_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    x_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    y_dev = cuda.mem_alloc(int(sizeof_float * size * 27))
    z_dev = cuda.mem_alloc(int(sizeof_float * size * 27))

    cuda.memcpy_htod(moments_device, moments)
    # Is this faster? 

    time_before = cuda.Event()
    time_after = cuda.Event()

    time_before.record()

    c_kernel(moments_device, c_moments, size, block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(3) * size, np.int32(6) * size, block=BlockSize, grid=GridSize)
    float_array_set(m, c_moments, size, np.int32(4) * size, np.int32(9) * size, block=BlockSize, grid=GridSize)

    # print("What is m1?")
    # print_device(m, np.int32(5), block=BlockSize, grid=GridSize)

    hyqmom3(m, x1, w1, size, block=BlockSize, grid=GridSize)

    # Is this faster? 
    chyqmom27_set_m(m, c_moments, size, block=BlockSize, grid=GridSize)

    # this_context.synchronize()
    # print_device(m, np.int32(10), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering CHYQMOM9")
    chyqmom9(m, size, w2, x2, y2)

    # this_context.synchronize()
    # print("What is w2?")
    # print_device(w2, np.int32(10), block=BlockSize, grid=GridSize)


    chyqmom27_rho_yf(c_moments, y2, w2, rho, yf, yp, size, block=BlockSize, grid=GridSize)
    chyqmom27_zf(c_moments, x1, zf, size, block=BlockSize, grid=GridSize) 
    chyqmom27_mu(c_moments, rho, zf, mu, size, block=BlockSize, grid=GridSize)

    float_array_set(m, mu, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize)
    float_array_set(m, mu, size, np.int32(3) * size, np.int32(1) * size, block=BlockSize, grid=GridSize)
    float_array_set(m, mu, size, np.int32(4) * size, np.int32(2) * size, block=BlockSize, grid=GridSize)
    hyqmom3(m, x3, w3, size, block=BlockSize, grid=GridSize)

    chyqmom27_wout(moments_device, w1, rho, w3, w_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_xout(moments_device, x1, x_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_yout(moments_device, yf, yp, y_dev, size, block=BlockSize, grid=GridSize)
    chyqmom27_zout(moments_device, zf, x3, z_dev, block=BlockSize, grid=GridSize)

    time_after.record()
    time_after.synchronize()
    elapsed_time =  time_after.time_since(time_before)
    
    cuda.memcpy_dtoh(w, w_dev)
    cuda.memcpy_dtoh(x, x_dev)
    cuda.memcpy_dtoh(y, y_dev)
    cuda.memcpy_dtoh(z, z_dev)

    # this_context.synchronize()
    # print("Entering rho")
    # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering mu")
    # print_device(mu, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering w1")
    # print_device(w1, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering rho")
    # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Entering w3")
    # print_device(w3, np.int32(3*2), block=BlockSize, grid=GridSize)
    # this_context.synchronize()
    # print("Final w_dev")
    # print_device(w_dev, np.int32(27*1), block=BlockSize, grid=GridSize)

    moments_device.free()
    c_moments.free()

    m.free()
    w1.free()
    x1.free()

    w2.free()
    x2.free()
    y2.free()

    rho.free()
    yf.free()
    yp.free()
    zf.free()

    w3.free()
    x3.free()

    mu.free()

    return elapsed_time, w_dev, x_dev, y_dev, z_dev
コード例 #17
0
        prev_layer = None
        for layer in layers:
            layer.fprop()
            if layer.weights is not None:
                mean = layer.get_activation_mean()
                scale = .5  #if prev_layer is None else prev_layer.reduction_factor()
                print("Scale weights: %.3f (%.3f) %s" %
                      (scale / mean, scale, layer))
                layer.weights *= scale / mean
                layer.fprop()

            prev_layer = layer

        ng.bench = layer_bench

        start = drv.Event()
        end = drv.Event()

        fprop_time = 0
        bprop_time = 0
        fprop_flops = 0
        bprop_flops = 0

        # We throw away the first run as it includes pycuda kernel loading times.
        # So add 1 to our loop count.
        for loop in range(loops + 1):

            start.record()
            flops = 0

            #fprop
コード例 #18
0
def sync_only():
    sync_gpu(driver.Event())
コード例 #19
0
ファイル: model_build_run.py プロジェクト: msdey/gprMax
def solve_gpu(currentmodelrun, modelend, G):
    """Solving using FDTD method on GPU. Implemented using Nvidia CUDA.

    Args:
        currentmodelrun (int): Current model run number.
        modelend (int): Number of last model to run.
        G (class): Grid class instance - holds essential parameters describing the model.

    Returns:
        tsolve (float): Time taken to execute solving
    """

    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    drv.init()

    # Suppress nvcc warnings on Windows
    if sys.platform == 'win32':
        compiler_opts = ['-w']
    else:
        compiler_opts = None

    # Create device handle and context on specifc GPU device (and make it current context)
    dev = drv.Device(G.gpu.deviceID)
    ctx = dev.make_context()

    # Electric and magnetic field updates - prepare kernels, and get kernel functions
    if Material.maxpoles > 0:
        kernels_fields = SourceModule(kernels_template_fields.substitute(
            REAL=cudafloattype,
            COMPLEX=cudacomplextype,
            N_updatecoeffsE=G.updatecoeffsE.size,
            N_updatecoeffsH=G.updatecoeffsH.size,
            NY_MATCOEFFS=G.updatecoeffsE.shape[1],
            NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1],
            NX_FIELDS=G.Ex.shape[0],
            NY_FIELDS=G.Ex.shape[1],
            NZ_FIELDS=G.Ex.shape[2],
            NX_ID=G.ID.shape[1],
            NY_ID=G.ID.shape[2],
            NZ_ID=G.ID.shape[3],
            NX_T=G.Tx.shape[1],
            NY_T=G.Tx.shape[2],
            NZ_T=G.Tx.shape[3]),
                                      options=compiler_opts)
    else:  # Set to one any substitutions for dispersive materials
        kernels_fields = SourceModule(kernels_template_fields.substitute(
            REAL=cudafloattype,
            COMPLEX=cudacomplextype,
            N_updatecoeffsE=G.updatecoeffsE.size,
            N_updatecoeffsH=G.updatecoeffsH.size,
            NY_MATCOEFFS=G.updatecoeffsE.shape[1],
            NY_MATDISPCOEFFS=1,
            NX_FIELDS=G.Ex.shape[0],
            NY_FIELDS=G.Ex.shape[1],
            NZ_FIELDS=G.Ex.shape[2],
            NX_ID=G.ID.shape[1],
            NY_ID=G.ID.shape[2],
            NZ_ID=G.ID.shape[3],
            NX_T=1,
            NY_T=1,
            NZ_T=1),
                                      options=compiler_opts)
    update_e_gpu = kernels_fields.get_function("update_e")
    update_h_gpu = kernels_fields.get_function("update_h")

    # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels
    updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0]
    updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0]
    if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem:
        raise GeneralError(
            'Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'
            .format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name))
    else:
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)

    # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU
    if Material.maxpoles > 0:  # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values).
        update_e_dispersive_A_gpu = kernels_fields.get_function(
            "update_e_dispersive_A")
        update_e_dispersive_B_gpu = kernels_fields.get_function(
            "update_e_dispersive_B")
        G.gpu_initialise_dispersive_arrays()

    # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU
    G.gpu_set_blocks_per_grid()
    G.gpu_initialise_arrays()

    # PML updates
    if G.pmls:
        # Prepare kernels
        kernels_pml = SourceModule(kernels_template_pml.substitute(
            REAL=cudafloattype,
            N_updatecoeffsE=G.updatecoeffsE.size,
            N_updatecoeffsH=G.updatecoeffsH.size,
            NY_MATCOEFFS=G.updatecoeffsE.shape[1],
            NY_R=G.pmls[0].ERA.shape[1],
            NX_FIELDS=G.Ex.shape[0],
            NY_FIELDS=G.Ex.shape[1],
            NZ_FIELDS=G.Ex.shape[2],
            NX_ID=G.ID.shape[1],
            NY_ID=G.ID.shape[2],
            NZ_ID=G.ID.shape[3]),
                                   options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels
        updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        # Set block per grid, initialise arrays on GPU, and get kernel functions
        for pml in G.pmls:
            pml.gpu_set_blocks_per_grid(G)
            pml.gpu_initialise_arrays()
            pml.gpu_get_update_funcs(kernels_pml)

    # Receivers
    if G.rxs:
        # Initialise arrays on GPU
        rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G)
        # Prepare kernel and get kernel function
        kernel_store_outputs = SourceModule(
            kernel_template_store_outputs.substitute(REAL=cudafloattype,
                                                     NY_RXCOORDS=3,
                                                     NX_RXS=6,
                                                     NY_RXS=G.iterations,
                                                     NZ_RXS=len(G.rxs),
                                                     NX_FIELDS=G.Ex.shape[0],
                                                     NY_FIELDS=G.Ex.shape[1],
                                                     NZ_FIELDS=G.Ex.shape[2]),
            options=compiler_opts)
        store_outputs_gpu = kernel_store_outputs.get_function("store_outputs")

    # Sources - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.voltagesources + G.hertziandipoles + G.magneticdipoles:
        kernels_sources = SourceModule(kernels_template_sources.substitute(
            REAL=cudafloattype,
            N_updatecoeffsE=G.updatecoeffsE.size,
            N_updatecoeffsH=G.updatecoeffsH.size,
            NY_MATCOEFFS=G.updatecoeffsE.shape[1],
            NY_SRCINFO=4,
            NY_SRCWAVES=G.iterations,
            NX_FIELDS=G.Ex.shape[0],
            NY_FIELDS=G.Ex.shape[1],
            NZ_FIELDS=G.Ex.shape[2],
            NX_ID=G.ID.shape[1],
            NY_ID=G.ID.shape[2],
            NZ_ID=G.ID.shape[3]),
                                       options=compiler_opts)
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels
        updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        if G.hertziandipoles:
            srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(
                G.hertziandipoles, G)
            update_hertzian_dipole_gpu = kernels_sources.get_function(
                "update_hertzian_dipole")
        if G.magneticdipoles:
            srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(
                G.magneticdipoles, G)
            update_magnetic_dipole_gpu = kernels_sources.get_function(
                "update_magnetic_dipole")
        if G.voltagesources:
            srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(
                G.voltagesources, G)
            update_voltage_source_gpu = kernels_sources.get_function(
                "update_voltage_source")

    # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.snapshots:
        # Initialise arrays on GPU
        snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(
            G)
        # Prepare kernel and get kernel function
        kernel_store_snapshot = SourceModule(
            kernel_template_store_snapshot.substitute(REAL=cudafloattype,
                                                      NX_SNAPS=Snapshot.nx_max,
                                                      NY_SNAPS=Snapshot.ny_max,
                                                      NZ_SNAPS=Snapshot.nz_max,
                                                      NX_FIELDS=G.Ex.shape[0],
                                                      NY_FIELDS=G.Ex.shape[1],
                                                      NZ_FIELDS=G.Ex.shape[2]),
            options=compiler_opts)
        store_snapshot_gpu = kernel_store_snapshot.get_function(
            "store_snapshot")

    # Iteration loop timer
    iterstart = drv.Event()
    iterend = drv.Event()
    iterstart.record()

    for iteration in tqdm(range(G.iterations),
                          desc='Running simulation, model ' +
                          str(currentmodelrun) + '/' + str(modelend),
                          ncols=get_terminal_width() - 1,
                          file=sys.stdout,
                          disable=not G.progressbars):

        # Get GPU memory usage on final iteration
        if iteration == G.iterations - 1:
            memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0]

        # Store field component values for every receiver
        if G.rxs:
            store_outputs_gpu(np.int32(len(G.rxs)),
                              np.int32(iteration),
                              rxcoords_gpu.gpudata,
                              rxs_gpu.gpudata,
                              G.Ex_gpu.gpudata,
                              G.Ey_gpu.gpudata,
                              G.Ez_gpu.gpudata,
                              G.Hx_gpu.gpudata,
                              G.Hy_gpu.gpudata,
                              G.Hz_gpu.gpudata,
                              block=(1, 1, 1),
                              grid=(round32(len(G.rxs)), 1, 1))

        # Store any snapshots
        for i, snap in enumerate(G.snapshots):
            if snap.time == iteration + 1:
                store_snapshot_gpu(np.int32(i),
                                   np.int32(snap.xs),
                                   np.int32(snap.xf),
                                   np.int32(snap.ys),
                                   np.int32(snap.yf),
                                   np.int32(snap.zs),
                                   np.int32(snap.zf),
                                   np.int32(snap.dx),
                                   np.int32(snap.dy),
                                   np.int32(snap.dz),
                                   G.Ex_gpu.gpudata,
                                   G.Ey_gpu.gpudata,
                                   G.Ez_gpu.gpudata,
                                   G.Hx_gpu.gpudata,
                                   G.Hy_gpu.gpudata,
                                   G.Hz_gpu.gpudata,
                                   snapEx_gpu.gpudata,
                                   snapEy_gpu.gpudata,
                                   snapEz_gpu.gpudata,
                                   snapHx_gpu.gpudata,
                                   snapHy_gpu.gpudata,
                                   snapHz_gpu.gpudata,
                                   block=Snapshot.tpb,
                                   grid=Snapshot.bpg)
                if G.snapsgpu2cpu:
                    gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(),
                                           snapEz_gpu.get(), snapHx_gpu.get(),
                                           snapHy_gpu.get(), snapHz_gpu.get(),
                                           i, snap)

        # Update magnetic field components
        update_h_gpu(np.int32(G.nx),
                     np.int32(G.ny),
                     np.int32(G.nz),
                     G.ID_gpu.gpudata,
                     G.Hx_gpu.gpudata,
                     G.Hy_gpu.gpudata,
                     G.Hz_gpu.gpudata,
                     G.Ex_gpu.gpudata,
                     G.Ey_gpu.gpudata,
                     G.Ez_gpu.gpudata,
                     block=G.tpb,
                     grid=G.bpg)

        # Update magnetic field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_magnetic(G)

        # Update magnetic field components for magetic dipole sources
        if G.magneticdipoles:
            update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)),
                                       np.int32(iteration),
                                       floattype(G.dx),
                                       floattype(G.dy),
                                       floattype(G.dz),
                                       srcinfo1_magnetic_gpu.gpudata,
                                       srcinfo2_magnetic_gpu.gpudata,
                                       srcwaves_magnetic_gpu.gpudata,
                                       G.ID_gpu.gpudata,
                                       G.Hx_gpu.gpudata,
                                       G.Hy_gpu.gpudata,
                                       G.Hz_gpu.gpudata,
                                       block=(1, 1, 1),
                                       grid=(round32(len(G.magneticdipoles)),
                                             1, 1))

        # Update electric field components
        # If all materials are non-dispersive do standard update
        if Material.maxpoles == 0:
            update_e_gpu(np.int32(G.nx),
                         np.int32(G.ny),
                         np.int32(G.nz),
                         G.ID_gpu.gpudata,
                         G.Ex_gpu.gpudata,
                         G.Ey_gpu.gpudata,
                         G.Ez_gpu.gpudata,
                         G.Hx_gpu.gpudata,
                         G.Hy_gpu.gpudata,
                         G.Hz_gpu.gpudata,
                         block=G.tpb,
                         grid=G.bpg)
        # If there are any dispersive materials do 1st part of dispersive update
        # (it is split into two parts as it requires present and updated electric field values).
        else:
            update_e_dispersive_A_gpu(np.int32(G.nx),
                                      np.int32(G.ny),
                                      np.int32(G.nz),
                                      np.int32(Material.maxpoles),
                                      G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata,
                                      G.Ty_gpu.gpudata,
                                      G.Tz_gpu.gpudata,
                                      G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata,
                                      G.Ey_gpu.gpudata,
                                      G.Ez_gpu.gpudata,
                                      G.Hx_gpu.gpudata,
                                      G.Hy_gpu.gpudata,
                                      G.Hz_gpu.gpudata,
                                      block=G.tpb,
                                      grid=G.bpg)

        # Update electric field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_electric(G)

        # Update electric field components for voltage sources
        if G.voltagesources:
            update_voltage_source_gpu(np.int32(len(G.voltagesources)),
                                      np.int32(iteration),
                                      floattype(G.dx),
                                      floattype(G.dy),
                                      floattype(G.dz),
                                      srcinfo1_voltage_gpu.gpudata,
                                      srcinfo2_voltage_gpu.gpudata,
                                      srcwaves_voltage_gpu.gpudata,
                                      G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata,
                                      G.Ey_gpu.gpudata,
                                      G.Ez_gpu.gpudata,
                                      block=(1, 1, 1),
                                      grid=(round32(len(G.voltagesources)), 1,
                                            1))

        # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last)
        if G.hertziandipoles:
            update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)),
                                       np.int32(iteration),
                                       floattype(G.dx),
                                       floattype(G.dy),
                                       floattype(G.dz),
                                       srcinfo1_hertzian_gpu.gpudata,
                                       srcinfo2_hertzian_gpu.gpudata,
                                       srcwaves_hertzian_gpu.gpudata,
                                       G.ID_gpu.gpudata,
                                       G.Ex_gpu.gpudata,
                                       G.Ey_gpu.gpudata,
                                       G.Ez_gpu.gpudata,
                                       block=(1, 1, 1),
                                       grid=(round32(len(G.hertziandipoles)),
                                             1, 1))

        # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates.
        if Material.maxpoles > 0:
            update_e_dispersive_B_gpu(np.int32(G.nx),
                                      np.int32(G.ny),
                                      np.int32(G.nz),
                                      np.int32(Material.maxpoles),
                                      G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata,
                                      G.Ty_gpu.gpudata,
                                      G.Tz_gpu.gpudata,
                                      G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata,
                                      G.Ey_gpu.gpudata,
                                      G.Ez_gpu.gpudata,
                                      block=G.tpb,
                                      grid=G.bpg)

    # Copy output from receivers array back to correct receiver objects
    if G.rxs:
        gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G)

    # Copy data from any snapshots back to correct snapshot objects
    if G.snapshots and not G.snapsgpu2cpu:
        for i, snap in enumerate(G.snapshots):
            gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(),
                                   snapEz_gpu.get(), snapHx_gpu.get(),
                                   snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

    iterend.record()
    iterend.synchronize()
    tsolve = iterstart.time_till(iterend) * 1e-3

    # Remove context from top of stack and delete
    ctx.pop()
    del ctx

    return tsolve, memsolve
コード例 #20
0
ファイル: vector_add.py プロジェクト: lkampoli/gpu-course
def vector_add_example(context):

    #create input data
    n = numpy.int32(5e7)
    a = numpy.random.randn(n).astype(numpy.float32)
    b = numpy.random.randn(n).astype(numpy.float32)
    c = numpy.zeros_like(b)

    #measure CPU time
    start = timer()
    d = a + b
    end = timer()
    print("a+b took", (end - start) * 1000.0, "ms")

    #move data to the GPU
    args = [c, a, b]
    gpu_args = []
    for arg in args:
        gpu_args.append(drv.mem_alloc(arg.nbytes))
        drv.memcpy_htod(gpu_args[-1], arg)
    gpu_args.append(n)

    #read kernel into string
    with open('vector_add.cu', 'r') as f:
        kernel_string = f.read()

    #get compute capability for compiling CUDA kernels
    devprops = {
        str(k): v
        for (k, v) in context.get_device().get_attributes().items()
    }
    cc = str(devprops['COMPUTE_CAPABILITY_MAJOR']) + str(
        devprops['COMPUTE_CAPABILITY_MINOR'])

    #compile the kernel
    vector_add = SourceModule(kernel_string,
                              arch='compute_' + cc,
                              code='sm_' + cc,
                              cache_dir=False,
                              no_extern_c=True,
                              options=['-Xcompiler=--std=c++11'
                                       ]).get_function("vec_add_kernel")

    #launch the kernel
    threads = (1024, 1, 1)
    grid = (int(numpy.ceil(n / float(threads[0]))), 1, 1)

    context.synchronize()
    start = drv.Event()
    end = drv.Event()
    start.record()
    vector_add(*gpu_args, block=threads, grid=grid, stream=None, shared=0)
    end.record()
    context.synchronize()

    print("vec_add_kernel took", end.time_since(start), "ms.")

    #copy output data back from GPU
    drv.memcpy_dtoh(c, gpu_args[0])

    #compare output with reference
    correct = numpy.allclose(c, a + b, atol=1e-6)
    if not correct:
        print("TEST FAILED!")
        print(c)
        print(a + b)
    else:
        print("TEST PASSED!")
コード例 #21
0
    def _thread_enqueue_solve_batch(self, cpu_slice_map, gpu_slice_map,
                                    **kwargs):
        """
        Enqueue CUDA memory transfer and kernel execution operations on a CUDA stream.

        CPU and GPU arrays are sliced given the dimension:slice mapping specified in
        cpu_slice_map and gpu_slice_map.

        Returns a (event, X2) tuple, where event is a CUDA event recorded at the
        end of this sequence of operations and X2 is a pinned memory array that
        will hold the result of the chi-squared operation.
        """

        tl = self.thread_local
        i, subslvr = tl.subslvr_gen.next()

        # A dictionary of references to memory pool allocated objects
        # ensuring that said objects remained allocated until
        # after compute has been performed. Returned from
        # this function, this object should be discarded when
        # reading the result of the enqueued operations.
        pool_refs = defaultdict(list)

        # Cache keyed by array names and contained indices
        # This is used to avoid unnecessary CPU to GPU copies
        # by caching the last index of the CPU array
        dirty = tl.dirty[i]

        # Guard pool allocations with a coarse-grained mutex
        with subslvr.pool_lock:
            # Now, iterate over our source chunks, enqueuing
            # memory transfers and CUDA kernels
            for src_cpu_slice_map, src_gpu_slice_map in self._gen_source_slices(
            ):
                # Update our maps with source slice information
                cpu_slice_map.update(src_cpu_slice_map)
                gpu_slice_map.update(src_gpu_slice_map)

                # Configure dimension extents and global size on the sub-solver
                for name, slice_ in cpu_slice_map.iteritems():
                    subslvr.update_dimension(
                        name=name,
                        global_size=self.dimension(name).global_size,
                        lower_extent=slice_.start,
                        upper_extent=slice_.stop)

                # Enqueue E Beam
                kernel = subslvr.rime_e_beam
                new_refs = self._enqueue_array(
                    subslvr,
                    cpu_slice_map,
                    gpu_slice_map,
                    direction=ASYNC_HTOD,
                    dirty=dirty,
                    classifiers=[Classifier.E_BEAM_INPUT])
                cdata_ref = self._enqueue_const_data_htod(
                    subslvr, kernel.rime_const_data[0])
                _update_refs(pool_refs, new_refs)
                _update_refs(pool_refs, {'cdata_ebeam': [cdata_ref]})
                kernel.execute(subslvr, subslvr.stream)

                # Enqueue B Sqrt
                kernel = subslvr.rime_b_sqrt
                new_refs = self._enqueue_array(
                    subslvr,
                    cpu_slice_map,
                    gpu_slice_map,
                    direction=ASYNC_HTOD,
                    dirty=dirty,
                    classifiers=[Classifier.B_SQRT_INPUT])
                cdata_ref = self._enqueue_const_data_htod(
                    subslvr, kernel.rime_const_data[0])
                _update_refs(pool_refs, new_refs)
                _update_refs(pool_refs, {'cdata_bsqrt': [cdata_ref]})
                kernel.execute(subslvr, subslvr.stream)

                # Enqueue EKB Sqrt
                kernel = subslvr.rime_ekb_sqrt
                new_refs = self._enqueue_array(
                    subslvr,
                    cpu_slice_map,
                    gpu_slice_map,
                    direction=ASYNC_HTOD,
                    dirty=dirty,
                    classifiers=[Classifier.EKB_SQRT_INPUT])
                cdata_ref = self._enqueue_const_data_htod(
                    subslvr, kernel.rime_const_data[0])
                _update_refs(pool_refs, new_refs)
                _update_refs(pool_refs, {'cdata_ekb': [cdata_ref]})
                kernel.execute(subslvr, subslvr.stream)

                # Enqueue Sum Coherencies
                kernel = subslvr.rime_sum
                new_refs = self._enqueue_array(
                    subslvr,
                    cpu_slice_map,
                    gpu_slice_map,
                    direction=ASYNC_HTOD,
                    dirty=dirty,
                    classifiers=[Classifier.COHERENCIES_INPUT])
                cdata_ref = self._enqueue_const_data_htod(
                    subslvr, kernel.rime_const_data[0])
                _update_refs(pool_refs, new_refs)
                _update_refs(pool_refs, {'cdata_coherencies': [cdata_ref]})
                kernel.execute(subslvr, subslvr.stream)

            # Enqueue chi-squared term reduction and return the
            # GPU array allocated to it
            X2_gpu_ary = subslvr.rime_reduce.execute(subslvr, subslvr.stream)

            # Get pinned memory to hold the chi-squared result
            sub_X2 = subslvr.pinned_mem_pool.allocate(shape=X2_gpu_ary.shape,
                                                      dtype=X2_gpu_ary.dtype)

            # Enqueue chi-squared copy off the GPU onto the CPU
            X2_gpu_ary.get_async(ary=sub_X2, stream=subslvr.stream)

            # Enqueue transfer of simulator output (model visibilities) to the CPU
            sim_output_refs = self._enqueue_array(
                subslvr,
                cpu_slice_map,
                gpu_slice_map,
                direction=ASYNC_DTOH,
                dirty={},
                classifiers=[Classifier.SIMULATOR_OUTPUT])

        # Should only be model visibilities
        assert len(sim_output_refs) == 1, (
            'Expected one array (model visibilities), '
            'received {l} instead.'.format(l=len(new_refs)))

        model_vis = sim_output_refs['model_vis'][0]

        # Create and record an event directly after the chi-squared copy
        # We'll synchronise on this thread in our synchronisation executor
        sync_event = cuda.Event(cuda.event_flags.DISABLE_TIMING
                                | cuda.event_flags.BLOCKING_SYNC)
        sync_event.record(subslvr.stream)

        # Retain references to CPU pinned  and GPU device memory
        # until the above enqueued operations have been performed.
        pool_refs['X2_gpu'].append(X2_gpu_ary)
        pool_refs['X2_cpu'].append(sub_X2)
        pool_refs['model_vis_output'].append(model_vis)

        return (sync_event, sub_X2, model_vis, pool_refs, subslvr.pool_lock,
                cpu_slice_map.copy(), gpu_slice_map.copy())
コード例 #22
0
    def radiation_loop(self, quant, write, rt_plot, Vmod):
        """ loops over the relevant kernels iteratively until the equilibrium TP - profile reached """

        condition = True
        quant.iter_value = np.int32(0)
        quant.p_lay = quant.dev_p_lay.get()

        # measures the runtime of a specified number of iterations
        start_loop = cuda.Event()
        end_loop = cuda.Event()
        start_total = cuda.Event()
        end_total = cuda.Event()

        # uncomment for time testing purposes
        # start_test = cuda.Event()
        # end_test = cuda.Event()

        start_total.record()

        while condition:

            if quant.iter_value % 10 == 0:
                start_loop.record()

            self.interpolate_temperatures(quant)
            self.interpolate_planck(quant)

            if quant.iter_value % 10 == 0:

                if Vmod.V_iter_nr == 0:
                    self.interpolate_opacities_and_scattering_cross_sections(quant)
                    self.interpolate_meanmolmass(quant)
                elif Vmod.V_iter_nr > 0:
                    Vmod.interpolate_molecular_and_mixed_opac(quant)
                    Vmod.combine_to_mixed_opacities(quant)
                self.interpolate_kappa(quant)
                self.calculate_c_p(quant)
                self.normalize_cloud_scattering(quant)
                self.calculate_transmission(quant)

                self.calculate_delta_z(quant)
                quant.delta_z_lay = quant.dev_delta_z_lay.get()
                hsfunc.calculate_height_z(quant)
                quant.dev_z_lay = gpuarray.to_gpu(quant.z_lay)
                self.calculate_direct_beamflux(quant)
            self.populate_spectral_flux(quant)
            self.integrate_flux(quant)

            # uncomment for time testing purposes
            # start_test.record()
            # end_test.record()
            # end_test.synchronize()
            # time_test = start_test.time_till(end_test)
            # print("\nTime for test [s]: {:g}".format(time_test * 1e-3))

            if quant.singlewalk == 0:

                abortsum = 0
                quant.marked_red = np.zeros(quant.nlayer)

                if quant.iter_value % 100 == 0:
                    print("\nWe are running \"" + quant.name + "\" at iteration step nr. : "+str(quant.iter_value))

                if quant.iter_value >= quant.foreplay:

                    # radiative temperature progression
                    self.rad_temp_iteration(quant)

                    quant.abort = quant.dev_abort.get()

                    for i in range(quant.nlayer):
                        abortsum += quant.abort[i]
                        if quant.abort[i] == 0:
                            quant.marked_red[i] = 1

                    if quant.iter_value % 10 == 0:
                        print("Layers converged: "+str(abortsum)+" out of "+str(quant.nlayer)+".")

                # checks whether to continue the loop
                condition = abortsum < quant.nlayer
                quant.iter_value += 1
                quant.iter_value = np.int32(quant.iter_value)

                if quant.iter_value % quant.n_plot == 0 or not condition:
                    write.write_restart_file(quant)
                    if quant.realtime_plot == 1:
                        rt_plot.plot_tp(quant)

                # records the time needed for 10 loops
                if (quant.iter_value-1) % 10 == 9:
                    end_loop.record()
                    end_loop.synchronize()
                    time_loop = start_loop.time_till(end_loop)
                    print("\nTime for the last 10 loops [s]: {:.2f}".format(time_loop * 1e-3))

                # time restriction for the run. It aborts automatically after the following time steps and prevents a hung job.
                if quant.iter_value > 2e4:
                    write.write_abort_file(quant)

                    print("\nRun exceeds allowed maximum allowed number of iteration steps. Aborting...")

                    raise SystemExit()

            elif quant.singlewalk == 1:
                condition = False

        end_total.record()
        end_total.synchronize()
        time_total = start_total.time_till(end_total)
        print("\nTime for radiative iteration [s]: {:.2f}".format(time_total * 1e-3))
        print("Total number of iterative steps: "+str(quant.iter_value))
コード例 #23
0
ファイル: bilinear.py プロジェクト: Amabitur/CUDA
M1, N1, _ = image.shape
M2 = int(2 * M1)
N2 = int(2 * N1)

result = np.zeros((M2, N2), dtype=np.uint32)
block = (16, 16, 1)
grid = (int(np.ceil(M2 / block[0])), int(np.ceil(N2 / block[1])))

# собираем ядро
mod = compiler.SourceModule(open("kernel.cu", "r").read())
bilinear_interpolation_kernel = mod.get_function("interpolate")

x_out = np.array([i for i in range(M2)] * N2)
y_out = np.array([i for i in range(N2)] * M2)

start = driver.Event()
stop = driver.Event()

#подготовка текстуры
print("Считаем на ГПУ...")
start.record()

prep_image = prepare_image(image)
tex = mod.get_texref("tex")
tex.set_filter_mode(driver.filter_mode.LINEAR)
tex.set_address_mode(0, driver.address_mode.CLAMP)
tex.set_address_mode(1, driver.address_mode.CLAMP)
driver.matrix_to_texref(prep_image, tex, order="C")

bilinear_interpolation_kernel(driver.Out(result),
                              driver.In(x_out),
コード例 #24
0
    def convection_loop(self, quant, write, rt_plot):
        """ loops interchangeably through the radiative and convection schemes """

        # kappa is required for the conv. instability check
        self.interpolate_kappa(quant)
        quant.T_lay = quant.dev_T_lay.get()
        quant.p_lay = quant.dev_p_lay.get()
        quant.p_int = quant.dev_p_int.get()
        quant.kappa_lay = quant.dev_kappa_lay.get()
        if quant.iso == 0:
            quant.kappa_int = quant.dev_kappa_int.get()
            hsfunc.conv_check(quant)
            hsfunc.mark_convective_layers(quant)

        # only starts the loop if convective adjustment is switched on
        if quant.singlewalk == 0 and quant.convection == 1:

            condition = sum(quant.conv_unstable) > 0

            start_total = cuda.Event()
            end_total = cuda.Event()
            start_total.record()

            quant.iter_value = np.int32(0)

            if condition:
                # measures time
                start_loop = cuda.Event()
                end_loop = cuda.Event()
                print("\nConvectively unstable layers found. Starting convective adjustment")
            else:
                print("All layers convectively stable. No convective adjustment necessary.\n")

            # quantities required on the host for the first convective adjustment
            quant.F_net = quant.dev_F_net.get()
            quant.F_up_tot = quant.dev_F_up_tot.get()

            while condition:

                if quant.iter_value % 10 == 0:
                    start_loop.record()

                if quant.iter_value % 100 == 0:
                    print("\nWe are running \"" + quant.name + "\" at iteration step nr. : "+str(quant.iter_value))

                # start with the convective adjustment and then recalculate the rad. fluxes, go back to conv. adjustment, then rad. fluxes, etc.
                self.interpolate_temperatures(quant)
                self.interpolate_meanmolmass(quant)
                self.interpolate_kappa(quant)
                self.calculate_c_p(quant)
                quant.kappa_lay = quant.dev_kappa_lay.get()
                quant.kappa_int = quant.dev_kappa_int.get()
                quant.c_p_lay = quant.dev_c_p_lay.get()
                quant.T_lay = quant.dev_T_lay.get()
                hsfunc.convective_adjustment(quant)
                quant.dev_T_lay = gpuarray.to_gpu(quant.T_lay)

                self.interpolate_temperatures(quant)
                self.interpolate_opacities_and_scattering_cross_sections(quant)
                self.interpolate_meanmolmass(quant)
                self.interpolate_kappa(quant)
                self.calculate_c_p(quant)
                self.normalize_cloud_scattering(quant)
                self.interpolate_planck(quant)
                self.calculate_transmission(quant)
                if quant.iter_value % 10 == 0:
                    self.calculate_delta_z(quant)
                    quant.delta_z_lay = quant.dev_delta_z_lay.get()
                    hsfunc.calculate_height_z(quant)
                    quant.dev_z_lay = gpuarray.to_gpu(quant.z_lay)
                    self.calculate_direct_beamflux(quant)
                self.populate_spectral_flux(quant)
                self.integrate_flux(quant)

                # copy back fluxes to determine convergence
                quant.F_net = quant.dev_F_net.get()
                quant.F_down_tot = quant.dev_F_down_tot.get()
                quant.F_up_tot = quant.dev_F_up_tot.get()
                quant.F_net_diff = quant.dev_F_net_diff.get()
                quant.kappa_lay = quant.dev_kappa_lay.get()
                quant.kappa_int = quant.dev_kappa_int.get()

                # mark layers where convection is dominant // used by realtime net flux plotting
                quant.T_lay = quant.dev_T_lay.get()

                # mark convection zone. used by realtime plotting
                hsfunc.mark_convective_layers(quant)
                # hsfunc.radiative_adjustment(quant)
                # if quant.iter_value % 50 == 0:
                #     print("kick")
                #     hsfunc.random_adjustment(quant)
                #     quant.dev_T_lay = gpuarray.to_gpu(quant.T_lay)

                # checks whether to continue the loop
                condition = not(hsfunc.check_for_global_local_equilibrium(quant)) or quant.iter_value < 100

                # the following only if loop continues
                if condition:

                    # realtime plotting every 10th step
                    if quant.iter_value % quant.n_plot == 0 or not condition:
                        write.write_restart_file(quant)
                        if quant.realtime_plot == 1:
                            rt_plot.plot_convective_feedback(quant)

                    # kernel that advances the temperature in a radiative way
                    self.conv_temp_iteration(quant)

                    # records the time needed for 10 loops
                    if quant.iter_value % 10 == 9:
                        end_loop.record()
                        end_loop.synchronize()
                        time_loop = start_loop.time_till(end_loop)
                        print("\nTime for the last 10 (total: {:d}) loops [s]: {:.2f}".format(quant.iter_value, time_loop * 1e-3))

                    quant.iter_value += 1
                    quant.iter_value = np.int32(quant.iter_value)

                # time restriction for the run. It aborts automatically after the following timesteps and thus prevents a hung job.
                if quant.iter_value > 2e4:

                    write.write_abort_file(quant)

                    print("\nRun exceeds allowed maximum allowed number of iteration steps. Aborting...")

                    raise SystemExit()

            end_total.record()
            end_total.synchronize()
            time_total = start_total.time_till(end_total)

            print("\nTime for rad.-conv. iteration [s]: {:.2f}".format(time_total * 1e-3))
            print("Total number of iterative steps: " + str(quant.iter_value))
コード例 #25
0
    def __init__(self,
                 device=0,
                 iterations=7,
                 compiler_options=None,
                 observers=None):
        """instantiate CudaFunctions object used for interacting with the CUDA device

        Instantiating this object will inspect and store certain device properties at
        runtime, which are used during compilation and/or execution of kernels by the
        kernel tuner. It also maintains a reference to the most recently compiled
        source module for copying data to constant memory before kernel launch.

        :param device: Number of CUDA device to use for this context
        :type device: int

        :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
        :type iterations: int
        """
        self.allocations = []
        self.texrefs = []
        if not pycuda_available and isinstance(
                drv, PyCudaPlaceHolder):  #and part to allow mocking
            raise ImportError(
                "Error: pycuda not installed, please install e.g. using 'pip install pycuda'."
            )

        drv.init()
        self.context = drv.Device(device).make_context()

        #inspect device properties
        devprops = {
            str(k): v
            for (k, v) in self.context.get_device().get_attributes().items()
        }
        self.max_threads = devprops['MAX_THREADS_PER_BLOCK']
        cc = str(devprops.get('COMPUTE_CAPABILITY_MAJOR', '0')) + str(
            devprops.get('COMPUTE_CAPABILITY_MINOR', '0'))
        if cc == "00":
            cc = self.context.get_device().compute_capability()
        self.cc = str(cc[0]) + str(cc[1])
        self.iterations = iterations
        self.current_module = None
        self.func = None
        self.compiler_options = compiler_options or []

        #select PyCUDA source module
        if int(self.cc) >= 35:
            self.source_mod = DynamicSourceModule
        else:
            self.source_mod = SourceModule
        if not self.source_mod:
            raise ImportError(
                "Error: pycuda not correctly installed, please ensure pycuda is installed on the same CUDA installation as you're using right now"
            )

        #create a stream and events
        self.stream = drv.Stream()
        self.start = drv.Event()
        self.end = drv.Event()

        #default dynamically allocated shared memory size, can be overwritten using smem_args
        self.smem_size = 0

        #setup observers
        self.observers = observers or []
        self.observers.append(CudaRuntimeObserver(self))
        for obs in self.observers:
            obs.register_device(self)

        #collect environment information
        env = dict()
        env["device_name"] = self.context.get_device().name()
        env["cuda_version"] = ".".join([str(i) for i in drv.get_version()])
        env["compute_capability"] = self.cc
        env["iterations"] = self.iterations
        env["compiler_options"] = compiler_options
        env["device_properties"] = devprops
        self.env = env
        self.name = env["device_name"]
コード例 #26
0
ファイル: P5B.py プロジェクト: adegirmenci/Harvard-CS205
    # Get image data
    height, width = np.int32(original_image.shape)
    print "Processing %d x %d image" % (width, height)
    size = width * height

    # Initialize the image region as empty
    im_region = np.int8(np.zeros([height, width]))

    # On the host, define the kernel parameters
    blocksize = (128, 4, 1)  #128,8 The number of threads per block (x,y,z)
    gridx = int(np.ceil(width / (1.0 * blocksize[0])))
    gridy = int(np.ceil(height / (1.0 * blocksize[1])))
    gridsize = (gridx, gridy)  # The number of thread blocks (x,y)

    # Initialize the GPU event trackers for timing
    start_gpu_time = cu.Event()
    end_gpu_time = cu.Event()
    gpu_transfer_time = 0.0
    gpu_comp_time = 0.0
    cpu_compute_time = 0.0

    # Allocate memory
    image = np.float32(np.array(original_image))
    region = np.int8(np.zeros([height, width]))
    queue = np.int32(np.linspace(0, size - 1, size))
    nextFront = np.int32(np.zeros([height, width]))
    scan = np.int32(np.zeros([height, width]))
    qLen = np.int32(queue.size)
    # Allocate device memory and copy host to device
    start_gpu_time.record()
    d_image = gpu.to_gpu(image.reshape(-1))
コード例 #27
0
def main():
    parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
    parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
    parser.add_argument(
        '-b',
        '--batch-size',
        default=[],
        action="append",
        help=
        'Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.',
        type=int)
    parser.add_argument('-s',
                        '--sequence-length',
                        default=128,
                        help='Sequence length of the BERT model',
                        type=int)
    parser.add_argument(
        '-i',
        '--iterations',
        default=200,
        help='Number of iterations to run when benchmarking each batch size.',
        type=int)
    parser.add_argument(
        '-w',
        '--warm-up-runs',
        default=10,
        help='Number of iterations to run prior to benchmarking.',
        type=int)
    parser.add_argument('-r',
                        '--random-seed',
                        required=False,
                        default=12345,
                        help='Random seed.',
                        type=int)
    args, _ = parser.parse_known_args()
    args.batch_size = args.batch_size or [1]

    # Import necessary plugins for BERT TensorRT
    ctypes.CDLL("libnvinfer_plugin.so", mode=ctypes.RTLD_GLOBAL)

    with open(args.engine, 'rb') as f, trt.Runtime(
            TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read(
            )) as engine, engine.create_execution_context() as context:
        # Allocate buffers large enough to store the largest batch size
        max_input_shape = (args.sequence_length * max(args.batch_size), )
        max_output_shape = (args.sequence_length * max(args.batch_size), 2, 1,
                            1)
        buffers = [
            DeviceBuffer(max_input_shape),
            DeviceBuffer(max_input_shape),
            DeviceBuffer((max(args.batch_size) + 1, )),
            DeviceBuffer((args.sequence_length, )),
            DeviceBuffer(max_output_shape)
        ]

        # Prepare random input
        pseudo_vocab_size = 30522
        pseudo_type_vocab_size = 2
        np.random.seed(args.random_seed)
        test_word_ids = np.random.randint(
            0,
            pseudo_vocab_size, (args.sequence_length * max(args.batch_size)),
            dtype=np.int32)
        test_segment_ids = np.random.randint(
            0,
            pseudo_type_vocab_size,
            (args.sequence_length * max(args.batch_size)),
            dtype=np.int32)
        test_cu_seq_lens = np.arange(
            0,
            args.sequence_length * max(args.batch_size) + 1,
            args.sequence_length,
            dtype=np.int32)

        # Copy input h2d
        cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
        cuda.memcpy_htod(buffers[1].buf, test_segment_ids.ravel())
        cuda.memcpy_htod(buffers[2].buf, test_cu_seq_lens.ravel())

        bench_times = {}

        for idx, batch_size in enumerate(sorted(args.batch_size)):
            context.active_optimization_profile = 0

            # Each profile has unique bindings
            bindings = [buf.binding() for buf in buffers]

            shapes = {
                "input_ids": (args.sequence_length * batch_size, ),
                "segment_ids": (args.sequence_length * batch_size, ),
                "cu_seqlens": (batch_size + 1, ),
                "max_seqlen": (args.sequence_length, ),
            }

            for binding, shape in shapes.items():
                context.set_binding_shape(engine[binding], shape)
            assert context.all_binding_shapes_specified

            # Inference
            total_time = 0
            start = cuda.Event()
            end = cuda.Event()
            stream = cuda.Stream()

            # Warmup
            for _ in range(args.warm_up_runs):
                context.execute_async_v2(bindings=bindings,
                                         stream_handle=stream.handle)
                stream.synchronize()

            # Timing loop
            times = []
            for _ in range(args.iterations):
                start.record(stream)
                context.execute_async_v2(bindings=bindings,
                                         stream_handle=stream.handle)
                end.record(stream)
                stream.synchronize()
                times.append(end.time_since(start))

            # Compute average time, 95th percentile time and 99th percentile time.
            bench_times[batch_size] = times

        [b.free() for b in buffers]

        for batch_size, times in bench_times.items():
            total_time = sum(times)
            avg_time = total_time / float(len(times))
            times.sort()
            percentile95 = times[int(len(times) * 0.95)]
            percentile99 = times[int(len(times) * 0.99)]
            print(
                "Running {:} iterations with Batch Size: {:}\n\tTotal Time: {:} ms \tAverage Time: {:} ms\t95th Percentile Time: {:} ms\t99th Percentile Time: {:}"
                .format(args.iterations, batch_size, total_time, avg_time,
                        percentile95, percentile99))
コード例 #28
0
elif rank == 2: mpi_direction = 'f'
else: mpi_direction = 'fb'

if rank == 0:
	# prepare for plot
	from matplotlib.pyplot import *
	ion()
	imsh = imshow(np.ones((3*nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005)
	colorbar()

	# measure kernel execution time
	from datetime import datetime
	t1 = datetime.now()
	flop = 3*(nx*ny*nz*30)*tgap
	flops = np.zeros(tmax/tgap+1)
	start, stop = cuda.Event(), cuda.Event()
	start.record()

# main loop
for tn in xrange(1, tmax+1):
	fdtd.update_h()
	fdtd.mpi_exchange_boundary_h(mpi_direction, comm)

	fdtd.update_e()
	fdtd.mpi_exchange_boundary_e(mpi_direction, comm)

	if rank == 1: fdtd.update_src(tn)

	if tn%tgap == 0 and rank == 0:
		stop.record()
		stop.synchronize()
コード例 #29
0
	update_h.prepare("PPPPPP", block=(tpb,1,1))
	update_e.prepare("PPPPPPPPP", block=(tpb,1,1))
	update_src.prepare("fP", block=(nz,1,1))

	'''
	# prepare for plot
	from matplotlib.pyplot import *
	ion()
	imsh = imshow(np.ones((nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005)
	colorbar()
	'''

	# measure kernel execution time
	#from datetime import datetime
	#t1 = datetime.now()
	start = cuda.Event()
	stop = cuda.Event()
	start.record()

	# main loop
	for tn in xrange(1, 1000+1):
		update_h.prepared_call(
				(bpg,1), ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu)

		update_e.prepared_call(
				(bpg/2,1), ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu, 
				cex_gpu, cey_gpu, cez_gpu)

		update_src.prepared_call((1,1), np.float32(tn), ez_gpu)

		'''
コード例 #30
0
import pycuda.driver as drv
from pycuda import gpuarray
import libcudnn, ctypes
import numpy as np

# Create a cuDNN context
cudnn_context = libcudnn.cudnnCreate()

# Set some options and tensor dimensions
tensor_format = libcudnn.cudnnTensorFormat['CUDNN_TENSOR_NCHW']
data_type = libcudnn.cudnnDataType['CUDNN_DATA_FLOAT']
convolution_mode = libcudnn.cudnnConvolutionMode['CUDNN_CROSS_CORRELATION']
convolution_fwd_pref = libcudnn.cudnnConvolutionFwdPreference[
    'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST']

start, end = (drv.Event(), drv.Event())


def start_bench():
    start.record()


def end_bench(op):
    end.record()
    end.synchronize()
    msecs = end.time_since(start)
    print("%7.3f msecs" % (msecs))


n_input = 64
filters_in = 128