Exemple #1
0
def verify_single_dump(input_path, max_block_threads):
    print(input_path)
    kernel_name = path.basename(input_path).split("_", 1)[1]
    with open(path.join(input_path, "launch.txt"), "r") as launch_f:
        launch_lines = list(map(int, launch_f.readlines()))
    block = tuple(launch_lines[3:6])
    launch_block_size = block[0] * block[1] * block[2]
    if launch_block_size > max_block_threads:
        print(
            f"    Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})"
        )
        return
    module = drv.module_from_file(path.join(input_path, "module.ptx"))
    kernel = module.get_function(kernel_name)
    pre_args = append_debug_buffer(parse_arguments(input_path, "pre"))
    kernel_pre_args, host_pre_args = zip(*pre_args)
    kernel(*list(kernel_pre_args),
           grid=tuple(launch_lines[:3]),
           block=block,
           shared=launch_lines[6])
    post_args = parse_arguments(input_path, "post")
    _, host_post_args_args = zip(*post_args)
    for idx, (pre_arg,
              post_arg) in enumerate(zip(host_pre_args, host_post_args_args)):
        if pre_arg is None:
            continue
        try:
            assert_array_equal_override(kernel_name, idx, pre_arg, post_arg)
        except Exception as e:
            print(f"{idx}: {e}")
Exemple #2
0
    def __init__(self, _gpu_num=0):
        if GPUDev.__instance != None:
            raise Exception("The GPUDev class is a singleton!")
        else:
            GPUDev.__instance = self
        from pycuda import driver as drv
        drv.init()
        self.id = _gpu_num
        self.dev = drv.Device(self.id)
        self.ctx = self.dev.make_context()
        this_dir = os.path.dirname(os.path.realpath(__file__)) + "/"

        if precision.num == 1:
            self.mod = drv.module_from_file(os.path.join(
                this_dir, '../gpu/cuda_kernels/kernels_single.cubin'))
        else:
            self.mod = drv.module_from_file(os.path.join(
                this_dir, '../gpu/cuda_kernels/kernels_double.cubin'))
Exemple #3
0
	def gfx_init( self ) :
		try :
			print 'compiling'
			self.prog = sh.compile_program_vfg( 'shad/balls' )

			print 'compiled'

			self.loc_mmv = sh.get_loc(self.prog,'modelview' )
			self.loc_mp  = sh.get_loc(self.prog,'projection')
			self.l_color = sh.get_loc(self.prog,'color'     )
			self.l_size  = sh.get_loc(self.prog,'ballsize'  )

		except ValueError as ve :
			print "Shader compilation failed: " + str(ve)
			sys.exit(0)    

#        glUseProgram( self.prog )
#        glUniform1i( pointsid , 0 );
#        glUseProgram( 0 )

		#
		# cuda init
		#
		self.grid = (int(self.BOX),int(self.BOX))
		self.block = (1,1,int(self.BOX))

		print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))
#        print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_X
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z

		floatbytes = np.dtype(np.float32).itemsize

		self.gpos = glGenBuffers(1)
		glBindBuffer( GL_ARRAY_BUFFER , self.gpos )
		glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW )
		glBindBuffer( GL_ARRAY_BUFFER , 0 )

		self.df1 = cuda_driver.mem_alloc( self.f.nbytes )
		self.df2 = cuda_driver.mem_alloc( self.f.nbytes )

		cuda_driver.memcpy_htod( self.df1 , self.f )
		cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q )

		mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' )

		self.collision = mod.get_function("collision_step")
		self.collision.prepare( "Piii" )

		self.streaming = mod.get_function("streaming_step")
		self.streaming.prepare( "PPiii" )

		self.colors = mod.get_function("colors")
		self.colors.prepare( "PPiii" )
Exemple #4
0
	def cuda_init( self ) :
		mod = cuda_driver.module_from_file( 'solid_kernel.cubin' )

		self.cerr = cuda_driver.mem_alloc( 4 )

		self.cut = mod.get_function("cut_x")
		self.cut.prepare( "PPPifiiiiiffP" )

		self.fill_v = mod.get_function("fill_v")
		self.fill_v.prepare( "Piifff")

		self.fill_n = mod.get_function("fill_n")
		self.fill_n.prepare( "Piifff")
Exemple #5
0
def load_module_new(module_name, module_file, nvcc_options, nvcc_include_dirs,
                    cubin_cache_enable):
    cu_hexhash = hashlib.md5(bytearray(module_file, 'utf-8')).hexdigest()
    cu_hexhash_from_file = ''

    if not (os.path.exists("cubin_cache/" + str(module_name) + ".txt")):
        cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'w+')
        cache_file.write(cu_hexhash)
        cache_file.close()
    else:
        cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'r')
        cu_hexhash_from_file = cache_file.read()
        cache_file.close()

    if (cu_hexhash_from_file == cu_hexhash) & (
            os.path.isfile("cubin/" + str(cu_hexhash_from_file) +
                           "_cubin.cubin")) & cubin_cache_enable:
        print("Load cached %s kernel !" % str(module_name))
        return drv.module_from_file("cubin/" + str(cu_hexhash) +
                                    "_cubin.cubin")
    else:
        if (os.path.isfile("cubin/" + str(cu_hexhash_from_file) +
                           "_cubin.cubin")):
            os.remove("cubin/" + str(cu_hexhash_from_file) + "_cubin.cubin")

    cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'w')
    cache_file.write(cu_hexhash)
    cache_file.close()

    print("Caching %s kernel !" % str(module_name))

    cubin = pycuda.compiler.compile(module_file,
                                    options=nvcc_options,
                                    include_dirs=nvcc_include_dirs,
                                    cache_dir=None)
    save_cubin(cubin, "cubin/" + str(cu_hexhash) + "_cubin.cubin")

    return drv.module_from_file("cubin/" + str(cu_hexhash) + "_cubin.cubin")
def load_cuda():
    md5 = hashlib.md5()
    md5.update(mod.encode("utf-8"))
    filename = md5.hexdigest() + ".cubin"
    path = pathlib.Path(__file__).resolve().parent / filename
    if not path.exists():
        try:
            cubin = compiler.compile(mod, no_extern_c=True)
            with open(str(path), "wb") as handle:
                handle.write(cubin)
        except cuda.CompileError as ce:
            print(f"{ce}")

    return cuda.module_from_file(str(path))
Exemple #7
0
def get_kernel(kernel_name):

    kernel_spec = kernels[kernel_name]
    params = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        if p[0:4] == "int ":
            sig += "I"
        elif p[0:6] == "float ":
            sig += "f"
        else:
            sig += "P"

    module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin"))
    func   = module.get_function(kernel_name)
    func.prepare(sig)
    # print("Loaded: " + kernel)
    return func
Exemple #8
0
def get_kernel(kernel_name):

    kernel_spec = kernels[kernel_name]
    params = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        if p[0:4] == "int ":
            sig += "I"
        elif p[0:6] == "float ":
            sig += "f"
        else:
            sig += "P"

    module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin"))
    func   = module.get_function(kernel_name)
    func.prepare(sig)
    # print("Loaded: " + kernel)
    return func
def init_cubin():
    from pycuda import driver
    from pycuda import gpuarray
    from pycuda import autoinit

    global context,kComputeMatMult
    device=driver.Device(gpu_id)
    context=device.make_context(flags=driver.ctx_flags.SCHED_YIELD)        
    stream=driver.Stream()
    #print model_path
    mod=driver.module_from_file(os.path.join(os.path.dirname(os.path.abspath(__file__)),'mat_mult.cubin'))
    
    kComputeMatMult = mod.get_function("kComputeMatMult")
    kComputeMatMult.prepare([np.int32,np.int32,np.int32,'P','P','P'])
    
    if context:
        context.pop()
    print 'init ok'
Exemple #10
0
def load_cuda_code_individual():
    global _update_individuals_fn

    md5 = hashlib.md5()
    md5.update(mod.encode("utf-8"))
    filename = md5.hexdigest() + ".cubin"
    path = pathlib.Path(__file__).resolve().parent / filename
    if not path.exists():
        try:
            cubin = compiler.compile(mod, no_extern_c=True)
            with open(str(path), "wb") as handle:
                handle.write(cubin)
        except cuda.CompileError as ce:
            print(f"{ce}")

    _cuda_module = cuda.module_from_file(str(path))

    _update_individuals_fn = _cuda_module.get_function("update_individuals")
Exemple #11
0
    def __init__(self,
                 kernel_set="fgemm_int64_wide32",
                 locks=1024,
                 calc_partials=True,
                 bench=False):

        m = re.search(r'wide(\d+)', kernel_set)
        if m:
            self.width = int(m.group(1))
        else:
            raise ValueError("Invalid kernel_set")

        self.locks = locks
        self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin")
        self.mode = 0 if calc_partials else 4
        self.fgemm = dict()
        for op in ("nt", "nn", "tn"):
            mod = self.module.get_function(kernel_set + "_" + op)
            mod.prepare("PPPIIIIIIHH")
            self.fgemm[op] = mod

        fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64")
        fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["fprop_conv"] = fprop_conv

        bprop_conv = self.module.get_function(
            "bprop_conv_float32_CRST64N64T64")
        bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["bprop_conv"] = bprop_conv

        udpate_conv = self.module.get_function(
            "update_conv_float32_CRST64K64T64")
        udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["update_conv"] = udpate_conv

        self.gpulock = drv.mem_alloc(locks * 4)
        drv.memset_d32(self.gpulock, 0, locks)

        self.bench = bench
        if bench:
            self.start = drv.Event()
            self.end = drv.Event()
Exemple #12
0
    def __init__(self, units, in_units, **kwargs):
        super(GPUMatMultLayer, self).__init__(**kwargs)
        gpu_id = 0
        device = driver.Device(gpu_id)
        self.context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD)
        self.stream = driver.Stream()
        mod = driver.module_from_file(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'mat_mult.cubin'))

        self.kComputeMatMult = mod.get_function("kComputeMatMult")
        self.kComputeMatMult.prepare(
            [np.int32, np.int32, np.int32, 'P', 'P', 'P'])
        self.feature_dim = in_units
        self.output_dim = units
        with self.name_scope():
            self.weight = self.params.get('weight', shape=(in_units, units))
            self.bias = self.params.get('bias', shape=(units, ))
        if self.context:
            self.context.pop()
Exemple #13
0
def get_kernel(kernel_name):

    #import ipdb; ipdb.set_trace()

    kernel_spec = kernels[kernel_name]
    params  = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        ptype, pname = _space_re.split(p)
        if ptype[-1] == '*':
            sig += "Q"
        elif ptype == 'float':
            sig += "f"
        else:
            sig += "I"

    module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin"))
    func   = module.get_function(kernel_name)
    func.prepare(sig)
    func.threads = kernel_spec["threads"]
    # print("Loaded: " + kernel_name)
    return func
Exemple #14
0
def get_kernel(kernel_name):

    #import ipdb; ipdb.set_trace()

    kernel_spec = kernels[kernel_name]
    params = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        ptype, pname = _space_re.split(p)
        if ptype[-1] == '*':
            sig += "Q"
        elif ptype == 'float':
            sig += "f"
        else:
            sig += "I"

    module = drv.module_from_file(
        os.path.join(cubin_dir, kernel_name + ".cubin"))
    func = module.get_function(kernel_name)
    func.prepare(sig)
    func.threads = kernel_spec["threads"]
    # print("Loaded: " + kernel_name)
    return func
    def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False):

        m = re.search( r'wide(\d+)', kernel_set)
        if m:
            self.width  = int(m.group(1))
        else:
            raise ValueError("Invalid kernel_set")
        
        self.locks   = locks
        self.module  = drv.module_from_file("kernels/" + kernel_set + ".cubin")
        self.mode    = 0 if calc_partials else 4
        self.fgemm   = dict()
        for op in ("nt", "nn", "tn"):
            mod = self.module.get_function(kernel_set + "_" + op)
            mod.prepare("PPPIIIIIIHH")
            self.fgemm[op] = mod

        fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64")
        fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["fprop_conv"] = fprop_conv

        bprop_conv = self.module.get_function("bprop_conv_float32_CRST64N64T64")
        bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["bprop_conv"] = bprop_conv

        udpate_conv = self.module.get_function("update_conv_float32_CRST64K64T64")
        udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII")
        self.fgemm["update_conv"] = udpate_conv

        self.gpulock = drv.mem_alloc(locks*4)
        drv.memset_d32(self.gpulock, 0, locks)

        self.bench = bench
        if bench:
            self.start = drv.Event()
            self.end   = drv.Event()
Exemple #16
0
def loadKernel():
    module = drv.module_from_file("ssgls.ptx")
    glsKernel = module.get_function("doGLS_kernel")

    return module, glsKernel
Exemple #17
0
def get_kernel(base_name, options=None):

    major, minor = _get_compute_capability()
    if major < 5:
        raise RuntimeError(
            "sass kernels require Maxwell or greater class hardware")

    arch = "sm_%d%d" % (major, minor)

    libprefix = "PERL5LIB=%s" % maxas_dir
    maxas_i = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -i -w"]
    maxas_p = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -p"]

    kernel_spec = kernels[base_name]
    kernel_name = base_name

    # static options
    if "args" in kernel_spec:
        for pair in kernel_spec["args"].items():
            maxas_i.append("-D%s %s" % pair)
            maxas_p.append("-D%s %s" % pair)

    # dynamic options
    if options is not None:
        for opt in options:
            if type(opt) is tuple:
                maxas_i.append("-D%s %s" % opt)
                maxas_p.append("-D%s %s" % opt)
                kernel_name += "_%s%s" % opt
            else:
                maxas_i.append("-D%s 1" % opt)
                maxas_p.append("-D%s 1" % opt)
                kernel_name += "_%s" % opt

    maxas_i.insert(2, "-k " + kernel_name)

    sass_name = kernel_spec["sass"] + ".sass"
    cubin_name = kernel_name + ".cubin"
    cubin_dir = _get_cache_dir([arch, 'cubin'])

    ptx_version = "4.2" if major < 6 else "5.0"
    ptx_file = get_ptx_file(kernel_spec, kernel_name, arch, ptx_version)
    sass_file = os.path.join(sass_dir, sass_name)
    cubin_file = os.path.join(cubin_dir, cubin_name)

    if not os.path.exists(sass_file):
        raise RuntimeError("Missing: %s for kernel: %s" %
                           (sass_name, kernel_name))

    ptx_mtime = os.path.getmtime(ptx_file)
    cubin_mtime = os.path.getmtime(cubin_file) if os.path.exists(
        cubin_file) else 0

    build_cubin = False
    if ptx_mtime > cubin_mtime:
        build_cubin = True

    includes = extract_includes(sass_name)
    for include, include_mtime in includes:
        if include_mtime > cubin_mtime:
            build_cubin = True
            break

    if build_cubin:
        # build the cubin and run maxas in the same command
        # we don't want the chance of a generated cubin not processed by maxas (in case user hits ^C in between these steps)
        run_command(["ptxas -v -arch", arch, "-o", cubin_file, ptx_file, ";"] +
                    maxas_i + [sass_file, cubin_file])
        cubin_mtime = time.time()

    # output preprocessed and disassembled versions in debug mode
    if debug:
        pre_dir = _get_cache_dir([arch, 'pre'])
        dump_dir = _get_cache_dir([arch, 'dump'])

        pre_file = os.path.join(pre_dir, kernel_name + "_pre.sass")
        dump_file = os.path.join(dump_dir, kernel_name + "_dump.sass")
        pre_mtime = os.path.getmtime(pre_file) if os.path.exists(
            pre_file) else 0
        dump_mtime = os.path.getmtime(dump_file) if os.path.exists(
            dump_file) else 0

        for include, include_mtime in includes:
            if include_mtime > pre_mtime:
                run_command(maxas_p + [sass_file, pre_file])
                break

        if cubin_mtime > dump_mtime:
            run_command(["nvdisasm -c", cubin_file, ">", dump_file])

    # generate the function signature for pycuda
    params = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        ptype, pname = _space_re.split(p)
        if ptype[-1] == '*':
            sig += "Q"
        elif ptype == 'float':
            sig += "f"
        elif ptype == 'unsigned':
            sig += "I"
        else:
            sig += "i"

    module = drv.module_from_file(cubin_file)
    func = module.get_function(kernel_name)
    func.prepare(sig)
    func.threads = kernel_spec["threads"]
    func.name = kernel_name
    func.static_shared = eval(kernel_spec["share"])

    return func
Exemple #18
0
def test_check_and_build_cu():
    '''
    check_and_build(): cuda
    '''
    import yaml
    import os
    import pycuda.driver as cuda
    import atexit
    from build import check_and_make_parameter_header, check_and_build, clean

    code_type = 'cu'

    dpath = join(current_dpath, 'src')
    src_dir = {'f90':'f90', 'c':'c', 'cu':'cuda', 'cl':'opencl'}[code_type]
    build_dpath = join(dpath, src_dir, 'build')
    with open(join(dpath, 'build.yaml'), 'r') as f: build_dict = yaml.load(f)

    #
    # Remove previous generated files
    #
    ret, out, err = capture(clean)(code_type, dpath)
    equal( len(os.listdir(build_dpath)), 0 )

    #
    # Make and compile header file
    # verify stdout and file existence
    #
    ret, out, err = capture(check_and_make_parameter_header)(code_type, dpath)
    ret, out, err = capture(check_and_build)(code_type, dpath)
    expect = '''
[compile] amb.cu using the PyCUDA build
[compile] apb.cu using the PyCUDA build
'''
    equal('\n'+out+'\n', expect)

    #
    # PyCUDA environment
    # Load modules
    #
    cuda.init()
    device = cuda.Device(0)
    context = device.make_context()
    atexit.register(context.pop)

    lib_apb = cuda.module_from_file( join(build_dpath, 'apb.cubin') )
    lib_amb = cuda.module_from_file( join(build_dpath, 'amb.cubin') )

    apb = lib_apb.get_function('apb')
    amb = lib_amb.get_function('amb')


    #
    # setup
    #
    nx = 1000000
    a = np.random.rand(nx)
    b = np.random.rand(nx)
    c = np.random.rand(nx)
    c2 = c.copy()

    a_dev = cuda.to_device(a)
    b_dev = cuda.to_device(b)
    c_dev = cuda.to_device(c)
    c2_dev = cuda.to_device(c2)

    with open(join(dpath, 'apb.yaml'), 'r') as f: apb_dict = yaml.load(f)
    with open(join(dpath, 'amb.yaml'), 'r') as f: amb_dict = yaml.load(f)
    kk = apb_dict['kk']
    lll = apb_dict['lll']
    mm = amb_dict['section']['mm']

    ref = kk*a + lll*b + mm*c


    #
    # verify results
    #
    apb(np.int32(0), np.int32(nx), a_dev, b_dev, c_dev, block=(512,1,1), grid=(nx//512+1,1))
    cuda.memcpy_dtoh(c, c_dev)
    aa_equal(ref, c, 14)

    amb(np.int32(0), np.int32(nx), a_dev, b_dev, c2_dev, block=(512,1,1), grid=(nx//512+1,1))
    cuda.memcpy_dtoh(c2, c2_dev)
    aa_equal(ref, c2, 14)


    #
    # verify stdout if revision
    #
    ret, out, err = capture(check_and_build)(code_type, dpath)
    expect = '''
./cuda/build/amb.cu is up to date.
./cuda/build/apb.cu is up to date.
'''
    equal('\n'+out.replace(dpath,'.')+'\n', expect)


    #
    # verify stdout if partial revision
    #
    os.remove(join(build_dpath, 'amb.cu'))
    ret, out, err = capture(check_and_build)(code_type, dpath)
    expect = '''
[compile] amb.cu using the PyCUDA build
./cuda/build/apb.cu is up to date.
'''
    equal('\n'+out.replace(dpath,'.')+'\n', expect)
Exemple #19
0
 def setup(self):
     self.mod = cuda.module_from_file('diffusion_kernel.cubin')
     self.func = self.mod.get_function("temperature_update16x16")
Exemple #20
0
#!/usr/bin/env python2.7
# gpu_trunc_norm.py
# Author: Nick Ulle

from __future__ import division
from math import ceil, sqrt

import numpy as np

import pycuda.autoinit
import pycuda.driver as cuda

# ----- PTX Module Setup
# Import PTX module and setup its functions.
_gpu_module = cuda.module_from_file('bin/gpu_trunc_norm.ptx')

_gpu_trunc_norm = _gpu_module.get_function('gpu_trunc_norm')
_gpu_curand_init = _gpu_module.get_function('gpu_curand_init')
_gpu_curand_deinit = _gpu_module.get_function('gpu_curand_deinit')

# ----- Globals
# A pointer to the curand RNG states.
_rng_state = None

# The total number of threads per block.
# This should match NUM_RNG in gpu_trunc_norm.cu
_NUM_THREADS = 128


# ----- Functions
def gpu_trunc_norm(n, mean, sd, a, b):
#!/usr/bin/env python2.7
# gpu_trunc_norm.py
# Author: Nick Ulle

from __future__ import division
from math import ceil, sqrt

import numpy as np

import pycuda.autoinit
import pycuda.driver as cuda

# ----- PTX Module Setup
# Import PTX module and setup its functions.
_gpu_module = cuda.module_from_file('bin/gpu_trunc_norm.ptx')

_gpu_trunc_norm = _gpu_module.get_function('gpu_trunc_norm')
_gpu_curand_init = _gpu_module.get_function('gpu_curand_init')
_gpu_curand_deinit = _gpu_module.get_function('gpu_curand_deinit')

# ----- Globals
# A pointer to the curand RNG states.
_rng_state = None

# The total number of threads per block.
# This should match NUM_RNG in gpu_trunc_norm.cu
_NUM_THREADS = 128

# ----- Functions
def gpu_trunc_norm(n, mean, sd, a, b):
    blocks = int(ceil(sqrt(n / _NUM_THREADS)))
def calc_holo(scatterer, schema):

  # Extract the necessary components from schema
  imsize1 = uint32(schema.shape[0])
  imsize2 = uint32(schema.shape[1])
  pxsize = float64(schema.spacing[0])
  wavevec = float64(schema.optics.wavevec)
  med_wavelen = float64(schema.optics.med_wavelen)
  einc = schema.optics.polarization

  start = time()
  # Extract the necessary components from scatterer
  # and calculate scattering coefficients
  # Single sphere case
  if isinstance(scatterer, Sphere):
    asbs = scat_coeffs(scatterer, schema.optics)
    num = uint32(1)
    sphLocation = scatterer.center
    x = float64(array([sphLocation[0]]))
    y = float64(array([sphLocation[1]]))
    z = float64(array([sphLocation[2]]))
    r = float64(array(scatterer.r))
    n = array(scatterer.n, dtype=complex)
  # Multi-sphere case
  else:
    asbs = scat_coeffs(scatterer.scatterers[0], schema.optics)
    num = uint32(len(scatterer.get_component_list()))
    sphLocation = scatterer.centers
    x = float64(array(sphLocation[:,0]))
    y = float64(array(sphLocation[:,1]))
    z = float64(array(sphLocation[:,2]))
    r = float64(array(scatterer.r))
    n = array(scatterer.n, dtype=complex)
  
  npoints = uint32(imsize1*imsize2)
  _, nstop = uint32(asbs.shape)

  # 2D array for storing final hologram
  holo = float64(zeros([imsize1,imsize2]))

  ##################################################### 
  #  Inputs to the kernel call:  
  #  Python var      Size        Py type    GPU type
  #  npoints           1         np.uint32  unsigned int
  #  asbs          2 x nstop     complex    dcmplx
  #  nstop             1         np.uint32  unsigned int
  #  imsize1            1         np.uint32  unsigned int
  #  imsize2            1         np.uint32  unsigned int
  #  pxsize            1         float64    double
  #  x,y,z,r,n      1 x num      float64    double
  #  wavevec           1         float64    double
  #  med_wavelen       1         float64    double
  #  num               1         np.uint32  unsigned int
  #  einc              2         float64    double
  #  holo       imsize x imsize  float64    double
  #####################################################

  ############################################################
  ## kernel_holo - Replaces asm_fullradial, calc_scat_field, 
  ## and fieldstocart
  ############################################################

  #uncomment to compile and cache the module
  #kernel_holo = cuda_compile(kernel_holo_source,"kernel_holo")

  #uncomment to load from cached file "precompiled.cubin"
  source_module = cu.module_from_file("precompiled.cubin")
  kernel_holo = source_module.get_function("kernel_holo")

  asbs_d = gpu.to_gpu(asbs.copy())
  einc_d = gpu.to_gpu(einc)
  holo_d = gpu.to_gpu(holo)

  x_d = gpu.to_gpu(x)
  y_d = gpu.to_gpu(y)
  z_d = gpu.to_gpu(z)
  r_d = gpu.to_gpu(r)
  n_d = gpu.to_gpu(n.copy())
  
  nblocks = 2**6
  blocksize = (nblocks,1,1)     
  gridsize  = (int((npoints/nblocks)+(npoints%nblocks)),1)

  start_gpu_time = cu.Event()
  end_gpu_time = cu.Event()
  
  start_gpu_time.record()
  start_test = time()

  # assume the spheres are the same size, just pass asbs once
  kernel_holo(npoints, asbs_d, nstop, imsize2, imsize1, pxsize,
    x_d, y_d, z_d, r_d, n_d, wavevec, med_wavelen, num,
    einc_d, holo_d, block=blocksize, grid=gridsize)

  end_gpu_time.record()
  end_gpu_time.synchronize()
  gpu_time = start_gpu_time.time_till(end_gpu_time) * 1e-3 
  holo = holo_d.get()

  stop = time()
  print "holo computation took %f sec. of which %f sec. were on the GPU" % (stop-start, gpu_time)

  holo = Image(holo,optics=schema.optics, spacing = schema.spacing)

  return holo
Exemple #23
0
# kernel compile and import
kernel = '''
__global__ void daxpy(int nx, double a, double *x, double *y) {
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx < nx) y[idx] = a*x[idx] + y[idx];
}
'''
from pycuda.compiler import SourceModule, compile
#mod = SourceModule(kernel, cache_dir='./')

cubin = compile(kernel, cache_dir='./')
#mod = cuda.module_from_buffer(cubin)

with open('daxpy.cubin', 'wb') as f: f.write(cubin)
mod = cuda.module_from_file('daxpy.cubin')
daxpy = mod.get_function('daxpy')

dev = pycuda.autoinit.device
print(dev.compute_capability())
#cuda.device_attribute['COMPUTE_CAPABILITY_MAJOR']
#cuda.device_attribute['COMPUTE_CAPABILITY_MINOR']


# setup
nx = 2**20

# allocation
a = np.random.rand()
x = np.random.rand(nx)
y = np.random.rand(nx)
Exemple #24
0
# compiling cubin and loading it up
if master:
    pass
#    import subprocess
#    subprocess.check_call(['nvcc', '--cubin', '-arch', 'sm_20', 'kernel.cu'])
    for i in range(slaves):
        mpi.send(None, dest = i, tag = 0xdeadbee)
#    debug('cubin compiled, msg sent')
else:
    import pycuda.driver as cuda
    cuda.init()
    device = cuda.Device(gpu_id)
    device.make_context()
    mpi.recv(source = master_rank, tag = 0xdeadbee)
    mod = cuda.module_from_file('kernel.cubin')
    debug('msg received, cubin loaded')
    debug('# of devices: %d' % cuda.Device.count())
    debug('gpu_id: %d' % gpu_id)
mpi.Barrier()



def init_nucleus(out_ar):
    cos = np.cos
    def bcc(x, y, z):
        return 0.166666666666666666667 * (cos(x + y) + cos(x + z) + cos(y + z) + cos(x - y) + cos(x - z) + cos(y - z))
    # xvec = np.linspace(0, lx, nx, endpoint = False)
    # yvec = np.linspace(0, ly, ny, endpoint = False)
    # zvec = np.linspace(mpi.rank * lz, (mpi.rank + 1) * lz, nz, endpoint = False)
    # X, Y, Z = np.meshgrid(xvec, yvec, zvec)
Exemple #25
0
        
        cu.init()
        d = cu.Device(1)
        ctx = d.make_context()

        kernel_size = 3
        block_size = (16, 16)
        #grid_size = calculate_grid_size((height, width), block_size)
        grid_size = (32, 32)
        #print(I.shape)
        #print(grid_size)

        I_gpu = cu.to_device(I.astype('float32'))
        J_gpu = cu.mem_alloc(J.nbytes)

        source = cu.module_from_file("sobel.cubin")

        kernel_naive = source.get_function("sobel_filter")
        kernel_naive.prepare(['P', 'P', 'Q', 'Q', 'Q', 'Q'])
        time = kernel_naive.prepared_timed_call(grid_size, block_size, I_gpu, J_gpu, height, width, kernel_size, 4)
        J1 = cu.from_device(J_gpu, shape=J.shape, dtype="float32")
        print("Time spent in kernel1: {}s".format(time() * 1e-3))

        print("L1 norm: {}".format( np.sum(np.sum( np.abs(J - J1) )) ))

        plt.imshow(J1)
        plt.show()
        
    finally:
        ctx.pop()
        print('\ndone')
Exemple #26
0
import sys

from gpuDA.gpuDA import GpuDA
from set_boundary import set_boundary_values

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

npz = 3
npy = 3
npx = 3

assert (size == npx * npy * npz)

mod = cuda.module_from_file('diffusion_kernel.cubin')
func = mod.get_function('temperature_update16x16')

# local sizes:
nx = 510
ny = 510
nz = 512

# global lengths:
lx = 0.3
ly = 0.3
lz = 0.3

# material properties:
alpha = 1e-5
Exemple #27
0
    def __init__(self, NGRID=16):

        self.mod = cuda.module_from_file("./cuda/graphene.cubin")
 def setup(self):
     self.mod = cuda.module_from_file('diffusion_kernel.cubin')
     self.func = self.mod.get_function("temperature_update16x16")
Exemple #29
0
	def gfx_init( self ) :
		try :
			print 'compiling'
			self.prog = sh.compile_program_vfg( 'shad/balls' )

			print 'compiled'

			self.loc_mmv = sh.get_loc(self.prog,'modelview' )
			self.loc_mp  = sh.get_loc(self.prog,'projection')
			self.l_color = sh.get_loc(self.prog,'color'     )
			self.l_size  = sh.get_loc(self.prog,'ballsize'  )

		except ValueError as ve :
			print "Shader compilation failed: " + str(ve)
			sys.exit(0)    

#        glUseProgram( self.prog )
#        glUniform1i( pointsid , 0 );
#        glUseProgram( 0 )

		#
		# cuda init
		#
		self.grid = (int(self.NUM/256)+1,1)
		self.block = (256,1,1)

		print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))

		self.gpts = glGenBuffers(1)
		glBindBuffer( GL_ARRAY_BUFFER , self.gpts )
		glBufferData( GL_ARRAY_BUFFER , self.pts.nbytes , self.pts , GL_STREAM_DRAW )
		glBindBuffer( GL_ARRAY_BUFFER , 0 )

#        self.dprv = cuda_driver.mem_alloc( self.prv.nbytes )
		self.dvel = cuda_driver.mem_alloc( self.vel.nbytes )
		self.dacc = cuda_driver.mem_alloc( self.acc.nbytes )
		self.dfrs = cuda_driver.mem_alloc( self.frs.nbytes )
		self.dmas = cuda_driver.mem_alloc( self.mas.nbytes )
		self.ddns = cuda_driver.mem_alloc( self.dns.nbytes )

#        cuda_driver.memcpy_htod( self.dprv , self.prv )
		cuda_driver.memcpy_htod( self.dvel , self.vel )
		cuda_driver.memcpy_htod( self.dacc , self.acc )
		cuda_driver.memcpy_htod( self.dfrs , self.frs )
		cuda_driver.memcpy_htod( self.dmas , self.mas )
		cuda_driver.memcpy_htod( self.ddns , self.dns )

		mod = cuda_driver.module_from_file( 'sph_kernel.cubin' )

		self.update_pts = mod.get_function("update_pts")
		self.update_pts.prepare( "PPPfi" )

		self.update_vel = mod.get_function("update_vel")
		self.update_vel.prepare( "PPPfi" )

		self.update_dns = mod.get_function("update_dns")
		self.update_dns.prepare( "PPPi" )

		self.update_frs = mod.get_function("update_frs")
		self.update_frs.prepare( "PPPPPi" )

		self.collisions = mod.get_function("collisions")
		self.collisions.prepare( "PPPfPfi" )
Exemple #30
0
def get_kernel(base_name, options=None):

    attributes = drv.Context.get_device().get_attributes()
    major = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MAJOR]
    minor = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MINOR]
    if major < 5:
        raise RuntimeError("sass kernels require Maxwell or greater class hardware")

    arch = "sm_%d%d" % (major, minor)

    libprefix = "PERL5LIB=%s" % (maxas_dir)
    maxas_i = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -i -w"]
    maxas_p = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -p"]

    kernel_spec = kernels[base_name]
    kernel_name = base_name

    if "args" in kernel_spec:
        for pair in kernel_spec["args"].items():
            maxas_i.append("-D%s %s" % pair)
            maxas_p.append("-D%s %s" % pair)

    if options is not None:
        for opt in options:
            if type(opt) is tuple:
                maxas_i.append("-D%s %s" % opt)
                maxas_p.append("-D%s %s" % opt)
                kernel_name += "_%s%s" % opt
            else:
                maxas_i.append("-D%s 1" % opt)
                maxas_p.append("-D%s 1" % opt)
                kernel_name += "_%s" % opt

    maxas_i.insert(2, "-k " + kernel_name)

    sass_name  = kernel_spec["sass"] + ".sass"
    cubin_name = kernel_name + ".cubin"

    ptx_version = "4.2" if major < 6 else "5.0"
    ptx_file   = get_ptx_file(kernel_spec, kernel_name, arch, ptx_version)
    sass_file  = os.path.join(sass_dir, sass_name)
    cubin_file = os.path.join(cubin_dir, cubin_name)

    if not os.path.exists(sass_file):
        raise RuntimeError("Missing: %s for kernel: %s" % (sass_name, kernel_name))

    ptx_age   = os.path.getmtime(ptx_file)
    cubin_age = os.path.getmtime(cubin_file) if os.path.exists(cubin_file) else 0

    if ptx_age > cubin_age:
        run_command([ "ptxas -v -arch", arch, "-o", cubin_file, ptx_file ])
        cubin_age = 0

    includes = extract_includes(sass_name)

    for include, include_age in includes:
        if include_age > cubin_age:
            run_command(maxas_i + [sass_file, cubin_file])
            cubin_age = include_age
            break

    if debug:
        pre_file  = os.path.join(pre_dir,  kernel_name + "_pre.sass")
        dump_file = os.path.join(dump_dir, kernel_name + "_dump.sass")
        pre_age   = os.path.getmtime(pre_file)  if os.path.exists(pre_file)  else 0
        dump_age  = os.path.getmtime(dump_file) if os.path.exists(dump_file) else 0

        for include, include_age in includes:
            if include_age > pre_age:
                run_command(maxas_p + [sass_file, pre_file])
                break

        if cubin_age > dump_age:
            run_command(["nvdisasm -raw", cubin_file, ">", dump_file])

    params  = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        ptype, pname = _space_re.split(p)
        if ptype[-1] == '*':
            sig += "Q"
        elif ptype == 'float':
            sig += "f"
        elif ptype == 'unsigned':
            sig += "I"
        else:
            sig += "i"

    module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin"))
    func   = module.get_function(kernel_name)
    func.prepare(sig)
    func.threads = kernel_spec["threads"]
    return func
Exemple #31
0
import pycuda.driver as cuda
import pycuda.autoinit
# from pycuda.compiler import SourceModule
from pycuda.driver import module_from_file
from pycuda.autoinit import context
import pycuda.gpuarray as gpuarray
import numpy as np

import time

### ------------------------------------------
### start timing the start of the end-to-end processing time
### ------------------------------------------

## load precompiled cubin file
mod = module_from_file("knn_kernels.cubin")

# link to the kernel function

lr_dist = mod.get_function('distKernel')
get_k = mod.get_function('getKLabels')
sort = mod.get_function('sort')

#------------------------------------------------------------------------------
# parameters
#------------------------------------------------------------------------------
neighbors = 3

# input data

#open file
Exemple #32
0
import pycuda.driver as cuda
import pycuda.autoinit
# from pycuda.compiler import SourceModule
from pycuda.driver import module_from_file

import pycuda.gpuarray as gpuarray
import numpy as np

import time

### ------------------------------------------
### start timing the start of the end-to-end processing time
### ------------------------------------------

## load precompiled cubin file
mod = module_from_file("lr_kernels.cubin")

# link to the kernel function

lr_MM = mod.get_function('MatrixMultiplyKernel')
lr_sig = mod.get_function('sub_sigKernel')
lr_dist = mod.get_function('distKernel')


#------------------------------------------------------------------------------
# parameters
#------------------------------------------------------------------------------
maxIter = 100
alpha = 0.001
tol = 0.0001
Exemple #33
0
# number of threads that one block can handle
THREADS_PER_BLOCK = device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_BLOCK)
# number of blocks required to handle all atoms
N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/\
               THREADS_PER_BLOCK))
# maximum number of threads per block
N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/\
               N_BLOCKS))

path=os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe())))
# ------------------------
# precompiled kernel is preferrable
# ------------------------
kernel = path + '/Lennard-Jones_BpV.cubin'
if os.path.isfile(kernel):
    kernel = cuda.module_from_file(kernel)
else:
    # ------------------------
    # just-in-time compilation
    # ------------------------
    with open(path + '/Lennard-Jones_BpV.cu') as ifs:
        kernel_text = ifs.read()
    kernel = SourceModule(kernel_text)

energy = kernel.get_function('Lennard_Jones_BpV')

shared_mem = DIM * N_THREAD * np.float32().nbytes
cuda_grid = (int(ngrid), N_BLOCKS)
cuda_blocks = (N_THREAD, 1, 1)

energy(cuda.InOut(grid), ngrid,
Exemple #34
0
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy as np

from pycuda.compiler import SourceModule

my_mod = drv.module_from_file("Isoparametric.cubin")

NumElements = 1000
NodesPerElement = 8

Elements = np.zeros((NodesPerElement, NumElements), dtype=np.int32)

id = 0
for i in xrange(NodesPerElement):
    for j in xrange(NumElements):
        Elements[i, j] = id
        id = id + 1

print Elements
Exemple #35
0
N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/THREADS_PER_BLOCK))
# maximum number of threads per block
N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/N_BLOCKS))

path=os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe())))
# ------------------------
# just-in-time compilation
# ------------------------
#with open(path + '/distance_BpV.cu') as ifs:
    #kernel_text = ifs.read()
#kernel = SourceModule(kernel_text)
# ------------------------
# precompiled kernel
# ------------------------
kernel = path + '/distance_BpV.cubin'
kernel = cuda.module_from_file(path + '/distance_BpV.cubin')

distance = kernel.get_function('distance_BpV')

shared_mem = DIM * N_THREAD * np.float32().nbytes
cuda_grid = (int(ngrid), N_BLOCKS)
cuda_blocks = (N_THREAD, 1, 1)

distance(cuda.InOut(grid), ngrid,
         cuda.In(pv),
         cuda.In(ndiv),
         cuda.In(spos),
         cuda.In(radii),
         natoms,
         block=cuda_blocks, grid=cuda_grid, shared=shared_mem)
Exemple #36
0
def _get_module(path, clss, op, size=None):

    size = "" if size is None else "_" + size
    cubin = "{0}_{1}{2}.cubin".format(clss, op, size)
    return drv.module_from_file(os.path.join(path, cubin))
Exemple #37
0
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy as np

from pycuda.compiler import SourceModule


my_mod = drv.module_from_file("Isoparametric.cubin")

NumElements = 1000;
NodesPerElement = 8

Elements = np.zeros((NodesPerElement,NumElements), dtype=np.int32)

id = 0;
for i in xrange(NodesPerElement):
	for j in xrange(NumElements):
		Elements[i,j] = id
		id = id + 1


print Elements			
			
Exemple #38
0
import matplotlib.pyplot as plt
from time import time
import pycuda.driver as cuda
import pycuda.autoinit


# Setup
nx, ny = 1200, 1000
tmax, tgap = 500, 100
c = numpy.ones((nx,ny), order='F')*0.25
f = numpy.zeros_like(c, order='F')
c_gpu = cuda.to_device(c)
f_gpu = cuda.to_device(f)
g_gpu = cuda.to_device(f)

mod = cuda.module_from_file('core.cubin')
advance_src = mod.get_function('advance_src')
#advance = mod.get_function('advance')
advance = mod.get_function('advance_smem')


# Plot using the matplotlib
plt.ion()
imag = plt.imshow(c.T, origin='lower', vmin=-0.2, vmax=0.2)
plt.colorbar()


# Main loop for the time evolution
inx, iny = numpy.int32(nx), numpy.int32(ny)
bs, gs = (256,1,1), (nx*ny//256+1,1)
t0 = time()
Exemple #39
0
def get_kernel(base_name, options=None):

    attributes = drv.Context.get_device().get_attributes()
    major = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MAJOR]
    minor = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MINOR]
    if major < 5:
        raise RuntimeError("sass kernels require Maxwell or greater class hardware")

    arch = "sm_%d%d" % (major, minor)

    libprefix = "PERL5LIB=%s" % (maxas_dir)
    maxas_i = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -i -w"]
    maxas_p = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -p"]

    kernel_spec = kernels[base_name]
    kernel_name = base_name

    if "args" in kernel_spec:
        for pair in kernel_spec["args"].items():
            maxas_i.append("-D%s %s" % pair)
            maxas_p.append("-D%s %s" % pair)

    if options is not None:
        for opt in options:
            if type(opt) is tuple:
                maxas_i.append("-D%s %s" % opt)
                maxas_p.append("-D%s %s" % opt)
                kernel_name += "_%s%s" % opt
            else:
                maxas_i.append("-D%s 1" % opt)
                maxas_p.append("-D%s 1" % opt)
                kernel_name += "_%s" % opt

    maxas_i.insert(2, "-k " + kernel_name)

    sass_name  = kernel_spec["sass"] + ".sass"
    cubin_name = kernel_name + ".cubin"

    ptx_version = "4.2" if major < 6 else "5.0"
    ptx_file   = get_ptx_file(kernel_spec, kernel_name, arch, ptx_version)
    sass_file  = os.path.join(sass_dir, sass_name)
    cubin_file = os.path.join(cubin_dir, cubin_name)

    if not os.path.exists(sass_file):
        raise RuntimeError("Missing: %s for kernel: %s" % (sass_name, kernel_name))

    ptx_age   = os.path.getmtime(ptx_file)
    cubin_age = os.path.getmtime(cubin_file) if os.path.exists(cubin_file) else 0

    if ptx_age > cubin_age:
        run_command([ "ptxas -v -arch", arch, "-o", cubin_file, ptx_file ])
        cubin_age = 0

    includes = extract_includes(sass_name)

    for include, include_age in includes:
        if include_age > cubin_age:
            run_command(maxas_i + [sass_file, cubin_file])
            cubin_age = include_age
            break

    if debug:
        pre_file  = os.path.join(pre_dir,  kernel_name + "_pre.sass")
        dump_file = os.path.join(dump_dir, kernel_name + "_dump.sass")
        pre_age   = os.path.getmtime(pre_file)  if os.path.exists(pre_file)  else 0
        dump_age  = os.path.getmtime(dump_file) if os.path.exists(dump_file) else 0

        for include, include_age in includes:
            if include_age > pre_age:
                run_command(maxas_p + [sass_file, pre_file])
                break

        if cubin_age > dump_age:
            run_command(["nvdisasm -raw", cubin_file, ">", dump_file])

    params  = _params[kernel_spec["params"]]
    sig = ""
    for p in params:
        ptype, pname = _space_re.split(p)
        if ptype[-1] == '*':
            sig += "Q"
        elif ptype == 'float':
            sig += "f"
        elif ptype == 'unsigned':
            sig += "I"
        else:
            sig += "i"

    module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin"))
    func   = module.get_function(kernel_name)
    func.prepare(sig)
    func.threads = kernel_spec["threads"]
    return func
Exemple #40
0
    cuda.device_attribute.MAX_THREADS_PER_BLOCK)
# number of blocks required to handle all atoms
N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/\
               THREADS_PER_BLOCK))
# maximum number of threads per block
N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/\
               N_BLOCKS))

path = os.path.dirname(
    os.path.realpath(inspect.getfile(inspect.currentframe())))
# ------------------------
# precompiled kernel is preferrable
# ------------------------
kernel = path + '/Lennard-Jones_BpV.cubin'
if os.path.isfile(kernel):
    kernel = cuda.module_from_file(kernel)
else:
    # ------------------------
    # just-in-time compilation
    # ------------------------
    with open(path + '/Lennard-Jones_BpV.cu') as ifs:
        kernel_text = ifs.read()
    kernel = SourceModule(kernel_text)

energy = kernel.get_function('Lennard_Jones_BpV')

shared_mem = DIM * N_THREAD * np.float32().nbytes
cuda_grid = (int(ngrid), N_BLOCKS)
cuda_blocks = (N_THREAD, 1, 1)

energy(cuda.InOut(grid),
Exemple #41
0
import sys

from gpuDA.gpuDA import GpuDA
from set_boundary import set_boundary_values

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

npz = 3
npy = 3
npx = 3

assert(size == npx*npy*npz)

mod = cuda.module_from_file('diffusion_kernel.cubin')
func = mod.get_function('temperature_update16x16')

# local sizes:
nx = 510
ny = 510
nz = 512

# global lengths:
lx = 0.3
ly = 0.3
lz = 0.3

# material properties:
alpha = 1e-5