def __init__( self, context, device, nx, ny, nz, coeff_use="e", precision_float="single", local_work_size=256, global_work_size=0, ): """ """ common.check_type("context", context, cl.Context) common.check_type("device", device, cl.Device) common.check_type("nx", nx, int) common.check_type("ny", ny, int) common.check_type("nz", nz, int) common.check_type("global_work_size", global_work_size, int) common.check_type("local_work_size", local_work_size, int) common.check_value("coeff_use", coeff_use, ("", "e", "h", "eh")) common.check_value("precision_float", precision_float, ("single", "double")) self.context = context self.device = device self.nx = nx self.ny = ny self.nz = nz self.ls = local_work_size self.gs = global_work_size self.coeff_use = coeff_use self.dtype = {"single": np.float32, "double": np.float64}[precision_float] self.dtype_str = {"single": "float", "double": "double"}[precision_float] self.dtype_str_list = { "single": ["float", ""], "double": ["double", "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"], }[precision_float] self.device_type = "gpu" # padding for the nz which is multiple of 16 (float32) or 8 (float64) self.align_size = a_size = {"single": 16, "double": 8}[precision_float] # 64 Bytes self.pad = pad = int(np.ceil(float(nz) / a_size) * a_size) - nz self.slz = slice(None, None) if pad == 0 else slice(None, -pad) self.nz_pitch = nz_pitch = nz + pad self.dtype_str_list.append("" if pad == 0 else "-%s" % pad) # ns, queue, global_size self.ns = [np.int32(nx), np.int32(ny), np.int32(nz)] self.ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] self.ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] self.queue = cl.CommandQueue(self.context, self.device) if self.gs == 0: self.gs = common_gpu.get_optimal_gs(self.device) # on/off the coefficient arrays self.ce_on = True if "e" in self.coeff_use else False self.ch_on = True if "h" in self.coeff_use else False # allocations f = np.zeros(self.ns_pitch, dtype=self.dtype) cf = np.ones_like(f) * 0.5 mf = cl.mem_flags self.eh_bufs = [cl.Buffer(self.context, mf.READ_WRITE, f.nbytes) for i in range(6)] for eh_buf in self.eh_bufs: cl.enqueue_copy(self.queue, eh_buf, f) self.ex_buf, self.ey_buf, self.ez_buf = self.eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = self.eh_bufs[3:] if self.ce_on: self.ce_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) for i in range(3)] self.cex_buf, self.cey_buf, self.cez_buf = self.ce_bufs if self.ch_on: self.ch_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) for i in range(3)] self.chx_buf, self.chy_buf, self.chz_buf = self.ch_bufs del f, cf # program macros = ["ARGS_CE", "CEX", "CEY", "CEZ", "ARGS_CH", "CHX", "CHY", "CHZ", "DX", "DTYPE", "PRAGMA_fp64", "PAD"] values = ["", "0.5", "0.5", "0.5", "", "0.5", "0.5", "0.5", str(self.ls)] + self.dtype_str_list self.e_args = self.ns_pitch + self.eh_bufs self.h_args = self.ns_pitch + self.eh_bufs if self.ce_on: values[:4] = [ ", __global DTYPE *cex, __global DTYPE *cey, __global DTYPE *cez", "cex[idx]", "cey[idx]", "cez[idx]", ] self.e_args += self.ce_bufs if self.ch_on: values[4:8] = [ ", __global DTYPE *chx, __global DTYPE *chy, __global DTYPE *chz", "chx[idx]", "chy[idx]", "chz[idx]", ] self.h_args += self.ch_bufs ksrc = common.replace_template_code(open(common_gpu.src_path + "core.cl").read(), macros, values) self.program = cl.Program(self.context, ksrc).build()
def __init__(self, context, device, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ local_work_size=256, \ global_work_size=0): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('global_work_size', global_work_size, int) common.check_type('local_work_size', local_work_size, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) # local variables queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single': np.float32, 'double': np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) align_size = {'single': 16, 'double': 8}[precision_float] # 64 Bytes pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 mflags = cl.mem_flags.READ_WRITE eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)] for eh_buf in eh_bufs: cl.enqueue_copy(queue, eh_buf, f) if ce_on: mflags = cl.mem_flags.READ_ONLY ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] if ch_on: mflags = cl.mem_flags.READ_ONLY ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] del f, cf # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] if ce_on: self.ce_bufs = ce_bufs self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.ch_bufs = ch_bufs self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.ls = local_work_size self.gs = global_work_size if self.gs == 0: self.gs = common_gpu.get_optimal_gs(device) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, context, device, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ local_work_size=256, \ global_work_size=0): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('global_work_size', global_work_size, int) common.check_type('local_work_size', local_work_size, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) # local variables queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single':np.float32, 'double':np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) align_size = {'single':16, 'double':8}[precision_float] # 64 Bytes pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 mflags = cl.mem_flags.READ_WRITE eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)] for eh_buf in eh_bufs: cl.enqueue_copy(queue, eh_buf, f) if ce_on: mflags = cl.mem_flags.READ_ONLY ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] if ch_on: mflags = cl.mem_flags.READ_ONLY ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] del f, cf # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] if ce_on: self.ce_bufs = ce_bufs self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.ch_bufs = ch_bufs self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.ls = local_work_size self.gs = global_work_size if self.gs == 0: self.gs = common_gpu.get_optimal_gs(device) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, device_id, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ block_size=256, \ grid_size=0): """ """ common.check_type('device_id', device_id, int) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, ('single', 'double')) common.check_type('block_size', block_size, int) common.check_type('grid_size', grid_size, int) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] dtype_str_list = { \ 'single':['float'], \ 'double':['double'] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) segment_nbytes = 64 align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # CUDA device and context cuda.init() device = cuda.Device(device_id) context = device.make_context() stream = cuda.Stream() # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 eh_bufs = [cuda.to_device(f) for i in range(6)] ce_bufs = [cuda.to_device(cf) for i in range(3)] if ce_on else None ch_bufs = [cuda.to_device(cf) for i in range(3)] if ch_on else None del f, cf # global variables self.device_type = 'gpu' self.device = device self.context = context self.stream = stream self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] self.ce_bufs = ce_bufs self.ch_bufs = ch_bufs if ce_on: self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.bs = (block_size, 1, 1) self.gs = (grid_size, 1) if grid_size != 0 else (common_gpu.get_optimal_gs(device, block_size), 1) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, context, device, \ nx, ny, nz, \ coeff_use='e', \ precision_float='single', \ local_work_size=256, \ global_work_size=0): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('global_work_size', global_work_size, int) common.check_type('local_work_size', local_work_size, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, ('single', 'double')) self.context = context self.device = device self.nx = nx self.ny = ny self.nz = nz self.ls = local_work_size self.gs = global_work_size self.coeff_use = coeff_use self.dtype = { 'single': np.float32, 'double': np.float64 }[precision_float] self.dtype_str = { 'single': 'float', 'double': 'double' }[precision_float] self.dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', '#pragma OPENCL EXTENSION cl_khr_fp64 : enable'] }[precision_float] self.device_type = 'gpu' # padding for the nz which is multiple of 16 (float32) or 8 (float64) self.align_size = a_size = { 'single': 16, 'double': 8 }[precision_float] # 64 Bytes self.pad = pad = int(np.ceil(float(nz) / a_size) * a_size) - nz self.slz = slice(None, None) if pad == 0 else slice(None, -pad) self.nz_pitch = nz_pitch = nz + pad self.dtype_str_list.append('' if pad == 0 else '-%s' % pad) # ns, queue, global_size self.ns = [np.int32(nx), np.int32(ny), np.int32(nz)] self.ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] self.ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] self.queue = cl.CommandQueue(self.context, self.device) if self.gs == 0: self.gs = common_gpu.get_optimal_gs(self.device) # on/off the coefficient arrays self.ce_on = True if 'e' in self.coeff_use else False self.ch_on = True if 'h' in self.coeff_use else False # allocations f = np.zeros(self.ns_pitch, dtype=self.dtype) cf = np.ones_like(f) * 0.5 mf = cl.mem_flags self.eh_bufs = [cl.Buffer(self.context, mf.READ_WRITE, f.nbytes) \ for i in range(6)] for eh_buf in self.eh_bufs: cl.enqueue_copy(self.queue, eh_buf, f) self.ex_buf, self.ey_buf, self.ez_buf = self.eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = self.eh_bufs[3:] if self.ce_on: self.ce_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) \ for i in range(3)] self.cex_buf, self.cey_buf, self.cez_buf = self.ce_bufs if self.ch_on: self.ch_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) \ for i in range(3)] self.chx_buf, self.chy_buf, self.chz_buf = self.ch_bufs del f, cf # program macros = ['ARGS_CE', 'CEX', 'CEY', 'CEZ', \ 'ARGS_CH', 'CHX', 'CHY', 'CHZ', \ 'DX', 'DTYPE', 'PRAGMA_fp64', 'PAD'] values = ['', '0.5', '0.5', '0.5', \ '', '0.5', '0.5', '0.5', \ str(self.ls)] + self.dtype_str_list self.e_args = self.ns_pitch + self.eh_bufs self.h_args = self.ns_pitch + self.eh_bufs if self.ce_on: values[:4] = [ \ ', __global DTYPE *cex, __global DTYPE *cey, __global DTYPE *cez', \ 'cex[idx]', 'cey[idx]', 'cez[idx]'] self.e_args += self.ce_bufs if self.ch_on: values[4:8] = [ \ ', __global DTYPE *chx, __global DTYPE *chy, __global DTYPE *chz', \ 'chx[idx]', 'chy[idx]', 'chz[idx]'] self.h_args += self.ch_bufs ksrc = common.replace_template_code( \ open(common_gpu.src_path + 'core.cl').read(), macros, values) self.program = cl.Program(self.context, ksrc).build()