def __init__(self, mainf_list, buffer_dict={}): """ """ common.check_type('buffer_dict', buffer_dict, dict) # local variables nx_list = [f.nx for f in mainf_list] nx = int(sum(nx_list) - len(nx_list) + 1) ny, nz = [int(n) for n in mainf_list[0].ns[1:]] accum_nx_list = np.add.accumulate([0] + [f.nx - 1 for f in mainf_list]) #accum_nx_list[-1] += 1 accum_nx_list = [int(anx) for anx in accum_nx_list] # global variables self.mainf_list = mainf_list self.buffer_dict = buffer_dict self.updatef_list = mainf_list[:] + buffer_dict.values() self.dtype = mainf_list[0].dtype self.nx = nx self.nx_list = nx_list self.accum_nx_list = accum_nx_list self.ns = (nx, ny, nz) # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, nx, ny, nz, precision_float='single', segment_nbytes=16): common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('segment_nbytes', segment_nbytes, int) common.check_value('precision_float', precision_float, ('single', 'double')) # local variables dtype = {'single': np.float32, 'double': np.float64}[precision_float] # padding for the nz which is multi of segment size align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [nx, ny, nz] ns_pitch = [nx, ny, nz_pitch] ns_pad = [nx, ny, pad] # allocations ehs = [np.zeros(ns_pitch, dtype) for i in range(6)] ces = [np.ones(ns_pitch, dtype) * 0.5 for i in range(3)] chs = [np.ones(ns_pitch, dtype) * 0.5 for i in range(3)] # global variables self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] self.ces = ces self.cex, self.cey, self.cez = ces self.chs = chs self.chx, self.chy, self.chz = chs # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, nx, ny, nz, precision_float='single', segment_nbytes=16): common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('segment_nbytes', segment_nbytes, int) common.check_value('precision_float', precision_float, ('single', 'double')) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] # padding for the nz which is multi of segment size align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [nx, ny, nz] ns_pitch = [nx, ny, nz_pitch] ns_pad = [nx, ny, pad] # allocations ehs = [np.zeros(ns_pitch, dtype) for i in range(6)] ces = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] chs = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] # global variables self.dx = 1. self.dt = 0.5 self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] self.ces = ces self.cex, self.cey, self.cez = ces self.chs = chs self.chx, self.chy, self.chz = chs # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, mainf_list, buffer_dict={}): """ """ try: from kemp.fdtd3d import gpu common.check_type('mainf_list', mainf_list, (list, tuple), (gpu.Fields, cpu.Fields)) except: common.check_type('mainf_list', mainf_list, (list, tuple), cpu.Fields) common.check_type('buffer_dict', buffer_dict, dict) # local variables device_type_list = [f.device_type for f in mainf_list] if 'cpu' in device_type_list: cpuf = mainf_list[device_type_list.index('cpu')] else: cpuf = None nx_list = [f.nx for f in mainf_list] nx = int(sum(nx_list) - len(nx_list) + 1) ny, nz = [int(n) for n in mainf_list[0].ns[1:]] accum_nx_list = np.add.accumulate([0] + [f.nx - 1 for f in mainf_list]) accum_nx_list[-1] += 1 accum_nx_list = [int(anx) for anx in accum_nx_list] # global variables self.mainf_list = mainf_list self.buffer_dict = buffer_dict self.updatef_list = buffer_dict.values() + mainf_list[:] self.cpuf = cpuf self.dtype = mainf_list[0].dtype self.nx = nx self.nx_list = nx_list self.accum_nx_list = accum_nx_list self.ns = (nx, ny, nz) # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance) # append the ExchangeNode instance if len(self.updatef_list) > 1: from exchange import ExchangeNode ExchangeNode(self)
def __init__(self, mainf_list, buffer_dict={}): """ """ try: from kemp.fdtd3d import gpu common.check_type('mainf_list', mainf_list, (list, tuple), (gpu.Fields, cpu.Fields)) except: common.check_type('mainf_list', mainf_list, (list, tuple), cpu.Fields) common.check_type('buffer_dict', buffer_dict, dict) # local variables device_type_list = [f.device_type for f in mainf_list] if 'cpu' in device_type_list: cpuf = mainf_list[ device_type_list.index('cpu') ] else: cpuf = None nx_list = [f.nx for f in mainf_list] nx = int( sum(nx_list) - len(nx_list) + 1 ) ny, nz = [int(n) for n in mainf_list[0].ns[1:]] accum_nx_list = np.add.accumulate([0] + [f.nx-1 for f in mainf_list]) accum_nx_list[-1] += 1 accum_nx_list = [int(anx) for anx in accum_nx_list] # global variables self.mainf_list = mainf_list self.buffer_dict = buffer_dict self.updatef_list = buffer_dict.values() + mainf_list[:] self.cpuf = cpuf self.dtype = mainf_list[0].dtype self.nx = nx self.nx_list = nx_list self.accum_nx_list = accum_nx_list self.ns = (nx, ny, nz) # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance) # append the ExchangeNode instance if len(self.updatef_list) > 1: from exchange import ExchangeNode ExchangeNode(self)
def __init__(self, nx, ny, nz, precision_float='single'): common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('precision_float', precision_float, ('single', 'double')) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] ns = [nx, ny, nz] # allocations ehs = [np.zeros(ns, dtype) for i in range(6)] # global variables self.dx = 1. self.dt = 0.5 self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.precision_float = precision_float self.dtype = dtype self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] self.ce_on, self.ch_on, self.rd_on = False, False, False self.ces = self.cex, self.cey, self.cez = 0.5, 0.5, 0.5 self.chs = self.chx, self.chy, self.chz = 0.5, 0.5, 0.5 self.erds = self.erdx, self.erdy, self.erdz = 1., 1., 1. self.hrds = self.hrdx, self.hrdy, self.hrdz = 1., 1., 1. # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, nx, ny, nz, \ coeff_use='', \ precision_float='single', \ use_cpu_core=0): """ """ common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) common.check_type('use_cpu_core', use_cpu_core, int) # local variables dtype = {'single': np.float32, 'double': np.float64}[precision_float] # padding for the nz which is multiple of 4 (float32) or 2 (float64) segment_nbytes = 16 align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [nx, ny, nz] ns_pitch = [nx, ny, nz_pitch] ns_pad = [nx, ny, pad] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations ehs = [np.zeros(ns_pitch, dtype) for i in range(6)] if ce_on: ces = [np.ones(ns_pitch, dtype) * 0.5 for i in range(3)] if ch_on: chs = [np.ones(ns_pitch, dtype) * 0.5 for i in range(3)] # global variables and functions self.device_type = 'cpu' self.qtask = QueueTask() self.enqueue = self.qtask.enqueue self.enqueue_barrier = self.qtask.enqueue_barrier self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.use_cpu_core = use_cpu_core self.dtype = dtype self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] if ce_on: self.ces = ces self.cex, self.cey, self.cez = ces if ch_on: self.chs = chs self.chx, self.chy, self.chz = chs # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, queue_task, nx, ny, nz, \ coeff_use='', \ precision_float='single', \ use_cpu_core=0): """ """ common.check_type('queue_task', queue_task, QueueTask) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) common.check_type('use_cpu_core', use_cpu_core, int) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] # padding for the nz which is multiple of 4 (float32) or 2 (float64) segment_nbytes = 16 align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [nx, ny, nz] ns_pitch = [nx, ny, nz_pitch] ns_pad = [nx, ny, pad] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations ehs = [np.zeros(ns_pitch, dtype) for i in range(6)] if ce_on: ces = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] if ch_on: chs = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] # global variables and functions self.device_type = 'cpu' self.qtask = queue_task self.enqueue = queue_task.enqueue self.enqueue_barrier = queue_task.enqueue_barrier self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.use_cpu_core = use_cpu_core self.dtype = dtype self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] if ce_on: self.ces = ces self.cex, self.cey, self.cez = ces if ch_on: self.chs = chs self.chx, self.chy, self.chz = chs # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, context, device, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ local_work_size=256, \ global_work_size=0): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('global_work_size', global_work_size, int) common.check_type('local_work_size', local_work_size, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) # local variables queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single': np.float32, 'double': np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) align_size = {'single': 16, 'double': 8}[precision_float] # 64 Bytes pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 mflags = cl.mem_flags.READ_WRITE eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)] for eh_buf in eh_bufs: cl.enqueue_copy(queue, eh_buf, f) if ce_on: mflags = cl.mem_flags.READ_ONLY ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] if ch_on: mflags = cl.mem_flags.READ_ONLY ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] del f, cf # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] if ce_on: self.ce_bufs = ce_bufs self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.ch_bufs = ch_bufs self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.ls = local_work_size self.gs = global_work_size if self.gs == 0: self.gs = common_gpu.get_optimal_gs(device) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, device_id, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ block_size=256, \ grid_size=0): """ """ common.check_type('device_id', device_id, int) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, ('single', 'double')) common.check_type('block_size', block_size, int) common.check_type('grid_size', grid_size, int) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] dtype_str_list = { \ 'single':['float'], \ 'double':['double'] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) segment_nbytes = 64 align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # CUDA device and context cuda.init() device = cuda.Device(device_id) context = device.make_context() stream = cuda.Stream() # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 eh_bufs = [cuda.to_device(f) for i in range(6)] ce_bufs = [cuda.to_device(cf) for i in range(3)] if ce_on else None ch_bufs = [cuda.to_device(cf) for i in range(3)] if ch_on else None del f, cf # global variables self.device_type = 'gpu' self.device = device self.context = context self.stream = stream self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] self.ce_bufs = ce_bufs self.ch_bufs = ch_bufs if ce_on: self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.bs = (block_size, 1, 1) self.gs = (grid_size, 1) if grid_size != 0 else (common_gpu.get_optimal_gs(device, block_size), 1) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, nx, ny, nz, \ coeff_use='', \ precision_float='single', \ use_cpu_core=0, \ mpi_type=''): """ """ common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) common.check_type('use_cpu_core', use_cpu_core, int) common.check_value('mpi_type', mpi_type, \ ('', 'x+', 'x-', 'y+', 'y-', 'z+', 'z-')) # local variables dtype = {'single':np.float32, 'double':np.float64}[precision_float] # padding for the nz which is multiple of 4 (float32) or 2 (float64) align_size = {'single':4, 'double':2}[precision_float] # 16 Bytes pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [nx, ny, nz] ns_pitch = [nx, ny, nz_pitch] ns_pad = [nx, ny, pad] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations ehs = [np.zeros(ns_pitch, dtype) for i in range(6)] if ce_on: ces = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] if ch_on: chs = [np.ones(ns_pitch, dtype)*0.5 for i in range(3)] # global variables and functions self.device_type = 'cpu' self.qtask = QueueTask() self.enqueue = self.qtask.enqueue self.enqueue_barrier = self.qtask.enqueue_barrier self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.use_cpu_core = use_cpu_core self.dtype = dtype self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] if ce_on: self.ces = ces self.cex, self.cey, self.cez = ces if ch_on: self.chs = chs self.chx, self.chy, self.chz = chs self.mpi_type = mpi_type # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance) split = {'+': 'h', '-': 'e', '': ''}[mpi_type[1:]] if split == '': self.update_e = self.update_e_whole self.update_h = self.update_h_whole elif split == 'e': self.update_e = self.update_e_split self.update_h = self.update_h_whole elif split == 'h': self.update_e = self.update_e_whole self.update_h = self.update_h_split
def __init__(self, context, device, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ local_work_size=256, \ global_work_size=0): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_type('global_work_size', global_work_size, int) common.check_type('local_work_size', local_work_size, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, \ ('single', 'double')) # local variables queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single':np.float32, 'double':np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) align_size = {'single':16, 'double':8}[precision_float] # 64 Bytes pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 mflags = cl.mem_flags.READ_WRITE eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)] for eh_buf in eh_bufs: cl.enqueue_copy(queue, eh_buf, f) if ce_on: mflags = cl.mem_flags.READ_ONLY ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] if ch_on: mflags = cl.mem_flags.READ_ONLY ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)] del f, cf # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] if ce_on: self.ce_bufs = ce_bufs self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.ch_bufs = ch_bufs self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.ls = local_work_size self.gs = global_work_size if self.gs == 0: self.gs = common_gpu.get_optimal_gs(device) # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, queue_task, \ nx, ny, nz, \ precision_float='single', \ use_cpu_core=0): """ """ common.check_type('queue_task', queue_task, QueueTask) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('precision_float', precision_float, ('single', 'double')) common.check_type('use_cpu_core', use_cpu_core, int) # local variables ns = [nx, ny, nz] dtype = {'single': np.float32, 'double': np.float64}[precision_float] # allocations ehs = [np.zeros(ns, dtype) for i in range(6)] # common macros for C templates dtype_macros = ['DTYPE'] dtype_values = { 'single': ['float'], ' double': ['double'] }[precision_float] omp_macros = ['OMP ', 'SET_NUM_THREADS'] if use_cpu_core == 0: omp_values = ['', ''] elif use_cpu_core == 1: omp_values = ['// ', ''] else: omp_values = ['', 'omp_set_num_threads(%d);' % use_cpu_core] # global variables and functions self.device_type = 'cpu' self.qtask = queue_task self.enqueue = queue_task.enqueue self.enqueue_barrier = queue_task.enqueue_barrier self.dx = 1. self.dt = 0.5 self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.dtype = dtype self.dtype_omp_macros = dtype_macros + omp_macros self.dtype_omp_values = dtype_values + omp_values self.ehs = ehs self.ex, self.ey, self.ez = ehs[:3] self.hx, self.hy, self.hz = ehs[3:] self.ce_on, self.ch_on = False, False self.rd_on = False # update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, context, device, queue_task, \ nx, ny, nz, \ coeff_use='', \ precision_float='single', \ local_work_size=256): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('queue_task', queue_task, QueueTask) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh')) common.check_value('precision_float', precision_float, ('single', 'double')) common.check_type('local_work_size', local_work_size, int) # local variables queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single':np.float32, 'double':np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # padding for the nz which is multiple of 16 (float32) or 8 (float64) segment_nbytes = 64 align_size = segment_nbytes / np.nbytes[dtype] pad = int(np.ceil(float(nz) / align_size) * align_size) - nz slice_z = slice(None, None) if pad == 0 else slice(None, -pad) nz_pitch = nz + pad ns = [np.int32(nx), np.int32(ny), np.int32(nz)] ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)] ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)] # on/off the coefficient arrays ce_on = True if 'e' in coeff_use else False ch_on = True if 'h' in coeff_use else False # allocations f = np.zeros(ns_pitch, dtype) cf = np.ones_like(f) * 0.5 mflags = cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR eh_bufs = [cl.Buffer(context, mflags, hostbuf=f) for i in range(6)] c_mflags = cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR if ce_on: ce_bufs = [cl.Buffer(context, c_mflags, hostbuf=cf) for i in range(3)] if ch_on: ch_bufs = [cl.Buffer(context, c_mflags, hostbuf=cf) for i in range(3)] del f, cf # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.qtask = queue_task self.enqueue = queue_task.enqueue self.enqueue_barrier = queue_task.enqueue_barrier self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.ns_pitch = ns_pitch self.ns_pad = ns_pad self.align_size = align_size self.pad = pad self.slice_z = slice_z self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.coeff_use = coeff_use self.ce_on = ce_on self.ch_on = ch_on self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] if ce_on: self.ce_bufs = ce_bufs self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs if ch_on: self.ch_bufs = ch_bufs self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs self.ls = ls = local_work_size nmax = nx * ny * nz_pitch remainder = nmax % ls self.gs = nmax if remainder == 0 else nmax - remainder + ls # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)
def __init__(self, context, device, \ nx, ny, nz, \ precision_float='single', \ local_work_size=256): """ """ common.check_type('context', context, cl.Context) common.check_type('device', device, cl.Device) common.check_type('nx', nx, int) common.check_type('ny', ny, int) common.check_type('nz', nz, int) common.check_value('precision_float', precision_float, ('single', 'double')) common.check_type('local_work_size', local_work_size, int) # local variables ns = [np.int32(nx), np.int32(ny), np.int32(nz)] queue = cl.CommandQueue(context, device) pragma_fp64 = '' if precision_float == 'double': extensions = device.get_info(cl.device_info.EXTENSIONS) if 'cl_khr_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' elif 'cl_amd_fp64' in extensions: pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable' else: precision_float = 'single' print('Warning: The %s GPU device is not support the double-precision.') % \ device.get_info(cl.device_info.NAME) print('The precision is changed to \'single\'.') dtype = {'single': np.float32, 'double': np.float64}[precision_float] dtype_str_list = { \ 'single':['float', ''], \ 'double':['double', pragma_fp64] }[precision_float] # allocations f = np.zeros(ns, dtype) eh_bufs = [ cl.Buffer(context, cl.mem_flags.READ_WRITE, f.nbytes) for i in range(6) ] for eh_buf in eh_bufs: cl.enqueue_copy(queue, eh_buf, f) # global variables self.device_type = 'gpu' self.context = context self.device = device self.queue = queue self.dx = 1. self.dt = 0.5 self.nx = nx self.ny = ny self.nz = nz self.ns = ns self.precision_float = precision_float self.dtype = dtype self.dtype_str_list = dtype_str_list self.eh_bufs = eh_bufs self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3] self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:] self.ce_on, self.ch_on = False, False self.rd_on = False self.ls = local_work_size # create update list self.instance_list = [] self.append_instance = lambda instance: \ common.append_instance(self.instance_list, instance)