Exemple #1
0
    def __init__(
        self,
        context,
        device,
        nx,
        ny,
        nz,
        coeff_use="e",
        precision_float="single",
        local_work_size=256,
        global_work_size=0,
    ):
        """
        """

        common.check_type("context", context, cl.Context)
        common.check_type("device", device, cl.Device)
        common.check_type("nx", nx, int)
        common.check_type("ny", ny, int)
        common.check_type("nz", nz, int)
        common.check_type("global_work_size", global_work_size, int)
        common.check_type("local_work_size", local_work_size, int)

        common.check_value("coeff_use", coeff_use, ("", "e", "h", "eh"))
        common.check_value("precision_float", precision_float, ("single", "double"))

        self.context = context
        self.device = device
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.ls = local_work_size
        self.gs = global_work_size
        self.coeff_use = coeff_use
        self.dtype = {"single": np.float32, "double": np.float64}[precision_float]
        self.dtype_str = {"single": "float", "double": "double"}[precision_float]
        self.dtype_str_list = {
            "single": ["float", ""],
            "double": ["double", "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"],
        }[precision_float]

        self.device_type = "gpu"

        # padding for the nz which is multiple of 16 (float32) or 8 (float64)
        self.align_size = a_size = {"single": 16, "double": 8}[precision_float]  # 64 Bytes
        self.pad = pad = int(np.ceil(float(nz) / a_size) * a_size) - nz
        self.slz = slice(None, None) if pad == 0 else slice(None, -pad)
        self.nz_pitch = nz_pitch = nz + pad

        self.dtype_str_list.append("" if pad == 0 else "-%s" % pad)

        # ns, queue, global_size
        self.ns = [np.int32(nx), np.int32(ny), np.int32(nz)]
        self.ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)]
        self.ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)]
        self.queue = cl.CommandQueue(self.context, self.device)
        if self.gs == 0:
            self.gs = common_gpu.get_optimal_gs(self.device)

        # on/off the coefficient arrays
        self.ce_on = True if "e" in self.coeff_use else False
        self.ch_on = True if "h" in self.coeff_use else False

        # allocations
        f = np.zeros(self.ns_pitch, dtype=self.dtype)
        cf = np.ones_like(f) * 0.5
        mf = cl.mem_flags

        self.eh_bufs = [cl.Buffer(self.context, mf.READ_WRITE, f.nbytes) for i in range(6)]
        for eh_buf in self.eh_bufs:
            cl.enqueue_copy(self.queue, eh_buf, f)
        self.ex_buf, self.ey_buf, self.ez_buf = self.eh_bufs[:3]
        self.hx_buf, self.hy_buf, self.hz_buf = self.eh_bufs[3:]

        if self.ce_on:
            self.ce_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) for i in range(3)]
            self.cex_buf, self.cey_buf, self.cez_buf = self.ce_bufs

        if self.ch_on:
            self.ch_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) for i in range(3)]
            self.chx_buf, self.chy_buf, self.chz_buf = self.ch_bufs

        del f, cf

        # program
        macros = ["ARGS_CE", "CEX", "CEY", "CEZ", "ARGS_CH", "CHX", "CHY", "CHZ", "DX", "DTYPE", "PRAGMA_fp64", "PAD"]

        values = ["", "0.5", "0.5", "0.5", "", "0.5", "0.5", "0.5", str(self.ls)] + self.dtype_str_list

        self.e_args = self.ns_pitch + self.eh_bufs
        self.h_args = self.ns_pitch + self.eh_bufs

        if self.ce_on:
            values[:4] = [
                ", __global DTYPE *cex, __global DTYPE *cey, __global DTYPE *cez",
                "cex[idx]",
                "cey[idx]",
                "cez[idx]",
            ]
            self.e_args += self.ce_bufs

        if self.ch_on:
            values[4:8] = [
                ", __global DTYPE *chx, __global DTYPE *chy, __global DTYPE *chz",
                "chx[idx]",
                "chy[idx]",
                "chz[idx]",
            ]
            self.h_args += self.ch_bufs

        ksrc = common.replace_template_code(open(common_gpu.src_path + "core.cl").read(), macros, values)
        self.program = cl.Program(self.context, ksrc).build()
    def __init__(self, context, device, \
            nx, ny, nz, \
            coeff_use='', \
            precision_float='single', \
            local_work_size=256, \
            global_work_size=0):
        """
        """

        common.check_type('context', context, cl.Context)
        common.check_type('device', device, cl.Device)
        common.check_type('nx', nx, int)
        common.check_type('ny', ny, int)
        common.check_type('nz', nz, int)
        common.check_type('global_work_size', global_work_size, int)
        common.check_type('local_work_size', local_work_size, int)
        common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh'))
        common.check_value('precision_float', precision_float, \
                ('single', 'double'))

        # local variables
        queue = cl.CommandQueue(context, device)
        pragma_fp64 = ''
        if precision_float == 'double':
            extensions = device.get_info(cl.device_info.EXTENSIONS)
            if 'cl_khr_fp64' in extensions:
                pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable'
            elif 'cl_amd_fp64' in extensions:
                pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable'
            else:
                precision_float = 'single'
                print('Warning: The %s GPU device is not support the double-precision.') % \
                        device.get_info(cl.device_info.NAME)
                print('The precision is changed to \'single\'.')

        dtype = {'single': np.float32, 'double': np.float64}[precision_float]
        dtype_str_list = { \
                'single':['float', ''], \
                'double':['double', pragma_fp64] }[precision_float]

        # padding for the nz which is multiple of 16 (float32) or 8 (float64)
        align_size = {'single': 16, 'double': 8}[precision_float]  # 64 Bytes
        pad = int(np.ceil(float(nz) / align_size) * align_size) - nz
        slice_z = slice(None, None) if pad == 0 else slice(None, -pad)
        nz_pitch = nz + pad

        ns = [np.int32(nx), np.int32(ny), np.int32(nz)]
        ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)]
        ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)]

        # on/off the coefficient arrays
        ce_on = True if 'e' in coeff_use else False
        ch_on = True if 'h' in coeff_use else False

        # allocations
        f = np.zeros(ns_pitch, dtype)
        cf = np.ones_like(f) * 0.5

        mflags = cl.mem_flags.READ_WRITE
        eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)]
        for eh_buf in eh_bufs:
            cl.enqueue_copy(queue, eh_buf, f)

        if ce_on:
            mflags = cl.mem_flags.READ_ONLY
            ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)]

        if ch_on:
            mflags = cl.mem_flags.READ_ONLY
            ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)]

        del f, cf

        # global variables
        self.device_type = 'gpu'
        self.context = context
        self.device = device
        self.queue = queue

        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.ns = ns
        self.ns_pitch = ns_pitch
        self.ns_pad = ns_pad

        self.align_size = align_size
        self.pad = pad
        self.slice_z = slice_z

        self.precision_float = precision_float
        self.dtype = dtype
        self.dtype_str_list = dtype_str_list

        self.coeff_use = coeff_use
        self.ce_on = ce_on
        self.ch_on = ch_on

        self.eh_bufs = eh_bufs
        self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3]
        self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:]
        if ce_on:
            self.ce_bufs = ce_bufs
            self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs
        if ch_on:
            self.ch_bufs = ch_bufs
            self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs

        self.ls = local_work_size
        self.gs = global_work_size
        if self.gs == 0:
            self.gs = common_gpu.get_optimal_gs(device)

        # create update list
        self.instance_list = []
        self.append_instance = lambda instance: \
            common.append_instance(self.instance_list, instance)
Exemple #3
0
    def __init__(self, context, device, \
            nx, ny, nz, \
            coeff_use='', \
            precision_float='single', \
            local_work_size=256, \
            global_work_size=0):
        """
        """

        common.check_type('context', context, cl.Context)
        common.check_type('device', device, cl.Device)
        common.check_type('nx', nx, int)
        common.check_type('ny', ny, int)
        common.check_type('nz', nz, int)
        common.check_type('global_work_size', global_work_size, int)
        common.check_type('local_work_size', local_work_size, int)
        common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh'))
        common.check_value('precision_float', precision_float, \
                ('single', 'double'))

        # local variables
        queue = cl.CommandQueue(context, device)
        pragma_fp64 = ''
        if precision_float == 'double':
            extensions = device.get_info(cl.device_info.EXTENSIONS)
            if 'cl_khr_fp64' in extensions:
                pragma_fp64 = '#pragma OPENCL EXTENSION cl_khr_fp64 : enable'
            elif 'cl_amd_fp64' in extensions:
                pragma_fp64 = '#pragma OPENCL EXTENSION cl_amd_fp64 : enable'
            else:
                precision_float = 'single'
                print('Warning: The %s GPU device is not support the double-precision.') % \
                        device.get_info(cl.device_info.NAME)
                print('The precision is changed to \'single\'.')

        dtype = {'single':np.float32, 'double':np.float64}[precision_float]
        dtype_str_list = { \
                'single':['float', ''], \
                'double':['double', pragma_fp64] }[precision_float]

        # padding for the nz which is multiple of 16 (float32) or 8 (float64)
        align_size = {'single':16, 'double':8}[precision_float]  # 64 Bytes
        pad = int(np.ceil(float(nz) / align_size) * align_size) - nz
        slice_z = slice(None, None) if pad == 0 else slice(None, -pad)
        nz_pitch = nz + pad

        ns = [np.int32(nx), np.int32(ny), np.int32(nz)]
        ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)]
        ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)]

        # on/off the coefficient arrays
        ce_on = True if 'e' in coeff_use else False
        ch_on = True if 'h' in coeff_use else False

        # allocations
        f = np.zeros(ns_pitch, dtype)
        cf = np.ones_like(f) * 0.5

        mflags = cl.mem_flags.READ_WRITE
        eh_bufs = [cl.Buffer(context, mflags, f.nbytes) for i in range(6)]
        for eh_buf in eh_bufs:
            cl.enqueue_copy(queue, eh_buf, f) 

        if ce_on:
            mflags = cl.mem_flags.READ_ONLY
            ce_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)]

        if ch_on:
            mflags = cl.mem_flags.READ_ONLY
            ch_bufs = [cl.Buffer(context, mflags, cf.nbytes) for i in range(3)]

        del f, cf

        # global variables
        self.device_type = 'gpu'
        self.context = context
        self.device = device
        self.queue = queue

        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.ns = ns
        self.ns_pitch = ns_pitch
        self.ns_pad = ns_pad

        self.align_size = align_size
        self.pad = pad
        self.slice_z = slice_z

        self.precision_float = precision_float
        self.dtype = dtype
        self.dtype_str_list = dtype_str_list 

        self.coeff_use = coeff_use
        self.ce_on = ce_on
        self.ch_on = ch_on

        self.eh_bufs = eh_bufs
        self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3]
        self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:]
        if ce_on:
            self.ce_bufs = ce_bufs
            self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs
        if ch_on:
            self.ch_bufs = ch_bufs
            self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs

        self.ls = local_work_size
        self.gs = global_work_size
        if self.gs == 0:
            self.gs = common_gpu.get_optimal_gs(device)

        # create update list
        self.instance_list = []
        self.append_instance = lambda instance: \
            common.append_instance(self.instance_list, instance)
Exemple #4
0
    def __init__(self, device_id, \
            nx, ny, nz, \
            coeff_use='', \
            precision_float='single', \
            block_size=256, \
            grid_size=0):
        """
        """

        common.check_type('device_id', device_id, int)
        common.check_type('nx', nx, int)
        common.check_type('ny', ny, int)
        common.check_type('nz', nz, int)
        common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh'))
        common.check_value('precision_float', precision_float, ('single', 'double'))
        common.check_type('block_size', block_size, int)
        common.check_type('grid_size', grid_size, int)

        # local variables
        dtype = {'single':np.float32, 'double':np.float64}[precision_float]
        dtype_str_list = { \
                'single':['float'], \
                'double':['double'] }[precision_float]

        # padding for the nz which is multiple of 16 (float32) or 8 (float64)
        segment_nbytes = 64
        align_size = segment_nbytes / np.nbytes[dtype]
        pad = int(np.ceil(float(nz) / align_size) * align_size) - nz
        slice_z = slice(None, None) if pad == 0 else slice(None, -pad)
        nz_pitch = nz + pad

        ns = [np.int32(nx), np.int32(ny), np.int32(nz)]
        ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)]
        ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)]

        # on/off the coefficient arrays
        ce_on = True if 'e' in coeff_use else False
        ch_on = True if 'h' in coeff_use else False

        # CUDA device and context
        cuda.init()
        device = cuda.Device(device_id)
        context = device.make_context()
        stream = cuda.Stream()

        # allocations
        f = np.zeros(ns_pitch, dtype)
        cf = np.ones_like(f) * 0.5

        eh_bufs = [cuda.to_device(f) for i in range(6)]
        ce_bufs = [cuda.to_device(cf) for i in range(3)] if ce_on else None
        ch_bufs = [cuda.to_device(cf) for i in range(3)] if ch_on else None
        del f, cf

        # global variables
        self.device_type = 'gpu'
        self.device = device
        self.context = context
        self.stream = stream

        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.ns = ns
        self.ns_pitch = ns_pitch
        self.ns_pad = ns_pad

        self.align_size = align_size
        self.pad = pad
        self.slice_z = slice_z

        self.precision_float = precision_float
        self.dtype = dtype
        self.dtype_str_list = dtype_str_list 

        self.coeff_use = coeff_use
        self.ce_on = ce_on
        self.ch_on = ch_on

        self.eh_bufs = eh_bufs
        self.ex_buf, self.ey_buf, self.ez_buf = eh_bufs[:3]
        self.hx_buf, self.hy_buf, self.hz_buf = eh_bufs[3:]
        self.ce_bufs = ce_bufs
        self.ch_bufs = ch_bufs
        if ce_on: self.cex_buf, self.cey_buf, self.cez_buf = ce_bufs
        if ch_on: self.chx_buf, self.chy_buf, self.chz_buf = ch_bufs

        self.bs = (block_size, 1, 1)
        self.gs = (grid_size, 1) if grid_size != 0 else (common_gpu.get_optimal_gs(device, block_size), 1)

        # create update list
        self.instance_list = []
        self.append_instance = lambda instance: \
            common.append_instance(self.instance_list, instance)
Exemple #5
0
    def __init__(self, context, device, \
            nx, ny, nz, \
            coeff_use='e', \
            precision_float='single', \
            local_work_size=256, \
            global_work_size=0):
        """
        """

        common.check_type('context', context, cl.Context)
        common.check_type('device', device, cl.Device)
        common.check_type('nx', nx, int)
        common.check_type('ny', ny, int)
        common.check_type('nz', nz, int)
        common.check_type('global_work_size', global_work_size, int)
        common.check_type('local_work_size', local_work_size, int)

        common.check_value('coeff_use', coeff_use, ('', 'e', 'h', 'eh'))
        common.check_value('precision_float', precision_float,
                           ('single', 'double'))

        self.context = context
        self.device = device
        self.nx = nx
        self.ny = ny
        self.nz = nz
        self.ls = local_work_size
        self.gs = global_work_size
        self.coeff_use = coeff_use
        self.dtype = {
            'single': np.float32,
            'double': np.float64
        }[precision_float]
        self.dtype_str = {
            'single': 'float',
            'double': 'double'
        }[precision_float]
        self.dtype_str_list = { \
                'single':['float', ''], \
                'double':['double', '#pragma OPENCL EXTENSION cl_khr_fp64 : enable'] }[precision_float]

        self.device_type = 'gpu'

        # padding for the nz which is multiple of 16 (float32) or 8 (float64)
        self.align_size = a_size = {
            'single': 16,
            'double': 8
        }[precision_float]  # 64 Bytes
        self.pad = pad = int(np.ceil(float(nz) / a_size) * a_size) - nz
        self.slz = slice(None, None) if pad == 0 else slice(None, -pad)
        self.nz_pitch = nz_pitch = nz + pad

        self.dtype_str_list.append('' if pad == 0 else '-%s' % pad)

        # ns, queue, global_size
        self.ns = [np.int32(nx), np.int32(ny), np.int32(nz)]
        self.ns_pitch = [np.int32(nx), np.int32(ny), np.int32(nz_pitch)]
        self.ns_pad = [np.int32(nx), np.int32(ny), np.int32(pad)]
        self.queue = cl.CommandQueue(self.context, self.device)
        if self.gs == 0:
            self.gs = common_gpu.get_optimal_gs(self.device)

        # on/off the coefficient arrays
        self.ce_on = True if 'e' in self.coeff_use else False
        self.ch_on = True if 'h' in self.coeff_use else False

        # allocations
        f = np.zeros(self.ns_pitch, dtype=self.dtype)
        cf = np.ones_like(f) * 0.5
        mf = cl.mem_flags

        self.eh_bufs = [cl.Buffer(self.context, mf.READ_WRITE, f.nbytes) \
                for i in range(6)]
        for eh_buf in self.eh_bufs:
            cl.enqueue_copy(self.queue, eh_buf, f)
        self.ex_buf, self.ey_buf, self.ez_buf = self.eh_bufs[:3]
        self.hx_buf, self.hy_buf, self.hz_buf = self.eh_bufs[3:]

        if self.ce_on:
            self.ce_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) \
                    for i in range(3)]
            self.cex_buf, self.cey_buf, self.cez_buf = self.ce_bufs

        if self.ch_on:
            self.ch_bufs = [cl.Buffer(self.context, mf.READ_ONLY, cf.nbytes) \
                    for i in range(3)]
            self.chx_buf, self.chy_buf, self.chz_buf = self.ch_bufs

        del f, cf

        # program
        macros = ['ARGS_CE', 'CEX', 'CEY', 'CEZ', \
                'ARGS_CH', 'CHX', 'CHY', 'CHZ', \
                'DX', 'DTYPE', 'PRAGMA_fp64', 'PAD']

        values = ['', '0.5', '0.5', '0.5', \
                '', '0.5', '0.5', '0.5', \
                str(self.ls)] + self.dtype_str_list

        self.e_args = self.ns_pitch + self.eh_bufs
        self.h_args = self.ns_pitch + self.eh_bufs

        if self.ce_on:
            values[:4] = [ \
                    ', __global DTYPE *cex, __global DTYPE *cey, __global DTYPE *cez', \
                    'cex[idx]', 'cey[idx]', 'cez[idx]']
            self.e_args += self.ce_bufs

        if self.ch_on:
            values[4:8] = [ \
                    ', __global DTYPE *chx, __global DTYPE *chy, __global DTYPE *chz', \
                    'chx[idx]', 'chy[idx]', 'chz[idx]']
            self.h_args += self.ch_bufs

        ksrc = common.replace_template_code( \
                open(common_gpu.src_path + 'core.cl').read(), macros, values)
        self.program = cl.Program(self.context, ksrc).build()