def __init__(self, ctx=None, queue=None):

        self._PlanckConstantReduced = 1.0545717e-34
        # wavelength of cooling laser
        lam = 313.0e-9
        # wave vector
        self.k0 = numpy.array([0, 0, 2.0 * numpy.pi / lam],
                              dtype=numpy.float32)
        self.x0 = numpy.array([0, 0, 0], dtype=numpy.float32)
        # 1/e radius of cooling laser
        self.sigma = 1.0e-3

        # line width (unsaturated)
        self.gamma = 2.0 * numpy.pi * 19.0e6
        # Detuning at zero velocity
        self.delta0 = -0.5 * self.gamma
        # Saturation parameter
        self.S = 0.1

        self.ctx = ctx
        self.queue = queue
        if self.ctx == None:
            self.ctx = cl.create_some_context()
        if self.queue == None:
            self.queue = cl.CommandQueue(
                self.ctx,
                properties=cl.command_queue_properties.PROFILING_ENABLE)
        absolutePathToKernels = os.path.dirname(os.path.realpath(__file__))
        src = open(absolutePathToKernels + '/cooling_laser_advance.cl',
                   'r').read()
        self.program = cl.Program(self.ctx, src)
        try:
            self.program.build()
        except:
            print("Error:")
            print(
                self.program.get_build_info(self.ctx.devices[0],
                                            cl.program_build_info.LOG))
            raise
        self.program.compute_mean_scattered_photons_homogeneous_beam.set_scalar_arg_dtypes(
            [
                None, None, None, None, None, None, numpy.float32,
                numpy.float32, numpy.float32, numpy.float32, numpy.float32,
                numpy.float32, numpy.float32, numpy.int32, None
            ])
        self.program.compute_mean_scattered_photons_gaussian_beam.set_scalar_arg_dtypes(
            [
                None, None, None, None, None, None, numpy.float32,
                numpy.float32, numpy.float32, numpy.float32, numpy.float32,
                numpy.float32, numpy.float32, numpy.float32, numpy.float32,
                numpy.float32, numpy.float32, numpy.int32, None
            ])
        self.program.countEmissions.set_scalar_arg_dtypes(
            [None, None, numpy.int32, None, numpy.int32])
        self.program.computeKicks.set_scalar_arg_dtypes([
            None, None, numpy.int32, None, numpy.float32, numpy.float32,
            numpy.float32, numpy.float32, numpy.float32, None, None, None,
            numpy.int32
        ])

        self.generator = cl_random.RanluxGenerator(self.queue,
                                                   num_work_items=128,
                                                   luxury=1,
                                                   seed=None,
                                                   no_warmup=False,
                                                   use_legacy_init=False,
                                                   max_work_items=None)
Esempio n. 2
0
 def loadProgram(self):
     src = reduce(
         lambda accum, filename: accum + open(filename, "r").read(),
         ["gpu_md5lib.cl", "gpu_brute.cl"], "")
     self.program = cl.Program(self.ctx, src).build()
Esempio n. 3
0
    def loadKernel(self, device):
        """Load the kernel and initialize the device."""
        self.context = cl.Context([device], None, None)

        # These definitions are required for the kernel to function.
        self.defines += (' -DOUTPUT_SIZE=' + str(self.OUTPUT_SIZE))
        self.defines += (' -DOUTPUT_MASK=' + str(self.OUTPUT_SIZE - 1))

        # If the user wants to mine with vectors, enable the appropriate code
        # in the kernel source.
        if self.VECTORS:
            self.defines += ' -DVECTORS'

        # Some AMD devices support a special "bitalign" instruction that makes
        # bitwise rotation (required for SHA-256) much faster.
        if (device.extensions.find('cl_amd_media_ops') != -1):
            self.defines += ' -DBITALIGN'
            #enable the expierimental BFI_INT instruction optimization
            if self.BFI_INT:
                self.defines += ' -DBFI_INT'
        else:
            #since BFI_INT requires cl_amd_media_ops, disable it
            if self.BFI_INT:
                self.BFI_INT = False

        # Locate and read the OpenCL source code in the kernel's directory.
        kernelFileDir, pyfile = os.path.split(__file__)
        kernelFilePath = os.path.join(kernelFileDir, 'kernel.cl')
        kernelFile = open(kernelFilePath, 'r')
        kernel = kernelFile.read()
        kernelFile.close()

        # For fast startup, we cache the compiled OpenCL code. The name of the
        # cache is determined as the hash of a few important,
        # compilation-specific pieces of information.
        m = md5()
        m.update(device.platform.name)
        m.update(device.platform.version)
        m.update(device.name)
        m.update(self.defines)
        m.update(kernel)
        cacheName = '%s.elf' % m.hexdigest()

        fileName = os.path.join(kernelFileDir, cacheName)

        # Finally, the actual work of loading the kernel...
        try:
            binary = open(fileName, 'rb')
        except IOError:
            binary = None

        try:
            if binary is None:
                self.kernel = cl.Program(self.context,
                                         kernel).build(self.defines)

                #apply BFI_INT if enabled
                if self.BFI_INT:
                    #patch the binary output from the compiler
                    patcher = BFIPatcher(self.interface)
                    binaryData = patcher.patch(self.kernel.binaries[0])

                    self.interface.debug("Applied BFI_INT patch")

                    #reload the kernel with the patched binary
                    self.kernel = cl.Program(self.context, [device],
                                             [binaryData]).build(self.defines)

                #write the kernel binaries to file
                binaryW = open(fileName, 'wb')
                binaryW.write(self.kernel.binaries[0])
                binaryW.close()
            else:
                binaryData = binary.read()
                self.kernel = cl.Program(self.context, [device],
                                         [binaryData]).build(self.defines)

        except cl.LogicError:
            self.interface.fatal("Failed to compile OpenCL kernel!")
            return
        except PatchError:
            self.interface.fatal('Failed to apply BFI_INT patch to kernel! '
                                 'Is BFI_INT supported on this hardware?')
            return
        finally:
            if binary: binary.close()

        cl.unload_compiler()

        # If the user didn't specify their own worksize, use the maxium
        # supported by the device.
        maxSize = self.kernel.search.get_work_group_info(
            cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)

        if self.WORKSIZE is None:
            self.WORKSIZE = maxSize
        else:
            if self.WORKSIZE > maxSize:
                self.interface.log(
                    'Warning: Worksize exceeds the maximum of ' +
                    str(maxSize) + ', using default.')
            if self.WORKSIZE < 1:
                self.interface.log('Warning: Invalid worksize, using default.')

            self.WORKSIZE = min(self.WORKSIZE, maxSize)
            self.WORKSIZE = max(self.WORKSIZE, 1)
            #if the worksize is not a power of 2, round down to the nearest one
            if (self.WORKSIZE & (self.WORKSIZE - 1)) != 0:
                self.WORKSIZE = 1 << int(math.floor(math.log(X) / math.log(2)))

        self.interface.setWorkFactor(self.WORKSIZE)
Esempio n. 4
0
my_gpu_devices = [platform[0].get_devices(device_type=cl.device_type.GPU)[1]]
context = cl.Context(devices=my_gpu_devices)
#context = cl.create_some_context()
queue = cl.CommandQueue(context)

# Create Opencl Buffers
buffer_a = cl.Buffer(context,
                     cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                     hostbuf=mat_a)
buffer_b = cl.Buffer(context,
                     cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                     hostbuf=mat_b)
buffer_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, mat_c.nbytes)

# Program
program = cl.Program(context, c_dot_product_kernel).build()
program.dotProduct.set_scalar_arg_dtypes([np.int32, None, None, None])

start_time = time()

program.dotProduct(queue, (1024, ), (1024 / 16, ), widthA, buffer_a, buffer_b,
                   buffer_c)

queue.finish()

run_time = time()

## Move the kernel's output data to host memory.
#cl.enqueue_copy(queue, mat_c, buffer_c)
cl.enqueue_copy(queue, mat_c, buffer_c)
Esempio n. 5
0
 def build_kernel(self, src):
     self.program = cl.Program(self.context, src).build(self.compiler_args)
aux_h = np.complex64(1 + 1j * 1)
RES_h = np.empty_like(X1_h)

dados_h = []
for i in range(3):
    dados_h.append(
        np.array([X1_h[i], X2_h[i], X3_h[i], Y1_h[i], Y2_h[i],
                  Y3_h[i]]).astype(np.complex64))
dados_h = np.array(dados_h).astype(np.complex64)

print dados_h

aux_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=aux_h)
dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h)
RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=RES_h)

Source = """
__kernel void soma( __global float2 *dados, __global float2 *res, int rowWidth){
	const int gid_x = get_global_id(0);
	res[gid_x] = dados[gid_x*rowWidth+3];
}
"""
prg = cl.Program(ctx, Source).build()

completeEvent = prg.soma(queue, (M, ), None, dados_d, RES_d, np.int32(6))
completeEvent.wait()

cl.enqueue_copy(queue, RES_h, RES_d)
print "GPU RES"
print RES_h
Esempio n. 7
0
    def predict_opencl_atom(self, X, predict_class = False, single_cpu = conf.SINGLE_CPU, opencl_config = conf.OPENCL_CONFIG):
        ''' PyOpenCL implementation of the iPSM approach
            return: a vector of predictions, eacn for a row in X
        '''
        print 'predict_opencl_atom() was called'
        try:
            t0 = time.time()
            c_evs = np.int32(self.__tileRasterReader.nbands)

            # standard deviation of each variable (over the whole study area)
            Std_evs = self.__tileRasterReader.statistics[:,3]
            SD_evs = Std_evs.reshape(c_evs).astype(np.float32)

            r, c = np.shape(X)
            nrows_X = np.int32(r)
            ncols_X = np.int32(c)

            X = X.reshape(nrows_X*ncols_X).astype(np.float32)

            MSRLEVES = self.__tileRasterReader.measurement_level_ints.reshape(c_evs).astype(np.int32)

            if not self.__samples_stats_collected:
                samples_X = self.__soilsamples.covariates_at_points.T

                nrows_samples = np.int32(samples_X.shape[1])
                self.__nrows_samples = nrows_samples

                samples_SD_evs = np.zeros((nrows_samples, c_evs))
                AVG_evs = self.__tileRasterReader.statistics[:,2]

                for i in range(nrows_samples):
                    delta = samples_X[:,i].T - AVG_evs
                    tmp = Std_evs**2 + delta**2
                    samples_SD_evs[i] = np.sqrt(tmp)

                self.__samples_SD_evs = np.array(samples_SD_evs).reshape(nrows_samples*c_evs).astype(np.float32)
                self.__samples_X = np.array(samples_X).T.reshape(nrows_samples*c_evs).astype(np.float32)

                # sample weights
                self.__sample_weights = self.__soilsamples.weights.reshape(nrows_samples).astype(np.float32)

                # sample attributes
                self.__sample_attributes = self.__soilsamples.attributes.reshape(nrows_samples).astype(np.float32)
                self.__samples_stats_collected = True

            # hold predictions for instances in X
            X_predictions = np.zeros(nrows_X).astype(np.float32)
            # hold prediction uncertainties for instances in X
            X_uncertainties = np.zeros(nrows_X).astype(np.float32)
            print 'preparation on HOST took', time.time() - t0, 's'

            ##### config computing platform and device
            for platform in cl.get_platforms():
                #print platform.name
                if platform.name == conf.OPENCL_CONFIG['Platform']:
                    PLATFORM = platform
                    # Print each device per-platform
                    for device in platform.get_devices():
                        #print device.name
                        if device.name == conf.OPENCL_CONFIG['Device']:
                            DEVICE = device
                            break

            # opencl context
            ctx = cl.Context([DEVICE])
            # opencl command queue
            queue = cl.CommandQueue(ctx)

            ##### allocate memory space on device
            mf = cl.mem_flags
            t0 = time.time()
            #evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=evs)
            SD_evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=SD_evs)
            X_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=X)
            MSRLEVES_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=MSRLEVES)
            sample_X_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__samples_X)

            ## added 09/06/2017
            samples_SD_evs_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__samples_SD_evs)

            sample_weights_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__sample_weights)
            sample_attributes_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.__sample_attributes)
            X_predictions_g = cl.Buffer(ctx, mf.WRITE_ONLY, X_predictions.nbytes)
            X_uncertainties_g = cl.Buffer(ctx, mf.WRITE_ONLY, X_uncertainties.nbytes)
            queue.finish()
            t1 = time.time()-t0
            conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(t1)
            print 'allocate and copy from HOST to DEVICE took', t1, 's'
            X = None

            ##### build opencl kernel from code in the file
            f = open(conf.iPSM_KERNEL_FN, 'r')
            fstr = "".join(f.readlines())
            fstr = fstr.replace("#define N_SAMPLES 100", "#define N_SAMPLES " + str(self.__nrows_samples))
            prg = cl.Program(ctx, fstr).build()

            ##### opencl computation
            threshold = np.float32(self.__uncthreshold)

            if predict_class:
                mode = np.int32(1)
            else:
                mode = np.int32(0)

            print X_predictions.shape

            ## improved version, 09/06/2017
            if not single_cpu:
                t0 = time.time()
                completeEvent = \
                prg.iPSM_Predict(queue, X_predictions.shape, None, nrows_X, ncols_X, self.__nrows_samples, mode, \
                                 threshold, MSRLEVES_g, samples_SD_evs_g, SD_evs_g, X_g, sample_X_g, sample_weights_g, sample_attributes_g, \
                                 X_predictions_g, X_uncertainties_g)
                queue.finish()
                t1 = time.time() - t0
                conf.TIME_KEEPING_DICT['parts']['compute'].append(t1)
                print 'kernel took', t1, 's'
                #print queue.finish()


            ## added on Oct. 7, 2018 [sequential version - CPU]
            else:
                print 'SINGLE_CPU iPSM.predict_opencl() called'
                t0 = time.time()
                completeEvent = \
                prg.iPSM_Predict_Sequential(queue, (1,), (1,), nrows_X, ncols_X, self.__nrows_samples, mode, \
                                 threshold, MSRLEVES_g, samples_SD_evs_g, SD_evs_g, X_g, sample_X_g, sample_weights_g, sample_attributes_g, \
                                 X_predictions_g, X_uncertainties_g)
                queue.finish()
                t1 = time.time() - t0
                conf.TIME_KEEPING_DICT['parts']['compute'].append(t1)
                print 'kernel took', t1, 's'
                #print queue.finish()

            #### wait until completions
            events = [completeEvent]
            queue.finish()
            print 'up to events finished kernel took', time.time() - t0, 's'
            #print queue.finish()

            ##### copy result data
            t0 = time.time()
            cl.enqueue_copy(queue, X_predictions, X_predictions_g, wait_for = events)#.wait()
            #print queue.finish()
            cl.enqueue_copy(queue, X_uncertainties, X_uncertainties_g)
            queue.finish()
            t1 = time.time() - t0
            conf.TIME_KEEPING_DICT['parts']['data_transfer'].append(t1)
            print 'copy from DEVICE to HOST took', t1, 's'
            y = np.vstack((X_predictions, X_uncertainties)).T
            #print y
            return y

        except Exception as e:
            raise
Esempio n. 8
0
program = False

try:
    import numpy
    import pyopencl as cl
    hash_dt = numpy.dtype([('target', numpy.uint64), ('v', numpy.str_, 73)])
    gpus = []
    for platform in cl.get_platforms():
	gpus.extend(platform.get_devices(device_type=cl.device_type.GPU))
    if (len(gpus) > 0):
        ctx = cl.Context(devices=gpus)
        queue = cl.CommandQueue(ctx)
        full_path = os.path.dirname(os.path.realpath(__file__))
        f = open(os.path.join(full_path, "bitmsghash", 'bitmsghash.cl'), 'r')
        fstr = ''.join(f.readlines())
        program = cl.Program(ctx, fstr).build(options="")
    else:
        print "No OpenCL GPUs found"
        ctx = False
except Exception as e:
    print "opencl fail: " + str(e)
    ctx = False

def has_opencl():
	return (ctx != False)

def do_opencl_pow(hash, target):
	output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
	if (ctx == False):
		return output[0][0]
	
Esempio n. 9
0
    def compile(self,
                bufferStructsObj,
                library_file,
                footer_file=None,
                N=15,
                invMemoryDensity=2):
        assert type(N) == int
        assert N < 20, "N >= 20 won't fit in a single buffer, so is unsupported. " + \
                       "Nothing sane should use 20, is this wickr?"
        self.N = N
        assert bufferStructsObj is not None, "need to supply a bufferStructsObj : set all to 0 if necessary"
        assert bufferStructsObj.code is not None, "bufferStructsObj should be initialised"
        bufStructs = bufferStructsObj
        self.wordSize = bufStructs.wordSize

        # set the np word type, for use in .run
        npType = {
            4: np.uint32,
            8: np.uint64,
        }
        self.wordType = npType[self.wordSize]

        if footer_file != None:
            src = bufStructs.code
        else:
            src = ""
        if library_file:
            with open(
                    os.path.join(current_dir, "worker", "generic",
                                 library_file), "r") as rf:
                src += rf.read()

        if footer_file:
            with open(
                    os.path.join(current_dir, "worker", "generic",
                                 footer_file), "r") as rf:
                src += rf.read()

        # Standardise to using no \r's, move to bytes to stop trickery
        src = src.encode("ascii")
        src = src.replace(b"\r\n", b"\n")

        # Debugging
        if self.write_combined_file:
            with open("combined_" + library_file, "wb") as wf:
                wf.write(src)

        # Convert back to text!
        src = src.decode("ascii")

        # Check that it starts with 2 newlines, for adding our defines
        if src.startswith("\n\n"):
            src = "\n\n" + src
            src = src[len("\n\n"):]
            # Prepend define N and invMemoryDensity
            defines = "#define N {}\n#define invMemoryDensity {}\n".format(
                N, invMemoryDensity)
            src = defines + src

        # Kernel function instantiation. Build returns self.
        prg = cl.Program(self.ctx, src).build()
        return prg
Esempio n. 10
0
def initialise_opencl_object(self,
                             program_src='',
                             interactive=False,
                             platform_pref=None,
                             device_pref=None,
                             default_group_size=None,
                             default_num_groups=None,
                             default_tile_size=None,
                             default_threshold=None,
                             transpose_block_dim=16,
                             size_heuristics=[],
                             required_types=[],
                             all_sizes={},
                             user_sizes={}):
    self.ctx = get_prefered_context(interactive, platform_pref, device_pref)
    self.queue = cl.CommandQueue(self.ctx)
    self.device = self.ctx.get_info(cl.context_info.DEVICES)[0]
    # XXX: Assuming just a single device here.
    self.platform = self.ctx.get_info(cl.context_info.DEVICES)[0].platform
    self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))
    device_type = self.device.type

    check_types(self, required_types)

    max_group_size = int(self.device.max_work_group_size)
    max_tile_size = int(np.sqrt(self.device.max_work_group_size))

    self.max_group_size = max_group_size
    self.max_tile_size = max_tile_size
    self.max_threshold = 0
    self.max_num_groups = 0
    self.free_list = {}

    default_sizes = apply_size_heuristics(
        self, size_heuristics, {
            'group_size': default_group_size,
            'tile_size': default_tile_size,
            'num_groups': default_num_groups,
            'lockstep_width': None,
            'threshold': default_threshold
        })
    default_group_size = default_sizes['group_size']
    default_num_groups = default_sizes['num_groups']
    default_threshold = default_sizes['threshold']
    default_tile_size = default_sizes['tile_size']
    lockstep_width = default_sizes['lockstep_width']

    if default_group_size > max_group_size:
        sys.stderr.write(
            'Note: Device limits group size to {} (down from {})\n'.format(
                max_tile_size, default_group_size))
        default_group_size = max_group_size

    if default_tile_size > max_tile_size:
        sys.stderr.write(
            'Note: Device limits tile size to {} (down from {})\n'.format(
                max_tile_size, default_tile_size))
        default_tile_size = max_tile_size

    for (k, v) in user_sizes.items():
        if k in all_sizes:
            all_sizes[k]['value'] = v
        else:
            raise Exception('Unknown size: {}'.format(k))

    self.sizes = {}
    for (k, v) in all_sizes.items():
        if v['class'] == 'group_size':
            max_value = max_group_size
            default_value = default_group_size
        elif v['class'] == 'num_groups':
            max_value = max_group_size  # Intentional!
            default_value = default_num_groups
        elif v['class'] == 'tile_size':
            max_value = max_tile_size
            default_value = default_tile_size
        elif v['class'] == 'threshold':
            max_value = None
            default_value = default_threshold
        else:
            raise Exception('Unknown size class for size \'{}\': {}'.format(
                k, v['class']))
        if v['value'] == None:
            self.sizes[k] = default_value
        elif max_value != None and v['value'] > max_value:
            sys.stderr.write(
                'Note: Device limits {} to {} (down from {}\n'.format(
                    k, max_value, v['value']))
            self.sizes[k] = max_value
        else:
            self.sizes[k] = v['value']

    if (len(program_src) >= 0):
        return cl.Program(self.ctx, program_src).build([
            "-DFUT_BLOCK_DIM={}".format(transpose_block_dim),
            "-DLOCKSTEP_WIDTH={}".format(lockstep_width)
        ] + ["-D{}={}".format(s, v) for (s, v) in self.sizes.items()])
Esempio n. 11
0
#print(s)
#exit()
hs = np.empty(nsamp,
              dtype=np.uint)  #Distribution of sotred indexes to new genome
hs.fill(0)
for x in range(0, len(s) - 1):
    sx = np.arange(s[x], s[x + 1]).astype(np.uint)
    for sxi in sx:
        if sxi < len(hs): hs[sxi] = x

print("hs == ", hs)
defines = \
"#define nvarsd "+str(nvarsd)+"\n"+\
"#define nvarsg "+str(nvarsg)+"\n"+\
"#define ninpt "+str(ninpt)+"\n\n"
kernels = genn.genkern2(tosumr, topology, lambda x: cl.Program(ctx, x).build())
print(kernels)

#uint hs["""+str(len(hs))+"""] = {"""+", ".join([str(hh) for hh in hs])+"""}; //Indexes for allocate cutted population to full
prsrc = """
__kernel void copy_inp(__global float *inpt, __global float *dnr){
    uint gid = get_global_id(0);
    dnr[gid] = inpt[gid];
}

__kernel void replicate_mutate(__global float *_gms, __global float *_tmpgms,\
                               __global uint *srt_idxs, __global float *res_g,\
                               __global float *_rnd, __global uint *_nvarsg, 
                               __global uint *_shiftsg, __constant uint *hs) {
  uint gid = get_global_id(0);
  uint h = hs[gid];                           
Esempio n. 12
0
    def __init__(self,
                 queue,
                 num_work_items=None,
                 luxury=None,
                 seed=None,
                 no_warmup=False,
                 use_legacy_init=False,
                 max_work_items=None):
        """
        :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
        :param luxury: the "luxury value" of the generator, and should be 0-4,
            where 0 is fastest and 4 produces the best numbers. It can also be
            >=24, in which case it directly sets the p-value of RANLUXCL.
        :param num_work_items: is the number of generators to initialize,
            usually corresponding to the number of work-items in the NDRange
            RANLUXCL will be used with.  May be `None`, in which case a default
            value is used.
        :param max_work_items: should reflect the maximum number of work-items
            that will be used on any parallel instance of RANLUXCL. So for
            instance if we are launching 5120 work-items on GPU1 and 10240
            work-items on GPU2, GPU1's RANLUXCLTab would be generated by
            calling ranluxcl_intialization with numWorkitems = 5120 while
            GPU2's RANLUXCLTab would use numWorkitems = 10240. However
            maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it
            must be set to the same value for both. (may be `None`)

        .. versionchanged:: 2013.1
            Added default value for `num_work_items`.
        """

        from warnings import warn
        warn(
            "Ranlux random number generation is deprecated and will go away "
            "in 2022.",
            DeprecationWarning,
            stacklevel=2)

        if luxury is None:
            luxury = 4

        if num_work_items is None:
            if queue.device.type & cl.device_type.CPU:
                num_work_items = 8 * queue.device.max_compute_units
            else:
                num_work_items = 64 * queue.device.max_compute_units

        if seed is None:
            from time import time
            seed = int(time() * 1e6) % 2 << 30

        self.context = queue.context
        self.luxury = luxury
        self.num_work_items = num_work_items

        from pyopencl.characterize import has_double_support
        self.support_double = has_double_support(queue.device)

        self.no_warmup = no_warmup
        self.use_legacy_init = use_legacy_init
        self.max_work_items = max_work_items

        src = """
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            kernel void init_ranlux(unsigned seeds,
                global ranluxcl_state_t *ranluxcltab)
            {
              if (get_global_id(0) < %(num_work_items)d)
                ranluxcl_initialization(seeds, ranluxcltab);
            }
            """ % {
            "defines": self.generate_settings_defines(),
            "num_work_items": num_work_items
        }
        prg = cl.Program(queue.context, src).build()

        # {{{ compute work group size

        wg_size = None

        import sys
        import platform
        if ("darwin" in sys.platform
                and "Apple" in queue.device.platform.vendor
                and platform.mac_ver()[0].startswith("10.7")
                and queue.device.type & cl.device_type.CPU):
            wg_size = (1, )

        self.wg_size = wg_size

        # }}}

        self.state = cl_array.empty(queue, (num_work_items, 112),
                                    dtype=np.uint8)
        self.state.fill(17)

        prg.init_ranlux(queue, (num_work_items, ), self.wg_size,
                        np.uint32(seed), self.state.data)
Esempio n. 13
0
    def get_gen_kernel(self, dtype, distribution):
        size_multiplier = 1
        arg_dtype = dtype

        rng_key = (distribution, dtype)

        if rng_key in [("uniform", np.float64), ("normal", np.float64)]:
            c_type = "double"
            scale1_const = "((double) %r)" % (1 / 2**32)
            scale2_const = "((double) %r)" % (1 / 2**64)
            if distribution == "normal":
                transform = "box_muller"
            else:
                transform = ""

            rng_expr = ("shift + scale * "
                        "%s( %s * convert_double4(gen)"
                        "+ %s * convert_double4(gen))" %
                        (transform, scale1_const, scale2_const))

            counter_multiplier = 2

        elif rng_key in [(dist, cmp_dtype) for dist in ["normal", "uniform"]
                         for cmp_dtype in [
                             np.float32,
                             cltypes.float2,
                             cltypes.float3,
                             cltypes.float4,
                         ]]:
            c_type = "float"
            scale_const = "((float) %r)" % (1 / 2**32)

            if distribution == "normal":
                transform = "box_muller"
            else:
                transform = ""

            rng_expr = ("shift + scale * %s(%s * convert_float4(gen))" %
                        (transform, scale_const))
            counter_multiplier = 1
            arg_dtype = np.float32
            try:
                _, size_multiplier = cltypes.vec_type_to_scalar_and_count[
                    dtype]
            except KeyError:
                pass

        elif rng_key == ("uniform", np.int32):
            c_type = "int"
            rng_expr = (
                "shift + convert_int4((convert_long4(gen) * scale) / %s)" %
                (str(2**32) + "l"))
            counter_multiplier = 1

        elif rng_key == ("uniform", np.int64):
            c_type = "long"
            rng_expr = ("shift"
                        "+ convert_long4(gen) * (scale/two32) "
                        "+ ((convert_long4(gen) * scale) / two32)".replace(
                            "two32", (str(2**32) + "l")))
            counter_multiplier = 2

        else:
            raise TypeError(
                "unsupported RNG distribution/data type combination '%s/%s'" %
                rng_key)

        kernel_name = f"rng_gen_{self.generator_name}_{distribution}"
        src = """//CL//
            #include <{header_name}>

            #ifndef M_PI
            #ifdef M_PI_F
            #define M_PI M_PI_F
            #else
            #define M_PI 3.14159265359f
            #endif
            #endif

            typedef {output_t} output_t;
            typedef {output_t}4 output_vec_t;
            typedef {gen_name}_ctr_t ctr_t;
            typedef {gen_name}_key_t key_t;

            uint4 gen_bits(key_t *key, ctr_t *ctr)
            {{
                union {{
                    ctr_t ctr_el;
                    uint4 vec_el;
                }} u;

                u.ctr_el = {gen_name}(*ctr, *key);
                if (++ctr->v[0] == 0)
                    if (++ctr->v[1] == 0)
                        ++ctr->v[2];

                return u.vec_el;
            }}

            #if {include_box_muller}
            output_vec_t box_muller(output_vec_t x)
            {{
                #define BOX_MULLER(I, COMPA, COMPB) \
                    output_t r##I = sqrt(-2*log(x.COMPA)); \
                    output_t c##I; \
                    output_t s##I = sincos((output_t) (2*M_PI) * x.COMPB, &c##I);

                BOX_MULLER(0, x, y);
                BOX_MULLER(1, z, w);
                return (output_vec_t) (r0*c0, r0*s0, r1*c1, r1*s1);
            }}
            #endif

            #define GET_RANDOM_NUM(gen) {rng_expr}

            kernel void {kernel_name}(
                int k1,
                #if {key_length} > 2
                int k2, int k3,
                #endif
                int c0, int c1, int c2, int c3,
                global output_t *output,
                long out_size,
                output_t scale,
                output_t shift)
            {{
                #if {key_length} == 2
                key_t k = {{{{get_global_id(0), k1}}}};
                #else
                key_t k = {{{{get_global_id(0), k1, k2, k3}}}};
                #endif

                ctr_t c = {{{{c0, c1, c2, c3}}}};

                // output bulk
                unsigned long idx = get_global_id(0)*4;
                while (idx + 4 < out_size)
                {{
                    output_vec_t ran = GET_RANDOM_NUM(gen_bits(&k, &c));
                    vstore4(ran, 0, &output[idx]);
                    idx += 4*get_global_size(0);
                }}

                // output tail
                output_vec_t tail_ran = GET_RANDOM_NUM(gen_bits(&k, &c));
                if (idx < out_size)
                  output[idx] = tail_ran.x;
                if (idx+1 < out_size)
                  output[idx+1] = tail_ran.y;
                if (idx+2 < out_size)
                  output[idx+2] = tail_ran.z;
                if (idx+3 < out_size)
                  output[idx+3] = tail_ran.w;
            }}
            """.format(kernel_name=kernel_name,
                       gen_name=self.generator_name,
                       header_name=self.header_name,
                       output_t=c_type,
                       key_length=self.key_length,
                       include_box_muller=int(distribution == "normal"),
                       rng_expr=rng_expr)

        prg = cl.Program(self.context, src).build()
        knl = getattr(prg, kernel_name)
        knl.set_scalar_arg_dtypes([np.int32] * (self.key_length - 1 + 4) +
                                  [None, np.int64, arg_dtype, arg_dtype])

        return knl, counter_multiplier, size_multiplier
Esempio n. 14
0
    def get_gen_kernel(self, dtype, distribution="uniform"):
        size_multiplier = 1
        arg_dtype = dtype

        if dtype == np.float64:
            bits = 64
            c_type = "double"
            rng_expr = "(shift + scale * gen)"
        elif dtype == np.float32:
            bits = 32
            c_type = "float"
            rng_expr = "(shift + scale * gen)"
        elif dtype == cltypes.float2:
            bits = 32
            c_type = "float"
            rng_expr = "(shift + scale * gen)"
            size_multiplier = 2
            arg_dtype = np.float32
        elif dtype in [cltypes.float3, cltypes.float4]:
            bits = 32
            c_type = "float"
            rng_expr = "(shift + scale * gen)"
            size_multiplier = 4
            arg_dtype = np.float32
        elif dtype == np.int32:
            assert distribution == "uniform"
            bits = 32
            c_type = "int"
            rng_expr = ("(shift "
                        "+ convert_int4((float) scale * gen) "
                        "+ convert_int4(((float) scale / (1<<24)) * gen))")

        elif dtype == np.int64:
            assert distribution == "uniform"
            if self.support_double:
                bits = 64
            else:
                bits = 32
            c_type = "long"
            rng_expr = ("(shift "
                        "+ convert_long4((float) scale * gen) "
                        "+ convert_long4(((float) scale / (1l<<24)) * gen)"
                        "+ convert_long4(((float) scale / (1l<<48)) * gen)"
                        ")")

        else:
            raise TypeError("unsupported RNG data type '%s'" % dtype)

        rl_flavor = "%d%s" % (bits, {
            "uniform": "",
            "normal": "norm"
        }[distribution])

        src = """//CL//
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            typedef %(output_t)s output_t;
            typedef %(output_t)s4 output_vec_t;
            #define NUM_WORKITEMS %(num_work_items)d
            #define RANLUX_FUNC ranluxcl%(rlflavor)s
            #define GET_RANDOM_NUM(gen) %(rng_expr)s

            kernel void generate(
                global ranluxcl_state_t *ranluxcltab,
                global output_t *output,
                unsigned long out_size,
                output_t scale,
                output_t shift)
            {

              ranluxcl_state_t ranluxclstate;
              ranluxcl_download_seed(&ranluxclstate, ranluxcltab);

              // output bulk
              unsigned long idx = get_global_id(0)*4;
              while (idx + 4 < out_size)
              {
                  output_vec_t ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate));
                  vstore4(ran, 0, &output[idx]);
                  idx += 4*NUM_WORKITEMS;
              }

              // output tail
              output_vec_t tail_ran = GET_RANDOM_NUM(RANLUX_FUNC(&ranluxclstate));
              if (idx < out_size)
                output[idx] = tail_ran.x;
              if (idx+1 < out_size)
                output[idx+1] = tail_ran.y;
              if (idx+2 < out_size)
                output[idx+2] = tail_ran.z;
              if (idx+3 < out_size)
                output[idx+3] = tail_ran.w;

              ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
            }
            """ % {
            "defines": self.generate_settings_defines(),
            "rlflavor": rl_flavor,
            "output_t": c_type,
            "num_work_items": self.num_work_items,
            "rng_expr": rng_expr
        }

        prg = cl.Program(self.context, src).build()
        knl = prg.generate
        knl.set_scalar_arg_dtypes(
            [None, None, np.uint64, arg_dtype, arg_dtype])

        return knl, size_multiplier
Esempio n. 15
0
class GLCharacter:
    '''GLCharacter is version of GLMeshes that supports blend skinning. TODO it hasn't been properly integrated yet.'''
    def __init__(self,
                 names,
                 verts,
                 faces,
                 bones=None,
                 transforms=None,
                 drawStyle='smooth',
                 colour=[0.9, 0.9, 0.9, 1.0],
                 vts=None,
                 fts=None,
                 visible=True):
        self.selectedIndex = -1
        self.numGeos = len(names)
        self.visible = visible
        self.gvs = None
        self.boneIndices = None
        self.pose = None
        assert self.numGeos == len(verts), 'Non-matching parameter lists.'
        assert self.numGeos == len(faces), 'Non-matching parameter lists.'
        if transforms is None: transforms = [None] * self.numGeos
        if bones is None: bones = [None] * self.numGeos
        if vts is None: vts = [None] * self.numGeos
        if fts is None: fts = [None] * self.numGeos
        self.transforms = np.zeros((self.numGeos, 4, 4), dtype=np.float32)
        vs,VTs,es,bs,tris,vtis,vs_mapping,vts_mapping = [],[],[],[],[],[],[],[]
        vsplits, esplits, tsplits, bsplits = [0], [0], [0], [0]
        vs_in_total = 0
        for i, (v, f, b, t, vt,
                ft) in enumerate(zip(verts, faces, bones, transforms, vts,
                                     fts)):
            #print i,names[i]
            voffset = len(vs)
            vt_indices = v_indices = np.arange(len(v), dtype=np.int32)
            if ft is not None:
                f_flat = [x for y in f for x in y]
                ft_flat = [x for y in ft for x in y]
                s = list(set(zip(f_flat, ft_flat)))
                d = dict(zip(s, range(len(s))))
                v_indices, vt_indices = np.array(zip(*s), dtype=np.int32)
                f = [
                    np.array([d[x] for x in zip(*y)], dtype=np.int32)
                    for y in zip(f, ft)
                ]
            vs_mapping.extend(v_indices + vs_in_total)
            vts_mapping.extend(vt_indices + vs_in_total)
            vs_in_total += len(v)
            v = np.array(v, dtype=np.float32).reshape(-1, 3)[v_indices]
            if vt is None:
                vt = np.zeros((len(vt_indices), 2),
                              dtype=np.float32)  # TODO missing verts
            vt = np.array(vt, dtype=np.float32).reshape(-1, 2)[vt_indices]
            vs.extend(v)  # TODO is this slow? faster to use np.concatenate?
            VTs.extend(vt)
            vtis.extend([i] * len(v))
            if b is not None:
                bs.extend(np.array(b, dtype=np.int32).reshape(-1, 2) + voffset)
            self.transforms[i] = np.eye(4)
            if t is not None: self.transforms[i, :, :3] = t.T
            if len(f) == 2 and f[1][0] == 0:  # assume this is faces and splits
                f0 = np.array(f[0], dtype=np.int32) + voffset
                for c0, c1 in zip(f[1][:-1], f[1][1:]):
                    fc = f0[c0:c1]
                    es.append((fc[-1], fc[0]))
                    es.append((fc[0], fc[1]))
                    for fi in xrange(2, len(fc)):
                        tris.append((fc[0], fc[fi - 1], fc[fi]))
                        es.append((fc[fi - 1], fc[fi]))
            else:
                try:
                    # see if the mesh is regular
                    fr = np.array(
                        f, dtype=np.int32
                    ) + voffset  # will fail if not rectangular ints
                    numFaces, faceSize = fr.shape  # will fail if not size 2
                    e = np.zeros((numFaces, faceSize, 2), dtype=np.int32)
                    t = np.zeros((numFaces, faceSize - 2, 3), dtype=np.int32)
                    e[:, 0, 0] = fr[:, -1]
                    e[:, 0, 1] = fr[:, 0]
                    e[:, 1, 0] = fr[:, 0]
                    e[:, 1, 1] = fr[:, 1]
                    t[:, :, 0] = fr[:, 0].reshape(-1, 1)
                    for fi in xrange(2, faceSize):
                        e[:, fi, 0] = fr[:, fi - 1]
                        e[:, fi, 1] = fr[:, fi]
                        t[:, fi - 2, 1] = fr[:, fi - 1]
                        t[:, fi - 2, 2] = fr[:, fi]
                    e = e.reshape(-1, 2)
                    t = t.reshape(-1, 3)
                    es.extend(e)
                    tris.extend(t)
                except Exception, e:
                    for fc in f:
                        fc = np.array(fc, dtype=np.int32) + voffset
                        es.append((fc[-1], fc[0]))
                        es.append((fc[0], fc[1]))
                        for fi in xrange(2, len(fc)):
                            tris.append((fc[0], fc[fi - 1], fc[fi]))
                            es.append((fc[fi - 1], fc[fi]))
            vsplits.append(len(vs))
            esplits.append(len(es))
            bsplits.append(len(bs))
            tsplits.append(len(tris))
        self.vsplits = np.array(vsplits, dtype=np.int32)
        self.esplits = np.array(esplits, dtype=np.int32)
        self.bsplits = np.array(bsplits, dtype=np.int32)
        self.tsplits = np.array(tsplits, dtype=np.int32)
        self.vs_mapping = np.array(vs_mapping, dtype=np.int32)
        self.vts_mapping = np.array(vts_mapping, dtype=np.int32)
        self.names = names
        vs = np.array(vs, dtype=np.float32).reshape(-1, 3)
        tris = np.array(tris, dtype=np.int32).reshape(-1, 3)
        edges = np.array(es, dtype=np.int32).reshape(-1, 2)
        bones = np.array(bs, dtype=np.int32).reshape(-1, 2)
        VTs = np.array(VTs, dtype=np.float32).reshape(-1, 2)
        #print 'lens',len(vs), len(tris), len(edges), (np.min(tris),np.max(tris)) if len(tris) else 'None', (np.min(edges), np.max(edges)) if len(edges) else 'None', (np.min(bones), np.max(bones)) if len(bones) else 'None'
        self.num_in_verts = vs_in_total
        self.vs = vbo.VBO(vs, usage='GL_STATIC_DRAW_ARB')
        self.tris = vbo.VBO(tris,
                            target=GL.GL_ELEMENT_ARRAY_BUFFER,
                            usage='GL_STATIC_DRAW_ARB')
        self.edges = vbo.VBO(edges,
                             target=GL.GL_ELEMENT_ARRAY_BUFFER,
                             usage='GL_STATIC_DRAW_ARB')
        self.bones = vbo.VBO(bones,
                             target=GL.GL_ELEMENT_ARRAY_BUFFER,
                             usage='GL_STATIC_DRAW_ARB')
        self.vtis = vbo.VBO(np.array(vtis, dtype=np.int32),
                            usage='GL_STATIC_DRAW_ARB')
        assert len(vtis) == len(vs)
        self.vts, self.vns = None, None
        # TODO, deal with input textures and normals
        if vts is not None:
            self.vts = vbo.VBO(VTs, usage='GL_STATIC_DRAW_ARB')
        #if vns is not None: self.vns = vbo.VBO(np.array(vns,dtype=np.float32), usage='GL_STATIC_DRAW_ARB')
        self.drawStyle = drawStyle  # 'wire','smooth','wire_over_smooth'
        self.colour = colour
        self.image, self.bindImage, self.bindId = None, None, None
        self.GL_is_initialised = False
        global CL_ctx, CL_queue
        if CL_ctx is None:
            CL_ctx = cl.create_some_context(False)
            CL_queue = cl.CommandQueue(CL_ctx)
        self.cl_prg = cl.Program(
            CL_ctx, '''
		__kernel void compute_normals(__global const float *xs_g, __global const int *edgeList_g, __global float *res_g) {
			const int gid = get_global_id(0);
			const int g10 = gid*10;
			const int g3 = gid*3;
			float sx=0,sy=0,sz=0;
			const float x=xs_g[g3],y=xs_g[g3+1],z=xs_g[g3+2];
			int e3 = edgeList_g[g10]*3;
			float ex0 = xs_g[e3]-x, ey0 = xs_g[e3+1]-y, ez0 = xs_g[e3+2]-z;
			for (int i = 1; i < 10; ++i) {
				e3 = edgeList_g[g10+i]*3;
				if (xs_g[e3] > 1e10) continue;
				float ex1 = xs_g[e3]-x, ey1 = xs_g[e3+1]-y, ez1 = xs_g[e3+2]-z;
				sx += ey0*ez1-ey1*ez0;
				sy += ez0*ex1-ez1*ex0;
				sz += ex0*ey1-ex1*ey0;
				ex0=ex1; ey0=ey1; ez0=ez1;
			}
			const float sum = sx*sx+sy*sy+sz*sz;
			if (sum < 1e-8) { sx = 0; sy = 0; sz = 0; }
			else {
				const float sc = rsqrt(sum);
				sx *= sc;
				sy *= sc;
				sz *= sc;
			}
			res_g[g3] = sx;
			res_g[g3+1] = sy;
			res_g[g3+2] = sz;
		}
		''').build()
        self.edgeList = self.trianglesToEdgeList(tris, len(vs))
        self.edgeList_g = cl.Buffer(CL_ctx,
                                    cl.mem_flags.READ_ONLY
                                    | cl.mem_flags.COPY_HOST_PTR,
                                    hostbuf=self.edgeList)
        if self.vns is None:
            vns = self.computeNormalsFromEdgeList(vs)
            self.vns = vbo.VBO(np.array(vns, dtype=np.float32),
                               usage='GL_STATIC_DRAW_ARB')
Esempio n. 16
0
        mf = cl.mem_flags
        a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
        b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
        dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

        prg = cl.Program(
            ctx, """
			__kernel void sum(__global const float *a,
			__global const float *b, __global float *c)
			{
						int gid = get_global_id(0);
						float a_temp;
						float b_temp;
						float c_temp;

						a_temp = a[gid]; // my a element (by global ref)
						b_temp = b[gid]; // my b element (by global ref)
						
						c_temp = a_temp+b_temp; // sum of my elements
						c_temp = c_temp * c_temp; // product of sums
						c_temp = c_temp * (a_temp/2.0); // times 1/2 my a

						c[gid] = c_temp; // store result in global memory
				}
				""").build()

        global_size = (data_points, )
        local_size = (workers, )
        preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
         device)
Esempio n. 17
0
import pyopencl as cl  # Import the OpenCL GPU computing API
import pyopencl.array as pycl_array  # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
import numpy as np  # Import Numpy number tools

context = cl.create_some_context()  # Initialize the Context
queue = cl.CommandQueue(context)  # Instantiate a Queue
a1=np.random.rand(50000).astype(np.float64)
b1=np.random.rand(50000).astype(np.float64)
a = pycl_array.to_device(queue, a1)
b = pycl_array.to_device(queue, b1)
# Create two random pyopencl arrays
c = pycl_array.empty_like(a)  # Create an empty pyopencl destination array

program = cl.Program(context, """
__kernel void sum(__global const float *a, __global const float *b, __global float *c)
{
  int i = get_global_id(0);
  c[i] = a[i] + b[i];
}""").build()  # Create the OpenCL program
time1 = time()
program.sum(queue, a.shape, None, a.data, b.data, c.data)  # Enqueue the program for execution and store the result in c

print("a: {}".format(a))
print("b: {}".format(b))
print("c: {}".format(c))  
# Print all three arrays, to show sum() worked  OpenCL:  0.0075032711029052734 s

time2 = time()

print("OpenCL: ", time2 - time1, "s")
Esempio n. 18
0
"""

#
# OpenCL setup.
#
kernel_code = pyOpenCLNCS.loadNCSKernel() + kernel_code

# Create context and command queue
platform = cl.get_platforms()[0]
devices = platform.get_devices()
context = cl.Context(devices)
queue = cl.CommandQueue(
    context, properties=cl.command_queue_properties.PROFILING_ENABLE)

# Open program file and build
program = cl.Program(context, kernel_code)
try:
    program.build()
except:
    print("Build log:")
    print(program.get_build_info(devices[0], cl.program_build_info.LOG))
    raise

n_pts = 256


def test_veccopy():
    v1 = numpy.zeros(n_pts, dtype=numpy.float32)
    v2 = numpy.random.uniform(low=1.0, high=10.0,
                              size=n_pts).astype(dtype=numpy.float32)
prg = cl.Program(ctx, """
inline uint popcnt(const uint i) {
  uint n;
  asm("popc.b32 %0, %1;" : "=r"(n) : "r" (i));
  return n;
}

inline uint ballot(const uint i) {
  uint n;
  asm(
    "{\\n\\t"
    ".reg .pred %%p<1>;\\n\\t"
    "setp.ne.u32 %%p1, %1, 0;\\n\\t"
    "vote.ballot.b32 %0, %%p1;\\n\\t"
    "}"
     : "=r"(n)
     : "r" (i)
  );
  return n;
}

__kernel void sum(__global float *a_g, __global unsigned int *b_g) {
  uint res = 0;
  asm("mov.u32 %0, %%laneid;" : "=r"(res));
  unsigned int res2;
//  uint comp;
  res = a_g[0];
 // res += 23;
//  res = res > 37 ? 5 : 99;
  asm(
  //".reg .pred %%p<2>;"
  "mov.u32 %0, %1;" 
  //"add.u32 %0, %0, 7;"
  //"mov.u32 %0, %%laneid;"
//  "setp.gt.u32 %%p1, %0, 12;"
 // "@%%p1 mov.u32 %0, 33;"
    : "=r"(res2)
    : "r"(res)
  );
  res2 = a_g[get_global_id(0)] > 0 ? 1 : 0;
  res = ballot(res2) & 0xffffffff;
  //res2 = popcnt(res2);
  b_g[get_global_id(0)] = get_global_id(0) == 31 ? res : res2;
//  if(get_global_id(0) == 0) {
 //    a_g[31] = res;
  //}
}
""").build()
Esempio n. 20
0
            print '---------------------------'

    # Create a context with all the devices
    devices = platforms[0].get_devices()
    context = cl.Context(devices)
    print 'This context is associated with ', len(context.devices), 'devices'

    # Create a queue for transferring data and launching computations.
    # Turn on profiling to allow us to check event times.
    queue = cl.CommandQueue(
        context,
        context.devices[0],
        properties=cl.command_queue_properties.PROFILING_ENABLE)
    print 'The queue is using the device:', queue.device.name

    program = cl.Program(context,
                         open('bilateral.cl').read()).build(options='')

    input_image = np.load('image.npz')['image'].astype(np.float32)
    #input_image = im.imread('img/cat.png').astype(np.float32)
    print "Input image size:", input_image.shape

    # use this input to check correctness of index trick
    '''
    input_image = np.array([[1,1,1,1,1,1,1,1],
                            [2,2,2,2,2,2,2,2], 
                            [3,3,3,3,3,3,3,3],
                            [4,4,4,4,4,4,4,4],
                            [5,5,5,5,5,5,5,5],
                            [6,6,6,6,6,6,6,6],
                            [7,7,7,7,7,7,7,7],
                            [8,8,8,8,8,8,8,8],
####################################
ctx = cl.create_some_context()
queue = cl.CommandQueue(
    ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
mf = cl.mem_flags

######################
#CREATING I/O BUFFERS#
######################
inp_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=input_img)
out_buf = cl.Buffer(ctx, mf.WRITE_ONLY, out_cl.nbytes)

##################
#BUILDING PROGRAM#
##################
prg = cl.Program(ctx, kernel).build()

######################
#CALLING THE FUNCTION#
######################
prg.makeCodeBlocks(queue, out_cl.shape, None, inp_buf, out_buf, input_size)

#########################################
#RETRIEVING THE CODEBOOK FROM THE DEVICE#
#########################################
cl.enqueue_copy(queue, out_cl, out_buf)

##################################
#INITIALIZING OUTPUT FOR ENCODING#
##################################
final_scales = np.zeros((input_size / 4, input_size / 4), dtype=np.float32)
Esempio n. 22
0
def match_dtype_to_c_struct(device, name, dtype, context=None):
    """Return a tuple `(dtype, c_decl)` such that the C struct declaration
    in `c_decl` and the structure :class:`numpy.dtype` instance `dtype`
    have the same memory layout.

    Note that *dtype* may be modified from the value that was passed in,
    for example to insert padding.

    (As a remark on implementation, this routine runs a small kernel on
    the given *device* to ensure that :mod:`numpy` and C offsets and
    sizes match.)

    .. versionadded: 2013.1

    This example explains the use of this function::

        >>> import numpy as np
        >>> import pyopencl as cl
        >>> import pyopencl.tools
        >>> ctx = cl.create_some_context()
        >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)])
        >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct(
        ...     ctx.devices[0], 'id_val', dtype)
        >>> print c_decl
        typedef struct {
          unsigned id;
          float value;
        } id_val;
        >>> print dtype
        [('id', '<u4'), ('value', '<f4')]
        >>> cl.tools.get_or_register_dtype('id_val', dtype)

    As this example shows, it is important to call
    :func:`get_or_register_dtype` on the modified `dtype` returned by this
    function, not the original one.
    """

    fields = sorted(six.iteritems(dtype.fields),
                    key=lambda name_dtype_offset: name_dtype_offset[1][1])

    c_fields = []
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))

    c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name)

    cdl = _CDeclList(device)
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        cdl.add_dtype(field_dtype)

    pre_decls = cdl.get_declarations()

    offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" %
                            (i + 1, name, field_name)
                            for i, (field_name, _) in enumerate(fields))

    src = r"""
        #define pycl_offsetof(st, m) \
                 ((uint) ((__local char *) &(dummy.m) \
                 - (__local char *)&dummy ))

        %(pre_decls)s

        %(my_decl)s

        __kernel void get_size_and_offsets(__global uint *result)
        {
            result[0] = sizeof(%(my_type)s);
            __local %(my_type)s dummy;
            %(offset_code)s
        }
    """ % dict(pre_decls=pre_decls,
               my_decl=c_decl,
               my_type=name,
               offset_code=offset_code)

    if context is None:
        context = cl.Context([device])

    queue = cl.CommandQueue(context)

    prg = cl.Program(context, src)
    knl = prg.build(devices=[device]).get_size_and_offsets

    import pyopencl.array  # noqa
    result_buf = cl.array.empty(queue, 1 + len(fields), np.uint32)
    knl(queue, (1, ), (1, ), result_buf.data)
    queue.finish()
    size_and_offsets = result_buf.get()

    size = int(size_and_offsets[0])

    from pytools import any
    offsets = size_and_offsets[1:]
    if any(ofs >= size for ofs in offsets):
        # offsets not plausible

        if dtype.itemsize == size:
            # If sizes match, use numpy's idea of the offsets.
            offsets = [
                dtype_and_offset[1] for field_name, dtype_and_offset in fields
            ]
        else:
            raise RuntimeError(
                "OpenCL compiler reported offsetof() past sizeof() "
                "for struct layout on '%s'. "
                "This makes no sense, and it's usually indicates a "
                "compiler bug. "
                "Refusing to discover struct layout." % device)

    result_buf.data.release()
    del knl
    del prg
    del queue
    del context

    try:
        dtype_arg_dict = {
            'names':
            [field_name for field_name, (field_dtype, offset) in fields],
            'formats':
            [field_dtype for field_name, (field_dtype, offset) in fields],
            'offsets': [int(x) for x in offsets],
            'itemsize':
            int(size_and_offsets[0]),
        }
        dtype = np.dtype(dtype_arg_dict)
        if dtype.itemsize != size_and_offsets[0]:
            # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo.
            dtype_arg_dict["names"].append("_pycl_size_fixer")
            dtype_arg_dict["formats"].append(np.uint8)
            dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1)
            dtype = np.dtype(dtype_arg_dict)
    except NotImplementedError:

        def calc_field_type():
            total_size = 0
            padding_count = 0
            for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                if offset > total_size:
                    padding_count += 1
                    yield ('__pycl_padding%d' % padding_count,
                           'V%d' % offset - total_size)
                yield field_name, field_dtype
                total_size = field_dtype.itemsize + offset

        dtype = np.dtype(list(calc_field_type()))

    assert dtype.itemsize == size_and_offsets[0]

    return dtype, c_decl
Esempio n. 23
0
def clbuild(cl_ctx, prg):
    return cl.Program(cl_ctx, prg).build()
Esempio n. 24
0
    def initialize(cls):
        '''
            Compile kernels
        '''
        cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build()
        cls.longitudinal_sort_kernel = RadixSort(cl_ctx,
                                                 [VectorArg(cl_ftype, "x"), 
                                                  VectorArg(cl_ftype, "px"),
                                                  VectorArg(cl_ftype, "y"),
                                                  VectorArg(cl_ftype, "py"),
                                                  VectorArg(cl_ftype, "theta"),
                                                  VectorArg(cl_ftype, "gamma"),
                                                  ScalarArg(cl_ftype, "inv_slice_len")],
                                                 key_expr="(int) floor(theta[i]*inv_slice_len)",
                                                 sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                 key_dtype=np.int32)

        class LongitudinalTraverseScanKernel(GenericScanKernel):
            '''
                Adds a preamble method for the longitudinal traverse sort
            '''
            def __init__(self, *argl, **argd):
                '''
                    Patch argd['preamble']
                '''

                sort_fun = '''
                            int sort_fun(FLOAT_TYPE x, 
                                         FLOAT_TYPE y, 
                                         FLOAT_TYPE theta, 
                                         FLOAT_TYPE inv_slice_len, 
                                         FLOAT_TYPE inv_traverse_len,
                                         int bins) {
                                         
                                         FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x);
                                         FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y);
                                         int xbin = (int) floor(xnorm * inv_traverse_len);
                                         int ybin = (int) floor(ynorm * inv_traverse_len);
                                         int zbin = (int) floor(theta*inv_slice_len);

                                         if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) {
                                            xbin = 0;
                                            ybin = 0;

                                         }

                                         return xbin+bins*(ybin+bins*zbin);
                            }
                           '''
                
                new_argd = dict(argd)
                new_argd['preamble'] = F(sort_fun + new_argd['preamble'])
                super().__init__(*argl, **new_argd)
        
        cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx,
                                                          [VectorArg(cl_ftype, "x"), 
                                                           VectorArg(cl_ftype, "px"),
                                                           VectorArg(cl_ftype, "y"),
                                                           VectorArg(cl_ftype, "py"),
                                                           VectorArg(cl_ftype, "theta"),
                                                           VectorArg(cl_ftype, "gamma"),
                                                           ScalarArg(cl_ftype, "inv_slice_len"),
                                                           ScalarArg(cl_ftype, "inv_traverse_len"),
                                                           ScalarArg(np.int32, "bins")],
                                                           key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)",
                                                           sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                           scan_kernel = LongitudinalTraverseScanKernel,
                                                           key_dtype=np.int32)
Esempio n. 25
0
def mercatorToEquirectangular(src, dest, north, south):
    sh, sw = src.shape
    dh, dw = dest.shape

    src = src.reshape(-1)
    dest = dest.reshape(-1)

    northY = math.log(math.tan(math.pi / 4.0 + math.radians(north) / 2.0))
    southY = math.log(math.tan(math.pi / 4.0 + math.radians(south) / 2.0))

    # the kernel function
    srcCode = """
    static float lerp(float a, float b, float mu) {
        return (b - a) * mu + a;
    }

    static float norm(float value, float a, float b) {
        float n = (value - a) / (b - a);
        if (n > 1.0) {
            n = 1.0;
        }
        if (n < 0.0) {
            n = 0.0;
        }
        return n;
    }

    __kernel void doProjection(__global uchar *source, __global uchar *dest){
        int sw = %d;
        int sh = %d;
        int dw = %d;
        int dh = %d;
        float north = %f;
        float south = %f;
        float northY = %f;
        float southY = %f;
        float piq = %f;

        // get dest position
        int x = get_global_id(1);
        int y = get_global_id(0);
        int i = y * dw + x;

        // get normalized position
        float nx = (float) x / (float) (dw-1);
        float ny = (float) y / (float) (dh-1);

        // get lat
        float lat = lerp(north, south, ny);

        // convert lon lat from mercator to equirectangular
        float nmy = ny;
        float my = (float) tan(piq + (float) radians(lat) / (float) 2.0);
        if (my > 0) {
            my = log(my);
            nmy = norm(my, northY, southY);
        }

        // get source position
        int sx = (int) round(nx * (float) (sw-1));
        int sy = (int) round(nmy * (float) (sh-1));
        int j = sy * sw + sx;

        // assign pixel
        dest[i] = source[j];
    }
    """ % (sw, sh, dw, dh, north, south, northY, southY, math.pi / 4.0)

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()
    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print "Warning: using CPU instead of GPU"
        ctx = cl.Context(CPU)
    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    # Kernel function instantiation
    prg = cl.Program(ctx, srcCode).build()

    bufIn =  cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=src)
    bufOut = cl.Buffer(ctx, mf.WRITE_ONLY, dest.nbytes)
    prg.doProjection(queue, [dh, dw], None , bufIn, bufOut)

    # Copy result
    cl.enqueue_copy(queue, dest, bufOut)

    dest = dest.reshape(dh, dw)
    return dest
Esempio n. 26
0
    nSmp = 1000
    M = np.tile(M, (1, nSmp, 1, 1))
    v = np.ones((nSmp, m*3))
    y = np.zeros((nSmp, m*3))

    # Setup the OpenCL environment.
    platform = cl.get_platforms()[0]

    device = platform.get_devices()[0]

    context = cl.Context([device])


    # Start with the most original one without any optimization.
    kernelsource = open("spMV0.cl").read()
    program = cl.Program(context, kernelsource).build()
    # mmul = program.mmul
    # mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None])

    queue = cl.CommandQueue(context)

    # localWorkSize = 256
    localWorkSize = 64
    num_compute_units = device.max_compute_units
    globalWorkSize = 8 * num_compute_units * localWorkSize
    print('num of computing unites {}'.format(num_compute_units))

    mem_flags = cl.mem_flags
    indptr_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = indptr)
    indices_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = indices)
    matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = M)
Esempio n. 27
0
mf = cl.mem_flags

d_pos = cl.array.to_device(queue, pos)
d_preresult = cl.array.empty(queue, (4 * workgroup_size, ),
                             dtype=numpy.float32)
d_minmax = cl.array.empty(queue, (4, ), dtype=numpy.float32)

with open("../openCL/ocl_lut_pixelsplit.cl", "r") as kernelFile:
    kernel_src = kernelFile.read()

compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%e" % \
                (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)

print(compile_options)

program = cl.Program(ctx, kernel_src).build(options=compile_options)

program.reduce1(queue, (workgroup_size * workgroup_size, ), (workgroup_size, ),
                d_pos.data, numpy.uint32(pos_size), d_preresult.data)

program.reduce2(queue, (workgroup_size, ), (workgroup_size, ),
                d_preresult.data, d_minmax.data)

min0 = pos[:, :, 0].min()
max0 = pos[:, :, 0].max()
min1 = pos[:, :, 1].min()
max1 = pos[:, :, 1].max()
minmax = (min0, max0, min1, max1)

print(minmax)
print(d_minmax)
Esempio n. 28
0
   /* Set float data */
   float f = global_id_0 * 10.0f + global_id_1 * 1.0f;
   f += local_id_0 * 0.1f + local_id_1 * 0.01f;

   output[index] = f;
}
'''

# Get device and context, create command queue and program
dev = utility.get_default_device()
context = cl.Context(devices=[dev])
queue = cl.CommandQueue(context, dev)

# Build program in the specified context using the kernel source code
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Create output buffer
out = np.zeros(shape=(4, 6), dtype=np.float32)
buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=out.nbytes)

# Enqueue kernel (with argument specified directly)
global_offset = (3, 5)
global_size = (6, 4)
local_size = (3, 2)
Esempio n. 29
0
def calc_errs(data, mask, W, O, pixel_map, n0, m0, dij_n, ss, fs):
    # demand that the data is float32 to avoid excess mem. usage
    assert (data.dtype == np.float32)
    assert (ss.dtype == np.int)
    assert (fs.dtype == np.int)

    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device = devices[0]
            break

    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue = cl.CommandQueue(context)

    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program = cl.Program(context, kernelsource).build()
    translations_err_cl = program.translations_err

    translations_err_cl.set_scalar_arg_dtypes(8 * [None] + 2 * [np.float32] +
                                              6 * [np.int32])

    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = translations_err_cl.get_work_group_info(
        cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)

    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])

    # inputs:
    Win = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin = O.astype(np.float32)
    dij_nin = dij_n.astype(np.float32)
    maskin = mask.astype(np.int32)
    ns = np.arange(data.shape[0]).astype(np.int32)

    # outputs:
    dij_nout = dij_n.copy()
    errs = np.empty((len(ss), data.shape[0]), dtype=np.float32)
    out = np.zeros(data.shape[0]).astype(np.float32)

    step = max_comp
    for i in range(len(ss)):
        #for n in tqdm.tqdm(np.arange(ns.shape[0])[::step], desc='updating sample translations'):
        for n in np.arange(ns.shape[0])[::step]:
            nsi = ns[n:n + step:]
            translations_err_cl(queue, (nsi.shape[0], 1), (1, 1), cl.SVM(Win),
                                cl.SVM(data), cl.SVM(Oin), cl.SVM(pixel_mapin),
                                cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(nsi),
                                cl.SVM(out), n0, m0, data.shape[1],
                                data.shape[2], O.shape[0], O.shape[1], ss[i],
                                fs[i])
            queue.finish()

            errs[i] = out

    return errs
Esempio n. 30
0
def PolHealpixMapper(dx, nside, ext, obspos, nH, Snu, Bx, By, Bz, GPU=0, y_shear=0.0, \
                     maxlos=1e30, minlos=0., p0=0.2, polred=0):
    """
    Usage:
        I, Q, U =  PolHealpixMapper(dx, nside, ext, obspos, nH, Snu, Bx, By, Bz)
    Input:
        dx      =  cell size [pc]
        nside   =  parameter of the resulting Healpix map (with 12*nside*nside pixels)
        ext     =  dust extinction [1/pc/H]
        obspos  =  position of the observer [x,y,z], relative to the centre of the model [pc]
        nH      =  density values [H], grid of [Nx, Ny, Nz] values
        Snu     =  emission/emissivity [MJy/sr/H/pc]
        Bx ...  =  magnetic field values [arbitrary units], [Nx, Ny, Nz] values each
        GPU     =  if ==1, try to use a GPU instead of a CPU (default=0)
        y_shear =  shear in y direction [cells]
        maxlos  =  maximum integration length along the LOS [pc]
        p0      =  maximum polarisation fraction, default value 0.2
        polred  =  (int) if >0, interpret |B| as polarisation fraction; default=0  => (Q,U) calculated for p=100%
    Return:
        I, Q, U, NH =  vectors of Healpix pixel values, for the requested nside, in RING order.
    Note:
        If y_shear==0.0, integration extends to the distance maxlos or to the model boundary, 
        whichever is smaller. If y_shear!=0, integration does not stop at X and Y boundaries but only
        when either MAXLOS or +/- Z boundary is reached.
    """
    NZ, NY, NX = nH.shape
    NPIX = 12 * nside * nside
    platform, device, context, queue, mf = InitCL(GPU)
    LOCAL = [8, 32][GPU > 0]
    GLOBAL = NPIX
    if (GLOBAL % LOCAL != 0): GLOBAL = ((GLOBAL / 32) + 1) * 32
    source = open("kernel_HP_map.c").read()
    OPT         =  \
    " -D NZ=%d -D NY=%d -D NX=%d -D NSIDE=%d -D DX=%.5ef -D MAXLOS=%.4ef -D MINLOS=%.4ef -D POLRED=%d -D p0=%.4ef" % \
    (NZ, NY, NX, nside, dx, maxlos/dx, minlos/dx, polred, p0)  # note -- in kernel [maxlos]=GL, not pc
    program_map = cl.Program(context, source).build(OPT)
    kernel_map = program_map.PolHealpixMapping
    kernel_map.set_scalar_arg_dtypes([
        np.float32, clarray.cltypes.float3, None, None, None, None, None, None,
        np.float32
    ])
    DENS_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ)
    EMIT_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ)
    Bx_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ)
    By_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ)
    Bz_buf = cl.Buffer(context, mf.READ_ONLY, 4 * NX * NY * NZ)
    MAP_buf = cl.Buffer(context, mf.WRITE_ONLY,
                        4 * 4 * NPIX)  # space for (I, Q, U, NH)
    #
    cl.enqueue_copy(queue, DENS_buf, np.asarray(nH, np.float32))
    cl.enqueue_copy(queue, EMIT_buf, np.asarray(Snu, np.float32))
    cl.enqueue_copy(queue, Bx_buf, np.asarray(Bx, np.float32))
    cl.enqueue_copy(queue, By_buf, np.asarray(By, np.float32))
    cl.enqueue_copy(queue, Bz_buf, np.asarray(Bz, np.float32))
    opos = clarray.vec.make_float3(obspos[0], obspos[1], obspos[2])
    extGL = ext * dx  # extinction per grid unit instead of per pc
    kernel_map(queue, [
        GLOBAL,
    ], [
        LOCAL,
    ], extGL, opos, DENS_buf, EMIT_buf, Bx_buf, By_buf, Bz_buf, MAP_buf,
               y_shear)
    MAP = np.zeros(4 * NPIX, np.float32)
    cl.enqueue_copy(queue, MAP, MAP_buf)
    MAP.shape = (NPIX, 4)
    return MAP[:, 0], MAP[:, 1], MAP[:, 2], MAP[:, 3]  # return I, Q, U, NH