Esempio n. 1
0
    for (int j=0; j<4; j++){
        g_grad[i+j] = grad[i+j];
    }
}

"""

#
# OpenCL setup.
#
kernel_code = pyOpenCLNCS.loadNCSKernel() + kernel_code

# Create context and command queue
platform = cl.get_platforms()[0]
devices = platform.get_devices()
context = cl.Context(devices)
queue = cl.CommandQueue(
    context, properties=cl.command_queue_properties.PROFILING_ENABLE)

# Open program file and build
program = cl.Program(context, kernel_code)
try:
    program.build()
except:
    print("Build log:")
    print(program.get_build_info(devices[0], cl.program_build_info.LOG))
    raise


def test_calc_ll():
    n_pts = 16
def testOpenCLFunction(file_kernel, T, time_step, params, data, platform_id=0):
    file_kernel = file_kernel

    abs_path = os.path.dirname(os.path.realpath(__file__))
    file_test = abs_path + "/prob_test.cl"

    #file_test = "../../HMMLikelihood/prob_test.cl"

    n_kernel_param = np.array(params).shape[0]
    n_data_dim = np.array(data).shape[0]

    defines = "-D T={} -D n_kernel_param={} -D n_data_dim={} ".format(
        T, n_kernel_param, n_data_dim) + "-cl-std=CL1.2"

    # Generate Code
    code_kernel = open(file_kernel, "r").read()
    code_test = open(file_test, "r").read()
    code = code_kernel + "\n" + code_test

    # OpenCL Initialisation
    platform = cl.get_platforms()[platform_id]
    device = platform.get_devices()[0]
    context = cl.Context([device])
    program = cl.Program(context, code).build(defines)
    queue = cl.CommandQueue(context)

    timestep_in = np.array([time_step]).astype(cl.cltypes.int)
    params_in = np.array(params).astype(cl.cltypes.float)
    data_in = np.array(data).astype(cl.cltypes.float)
    res_out = np.zeros(1, np.float32)

    # Buffer creation
    mem_flags = cl.mem_flags
    timestep_buf = cl.Buffer(context,
                             mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR,
                             hostbuf=timestep_in)
    param_buf = cl.Buffer(context,
                          mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR,
                          hostbuf=params_in)
    data_buf = cl.Buffer(context,
                         mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR,
                         hostbuf=data_in)
    res_buf = cl.Buffer(context, mem_flags.READ_WRITE, res_out.nbytes)

    # Assign function execution
    kernel = program.prob_test

    # Set program arguments
    globalItems = (1, )
    localItems = None

    kernel.set_arg(0, timestep_buf)
    kernel.set_arg(1, param_buf)
    kernel.set_arg(2, data_buf)
    kernel.set_arg(3, res_buf)

    completeEvent = cl.enqueue_nd_range_kernel(queue, kernel, globalItems,
                                               localItems)
    completeEvent.wait()

    cl.enqueue_copy(queue, res_out, res_buf, is_blocking=True)
    return res_out
Esempio n. 3
0
############################
############################

## Import packages
import carbontax.optimalcontrol
import carbontax.scenarios
import carbontax.plot

import pyopencl as cl
import numpy as np

## Create OpenCL kernel

platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])  # Initialize the Context
queue = cl.CommandQueue(context)  # Instantiate a Queue

## Results

scenario = carbontax.scenarios.createScenarioFunctions(
    context,
    queue,
    inertia='no',
    evolution='default',
    #extraparams={'maxReduct': 0.05, 'progressRatio':0.7},
    minEmissions=-10)

output = carbontax.optimalcontrol.findOptimalCarbonPath(
    scenarioFunctions=scenario,
    T=86,
Esempio n. 4
0
def main():
    platform_ID = None
    xclbin = None
    globalbuffersize = 1024 * 1024 * 16  #16 MB
    typesize = 512
    threshold = 40000
    expected = np.array([
        [300, 240, 450, 250, 250, 250],  # 32 bits
        [600, 500, 1000, 500, 500, 500],  # 64 bits
        [1100, 900, 1500, 1100, 1100, 1100],  #128 bits
        [1500, 1500, 1900, 2200, 2200, 2200],  #256 bits
        [1900, 2000, 2300, 3800, 3800, 3800]  #512 bits
    ])

    # Process cmd line args
    parser = OptionParser()
    parser.add_option("-k", "--kernel", help="xclbin path")
    parser.add_option("-d", "--device", help="device index")

    (options, args) = parser.parse_args()
    xclbin = options.kernel
    index = options.device

    if xclbin is None:
        print("No xclbin specified\nUsage: -k <path to xclbin>")
        sys.exit(1)

    if index is None:
        index = 0  #get default device

    platforms = cl.get_platforms()
    # get Xilinx platform
    for i in platforms:
        if i.name == "Xilinx":
            platform_ID = platforms.index(i)
            print("\nPlatform Information:")
            print("Platform name:       %s" % platforms[platform_ID].name)
            print("Platform version:    %s" % platforms[platform_ID].version)
            print("Platform profile:    %s" % platforms[platform_ID].profile)
            print("Platform extensions: %s" %
                  platforms[platform_ID].extensions)
            break

    if platform_ID is None:
        #make sure xrt is sourced
        #run clinfo to make sure Xilinx platform is discoverable
        print("ERROR: Plaform not found")
        sys.exit(1)

    # choose device
    devices = platforms[platform_ID].get_devices()
    if int(index) > len(devices) - 1:
        print("\nERROR: Index out of range. %d devices were found" %
              len(devices))
        sys.exit(1)
    else:
        dev = devices[int(index)]
    if "qdma" in str(dev) or "qep" in str(dev):
        threshold = 30000

    if "u2x4" in str(dev) or "U2x4" in str(dev):
        threshold = 10000

    if "gen3x4" in str(dev):
        threshold = 20000

    if "_u25_" in str(dev):  # so that it doesn't set theshold for u250
        threshold = 9000

    ctx = cl.Context(devices=[dev])
    if not ctx:
        print("ERROR: Failed to create context")
        sys.exit(1)

    commands = cl.CommandQueue(
        ctx,
        dev,
        properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    if not commands:
        print("ERROR: Failed to create command queue")
        sys.exit(1)

    print("Loading xclbin")
    with open(xclbin, "rb") as f:
        src = f.read()
    prg = cl.Program(ctx, [dev], [src])

    try:
        prg.build()
    except:
        print("ERROR:")
        print(prg.get_build_info(ctx, cl.program_build_info.LOG))
        raise

    knl1 = prg.bandwidth1
    knl2 = prg.bandwidth2

    #input host and buffer
    lst = [i % 256 for i in range(globalbuffersize)]
    input_host1 = np.array(lst).astype(np.uint8)
    input_host2 = np.array(lst).astype(np.uint8)

    input_buf1 = cl.Buffer(ctx,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=input_host1)
    input_buf2 = cl.Buffer(ctx,
                           cl.mem_flags.READ_WRITE
                           | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=input_host2)

    if input_buf1.int_ptr is None or input_buf2.int_ptr is None:
        print("ERROR: Failed to allocate source buffer")
        sys.exit(1)

    #output host and buffer
    output_host1 = np.empty_like(input_host1, dtype=np.uint8)
    output_host2 = np.empty_like(input_host2, dtype=np.uint8)

    output_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host1.nbytes)
    output_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host2.nbytes)

    if output_buf1.int_ptr is None or output_buf2.int_ptr is None:
        print("ERROR: Failed to allocate destination buffer")
        sys.exit(1)

    #copy dataset to OpenCL buffer
    globalbuffersizeinbeats = globalbuffersize / (typesize / 8)
    tests = int(math.log(globalbuffersizeinbeats, 2.0)) + 1

    #lists
    dnsduration = []
    dsduration = []
    dbytes = []
    dmbytes = []
    bpersec = []
    mbpersec = []

    #run tests with burst length 1 beat to globalbuffersize
    #double burst length each test
    test = 0
    beats = 16
    throughput = []
    while beats <= 1024:
        print("LOOP PIPELINE %d beats" % beats)

        usduration = 0
        fiveseconds = 5 * 1000000
        reps = 64
        while usduration < fiveseconds:

            start = current_micro_time()
            knl1(commands, (1, ), (1, ), output_buf1, input_buf1,
                 np.uint32(beats), np.uint32(reps))
            knl2(commands, (1, ), (1, ), output_buf2, input_buf2,
                 np.uint32(beats), np.uint32(reps))
            commands.finish()
            end = current_micro_time()

            usduration = end - start

            cl.enqueue_copy(commands, output_host1, output_buf1).wait()
            cl.enqueue_copy(commands, output_host2, output_buf2).wait()

            # need to check, currently fails
            limit = int(beats * (typesize / 8))
            if not np.array_equal(output_host1[:limit], input_host1[:limit]):
                print("ERROR: Failed to copy entries")
                input_buf1.release()
                input_buf2.release()
                output_buf1.release()
                output_buf2.release()
                sys.exit(1)

            if not np.array_equal(output_host2[:limit], input_host2[:limit]):
                print("ERROR: Failed to copy entries")
                input_buf1.release()
                input_buf2.release()
                output_buf1.release()
                output_buf2.release()
                sys.exit(1)

            # print("Reps = %d, Beats = %d, Duration = %lf us" %(reps, beats, usduration)) # for debug

            if usduration < fiveseconds:
                reps = reps * 2

        dnsduration.append(usduration)
        dsduration.append(dnsduration[test] / 1000000)
        dbytes.append(reps * beats * (typesize / 8))
        dmbytes.append(dbytes[test] / (1024 * 1024))
        bpersec.append(2 * dbytes[test] / dsduration[test])
        mbpersec.append(2 * bpersec[test] / (1024 * 1024))
        throughput.append(mbpersec[test])
        print("Test %d, Throughput: %d MB/s" % (test, throughput[test]))
        beats = beats * 4
        test += 1

    #cleanup
    input_buf1.release()
    input_buf2.release()
    output_buf1.release()
    output_buf2.release()
    del ctx

    print("TTTT: %d" % throughput[0])
    print("Maximum throughput: %d MB/s" % max(throughput))
    if max(throughput) < threshold:
        print("ERROR: Throughput is less than expected value of %d GB/sec" %
              (threshold / 1000))
        sys.exit(1)

    print("PASSED")
Esempio n. 5
0
#!/usr/bin/env python

import sys
import pyopencl as cl
import numpy as np
from my_cl_utils import print_device_info, get_optimal_global_work_size

# Platform, Device, Context and Queue
devices = []
platforms = cl.get_platforms()
for platform in platforms:
    devices.extend(platform.get_devices())
#print_device_info(platforms, devices)

device = devices[0]
context = cl.Context((device, ))
queue = cl.CommandQueue(context, device)

# Parameter setup
nx, ny, nz = 240, 256, 256  # 540 MB
#nx, ny, nz = 512, 480, 480		# 3.96 GB
#nx, ny, nz = 256, 480, 960
tmax, tgap = 200, 10

print '(%d, %d, %d)' % (nx, ny, nz),
total_bytes = nx * ny * nz * 4 * 9
if total_bytes / (1024**3) == 0:
    print '%d MB' % (total_bytes / (1024**2))
else:
    print '%1.2f GB' % (float(total_bytes) / (1024**3))
Esempio n. 6
0
sequence_name = seq['name']
frame_num = seq['frame_num']
frame_resolution = seq['resolution']

in_path = './tests/' + sequence_name + '/input/in00%04d.jpg'
gt_path = './tests/' + sequence_name + '/groundtruth/gt00%04d.png'
out_path = './tests/' + sequence_name + '/output/out00%04d.jpg'

# Kernel function
kernel_src = 'mixture-of-gaussian.cl'

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    # Choose graphic device and create context for it
    ctx = cl.Context([device_choose(pref_platform, pref_device)])
    mf = cl.mem_flags

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)

    #Kernel function instantiation
    kernel = str()
    with open(kernel_src, 'r') as content_file:
        kernel = content_file.read()
    prg = cl.Program(ctx, kernel).build()

    mixture_data_buff = np.zeros(3 * nmixtures * frame_resolution,
                                 dtype=np.float32)
    mixture_data_buff[0:frame_resolution * nmixtures] = 1.0 / nmixtures
    mixture_data_buff[frame_resolution * nmixtures + 1:2 * frame_resolution *
Esempio n. 7
0
def calc_fractal_opencl(chunks, maxiter):
    # List all the stuff in this computer
    platforms = cl.get_platforms()

    for platform in platforms:
        print("Found a device: {}".format(str(platform)))

    # Let's just go with platform zero
    ctx = cl.Context(dev_type=cl.device_type.ALL,
                     properties=[(cl.context_properties.PLATFORM, platforms[0])])

    # Create a command queue on the platform (device = None means OpenCL picks a device for us)
    queue = cl.CommandQueue(ctx, device = None)

    mf = cl.mem_flags

    # This is our OpenCL kernel. It does a single point (OpenCL is responsible for mapping it across the points in a chunk)
    # __kernel decorator just specifies its an opencl kernel
    # Can define multiple kernels in single cl.Program and call them with cl.methodName
    # float2 is a struct containing 2 floats (works for 2,3,4 dims (can use .x, .y, .w, .\))
    # floatN can also be used
    prg = cl.Program(ctx, """
    #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
    __kernel void mandelbrot(__global float2 *q, __global ushort *output, ushort const maxiter)
    {
      int gid = get_global_id(0);

      float cx = q->x;
      float cy = q->y;

      float x = 0.0f;
      float y = 0.0f;
      int its = 0;

      while (((x*x + y*y) < 4.0f) && (its < maxiter)) {
        float xtemp = x*x - y*y + cx;
        y = 2*x*y + cy;
        x = xtemp;
        its++;
      }
    
        // Assume point is not in set if reach maxiter
      if (its == maxiter) {
        output[gid] = 0;
      } else {
        output[gid] = its;
      }
    }
    """).build()

    output_chunks = []
    output_chunks_on_device = []

    chunk_shape = None

    for chunk_input in chunks:
        # Record the shape of input chunks
        chunk_shape = chunk_input.shape

        # These are our buffers to hold data on the device (on the device specified in ctx)
        chunk_input_on_device = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=chunk_input)

        chunk_output_on_device = cl.Buffer(ctx, mf.WRITE_ONLY, int(chunk_input.nbytes / 4))
        # divided by 4 because our inputs are 64 bits but outputs are 16 bits

        # Call the kernel on this chunk
        # Notice we defined our function for a single point, but are passing a chunk
        # OpenCL handles parallelizing (partitioning) depending on context device
        # After none, we're passing params to our kernel
        prg.mandelbrot(queue, chunk_shape, None, chunk_input_on_device, chunk_output_on_device, np.uint16(maxiter))

        # Add the output chunk to our list to keep track of it
        output_chunks_on_device.append(chunk_output_on_device)

    # Wait for all the chunks to be computed
    # In default single treaded mode for queue: chunks run serially (but work inside queue is parallelized)
    # Can use unordered queue to dump as much of the work onto the devices for even more parallelization
    queue.finish()

    for chunk_output_on_device in output_chunks_on_device:
        chunk_output = np.zeros(chunk_shape, dtype=np.uint16)

        # Wait until it is done and pull the data back
        cl.enqueue_copy(queue, chunk_output, chunk_output_on_device).wait()

        # Insert the chunk in our overall output
        output_chunks.append(chunk_output)

    return np.concatenate(output_chunks)
    def __init__(self, kernel_file):

        if self.layers + 1 != len(self.layer_height):
            print("Bad network config.")
            exit()

        print("Running with {} hidden layers and {} layers total.".format(
            self.hidden_layers, self.layers))

        print("OpenCL Version v{}".format(".".join(
            [str(i) for i in cl.get_cl_header_version()])))
        print("Finding platform....")
        platform = self.findPlatform(VENDOR_NAME)
        if not platform:
            print("ERROR: Platform not found for name {0}".format(VENDOR_NAME))
            exit(1)

        print("Getting devices...")
        devices = platform.get_devices(device_type=DEVICE_TYPE)
        if len(devices) < 1:
            print("ERROR: No device found for type {0}.".format(DEVICE_TYPE))
            exit(1)

        devices = [devices[1]]

        self.ctx = cl.Context(devices=devices)

        if DEVICE_TYPE == cl.device_type.ACCELERATOR:
            print("Reading binary...")
            binary = kernel_file.read()

            binaries = [binary] * len(devices)

            print("Building...")
            program = cl.Program(self.ctx, devices, binaries)
        else:
            print("Reading program...")
            binary = kernel_file.read()

            program = cl.Program(self.ctx, binary.decode('utf-8')).build()

        self.kForward = program.forward
        self.kForwardSoftMax = program.forward_softmax
        # self.kBackwardFirstDelta = program.backward_first_delta
        # self.kBackward = program.backward

        self.kForward.set_scalar_arg_dtypes(
            [None, None, None, None, np.int32, np.int32, np.int32, np.int32])
        self.kForwardSoftMax.set_scalar_arg_dtypes([None, np.int32, np.int32])
        # self.kBackwardFirstDelta.set_scalar_arg_dtypes([None, None, None, np.int32, np.int32])
        # self.kBackward.set_scalar_arg_dtypes(
        #    [None, None, None, None, NN_T, NN_T, np.int32, np.int32, np.int32])

        self.queue = cl.CommandQueue(self.ctx)

        print("Loading data...")
        _, (self.x_test, self.y_test) = input_data.load_data()

        self.y_test = self.y_test.reshape((10000, ))
        self.x_test = self.x_test.reshape(10000, self.layer_height[0])
        self.x_test = self.x_test.astype('float32')
        self.x_test /= 255

        self.correct_pred_fpga = 0
        self.wrong_pred_fpga = 0
        self.correct_pred_cpu = 0
        self.wrong_pred_cpu = 0
Esempio n. 9
0
def update_pixel_map_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n, subpixel, subsample, search_window, ss, fs):
    # demand that the data is float32 to avoid excess mem. usage
    assert(data.dtype == np.float32)
    
    ##################################################################
    # OpenCL crap
    ##################################################################
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device   = devices[0]
            break
    
    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue   = cl.CommandQueue(context)
    
    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program     = cl.Program(context, kernelsource).build()
    
    if subpixel:
        update_pixel_map_cl = program.update_pixel_map_subpixel
    else :
        update_pixel_map_cl = program.update_pixel_map
    
    update_pixel_map_cl.set_scalar_arg_dtypes(
            [None, None, None, None, None, None, None, None, None, None,
             np.float32, np.float32, np.float32, np.int32, np.int32, 
             np.int32, np.int32, np.int32, np.int32, np.int32,
             np.int32, np.int32])
    
    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = update_pixel_map_cl.get_work_group_info(
                       cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)
    
    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    
    # inputs:
    Win         = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin         = O.astype(np.float32)
    dij_nin     = dij_n.astype(np.float32)
    maskin      = mask.astype(np.int32)
    ss          = ss.ravel().astype(np.int32)
    fs          = fs.ravel().astype(np.int32)
    
    ss_min, ss_max = (-(search_window[0]-1)//2, (search_window[0]+1)//2) 
    fs_min, fs_max = (-(search_window[1]-1)//2, (search_window[1]+1)//2) 
    
    print(ss_min, ss_max)
    print(fs_min, fs_max)
    # outputs:
    err_map      = np.zeros(W.shape, dtype=np.float32)
    pixel_mapout = pixel_map.astype(np.float32)
    ##################################################################
    # End crap
    ##################################################################

    # evaluate err_map0
    ssi = ss
    fsi = fs
    update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), 
                        cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), 
                        cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin),
                        cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, 
                        data.shape[0], data.shape[1], data.shape[2], 
                        O.shape[0], O.shape[1], 0, 1, 0, 1)
    queue.finish()

    pixel_mapout = pixel_map.astype(np.float32)
    err_map0     = err_map.copy()
    
    step = min(100, ss.shape[0])
    it = tqdm.tqdm(np.arange(ss.shape[0])[::step], desc='updating pixel map')
    for i in it:
        ssi = ss[i:i+step:]
        fsi = fs[i:i+step:]
        update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), 
              cl.SVM(Win), 
              cl.SVM(data), 
              localmem, 
              cl.SVM(err_map), 
              cl.SVM(Oin), 
              cl.SVM(pixel_mapout), 
              cl.SVM(dij_nin), 
              cl.SVM(maskin),
              cl.SVM(ssi),
              cl.SVM(fsi),
              n0, m0, subsample,
              data.shape[0], data.shape[1], data.shape[2], 
              O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max)
        queue.finish()
        
        er = np.mean(err_map[err_map>0])
        it.set_description("updating pixel map: {:.2e}".format(er))
        
        #it.set_description("updating pixel map: {:.2e}".format(np.sum(err_map) \
                #                   / np.sum(err_map>0)))
    
    # only return filled values
    out = np.zeros((2,) + ss.shape, dtype=pixel_map.dtype)
    out[0] = pixel_mapout[0][ss, fs]
    out[1] = pixel_mapout[1][ss, fs]
    return out, {'error_map': err_map, 'error': np.sum(err_map)}
Esempio n. 10
0
import numpy, sys, os, pyopencl, random
platform = pyopencl.get_platforms()[0]
mygpu = platform.get_devices(pyopencl.device_type.GPU)[0]
context = pyopencl.Context(devices=[mygpu])
queue = pyopencl.CommandQueue(context)
prog = pyopencl.Program(context, open("func.cl").read()).build(devices=[mygpu])
Esempio n. 11
0
def try_kernel(platform_idx, device_idx):
    platforms = cl.get_platforms()
    device = platforms[platform_idx].get_devices()[device_idx]
    worksize = device.get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
    context = cl.Context([device], None, None)
    output_size = 256
    host_out_buffer = bytearray((output_size + 1) * 4)
    cl_out_buffer = cl.Buffer(context,
                              cl.mem_flags.WRITE_ONLY,
                              size=len(host_out_buffer)
                              # hostbuf=host_out_buffer
                              )

    defines = (
        f'-D OUTPUT_SIZE={output_size} -D OUTPUT_MASK={output_size - 1} '
        f'-D WORK_GROUP_SIZE={worksize}')
    if device.extensions.find('cl_amd_media_ops') != -1:
        print('AMD bitalign defined!')
        defines += ' -DBITALIGN'

    kernel_code = pkgutil.get_data('apoclypsebm',
                                   'apoclypse-0.cl').decode('ascii')
    print(f'Building with {defines}')
    program = cl.Program(context, kernel_code).build(defines)

    kernel_bin_file_name = file_safe_sanitize(
        f'{device.platform.name}-{device.platform.version}-{device.name}.elf')
    with open(kernel_bin_file_name, 'wb') as binary:
        binary.write(program.binaries[0])
    print(f'Wrote kernel to {kernel_bin_file_name}')

    kernel = program.search

    frame = 1.0 / DEFAULT_FRAMES
    unit = worksize * 256
    global_threads = unit * 10

    print(
        f'worksize: {worksize},  unit: {unit},  global_threads: {global_threads}'
    )

    kernel.set_arg(20, cl_out_buffer)

    base = 0

    work = None  # TODO
    rate_divisor, hashspace = 1000, 0xFFFFFFFF  # assumes not vectorized
    nonces_left = hashspace

    with open('../example_blochheader_block.txt') as block_file:
        binary_data = unhexlify(block_file.read())[:76]
        # Swap every uint32 for sha256…
    swapped_bin = bytearray()
    for i in range(-1, len(binary_data) - 1, 4):
        new_word = binary_data[i + 4:None if i == -1 else i:-1]
        swapped_bin += new_word
        # print(f'slicing {i + 3}:{i or None}:{-1} gives {new_word.hex()}')
    print(f'Initial: {binary_data.hex()}')
    print(f'SHA2 chunked: {swapped_bin.hex()}')

    midstate_input = list(unpack('<16I', swapped_bin[:64])) + ([0] * 48)
    midstate = sha256(STATE, midstate_input)
    merkle_end = uint32(unpack('<I', swapped_bin[64:68])[0])
    time = uint32(unpack('<I', swapped_bin[68:72])[0])
    difficulty = uint32(unpack('<I', swapped_bin[72:76])[0])

    base = 316141196 - 0  # -10 from the winner

    print(f'')

    state = list(midstate)
    f = [0] * 8
    state2 = partial(state, merkle_end, time, difficulty, f)
    print(f'state2 and f: {state2}, {f}')
    calculateF(state, merkle_end, time, difficulty, f, state2)
    print(f'state and f after fcalc: {state}, {f}')
    kernel.set_arg(0, uint32_as_bytes(state[0]))
    kernel.set_arg(1, uint32_as_bytes(state[1]))
    kernel.set_arg(2, uint32_as_bytes(state[2]))
    kernel.set_arg(3, uint32_as_bytes(state[3]))
    kernel.set_arg(4, uint32_as_bytes(state[4]))
    kernel.set_arg(5, uint32_as_bytes(state[5]))
    kernel.set_arg(6, uint32_as_bytes(state[6]))
    kernel.set_arg(7, uint32_as_bytes(state[7]))

    kernel.set_arg(8, uint32_as_bytes(state2[1]))
    kernel.set_arg(9, uint32_as_bytes(state2[2]))
    kernel.set_arg(10, uint32_as_bytes(state2[3]))
    kernel.set_arg(11, uint32_as_bytes(state2[5]))
    kernel.set_arg(12, uint32_as_bytes(state2[6]))
    kernel.set_arg(13, uint32_as_bytes(state2[7]))

    kernel.set_arg(15, uint32_as_bytes(f[0]))
    kernel.set_arg(16, uint32_as_bytes(f[1]))
    kernel.set_arg(17, uint32_as_bytes(f[2]))
    kernel.set_arg(18, uint32_as_bytes(f[3]))
    kernel.set_arg(19, uint32_as_bytes(f[4]))

    # This part usually done after temperature check:
    print(f'Starting with base {base}')
    kernel.set_arg(14, uint32_as_bytes(base)[::-1])
    cmd_queue = cl.CommandQueue(context)
    cl.enqueue_copy(cmd_queue, cl_out_buffer, host_out_buffer)
    cl.enqueue_nd_range_kernel(cmd_queue, kernel, (global_threads, ),
                               (worksize, ))

    cl.enqueue_copy(cmd_queue, host_out_buffer, cl_out_buffer)
    cmd_queue.finish()
    print(f'Got {len(host_out_buffer)} outputs:')
    print(' '.join([f'{nonce:02x}' for nonce in host_out_buffer]))

    nonces_left -= global_threads
    # threads_run_pace += global_threads
    # threads_run += global_threads
    base = uint32(base + global_threads)
Esempio n. 12
0
    def test_y_pbc_x_exchange(self):
        # instance
        nx, ny, nz = 40, 50, 60
        #nx, ny, nz = 3, 4, 5
        gpu_devices = common_gpu.gpu_device_list(print_info=False)
        context = cl.Context(gpu_devices)

        gpuf = gpu.Fields(context, gpu_devices[0], nx, ny, nz)
        cpuf = cpu.Fields(nx, ny, nz)
        mainf_list = [gpuf, cpuf]
        nodef = NodeFields(mainf_list)
        core = NodeCore(nodef)
        pbc = NodePbc(nodef, 'y')
        exchange = NodeExchange(nodef)

        # generate random source
        ehs_gpu = common_update.generate_random_ehs(nx, ny, nz, nodef.dtype)
        gpuf.set_eh_bufs(*ehs_gpu)
        ehs_gpu_dict = dict(zip(['ex', 'ey', 'ez', 'hx', 'hy', 'hz'], ehs_gpu))

        ehs_cpu = common_update.generate_random_ehs(nx, ny, nz, nodef.dtype)
        cpuf.set_ehs(*ehs_cpu)
        ehs_cpu_dict = dict(zip(['ex', 'ey', 'ez', 'hx', 'hy', 'hz'], ehs_cpu))

        # verify
        for mainf in mainf_list:
            mainf.update_e()
        pbc.update_e()
        exchange.update_e()

        for mainf in mainf_list:
            mainf.update_h()
        pbc.update_h()
        exchange.update_h()

        mainf_list[-1].enqueue_barrier()
        getf0, getf1 = {}, {}

        # x-axis exchange
        getf0['e'] = gpu.GetFields(gpuf, ['ey', 'ez'], (nx - 1, 0, 0),
                                   (nx - 1, ny - 2, nz - 2))
        getf1['e'] = cpu.GetFields(cpuf, ['ey', 'ez'], (0, 0, 0),
                                   (0, ny - 2, nz - 2))

        getf0['h'] = gpu.GetFields(gpuf, ['hy', 'hz'], (nx - 1, 1, 1),
                                   (nx - 1, ny - 1, nz - 1))
        getf1['h'] = cpu.GetFields(cpuf, ['hy', 'hz'], (0, 1, 1),
                                   (0, ny - 1, nz - 1))

        for getf in getf0.values() + getf1.values():
            getf.get_event().wait()

        for eh in ['e', 'h']:
            g0 = getf0[eh].get_fields()
            g1 = getf1[eh].get_fields()
            norm = np.linalg.norm(g0 - g1)
            self.assertEqual(norm, 0,
                             '%g, %s, %s' % (norm, 'x-axis exchange', eh))

        # y-axis pbc gpu
        getf0['e'] = gpu.GetFields(gpuf, ['ex', 'ez'], (0, ny - 1, 0),
                                   (nx - 2, ny - 1, nz - 2))
        getf1['e'] = gpu.GetFields(gpuf, ['ex', 'ez'], (0, 0, 0),
                                   (nx - 2, 0, nz - 2))

        getf0['h'] = gpu.GetFields(gpuf, ['hx', 'hz'], (1, ny - 1, 1),
                                   (nx - 1, ny - 1, nz - 1))
        getf1['h'] = gpu.GetFields(gpuf, ['hx', 'hz'], (1, 0, 1),
                                   (nx - 1, 0, nz - 1))

        for getf in getf0.values() + getf1.values():
            getf.get_event().wait()

        for eh in ['e', 'h']:
            g0 = getf0[eh].get_fields()
            g1 = getf1[eh].get_fields()
            norm = np.linalg.norm(g0 - g1)
            self.assertEqual(norm, 0,
                             '%g, %s, %s' % (norm, 'y-axis pbc gpu', eh))

        # y-axis pbc cpu
        getf0['e'] = cpu.GetFields(cpuf, ['ex', 'ez'], (0, ny - 1, 0),
                                   (nx - 2, ny - 1, nz - 2))
        getf1['e'] = cpu.GetFields(cpuf, ['ex', 'ez'], (0, 0, 0),
                                   (nx - 2, 0, nz - 2))

        getf0['h'] = cpu.GetFields(cpuf, ['hx', 'hz'], (1, ny - 1, 1),
                                   (nx - 1, ny - 1, nz - 1))
        getf1['h'] = cpu.GetFields(cpuf, ['hx', 'hz'], (1, 0, 1),
                                   (nx - 1, 0, nz - 1))

        for getf in getf0.values() + getf1.values():
            getf.get_event().wait()

        for eh in ['e', 'h']:
            g0 = getf0[eh].get_fields()
            g1 = getf1[eh].get_fields()
            norm = np.linalg.norm(g0 - g1)
            self.assertEqual(norm, 0,
                             '%g, %s, %s' % (norm, 'y-axis pbc cpu', eh))
Esempio n. 13
0
import numpy as np
import pyopencl as cl
import numpy.linalg as la

vector_dimension = 100

#vector_a = np.random.rand(vector_dimension).astype(np.float32)
#vector_b = np.random.rand(vector_dimension).astype(np.float32)
vector_a = np.random.randint(100, size=100)
vector_b = np.random.randint(100, size=100)

platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
context = cl.Context([device])
queue = cl.CommandQueue(context)

mf = cl.mem_flags
a_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vector_a)
b_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=vector_b)

program = cl.Program(context, """
__kernel void vectorSum(__global const int *a_g, __global const int *b_g, __global int *res_g) {
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

res_g = cl.Buffer(context, mf.WRITE_ONLY, vector_a.nbytes)
program.vectorSum(queue, vector_a.shape, None, a_g, b_g, res_g)

Esempio n. 14
0
    def runTest(self):
        nx, ny, nz = self.args

        # instances
        buffer_dict = {}
        buffer_dict['x+'] = cpu.BufferFields('x+', ny, nz, '', 'single')
        buffer_dict['x-'] = cpu.BufferFields('x-', ny, nz, '', 'single')

        import pyopencl as cl
        from kemp.fdtd3d.util import common_gpu
        from kemp.fdtd3d import gpu
        gpu_devices = common_gpu.gpu_device_list(print_info=False)
        context = cl.Context(gpu_devices)
        mainf_list = [ gpu.Fields(context, gpu_devices[0], nx, ny, nz) ]
        #mainf_list = [ cpu.Fields(nx, ny, nz) ]
        nodef = node.Fields(mainf_list, buffer_dict)

        # generate random source
        dtype = nodef.dtype
        ehs = common_random.generate_ehs(nx, ny, nz, dtype)
        buf_ehs_p = common_random.generate_ehs(3, ny, nz, dtype)
        buf_ehs_m = common_random.generate_ehs(3, ny, nz, dtype)
        nodef.mainf_list[0].set_eh_bufs(*ehs)
        #nodef.mainf_list[0].set_ehs(*ehs)
        nodef.buffer_dict['x+'].set_ehs(*buf_ehs_p)
        nodef.buffer_dict['x-'].set_ehs(*buf_ehs_m)
        node.Core(nodef)

        # allocations for verify
        getf_dict = {'x+': {}, 'x-': {}}
        getf_buf_dict = {'x+': {}, 'x-': {}}

        getf_dict['x+']['e'] = gpu.GetFields(nodef.mainf_list[0], ['ey', 'ez'], (nx-1, 0, 0), (nx-1, ny-1, nz-1))
        getf_dict['x+']['h'] = gpu.GetFields(nodef.mainf_list[0], ['hy', 'hz'], (nx-2, 0, 0), (nx-2, ny-1, nz-1))

        getf_buf_dict['x+']['e'] = cpu.GetFields(nodef.buffer_dict['x+'], ['ey', 'ez'], (1, 0, 0), (1, ny-1, nz-1))
        getf_buf_dict['x+']['h'] = cpu.GetFields(nodef.buffer_dict['x+'], ['hy', 'hz'], (0, 0, 0), (0, ny-1, nz-1))

        getf_dict['x-']['e'] = gpu.GetFields(nodef.mainf_list[0], ['ey', 'ez'], (1, 0, 0), (1, ny-1, nz-1))
        getf_dict['x-']['h'] = gpu.GetFields(nodef.mainf_list[0], ['hy', 'hz'], (0, 0, 0), (0, ny-1, nz-1))

        getf_buf_dict['x-']['e'] = cpu.GetFields(nodef.buffer_dict['x-'], ['ey', 'ez'], (2, 0, 0), (2, ny-1, nz-1))
        getf_buf_dict['x-']['h'] = cpu.GetFields(nodef.buffer_dict['x-'], ['hy', 'hz'], (1, 0, 0), (1, ny-1, nz-1))

        # verify
        nodef.update_e()
        nodef.update_h()
        print 'nodef, instance_list', nodef.instance_list
        print 'mainf_list[0], instance_list', nodef.mainf_list[0].instance_list

        for direction in ['x+', 'x-']:
            for e_or_h in ['e', 'h']:
                getf = getf_dict[direction][e_or_h]
                getf_buf = getf_buf_dict[direction][e_or_h]

                getf.get_event().wait()
                getf_buf.get_event().wait()

                original = getf.get_fields()
                copy = getf_buf.get_fields()
                norm = np.linalg.norm(original - copy)
                self.assertEqual(norm, 0, '%s, %g, %s, %s' % (self.args, norm, direction, e_or_h))
Esempio n. 15
0
def getParticleData(data, p):
    h = p["particles"]
    w = p["points_per_particle"]
    dim = 4  # four points: x, y, alpha, width

    offset = 1.0 - p["animationProgress"]
    tw = p["width"]
    th = p["height"]
    dh = len(data)
    dw = len(data[0])

    result = np.zeros(tw * th, dtype=np.float32)

    # print "%s x %s x %s = %s" % (w, h, dim, len(result))

    fData = np.array(data)
    fData = fData.astype(np.float32)
    fData = fData.reshape(-1)

    # print "%s x %s x 3 = %s" % (dw, dh, len(fData))

    pData = np.array(p["particleProperties"])
    pData = pData.astype(np.float32)
    pData = pData.reshape(-1)

    # print "%s x 3 = %s" % (h, len(pData))

    # the kernel function
    src = """

    static float lerp(float a, float b, float mu) {
        return (b - a) * mu + a;
    }

    static float det(float a0, float a1, float b0, float b1) {
        return a0 * b1 - a1 * b0;
    }

    static float2 lineIntersection(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3) {
        float xd0 = x0 - x1;
        float xd1 = x2 - x3;
        float yd0 = y0 - y1;
        float yd1 = y2 - y3;

        float div = det(xd0, xd1, yd0, yd1);

        float2 intersection;
        intersection.x = -1.0;
        intersection.y = -1.0;

        if (div != 0.0) {
            float d1 = det(x0, y0, x1, y1);
            float d2 = det(x2, y2, x3, y3);
            intersection.x = det(d1, d2, xd0, xd1) / div;
            intersection.y = det(d1, d2, yd0, yd1) / div;
        }

        return intersection;
    }


    static float norm(float value, float a, float b) {
        float n = (value - a) / (b - a);
        if (n > 1.0) {
            n = 1.0;
        }
        if (n < 0.0) {
            n = 0.0;
        }
        return n;
    }

    static float wrap(float value, float a, float b) {
        if (value < a) {
            value = b - (a - value);
        } else if (value > b) {
            value = a + (value - b);
        }
        return value;
    }

    void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness);
    void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha);

    void drawLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha, int thickness) {
        int dx = abs(x1-x0);
        int dy = abs(y1-y0);

        if (dx==0 && dy==0) {
            return;
        }

        // draw the first line
        drawSingleLine(p, x0, y0, x1, y1, w, h, alpha);

        thickness--;
        if (thickness < 1) return;

        int stepX = 0;
        int stepY = 0;
        if (dx > dy) stepY = 1;
        else stepX = 1;

        // loop through thickness
        int offset = 1;
        for (int i=0; i<thickness; i++) {
            int xd = stepX * offset;
            int yd = stepY * offset;

            drawSingleLine(p, x0+xd, y0+yd, x1+xd, y1+yd, w, h, alpha);

            // alternate above and below
            offset *= -1;
            if (offset > 0) {
                offset++;
            }
        }


    }

    void drawSingleLine(__global float *p, int x0, int y0, int x1, int y1, int w, int h, float alpha) {
        // clamp
        x0 = clamp(x0, 0, w-1);
        x1 = clamp(x1, 0, w-1);
        y0 = clamp(y0, 0, h-1);
        y1 = clamp(y1, 0, h-1);

        int dx = abs(x1-x0);
        int dy = abs(y1-y0);

        if (dx==0 && dy==0) {
            return;
        }

        int sy = 1;
        int sx = 1;
        if (y0>=y1) {
            sy = -1;
        }
        if (x0>=x1) {
            sx = -1;
        }
        int err = dx/2;
        if (dx<=dy) {
            err = -dy/2;
        }
        int e2 = err;

        int x = x0;
        int y = y0;
        for(int i=0; i<w; i++){
            p[y*w+x] = alpha;
            if (x==x1 && y==y1) {
                break;
            }
            e2 = err;
            if (e2 >-dx) {
                err -= dy;
                x += sx;
            }
            if (e2 < dy) {
                err += dx;
                y += sy;
            }
        }
    }

    __kernel void getParticles(__global float *data, __global float *pData, __global float *result){
        int points = %d;
        int dw = %d;
        int dh = %d;
        float tw = %f;
        float th = %f;
        float offset = %f;
        float magMin = %f;
        float magMax = %f;
        float alphaMin = %f;
        float alphaMax = %f;
        float velocityMult = %f;
        float lineWidthMin = %f;
        float lineWidthMax = %f;
        float lineWidthLatMin = %f;
        float lineWidthLatMax = %f;

        // get current position
        int i = get_global_id(0);
        float dx = pData[i*3];
        float dy = pData[i*3+1];
        float doffset = pData[i*3+2];

        // set starting position
        float x = dx * (tw-1);
        float y = dy * (th-1);

        for(int j=0; j<points; j++) {
            // get UV value
            int lon = (int) round(dx * (dw-1));
            int lat = (int) round(dy * (dh-1));
            int dindex = lat * dw * 3 + lon * 3;
            float u = data[dindex+1];
            float v = data[dindex+2];

            // check for invalid values
            if (u >= 999.0 || u <= -999.0) {
                u = 0.0;
            }
            if (v >= 999.0 || v <= -999.0) {
                v = 0.0;
            }

            // calc magnitude
            float mag = sqrt(u * u + v * v);
            mag = norm(mag, magMin, magMax);

            // determine alpha transparency/thickness based on magnitude and offset
            float jp = (float) j / (float) (points-1);
            float progressMultiplier = (jp + offset + doffset) - floor(jp + offset + doffset);
            float alpha = lerp(alphaMin, alphaMax, mag * progressMultiplier);
            float thickness = lerp(lineWidthMin, lineWidthMax, mag * progressMultiplier);

            // adjust thickness based on latitude
            float latMultiplier = (float) abs(lat - (dh/2)) / (float) (dh/2);
            float thicknessMultiplier = lerp(lineWidthLatMin, lineWidthLatMax, latMultiplier);
            thickness *= thicknessMultiplier;
            if (thickness < 1.0) thickness = 1.0;

            float x1 = x + u * velocityMult;
            float y1 = y + (-v) * velocityMult;

            // clamp y
            if (y1 < 0.0) {
                y1 = 0.0;
            }
            if (y1 > (th-1.0)) {
                y1 = th-1.0;
            }

            // check for no movement
            if (x==x1 && y==y1) {
                break;

            // check for invisible line
            } else if (alpha < 1.0) {
                // continue

            // wrap from left to right
            } else if (x1 < 0) {
                float2 intersection = lineIntersection(x, y, x1, y1, (float) 0.0, (float) 0.0, (float) 0.0, th);
                if (intersection.y > 0.0) {
                    drawLine(result, (int) round(x), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                    drawLine(result, (int) round((float) (tw-1.0) + x1), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                }

            // wrap from right to left
            } else if (x1 > tw-1.0) {
                float2 intersection = lineIntersection(x, y, x1, y1, (float) (tw-1.0), (float) 0.0, (float) (tw-1.0), th);
                if (intersection.y > 0.0) {
                    drawLine(result, (int) round(x), (int) round(y), (int) (tw-1.0), (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                    drawLine(result, (int) round((float) x1 - (float)(tw-1.0)), (int) round(y), 0, (int) intersection.y, (int) tw, (int) th, round(alpha), (int) thickness);
                }

            // draw it normally
            } else {
                drawLine(result, (int) round(x), (int) round(y), (int) round(x1), (int) round(y1), (int) tw, (int) th, round(alpha), (int) thickness);
            }

            // wrap x
            x1 = wrap(x1, 0.0, tw-1);
            dx = x1 / tw;
            dy = y1 / th;

            x = x1;
            y = y1;
        }
    }
    """ % (w, dw, dh, tw, th, offset, p["mag_range"][0], p["mag_range"][1],
           p["alpha_range"][0], p["alpha_range"][1], p["velocity_multiplier"],
           p["linewidth_range"][0], p["linewidth_range"][1],
           p["linewidth_lat_range"][0], p["linewidth_lat_range"][1])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        # print "Using GPU"
        ctx = cl.Context(devices=GPUs)
    else:
        print("Warning: using CPU")
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=fData)
    inPData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pData)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    prg.getParticles(queue, (h, ), None, inData, inPData, outResult)

    # Copy result
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape((th, tw))
    result = result.astype(np.uint8)

    return result
Esempio n. 16
0
def quadratic_refinement_1d_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n):
    # demand that the data is float32 to avoid excess mem. usage
    assert(data.dtype == np.float32)
    
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device   = devices[0]
            break
    
    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue   = cl.CommandQueue(context)
    
    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program     = cl.Program(context, kernelsource).build()
    update_pixel_map_cl = program.pixel_map_err
    
    update_pixel_map_cl.set_scalar_arg_dtypes(
                        8*[None] + 2*[np.float32] + 7*[np.int32])
    
    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = update_pixel_map_cl.get_work_group_info(
                       cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)
    
    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    
    # inputs:
    Win         = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin         = O.astype(np.float32)
    dij_nin     = dij_n.astype(np.float32)
    maskin      = mask.astype(np.int32)
    
    # outputs:
    err_map      = np.empty(W.shape, dtype=np.float32)
    pixel_shift  = np.zeros(pixel_map.shape, dtype=np.float32)
    err_quad     = np.empty((3,) + W.shape, dtype=np.float32)
    out          = pixel_map.copy()
    
    import time
    d0 = time.time()
    
    # qudratic fit refinement
    pixel_shift.fill(0.)
    
    A = []
    if data.shape[1] == 1:
        ss_shifts = [0]
    else :
        ss_shifts = [-1, 0, 1]

    if data.shape[2] == 1:
        fs_shifts = [0]
    else :
        fs_shifts = [-1, 0, 1]
    
    for ss_shift in ss_shifts:
        for fs_shift in fs_shifts:
            err_map.fill(9999)
            update_pixel_map_cl( queue, W.shape, (1, 1), 
                  cl.SVM(Win), 
                  cl.SVM(data), 
                  localmem, 
                  cl.SVM(err_map), 
                  cl.SVM(Oin), 
                  cl.SVM(pixel_mapin), 
                  cl.SVM(dij_nin), 
                  cl.SVM(maskin),
                  n0, m0, 
                  data.shape[0], data.shape[1], data.shape[2], 
                  O.shape[0], O.shape[1], ss_shift, fs_shift)
            queue.finish()
            
            if data.shape[1] == 1 :
                err_quad[fs_shift+1, :, :] = err_map
                A.append([fs_shift**2, fs_shift, 1])
            else :
                err_quad[ss_shift+1, :, :] = err_map
                A.append([ss_shift**2, ss_shift, 1])
    
    # now we have 3 equations and 3 unknowns
    # a x^2 + b x + c = err_i
    B = np.linalg.pinv(A)
    C = np.dot(B, np.transpose(err_quad, (1, 0, 2)))
    
    # minima is defined by
    # 2 a x + b = 0
    # x = -b / 2a
    # where C = [a, b, c]
    #           [0, 1, 2]
    det = 2*C[0]

    # make sure all sampled shifts have a valid error
    m    = np.all(err_quad!=9999, axis=0)
    # make sure the determinant is non zero
    m    = m * (det != 0)

    if data.shape[1] == 1 :
        pixel_shift[1][m] = (-C[1])[m] / det[m]
        #print(pixel_shift[1][m])
    elif data.shape[2] == 1 :
        pixel_shift[0][m] = (-C[1])[m] / det[m]
        #print(pixel_shift[0][m])

    # now only update pixels for which x**2 < 3**2
    m = m * (np.sum(pixel_shift**2, axis=0) < 9)
    
    out[0][m] = out[0][m] + pixel_shift[0][m]
    out[1][m] = out[1][m] + pixel_shift[1][m]
      
    error = np.sum(np.min(err_quad, axis=0))
    return out, {'pixel_shift': pixel_shift, 'error': error, 'err_quad': err_quad}
Esempio n. 17
0
def getTemperatureImage(data, p):
    tRange = p["temperature_range"]
    gradient = p["gradient"]

    dataG = np.array(gradient)
    dataG = dataG.astype(np.float32)

    shape = data.shape
    h, w, dim = shape

    data = data.reshape(-1)
    dataG = dataG.reshape(-1)

    # the kernel function
    src = """
    __kernel void lerpImage(__global float *d, __global float *grad, __global uchar *result){
        int w = %d;
        int dim = %d;
        int gradLen = %d;
        float minValue = %f;
        float maxValue = %f;

        // get current position
        int posx = get_global_id(1);
        int posy = get_global_id(0);

        // get index
        int i = posy * w * dim + posx * dim;
        float temperature = d[i];
        int r = 45;
        int g = 50;
        int b = 55;

        // assume large values are invalid
        if (temperature > -99.0 && temperature < 99.0) {
            // normalize the temperature
            float norm = (temperature - minValue) / (maxValue - minValue);
            // clamp
            if (norm > 1.0) {
                norm = 1.0;
            }
            if (norm < 0.0) {
                norm = 0.0;
            }
            // get color from gradient
            int gradientIndex = (int) round(norm * (gradLen-1));
            gradientIndex = gradientIndex * 3;
            r = (int) round(grad[gradientIndex] * 255);
            g = (int) round(grad[gradientIndex+1] * 255);
            b = (int) round(grad[gradientIndex+2] * 255);
        }

        // set the color
        result[i] = r;
        result[i+1] = g;
        result[i+2] = b;
    }
    """ % (w, dim, len(gradient), tRange[0], tRange[1])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        # print "Using GPU"
        ctx = cl.Context(devices=GPUs)
    else:
        print("Warning: using CPU")
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inData = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=data)
    inG = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataG)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, (data.astype(np.uint8)).nbytes)

    prg.lerpImage(queue, [h, w], None, inData, inG, outResult)

    # Copy result
    result = np.empty_like(data)
    result = result.astype(np.uint8)
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    imOut = Image.fromarray(result, mode="RGB")
    return imOut
    def __init__(self, features, reference, nr_features, nr_nodes):
        self.nr_points = len(features)
        self.features_np = np.asarray(features, dtype=np.float32)
        self.reference_np = np.asarray(reference, dtype=np.float32)
        self.result_np = np.empty(self.nr_points, dtype=np.float32)

        #self.ctx = cl.create_some_context(interactive=True, answers=None, cache_dir=None)
        platform = cl.get_platforms()[0]    # Select the first platform [0]
        device = platform.get_devices()[0]  # Select the first device on this platform [0]
        self.ctx = cl.Context([device])      # Create a context with your device
        
        self.queue = cl.CommandQueue(self.ctx)
        self.mf = cl.mem_flags

        self.features_g = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.features_np)
        self.reference_g = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.reference_np)
        self.result_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, self.result_np.nbytes)

        self.program_fitness = cl.Program(self.ctx, """
        __kernel void calculate_fitness(
            __global const float *r_g, __global const float *f_g, __global float *res_g, __global int *program)
        {
          int nr_features = """ + str(nr_features) + """;
          int nr_nodes = """ + str(nr_nodes) + """;
          int gid = get_global_id(0);
          int offset = gid * """ + str(nr_features) + """;
          float inputs[""" + str(nr_features + nr_nodes) + """] ;
          for (int i = 0; i < nr_features; i++) {
              inputs[i] = f_g[offset + i];
          }
          for (int i = 0; i < nr_nodes; i++) {
              int id1 = program[i * 3];
              int id2 = program[i * 3 + 1];
              int op  = program[i * 3 + 2];
              float i1 = inputs[id1];
              float i2 = inputs[id2];
              if(op == 0) {
                  inputs[i + nr_features] = i1 + i2;
              } else if(op == 1) {
                  inputs[i + nr_features] = i1 - i2;
              } else if(op == 2) {
                  inputs[i + nr_features] = i1 * i2;
              } else if(op == 3) {
                float safe_offset = 0;//(i2 > 0) ? FLT_EPSILON : -FLT_EPSILON;
                inputs[i + nr_features] = i1 / (i2 + safe_offset);    
              } else if (op == 4) {
               inputs[i + nr_features] = tanh(i1);
              } else if(op == 5) {
               inputs[i + nr_features] = cos(i1);
              } else if (op == 6) {
               inputs[i + nr_features] = tan(i1);
              } else if(op == 7) {
               inputs[i + nr_features] = cosh(i1);
              } else if(op == 8) {
               inputs[i + nr_features] = M_PI;
              } else if(op == 9) {
               inputs[i + nr_features] = M_E;
              } else if(op == 10) {
               inputs[i + nr_features] = pow(i1,i2);
              } else if (op == 11) {
               inputs[i + nr_features] = acos(i1);
              } else if(op == 12) {
               inputs[i + nr_features] = atan(i1);
              } else if (op == 13) {
               inputs[i + nr_features] = acosh(i1);
              } else if(op == 14) {
               inputs[i + nr_features] = atanh(i1);
              } else if(op == 15) {
               inputs[i + nr_features] = sqrt(i1);
              } else if(op == 16) {
               inputs[i + nr_features] = 1;
              } else if(op == 17) {
               inputs[i + nr_features] = log(i1);
              }
                           
              
              
          }
          float result1= inputs[nr_features + nr_nodes -1] - r_g[2 * gid];
          float result2= inputs[nr_features + nr_nodes -2] - r_g[2 * gid + 1];
       //   result1 /= sqrt(r_g[2 * gid] * r_g[2 * gid] + 0.00001);
          float result = sqrt(result1 * result1 + result2 * result2);
       //   result2 /= sqrt(r_g[2 * gid + 1] * r_g[2 * gid + 1] + 0.00001);
         //     res_g[gid]  = 1 / (result + 0.2) - result;
          res_g[gid] = -result;
        }
        """).build()

        self.program_predict = cl.Program(self.ctx, """
        __kernel void predict(
           __global const float *f_g, __global float *result_predict_g, __global int *program)
        {
          int nr_features = """ + str(nr_features) + """;
          int nr_nodes = """ + str(nr_nodes) + """;
          int gid = get_global_id(0);
          int offset = gid * """ + str(nr_features) + """;
          float inputs[""" + str(nr_features + nr_nodes) + """] ;
          for (int i = 0; i < nr_features; i++) {
              inputs[i] = f_g[offset + i];
          }
          
         // if (gid == 0) {

          for (int i = 0; i < nr_nodes; i++) {
         // if (i == 0) {
              int id1 = program[i * 3];
              int id2 = program[i * 3 + 1];
              int op  = program[i * 3 + 2];
              //          result_predict_g[0] = id1;
              //        result_predict_g[1] = id2;
              //         result_predict_g[2] = op;
              float i1 = inputs[id1];
              float i2 = inputs[id2];
             // result_predict_g[3] = i1;
             // result_predict_g[4] = i2;
                           if(op == 0) {
                  inputs[i + nr_features] = i1 + i2;
              } else if(op == 1) {
                  inputs[i + nr_features] = i1 - i2;
              } else if(op == 2) {
                  inputs[i + nr_features] = i1 * i2;
              } else if(op == 3) {
                float safe_offset = 0;//(i2 > 0) ? FLT_EPSILON : -FLT_EPSILON;
                inputs[i + nr_features] = i1 / (i2 + safe_offset);    
              } else if (op == 4) {
               inputs[i + nr_features] = tanh(i1);
              } else if(op == 5) {
               inputs[i + nr_features] = cos(i1);
              } else if (op == 6) {
               inputs[i + nr_features] = tan(i1);
              } else if(op == 7) {
               inputs[i + nr_features] = cosh(i1);
              } else if(op == 8) {
               inputs[i + nr_features] = M_PI;
              } else if(op == 9) {
               inputs[i + nr_features] = M_E;
              } else if(op == 10) {
               inputs[i + nr_features] = pow(i1,i2);
              } else if (op == 11) {
               inputs[i + nr_features] = acos(i1);
              } else if(op == 12) {
               inputs[i + nr_features] = atan(i1);
              } else if (op == 13) {
               inputs[i + nr_features] = acosh(i1);
              } else if(op == 14) {
               inputs[i + nr_features] = atanh(i1);
              } else if(op == 15) {
               inputs[i + nr_features] = sqrt(i1);
              } else if(op == 16) {
               inputs[i + nr_features] = 1;
              } else if(op == 17) {
               inputs[i + nr_features] = log(i1);
              }
              
           //   }
          }
          result_predict_g[gid * 2] =  inputs[nr_features + nr_nodes -1];
          result_predict_g[gid * 2 + 1] = inputs[nr_features + nr_nodes -2];
         // }
        }
        """).build()
Esempio n. 19
0
def select_device():
    global context, selected_device,selected_platform,selected_device_max_size,AMD_WARNING_SHOWN
    if context is not None:
        return
    log_version_info()
    log_platforms_info()
    #try to choose a platform and device using *our* heuristics / env options:
    options = {}
    log("select_device() environment preferred DEVICE_NAME=%s, DEVICE_TYPE=%s, DEVICE_PLATFORM=%s", PREFERRED_DEVICE_NAME, PREFERRED_DEVICE_TYPE, PREFERRED_DEVICE_PLATFORM)
    for platform in opencl_platforms:
        log("evaluating platform=%s", platform.name)
        if platform.name.startswith("AMD") and not AMD_WARNING_SHOWN:
            log.warn("Warning: the AMD OpenCL is loaded, it is known to interfere with signal delivery!")
            log.warn(" please consider disabling OpenCL or removing the AMD icd")
            AMD_WARNING_SHOWN = True
        devices = platform.get_devices()
        is_cuda = platform.name.find("CUDA")>=0
        for d in devices:
            if d.available and d.compiler_available and d.get_info(pyopencl.device_info.IMAGE_SUPPORT):
                if not is_supported(platform.name) and (len(PREFERRED_DEVICE_PLATFORM)==0 or str(platform.name).find(PREFERRED_DEVICE_PLATFORM)<0):
                    log("ignoring unsupported platform/device: %s / %s", platform.name, d.name)
                    continue
                dtype = device_type(d)
                log("evaluating device type=%s, name=%s", dtype, d.name)
                if is_cuda:
                    score = 0
                elif dtype==PREFERRED_DEVICE_TYPE:
                    score = 40
                else:
                    score = 10

                if len(PREFERRED_DEVICE_NAME)>0 and d.name.find(PREFERRED_DEVICE_NAME)>=0:
                    score += 50
                if len(PREFERRED_DEVICE_PLATFORM)>0 and str(platform.name).find(PREFERRED_DEVICE_PLATFORM)>=0:
                    score += 50

                #Intel SDK does not work (well?) on AMD CPUs
                #and CUDA has problems doing YUV to RGB..
                if platform.name.startswith("Intel"):
                    if d.name.find("AMD")>=0 or is_cuda:
                        score = max(0, score - 20)
                    elif d.name.find("Intel")>=0:
                        score += 10

                options.setdefault(score, []).append((d, platform))
    log("best device/platform option%s: %s", engs(options), options)
    for score in reversed(sorted(options.keys())):
        for d, p in options.get(score):
            try:
                log("trying platform: %s", platform_info(p))
                log("with %s device: %s", device_type(d), device_info(d))
                context = pyopencl.Context([d])
                selected_device_max_size = d.get_info(pyopencl.device_info.IMAGE2D_MAX_WIDTH), d.get_info(pyopencl.device_info.IMAGE2D_MAX_HEIGHT)
                selected_platform = p
                selected_device = d
                log.info(" using platform: %s", platform_info(selected_platform))
                log_device_info(selected_device)
                #save device costs:
                global selected_device_cpu_cost, selected_device_gpu_cost, selected_device_setup_cost
                if device_type(d)=="GPU":
                    selected_device_cpu_cost = 0
                    selected_device_gpu_cost = 50
                    selected_device_setup_cost = 40
                else:
                    selected_device_cpu_cost = 100
                    selected_device_gpu_cost = 0
                    selected_device_setup_cost = 20
                log("device is a %s, using CPU cost=%s, GPU cost=%s", device_type(d), selected_device_cpu_cost, selected_device_gpu_cost)
                log(" max image 2d size: %s", selected_device_max_size)
                return
            except Exception as e:
                log.warn(" failed to use %s", platform_info(p))
                log.warn(" with %s device %s", device_type(d), device_info(d))
                log.warn(" Error: %s", e)
    #fallback to pyopencl auto mode:
    log.warn("OpenCL Error: failed to find a working platform and device combination... trying with pyopencl's 'create_some_context'")
    context = pyopencl.create_some_context(interactive=False)
    devices = context.get_info(pyopencl.context_info.DEVICES)
    log.info("chosen context has %s device%s:", len(devices), engs(devices))
    for d in devices:
        log_device_info(d)
    assert len(devices)==1, "we only handle a single device at a time, sorry!"
    selected_device = devices[0]
    assert context is not None and selected_device is not None
Esempio n. 20
0
def match_dtype_to_c_struct(device, name, dtype, context=None):
    """Return a tuple `(dtype, c_decl)` such that the C struct declaration
    in `c_decl` and the structure :class:`numpy.dtype` instance `dtype`
    have the same memory layout.

    Note that *dtype* may be modified from the value that was passed in,
    for example to insert padding.

    (As a remark on implementation, this routine runs a small kernel on
    the given *device* to ensure that :mod:`numpy` and C offsets and
    sizes match.)

    .. versionadded: 2013.1

    This example explains the use of this function::

        >>> import numpy as np
        >>> import pyopencl as cl
        >>> import pyopencl.tools
        >>> ctx = cl.create_some_context()
        >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)])
        >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct(
        ...     ctx.devices[0], 'id_val', dtype)
        >>> print c_decl
        typedef struct {
          unsigned id;
          float value;
        } id_val;
        >>> print dtype
        [('id', '<u4'), ('value', '<f4')]
        >>> cl.tools.get_or_register_dtype('id_val', dtype)

    As this example shows, it is important to call
    :func:`get_or_register_dtype` on the modified `dtype` returned by this
    function, not the original one.
    """

    fields = sorted(six.iteritems(dtype.fields),
                    key=lambda name_dtype_offset: name_dtype_offset[1][1])

    c_fields = []
    for field_name, (field_dtype, offset) in fields:
        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))

    c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name)

    cdl = _CDeclList(device)
    for field_name, (field_dtype, offset) in fields:
        cdl.add_dtype(field_dtype)

    pre_decls = cdl.get_declarations()

    offset_code = "\n".join(
        "result[%d] = pycl_offsetof(%s, %s);" % (i + 1, name, field_name)
        for i, (field_name, (field_dtype, offset)) in enumerate(fields))

    src = r"""
        #define pycl_offsetof(st, m) \
                 ((size_t) ((__local char *) &(dummy.m) \
                 - (__local char *)&dummy ))

        %(pre_decls)s

        %(my_decl)s

        __kernel void get_size_and_offsets(__global size_t *result)
        {
            result[0] = sizeof(%(my_type)s);
            __local %(my_type)s dummy;
            %(offset_code)s
        }
    """ % dict(pre_decls=pre_decls,
               my_decl=c_decl,
               my_type=name,
               offset_code=offset_code)

    if context is None:
        context = cl.Context([device])

    queue = cl.CommandQueue(context)

    prg = cl.Program(context, src)
    knl = prg.build(devices=[device]).get_size_and_offsets

    import pyopencl.array  # noqa
    result_buf = cl.array.empty(queue, 1 + len(fields), np.uintp)
    knl(queue, (1, ), (1, ), result_buf.data)
    queue.finish()
    size_and_offsets = result_buf.get()

    size = int(size_and_offsets[0])

    from pytools import any
    offsets = size_and_offsets[1:]
    if any(ofs >= size for ofs in offsets):
        # offsets not plausible

        if dtype.itemsize == size:
            # If sizes match, use numpy's idea of the offsets.
            offsets = [offset for field_name, (field_dtype, offset) in fields]
        else:
            raise RuntimeError("cannot discover struct layout on '%s'" %
                               device)

    result_buf.data.release()
    del knl
    del prg
    del queue
    del context

    try:
        dtype_arg_dict = {
            'names':
            [field_name for field_name, (field_dtype, offset) in fields],
            'formats':
            [field_dtype for field_name, (field_dtype, offset) in fields],
            'offsets': [int(x) for x in offsets],
            'itemsize':
            int(size_and_offsets[0]),
        }
        dtype = np.dtype(dtype_arg_dict)
        if dtype.itemsize != size_and_offsets[0]:
            # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo.
            dtype_arg_dict["names"].append("_pycl_size_fixer")
            dtype_arg_dict["formats"].append(np.uint8)
            dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1)
            dtype = np.dtype(dtype_arg_dict)
    except NotImplementedError:

        def calc_field_type():
            total_size = 0
            padding_count = 0
            for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                if offset > total_size:
                    padding_count += 1
                    yield ('__pycl_padding%d' % padding_count,
                           'V%d' % offset - total_size)
                yield field_name, field_dtype
                total_size = field_dtype.itemsize + offset

        dtype = np.dtype(list(calc_field_type()))

    assert dtype.itemsize == size_and_offsets[0]

    return dtype, c_decl
Esempio n. 21
0
import pyopencl as cl
import numpy as np
import logging
from scipy.misc import *

# Get platforms, both CPU and GPU
plat = cl.get_platforms()
CPU = plat[0].get_devices()
try:
    GPU = plat[1].get_devices()
except IndexError:
    GPU = "none"

#Create context for GPU/CPU
if GPU != "none":
    ctx = cl.Context(GPU)
else:
    ctx = cl.Context(CPU)

# Create queue for each kernel execution
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags

#Test sequence constants
frame_num = 1700
rel_path = './input/in00%04d.jpg'
dest_path = './output/out00%04d.jpg'

# Kernel function
kernel_src = 'median.cl'
Esempio n. 22
0
def make_pixel_map_err(data, mask, W, O, pixel_map, n0, m0, dij_n, roi, search_window=20, grid=[20, 20]):
    # demand that the data is float32 to avoid excess mem. usage
    assert(data.dtype == np.float32)
    
    import time
    t0 = time.time()
    ##################################################################
    # OpenCL crap
    ##################################################################
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device   = devices[0]
            break
    
    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue   = cl.CommandQueue(context)
    
    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program     = cl.Program(context, kernelsource).build()
    make_error_map_subpixel = program.make_error_map_subpixel

    make_error_map_subpixel.set_scalar_arg_dtypes(
            [None, None, None, None, None, None, None, None, None,
             np.float32, np.float32, 
             np.int32, np.int32, np.int32, np.int32, 
             np.int32, np.int32, np.int32, np.int32, 
             np.int32, np.int32])
    
    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = make_error_map_subpixel.get_work_group_info(
                       cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)
    
    # allocate local memory and dtype conversion
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    
    # inputs:
    Win         = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin         = O.astype(np.float32)
    dij_nin     = dij_n.astype(np.float32)
    maskin      = mask.astype(np.int32)
    
    # outputs:
    err_map      = np.zeros((grid[0]*grid[1], search_window**2), dtype=np.float32)
    pixel_mapout = pixel_map.astype(np.float32)
    ##################################################################
    
    if type(search_window) is int :
        s_ss = search_window
        s_fs = search_window
    else :
        s_ss, s_fs = search_window
    
    ss_min, ss_max = (-(s_ss-1)//2, (s_ss+1)//2) 
    fs_min, fs_max = (-(s_fs-1)//2, (s_fs+1)//2) 
    
    # list the pixels for which to calculate the error grid
    ijs = []
    for i in np.linspace(roi[0], roi[1]-1, grid[0]):
        for j in np.linspace(roi[2], roi[3]-1, grid[1]):
            ijs.append([round(i), round(j)])
               
    ijs = np.array(ijs).astype(np.int32)
    
    for i in tqdm.trange(1, desc='calculating pixel map shift errors'):
        make_error_map_subpixel(queue, (1, ijs.shape[0]), (1, 1), 
              cl.SVM(Win), 
              cl.SVM(data), 
              localmem, 
              cl.SVM(err_map), 
              cl.SVM(Oin), 
              cl.SVM(pixel_mapout), 
              cl.SVM(dij_nin), 
              cl.SVM(maskin),
              cl.SVM(ijs),
              n0, m0, ijs.shape[0],
              data.shape[0], data.shape[1], data.shape[2], 
              O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max)
         
        queue.finish()
    t1 = time.time()
    t = t1-t0
    
    res = make_pixel_map_err_report(ijs, err_map, mask, search_window, roi, t)
    
    return ijs, err_map, res
Esempio n. 23
0
import pyopencl as cl
import numpy as np
import sys

platforms = cl.get_platforms()
platform = platforms[0]
devs = platform.get_devices(cl.device_type.GPU)
dev = devs[0]
mf = cl.mem_flags
ctx = cl.Context([dev])
queue = cl.CommandQueue(ctx, dev)

a = np.arange(24).astype(np.int32).reshape(3, 4, 2)
b = np.zeros(2).astype(np.int32)
b1 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
b2 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

prog = cl.Program(
    ctx, """
#define GL_ID (int4)(get_global_id(1), get_global_id(0), get_global_id(2), 0)

__kernel void Buffer(
    __global int *ary)
{

    int4 id = GL_ID;
    int i = id.z * 12 + id.y * 4 + id.x;
    ary[i] = ary[i] * 10;
    printf("%d %d %d: %d\\n", id.x, id.y, id.z, ary[i]);

}
Esempio n. 24
0
def addParticlesToImage(base, temperature, particles, p):
    basePx = np.array(base)
    basePx = basePx.astype(np.uint8)

    tempPx = np.array(temperature)
    tempPx = tempPx.astype(np.uint8)

    shape = basePx.shape
    h, w, dim = shape

    basePx = basePx.reshape(-1)
    tempPx = tempPx.reshape(-1)
    particles = particles.reshape(-1)

    # the kernel function
    src = """
    __kernel void addParticles(__global uchar *base, __global uchar *colors, __global uchar *particles, __global uchar *result){
        int w = %d;
        int dim = %d;
        float power = 1.0 - %f; // lower number = more visible lines

        int posx = get_global_id(1);
        int posy = get_global_id(0);
        int i = posy * w * dim + posx * dim;
        int j = posy * w + posx;

        float alpha = (float) particles[j] / 255.0;
        int r = colors[i];
        int g = colors[i+1];
        int b = colors[i+2];
        int baseR = base[i];
        int baseG = base[i+1];
        int baseB = base[i+2];

        // temp hack, convert to grayscale
        //float count = 3.0;
        //int baseAvg = (int) round((float) ((float) baseR + (float) baseG + (float) baseB) / count);
        //baseR = baseAvg;
        //baseG = baseAvg;
        //baseB = baseAvg;

        if (alpha > 0) {
            alpha = pow(alpha*alpha + alpha*alpha, power);
            if (alpha > 1.0) {
                alpha = 1.0;
            }
            //r = (int) round((r * alpha));
            //g = (int) round((g * alpha));
            //b = (int) round((b * alpha));
            float inv = 1.0 - alpha;
            r = (int) round(((float) r * alpha) + ((float) baseR * inv));
            g = (int) round(((float) g * alpha) + ((float) baseG * inv));
            b = (int) round(((float) b * alpha) + ((float) baseB * inv));
        } else {
            r = baseR;
            g = baseG;
            b = baseB;
        }

        result[i] = r;
        result[i+1] = g;
        result[i+2] = b;
    }
    """ % (w, dim, p["line_visibility"])

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print("Warning: using CPU")
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=basePx)
    inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=tempPx)
    inC = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=particles)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, basePx.nbytes)

    prg.addParticles(queue, [h, w], None, inA, inB, inC, outResult)

    # Copy result
    result = np.empty_like(basePx)
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    return result
Esempio n. 25
0
'''
Listing 3.3: Copying and mapping buffer objects
'''

import pyopencl as cl
import numpy as np
import utility

kernel_src = '''
__kernel void blank(__global float *a, __global float *b) {
}
'''

# Get device and context, create command queue and program
dev = utility.get_default_device()
context = cl.Context(devices=[dev])
queue = cl.CommandQueue(context, dev)
prog = cl.Program(context, kernel_src)

try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))

data_one = np.arange(start=0, stop=100, step=1, dtype=np.float32)
data_two = -np.arange(start=0, stop=100, step=1, dtype=np.float32)

# Create buffers
flags = cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR
buffer_one = cl.Buffer(context, flags, hostbuf=data_one)
Esempio n. 26
0
def lerpData(dataA, dataB, mu, offset=0):
    dataLen = len(dataA)
    if dataLen != len(dataB):
        print("Warning: data length mismatch")
    shape = (len(dataA[0]), len(dataA[0][0]), 3)
    h, w, dim = shape
    result = np.empty(h * w * dim, dtype=np.float32)

    # read data as floats
    dataA = np.array(dataA)
    dataA = dataA.astype(np.float32)
    dataB = np.array(dataB)
    dataB = dataB.astype(np.float32)

    # convert to 1-dimension
    dataA = dataA.reshape(-1)
    dataB = dataB.reshape(-1)

    # the kernel function
    src = """

    static bool isValid(float value) {
        return (value > -999.0 && value < 999.0);
    }

    __kernel void lerpData(__global float *a, __global float *b, __global float *result){
        int dlen = %d;
        int h = %d;
        int w = %d;
        int dim = %d;
        float mu = %f;
        int offsetX = %d;

        // get current position
        int posx = get_global_id(1);
        int posy = get_global_id(0);

        // convert position from 0,360 to -180,180
        int posxOffset = posx;
        if (offsetX > 0 || offsetX < 0) {
            if (posx < offsetX) {
                posxOffset = posxOffset + offsetX;
            } else {
                posxOffset = posxOffset - offsetX;
            }
        }

        // get indices
        int j = posy * w * dim + posx * dim;

        // get the mean values for a and b datasets
        float a1 = 0;
        float a2 = 0;
        float a3 = 0;
        float b1 = 0;
        float b2 = 0;
        float b3 = 0;
        float a1count = 0;
        float a2count = 0;
        float a3count = 0;
        float b1count = 0;
        float b2count = 0;
        float b3count = 0;
        for(int k=0; k<dlen; k++) {
            int i = k * h * w * dim + posy * w * dim + posxOffset * dim;
            if (isValid(a[i])) {
                a1 = a1 + a[i];
                a1count = a1count + 1.0;
            }
            if (isValid(a[i+1])) {
                a2 = a2 + a[i+1];
                a2count = a2count + 1.0;
            }
            if (isValid(a[i+2])) {
                a3 = a3 + a[i+2];
                a3count = a3count + 1.0;
            }
            if (isValid(b[i])) {
                b1 = b1 + b[i];
                b1count = b1count + 1.0;
            }
            if (isValid(b[i+1])) {
                b2 = b2 + b[i+1];
                b2count = b2count + 1.0;
            }
            if (isValid(b[i+2])) {
                b3 = b3 + b[i+2];
                b3count = b3count + 1.0;
            }
        }
        if (a1count > 0) { a1 = a1 / a1count; }
        // else { a1 = -9999.0; }
        if (a2count > 0) { a2 = a2 / a2count; }
        if (a3count > 0) { a3 = a3 / a3count; }
        if (b1count > 0) { b1 = b1 / b1count; }
        // else { b1 = -9999.0; }
        if (b2count > 0) { b2 = b2 / b2count; }
        if (b3count > 0) { b3 = b3 / b3count; }

        // set result
        float t = a1 + mu * (b1-a1);
        float u = a2 + mu * (b2-a2);
        float v = a3 + mu * (b3-a3);
        // if (a1 <= -9999.0 || b1 <= -9999.0) t = -9999.0;
        result[j] = t;
        result[j+1] = u;
        result[j+2] = v;
    }
    """ % (dataLen, h, w, dim, mu, offset)

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print("Warning: using CPU")
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataA)
    inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dataB)
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes)

    prg.lerpData(queue, [h, w], None, inA, inB, outResult)

    # Copy result
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    return result
    def gpu_chain(self, silent=False):

        # Calculate binomarl distribution
        self.defines_dic0 = {
            "T": str(self.T),
            "dim": str(self.dim),
            "computeUnits": str(self.computeUnits),
            "wrkUnit": str(self.wrkUnit),
            "n_elem":
            "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)",  # For temporary result matrix
            "n_wrkGroups": "computeUnits",
            "matSize": "dim*dim",
            "n_kernel_param": str(self.n_kernel_param),
            "n_data_dim": str(self.n_data_dim),
            "minVal": str(0.000001)
        }
        definesToLocals(self.defines_dic0)
        self.defines0 = definesFromDict(self.defines_dic0) + " -cl-std=CL1.2"

        diag0 = np.zeros((dim, n_kernel_param)).astype(cl.cltypes.float)
        diag0 = self.kernels.astype(cl.cltypes.float)
        #diag0[:,4] = 0
        #data points
        mat0 = np.zeros((T, n_data_dim)).astype(cl.cltypes.float)
        #mat0 = self.data.astype(cl.cltypes.float)

        self.res0 = np.zeros((T, dim), np.float32)

        self.code_kernel = open(self.file_diag_kernel, "r").read()
        self.code_diag = open(self.file_diag_normal, "r").read()
        self.code0 = self.code_kernel + "\n" + self.code_diag

        self.platform = cl.get_platforms()[self.platform_id]
        self.device = self.platform.get_devices()[0]
        if (not silent):
            print("Using Device : ", self.device.name)
            print("Compute Units: ", self.computeUnits)
        self.context = cl.Context([self.device])
        self.program0 = cl.Program(self.context,
                                   self.code0).build(self.defines0)
        self.queue = cl.CommandQueue(self.context)

        # Buffer creation
        mem_flags = cl.mem_flags
        self.mat_buf0 = cl.Buffer(self.context,
                                  mem_flags.READ_WRITE
                                  | mem_flags.COPY_HOST_PTR,
                                  hostbuf=mat0)
        self.diag_buf0 = cl.Buffer(self.context,
                                   mem_flags.READ_WRITE
                                   | mem_flags.COPY_HOST_PTR,
                                   hostbuf=diag0)
        self.res_buf0 = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                  self.res0.nbytes)

        self.kernel0 = self.program0.diag_normal

        # Set program arguments
        self.globalItems0 = (T, )
        self.localItems0 = None  # (32, )

        self.kernel0.set_arg(0, self.diag_buf0)
        self.kernel0.set_arg(1, self.mat_buf0)
        self.kernel0.set_arg(2, self.res_buf0)

        # Diagonal Matrix multiplying

        defines_dic1 = {
            "T": str(self.T),
            "dim": str(self.dim),
            "computeUnits": str(self.computeUnits),
            "wrkUnit": str(self.wrkUnit),
            "n_elem":
            "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)",  # For temporary result matrix
            "n_wrkGroups": "computeUnits",
            "matSize": "dim*dim",
            "n_kernel_param": str(self.n_kernel_param),
            "n_data_dim": str(self.n_data_dim),
            "minVal": str(0.000001)
        }
        definesToLocals(defines_dic1)
        self.defines1 = definesFromDict(defines_dic1) + " -cl-std=CL1.2"

        # For use with diag_mat_mulB
        #n_dat_mat1 = np.zeros(wrkUnit*computeUnits + 1).astype(np.int)
        #n_dat_mat1[1:] = int(T / (wrkUnit*computeUnits))
        #n_dat_mat1[1:T%(wrkUnit*computeUnits) + 1] += 1
        #n_dat_mat1 = np.cumsum(n_dat_mat1)
        #n_dat_mat1 = n_dat_mat1.astype(cl.cltypes.int)
        #self.n_dat_mat1 = n_dat_mat1

        n_dat_mat1 = np.zeros(n_wrkGroups + 1).astype(np.int)
        n_dat_mat1[1:] = int((T) / n_wrkGroups)
        n_dat_mat1[1:(T) % n_wrkGroups + 1] += 1
        n_dat_mat1 = np.cumsum(n_dat_mat1)
        n_dat_mat1 = n_dat_mat1.astype(cl.cltypes.int)
        self.n_dat_mat1 = n_dat_mat1

        # transistion matrix
        mat1 = np.zeros((dim, dim)).astype(cl.cltypes.float)
        mat1 = self.transistion_matrix.astype(cl.cltypes.float)
        # Calculated from data and kernels - taken from previous kernel
        #diag1 = np.random.random((T, dim)).astype(cl.cltypes.float)
        # Result
        self.res1 = np.zeros((T, dim, dim), np.float32)

        self.code1 = open(self.file_diag_mat_mul, "r").read()

        self.program1 = cl.Program(self.context,
                                   self.code1).build(self.defines1)

        # Buffer creation
        mem_flags = cl.mem_flags
        self.n_dat_mat_buf1 = cl.Buffer(self.context,
                                        mem_flags.READ_ONLY
                                        | mem_flags.COPY_HOST_PTR,
                                        hostbuf=n_dat_mat1)
        self.mat_buf1 = cl.Buffer(self.context,
                                  mem_flags.READ_WRITE
                                  | mem_flags.COPY_HOST_PTR,
                                  hostbuf=mat1)
        #self.diag_buf1 = cl.Buffer(self.context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=diag1)
        self.res_buf1 = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                  self.res1.nbytes)

        self.kernel1 = self.program1.diag_mat_mul

        # Set program arguments
        # For use with diag_mat_mulB
        #self.globalItems1 = ( computeUnits*wrkUnit, )
        #self.localItems1 = None # (32, )
        self.globalItems1 = (computeUnits * dim, )
        self.localItems1 = (dim, )
        #print(self.globalItems1)

        self.kernel1.set_arg(0, self.n_dat_mat_buf1)
        self.kernel1.set_arg(1, self.mat_buf1)  # Transistion matrix
        self.kernel1.set_arg(
            2, self.res_buf0)  # Diagonal matrix values from previous step
        self.kernel1.set_arg(3, self.res_buf1)

        # MatrixMatrix Multiplying

        self.defines_dic2 = {
            "T": str(self.T),
            "dim": str(self.dim),
            "computeUnits": str(self.computeUnits),
            "wrkUnit": str(self.wrkUnit),
            "n_elem":
            "np.ceil((dim*dim+wrkUnit)/wrkUnit).astype(np.int)",  # For temporary result matrix
            "n_wrkGroups": "computeUnits",
            "matSize": "dim*dim",
            "n_kernel_param": str(self.n_kernel_param),
            "n_data_dim": str(self.n_data_dim),
            "minVal": str(0.000001)
        }
        definesToLocals(self.defines_dic2)
        self.defines2 = definesFromDict(self.defines_dic2) + " -cl-std=CL1.2"

        n_element_mat2 = np.zeros(wrkUnit + 1).astype(np.int)
        n_element_mat2[1:] = int(dim * dim / wrkUnit)
        n_element_mat2[1:(dim * dim + 1) % wrkUnit] += 1
        n_element_mat2 = np.cumsum(n_element_mat2)
        n_element_mat2 = n_element_mat2.astype(cl.cltypes.int)
        self.n_element_mat2 = n_element_mat2

        n_mat_mat2 = np.zeros(n_wrkGroups + 1).astype(np.int)
        n_mat_mat2[1:] = int((T) / n_wrkGroups)
        n_mat_mat2[1:(T) % n_wrkGroups + 1] += 1
        n_mat_mat2 = np.cumsum(n_mat_mat2)
        n_mat_mat2 = n_mat_mat2.astype(cl.cltypes.int)
        self.n_mat_mat2 = n_mat_mat2

        # Matrix input - already made as output from previous step
        #mat2 = np.random.random((T, dim, dim)).astype(cl.cltypes.float) / T
        # Matrix outputs
        self.res2 = np.zeros((computeUnits, dim, dim), np.float32)
        # Scaling coefficients used in matrix multiplication
        self.resCoef2 = np.zeros((computeUnits), np.int32)

        self.code2 = open(self.file_matrixmul, "r").read()
        self.program2 = cl.Program(self.context,
                                   self.code2).build(self.defines2)

        # Buffer creation
        self.n_element_mat_buf2 = cl.Buffer(self.context,
                                            mem_flags.READ_ONLY
                                            | mem_flags.COPY_HOST_PTR,
                                            hostbuf=self.n_element_mat2)
        self.n_mat_mat_buf2 = cl.Buffer(self.context,
                                        mem_flags.READ_ONLY
                                        | mem_flags.COPY_HOST_PTR,
                                        hostbuf=self.n_mat_mat2)
        #mat_buf2 = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=mat2)
        self.res_buf2 = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                  self.res2.nbytes)
        self.resCoef_buf2 = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                      self.resCoef2.nbytes)

        self.kernel2 = self.program2.matrixmul

        # Set program arguments
        self.globalItems2 = (computeUnits * wrkUnit, )
        self.localItems2 = (wrkUnit, )

        self.kernel2.set_arg(0, self.n_element_mat_buf2)
        self.kernel2.set_arg(1, self.n_mat_mat_buf2)
        #self.kernel2.set_arg(2, self.mat_buf2 )
        self.kernel2.set_arg(2, self.res_buf1)  # input argument for MM
        self.kernel2.set_arg(3, self.res_buf2)
        self.kernel2.set_arg(4, self.resCoef_buf2)

        cl.enqueue_copy(self.queue,
                        self.mat_buf0,
                        self.data.astype(cl.cltypes.float),
                        is_blocking=True)
        cl.enqueue_copy(self.queue,
                        self.mat_buf1,
                        self.transistion_matrix.astype(cl.cltypes.float),
                        is_blocking=True)
        cl.enqueue_copy(self.queue,
                        self.diag_buf0,
                        self.kernels.astype(cl.cltypes.float),
                        is_blocking=True)
Esempio n. 28
0
def lerpImage(imA, imB, mu):
    # read pixels and floats
    pxA = np.array(imA)
    pxA = pxA.astype(np.uint8)
    pxB = np.array(imB)
    pxB = pxB.astype(np.uint8)

    shape = pxA.shape
    h, w, dim = shape

    pxA = pxA.reshape(-1)
    pxB = pxB.reshape(-1)

    # the kernel function
    src = """
    __kernel void lerpImage(__global uchar *a, __global uchar *b, __global float *mu, __global uchar *result){
        int w = %d;
        int dim = %d;
        float m = *mu;
        int posx = get_global_id(1);
        int posy = get_global_id(0);
        int i = posy * w * dim + posx * dim;
        result[i] = a[i] + m * (b[i]-a[i]);
        result[i+1] = a[i+1] + m * (b[i+1]-a[i+1]);
        result[i+2] = a[i+2] + m * (b[i+2]-a[i+2]);
    }
    """ % (w, dim)

    # Get platforms, both CPU and GPU
    plat = cl.get_platforms()
    GPUs = plat[0].get_devices(device_type=cl.device_type.GPU)
    CPU = plat[0].get_devices()

    # prefer GPUs
    if GPUs and len(GPUs) > 0:
        ctx = cl.Context(devices=GPUs)
    else:
        print("Warning: using CPU")
        ctx = cl.Context(CPU)

    # Create queue for each kernel execution
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    # Kernel function instantiation
    prg = cl.Program(ctx, src).build()

    inA = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pxA)
    inB = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pxB)
    inMu = cl.Buffer(ctx,
                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                     hostbuf=np.float32(mu))
    outResult = cl.Buffer(ctx, mf.WRITE_ONLY, pxA.nbytes)

    prg.lerpImage(queue, shape, None, inA, inB, inMu, outResult)

    # Copy result
    result = np.empty_like(pxA)
    cl.enqueue_copy(queue, result, outResult)

    result = result.reshape(shape)
    imOut = Image.fromarray(result, mode="RGB")
    return imOut
VECTOR_SIZE = 50000  # Elements of vector

# Create two random vectors a & b
a_host = np.random.rand(VECTOR_SIZE).astype(np.float32)
b_host = np.random.rand(VECTOR_SIZE).astype(np.float32)
# Create a empty vector for the result
res_host = np.zeros(VECTOR_SIZE).astype(np.float32)

# Create CL context
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]  #get first gpu available

print "Running: ", platform
print "On GPU: ", device

ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)

# Transfer host (CPU) memory to device (GPU) memory
mf = cl.mem_flags
a_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_host)
b_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_host)

# Kernel code
prg = cl.Program(
    ctx, """
__kernel void sum(__global const float *a_gpu, __global const float *b_gpu, __global float *res_gpu) {
  int gid = get_global_id(0);
  res_gpu[gid] = a_gpu[gid] + b_gpu[gid];
}
""").build()
Esempio n. 30
0
import pyopencl as CL
from pyopencl import array
import numpy

CL.tools.clear_first_arg_caches()

c = CL.Context([CL.get_platforms()[0].get_devices()[0]])

k = CL.Program(
    c, """
    #include \"lambda.cl\"

    kernel void test(global const ulong *in,
                     global       ulong *out) {
        uint i = get_global_id(0);
        out[i] = lex(in[i], 0);
    }""").build("-I./src/cl")
q = CL.CommandQueue(c)

flags = CL.mem_flags

# 290 = i i
# 1323270 = (k i) k
# 659718 = (k* i) k
# 72218 = Ω
b_in = numpy.zeros((1, 1), CL.array.vec.ulong8)
#b_in[0, 0] = (290, 1323270, 659718, 72218)
b_in[0, 0] = (4, 48, 16, 88, 90466, 0, 0, 0)
b_out = numpy.zeros(5, numpy.uint64)

mem_in = CL.Buffer(c, flags.READ_ONLY | flags.COPY_HOST_PTR, hostbuf=b_in)