Example #1
0
      uint bot = clamp(yg+1, (uint)0, h);
      if(yl==hl) c_loc[xl+wm*(hl+1)] = c[xg+w*bot];
      barrier(CLK_LOCAL_MEM_FENCE);
      uchar4 blr = c_loc[xl+wm*(yl-1)]/(uchar)5 +
                  c_loc[xl-1+wm*yl]/(uchar)5 +
                  c_loc[xl+wm*yl]/(uchar)5 +
                  c_loc[xl+1+wm*yl]/(uchar)5 +
                  c_loc[xl+wm*(yl+1)]/(uchar)5;
      res[xg+w*yg] = blr;
    }
    """).build()

n_pix = cat.size[0] * cat.size[1]
result = np.empty_like(pix)
mf = cl.mem_flags
pix_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=pix)
pixb_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=result)

wgs = cl.Kernel(prg, 'blur').get_work_group_info(
    cl.kernel_work_group_info.WORK_GROUP_SIZE,
    ctx.get_info(cl.context_info.DEVICES)[0])
n_local = (16, 12)
if n_local[0] * n_local[1] > wgs:
    print "Reduce the n_local variable size please!"

nn_buf = cl.LocalMemory(4 * (n_local[0] + 2) * (n_local[1] + 2))
n_workers = (cat.size[0], cat.size[1])

prg.blur(queue, n_workers, n_local, pix_buf, pixb_buf, nn_buf,
         np.uint32(cat.size[0]), np.uint32(cat.size[1]))
    start_time = time.time()
    city_x = numpy.random.random(CITIES).astype(numpy.float32) * 100
    city_y = numpy.random.random(CITIES).astype(numpy.float32) * 100
    # prepare memory for final answer from OpenCL
    final = numpy.zeros(MAP_SIZE, dtype=numpy.float32)
    time_hostdata_loaded = time.time()

    print('create context')
    ctx = cl.create_some_context()
    print('create command queue')
    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    time_ctx_queue_creation = time.time()

    # prepare device memory for OpenCL
    print('prepare device memory for input / output')
    dev_x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=city_x)
    dev_y = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=city_y)
    dev_fianl = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, final.nbytes)
    time_devicedata_loaded = time.time()

    print('compile kernel code')
    prg = cl.Program(ctx, kernels).build()
    time_kernel_compilation = time.time()

    print('execute kernel programs')
    evt = prg.calc_distance(queue, (MAP_SIZE, ), (1, ), numpy.int32(CITIES), dev_x, dev_y, dev_fianl)
    print('wait for kernel executions')
    evt.wait()
    elapsed = 1e-9 * (evt.profile.end - evt.profile.start)

    print('elapsed time: {}'.format(elapsed))
Example #3
0
        #mb_wg_markup = numpy.array(wg_markup, dtype=numpy.float32)
        #mb_wg_stop_loss = numpy.array(wg_stop_loss, dtype=numpy.float32)
        #mb_wg_stop_age = numpy.array(wg_stop_age, dtype=numpy.float32)
        #mb_wg_macd_buy_trip = numpy.array(wg_macd_buy_trip, dtype=numpy.float32)
        #mb_wg_buy_wait_after_stop_loss = numpy.array(wg_buy_wait_after_stop_loss, dtype=numpy.uint32)
        #mb_wg_quartile = numpy.array(wg_quartile, dtype=numpy.uint32)
        #mb_wg_market_classification = numpy.array(wg_market_classification, dtype=numpy.uint32)
        mb_wg_input = numpy.array(wg_input, dtype=numpy.float32)
        #mb_wg_score = numpy.array(range(work_group_size), dtype=numpy.float32)
        #mb_wg_orders = numpy.array(range(work_group_size * max_open_orders * order_array_size), dtype=numpy.float32)
        #create OpenCL buffers

        #mapped - makes sure the data is completly loaded before processing begins
        #ocl_mb_wg_market_classification = cl.Buffer(ctx, mf.READ_ONLY | mf.ALLOC_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=mb_wg_market_classification)
        ocl_mb_wg_input = cl.Buffer(ctx,
                                    mf.READ_ONLY | mf.ALLOC_HOST_PTR
                                    | mf.COPY_HOST_PTR,
                                    hostbuf=mb_wg_input)
        #ocl_mb_wg_orders = cl.Buffer(ctx, mf.READ_WRITE | mf.ALLOC_HOST_PTR | mf.COPY_HOST_PTR, hostbuf=mb_wg_orders)#mb_wg_orders.nbytes

        #unmapped - can be transferred on demand
        #ocl_mb_wg_quartile = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_quartile)
        #ocl_mb_wg_score = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=mb_wg_score)
        #ocl_mb_wg_shares = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_shares)
        ocl_mb_wg_wll = cl.Buffer(ctx,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=mb_wg_wll)
        ocl_mb_wg_wls = cl.Buffer(ctx,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=mb_wg_wls)
        #ocl_mb_wg_buy_wait = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_buy_wait)
        #ocl_mb_wg_markup = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mb_wg_markup)
Example #4
0
def testConvolution():
    # read kernel file
    f = open(PATH_TO_KERNEL, 'r', encoding='utf-8')
    kernels = ' '.join(f.readlines())
    f.close()

    # create context, queue, buffers and compile kernels
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    prg = cl.Program(ctx, kernels).build()

    # init parameters for kernel-call

    cX = torch.ones(10, 10)
    print("Input x shape - " + str(cX.shape))
    print(cX)
    cKernel = getKernel(4)
    print("Input kernel shape - " + str(cKernel.shape))
    print(cKernel)
    cOutput = torch.zeros(8, 8)
    print("Output shape - " + str(cOutput.shape))

    # convert Tensors into usabel np_arrays
    np_cX = cX.numpy()
    np_cKernel = cKernel.numpy()
    np_cOutput = cOutput.numpy()
    print("Numpy cX - ")
    print(str(np_cX))
    print("Numpy cX dType - " + str(np_cX.dtype))
    print("Numpy cKernel - ")
    print(str(np_cKernel))
    print("Numpy cKernel dType - " + str(np_cKernel.dtype))

    np_dim_cX = np.array(np_cX.shape,
                         dtype=np.int32)  # fits device integer bit-length
    print("Dimensions of np_cX - " + str(np_dim_cX))
    print("Dtype of np_cX - " + str(type(np_dim_cX[0])))
    np_dim_cKernel = np.array(np_cKernel.shape,
                              dtype=np.int32)  # fits device integer bit-length
    print("Dimensions of np_cKernel - " + str(np_dim_cKernel))
    print(np_cKernel.shape)
    print(type(np_dim_cKernel[0]))

    # copy np_arrays into buffers of device

    # input
    buf_cX = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_cX)
    # input dimension
    buf_dim_cX = cl.Buffer(ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=np_dim_cX)
    # kernel
    buf_cKernel = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=np_cKernel)
    # kernel dimension
    buf_dim_cKernel = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=np_dim_cKernel)
    # output
    buf_cOutput = cl.Buffer(ctx, mf.WRITE_ONLY, np_cOutput.nbytes)

    ######
    # Options
    ######
    # stride
    np_stride = np.array([1, 1], dtype=np.int32)

    buf_stride = cl.Buffer(ctx,
                           mf.READ_ONLY | mf.COPY_HOST_PTR,
                           hostbuf=np_stride)

    print("Calling Kernel with global_work_size: " +
          str(np_cOutput.shape[0] * np_cOutput.shape[1]))

    convKernel = prg.conv2d2
    convKernel.set_args(buf_cKernel, buf_dim_cKernel, buf_cX, buf_dim_cX,
                        buf_cOutput, buf_stride)
    ev = cl.enqueue_nd_range_kernel(queue, convKernel, np_cOutput.shape, None)

    #prg.conv2d2(queue, np_cOutput.shape, None,
    #    buf_cX, buf_dim_cX,
    #    buf_cKernel, buf_dim_cKernel,
    #    buf_cOutput)

    cl.enqueue_copy(queue, np_cOutput, buf_cOutput)
    print(type(np_cOutput))
    print(np_cOutput.dtype)
    print(np_cOutput)
Example #5
0
def clFindKnn(h_bf_indexes, h_bf_distances, h_pointset, h_query, kth, thelier,
              nchunks, pointdim, signallength, gpuid):

    triallength = int(signallength / nchunks)
    #    print 'Values:', pointdim, triallength, signallength, kth, thelier
    '''for platform in cl.get_platforms():
        for device in platform.get_devices():
            print("===============================================================")
            print("Platform name:", platform.name)
            print("Platform profile:", platform.profile)
            print("Platform vendor:", platform.vendor)
            print("Platform version:", platform.version)
            print("---------------------------------------------------------------")
            print("Device name:", device.name)
            print("Device type:", cl.device_type.to_string(device.type))
            print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
            print("Device max clock speed:", device.max_clock_frequency, 'MHz')
            print("Device compute units:", device.max_compute_units)
            print("Device max work group size:", device.max_work_group_size)
            print("Device max work item sizes:", device.max_work_item_sizes)'''

    # Set up OpenCL
    my_gpu_devices, context, queue = _get_device(gpuid)

    # Check memory resources.
    usedmem = int((h_query.nbytes + h_pointset.nbytes + h_bf_distances.nbytes +
                   h_bf_indexes.nbytes) // 1024 // 1024)
    totalmem = int(my_gpu_devices[gpuid].global_mem_size // 1024 // 1024)

    if (totalmem * 0.90) < usedmem:
        print(("WARNING:", usedmem, "Mb used out of", totalmem,
               "Mb. The GPU could run out of memory."))

    # Create OpenCL buffers
    d_bf_query = cl.Buffer(context,
                           cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=h_query)
    d_bf_pointset = cl.Buffer(context,
                              cl.mem_flags.READ_ONLY
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=h_pointset)
    d_bf_distances = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                               h_bf_distances.nbytes)
    d_bf_indexes = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                             h_bf_indexes.nbytes)

    # Kernel Launch
    kernelLocation = resource_filename(__name__, 'gpuKnnBF_kernel.cl')
    kernelsource = open(kernelLocation).read()
    program = cl.Program(context, kernelsource).build()
    kernelKNNshared = program.kernelKNNshared
    kernelKNNshared.set_scalar_arg_dtypes([
        None, None, None, None, np.int32, np.int32, np.int32, np.int32,
        np.int32, None, None
    ])

    # Size of workitems and NDRange
    if signallength / nchunks < my_gpu_devices[gpuid].max_work_group_size:
        workitems_x = 8
    elif my_gpu_devices[gpuid].max_work_group_size < 256:
        workitems_x = my_gpu_devices[gpuid].max_work_group_size
    else:
        workitems_x = 256

    if signallength % workitems_x != 0:
        temp = int(round(((signallength) / workitems_x), 0) + 1)
    else:
        temp = int(signallength / workitems_x)

    NDRange_x = workitems_x * temp

    # Local memory for distances and indexes
    localmem = (np.dtype(np.float32).itemsize * kth * workitems_x +
                np.dtype(np.int32).itemsize * kth * workitems_x) / 1024
    if localmem > my_gpu_devices[gpuid].local_mem_size / 1024:
        print('Localmem alocation will fail. {0} kb available, and it needs '
              '{1} kb.'.format(my_gpu_devices[gpuid].local_mem_size / 1024,
                               localmem))
    localmem1 = cl.LocalMemory(
        np.dtype(np.float32).itemsize * kth * workitems_x)
    localmem2 = cl.LocalMemory(np.dtype(np.int32).itemsize * kth * workitems_x)

    kernelKNNshared(queue, (NDRange_x, ), (workitems_x, ), d_bf_query,
                    d_bf_pointset, d_bf_indexes, d_bf_distances, pointdim,
                    triallength, signallength, kth, thelier, localmem1,
                    localmem2)
    queue.finish()

    # Download results
    cl.enqueue_copy(queue, h_bf_distances, d_bf_distances)
    cl.enqueue_copy(queue, h_bf_indexes, d_bf_indexes)

    # Free buffers
    d_bf_distances.release()
    d_bf_indexes.release()
    d_bf_query.release()
    d_bf_pointset.release()

    return 1
Example #6
0
def test_image_2d(ctx_factory):
    context = ctx_factory()

    device, = context.devices

    if not device.image_support:
        from pytest import skip
        skip("images not supported on %s" % device)

    if "Intel" in device.vendor and "31360.31426" in device.version:
        from pytest import skip
        skip("images crashy on %s" % device)
    if "pocl" in device.platform.vendor and (
            "0.8" in device.platform.version or
            "0.9" in device.platform.version
            ):
        from pytest import skip
        skip("images crashy on %s" % device)

    prg = cl.Program(context, """
        __kernel void copy_image(
          __global float *dest,
          __read_only image2d_t src,
          sampler_t samp,
          int stride0)
        {
          int d0 = get_global_id(0);
          int d1 = get_global_id(1);
          /*
          const sampler_t samp =
            CLK_NORMALIZED_COORDS_FALSE
            | CLK_ADDRESS_CLAMP
            | CLK_FILTER_NEAREST;
            */
          dest[d0*stride0 + d1] = read_imagef(src, samp, (float2)(d1, d0)).x;
        }
        """).build()

    num_channels = 1
    a = np.random.rand(1024, 512, num_channels).astype(np.float32)
    if num_channels == 1:
        a = a[:, :, 0]

    queue = cl.CommandQueue(context)
    try:
        a_img = cl.image_from_array(context, a, num_channels)
    except cl.RuntimeError:
        import sys
        exc = sys.exc_info()[1]
        if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
            from pytest import skip
            skip("required image format not supported on %s" % device.name)
        else:
            raise

    a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes)

    samp = cl.Sampler(context, False,
            cl.addressing_mode.CLAMP,
            cl.filter_mode.NEAREST)
    prg.copy_image(queue, a.shape, None, a_dest, a_img, samp,
            np.int32(a.strides[0]/a.dtype.itemsize))

    a_result = np.empty_like(a)
    cl.enqueue_copy(queue, a_result, a_dest)

    good = la.norm(a_result - a) == 0
    if not good:
        if queue.device.type & cl.device_type.CPU:
            assert good, ("The image implementation on your CPU CL platform '%s' "
                    "returned bad values. This is bad, but common."
                    % queue.device.platform)
        else:
            assert good
Example #7
0
def test_get_info(ctx_factory):
    ctx = ctx_factory()
    device, = ctx.devices
    platform = device.platform

    failure_count = [0]

    pocl_quirks = [
            (cl.Buffer, cl.mem_info.OFFSET),
            (cl.Program, cl.program_info.KERNEL_NAMES),
            (cl.Program, cl.program_info.NUM_KERNELS),
            ]
    CRASH_QUIRKS = [
            (("NVIDIA Corporation", "NVIDIA CUDA",
                "OpenCL 1.0 CUDA 3.0.1"),
                [
                    (cl.Event, cl.event_info.COMMAND_QUEUE),
                    ]),
            (("The pocl project", "Portable Computing Language",
                "OpenCL 1.2 pocl 0.8-pre"),
                    pocl_quirks),
            (("The pocl project", "Portable Computing Language",
                "OpenCL 1.2 pocl 0.8"),
                pocl_quirks),
            (("The pocl project", "Portable Computing Language",
                "OpenCL 1.2 pocl 0.9-pre"),
                pocl_quirks),
            (("Apple", "Apple",
                "OpenCL 1.2 (Apr 25 2013 18:32:06)"),
                [
                    (cl.Program, cl.program_info.SOURCE),
                    ]),
            ]
    QUIRKS = []

    plat_quirk_key = (
            platform.vendor,
            platform.name,
            platform.version)

    def find_quirk(quirk_list, cl_obj, info):
        for entry_plat_key, quirks in quirk_list:
            if entry_plat_key == plat_quirk_key:
                for quirk_cls, quirk_info in quirks:
                    if (isinstance(cl_obj, quirk_cls)
                            and quirk_info == info):
                        return True

        return False

    def do_test(cl_obj, info_cls, func=None, try_attr_form=True):
        if func is None:
            def func(info):
                cl_obj.get_info(info)

        for info_name in dir(info_cls):
            if not info_name.startswith("_") and info_name != "to_string":
                print(info_cls, info_name)
                info = getattr(info_cls, info_name)

                if find_quirk(CRASH_QUIRKS, cl_obj, info):
                    print("not executing get_info", type(cl_obj), info_name)
                    print("(known crash quirk for %s)" % platform.name)
                    continue

                try:
                    func(info)
                except:
                    msg = "failed get_info", type(cl_obj), info_name

                    if find_quirk(QUIRKS, cl_obj, info):
                        msg += ("(known quirk for %s)" % platform.name)
                    else:
                        failure_count[0] += 1

                if try_attr_form:
                    try:
                        getattr(cl_obj, info_name.lower())
                    except:
                        print("failed attr-based get_info", type(cl_obj), info_name)

                        if find_quirk(QUIRKS, cl_obj, info):
                            print("(known quirk for %s)" % platform.name)
                        else:
                            failure_count[0] += 1

    do_test(platform, cl.platform_info)
    do_test(device, cl.device_info)
    do_test(ctx, cl.context_info)

    props = 0
    if (device.queue_properties
            & cl.command_queue_properties.PROFILING_ENABLE):
        profiling = True
        props = cl.command_queue_properties.PROFILING_ENABLE
    queue = cl.CommandQueue(ctx,
            properties=props)
    do_test(queue, cl.command_queue_info)

    prg = cl.Program(ctx, """
        __kernel void sum(__global float *a)
        { a[get_global_id(0)] *= 2; }
        """).build()
    do_test(prg, cl.program_info)
    do_test(prg, cl.program_build_info,
            lambda info: prg.get_build_info(device, info),
            try_attr_form=False)

    n = 2000
    a_buf = cl.Buffer(ctx, 0, n*4)

    do_test(a_buf, cl.mem_info)

    kernel = prg.sum
    do_test(kernel, cl.kernel_info)

    evt = kernel(queue, (n,), None, a_buf)
    do_test(evt, cl.event_info)

    if profiling:
        evt.wait()
        do_test(evt, cl.profiling_info,
                lambda info: evt.get_profiling_info(info),
                try_attr_form=False)

    # crashes on intel...
    if device.image_support and platform.vendor not in [
            "Intel(R) Corporation",
            "The pocl project",
            ]:
        smp = cl.Sampler(ctx, False,
                cl.addressing_mode.CLAMP,
                cl.filter_mode.NEAREST)
        do_test(smp, cl.sampler_info)

        img_format = cl.get_supported_image_formats(
                ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0]

        img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256))
        assert img.shape == (128, 256)

        img.depth
        img.image.depth
        do_test(img, cl.image_info,
                lambda info: img.get_image_info(info))
Example #8
0
 def __init__(self, queue, block_size):
     self.queue = queue
     self.host_buf = np.empty(block_size, dtype=np.uint8)
     self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE,
                              block_size)
Example #9
0
# Build program in the specified context using the kernel source code
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Data
v = np.arange(4, dtype=np.float32)
print('Input: ' + str(v))

# Create output buffer
v_buff = cl.Buffer(context, flags=cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=v)

# Create user event
user_event = cl.UserEvent(context)


def read_complete(status, data):
    print('Output: ' + str(data))

# Enqueue kernel that waits for user event before executing
global_size = (1,)
local_size = None

# __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
kernel_event = prog.user_event(queue, global_size, local_size, v_buff, wait_for=[user_event])
Example #10
0
def eikonal(graph, signal, hp):
    """Does the mean-curvature evolution 
        params:
        ------
        graph: 
        signal: A initial distance field, for m number of seeds
        it is of size (n X m=chnls).
        hp: hyperparameters

        return:
        ------
        new_signal:
        """

    ngbrs = graph.ngbrs
    wgts = graph.wgts
    k = graph.k
    ngbrs = ngbrs.astype('int32')
    wgts = wgts.astype('float32')
    n, chnl = signal.shape
    """
        old notes, need to include the A set here.
        red =  gray[:,0]
        # get the ids of the seed
        """
    signal = np.reshape(signal, (n * chnl), order='F')
    signal = signal.astype('float32')
    print("signal", signal.shape) if bool_1 else print()
    print("n", n) if bool_1 else print()
    it = hp.it
    print("sucess till loading") if bool_1 else print()

    # create the opencl context
    platform = cl.get_platforms()[0]
    print(platform)
    device = platform.get_devices()[0]
    print(device)
    context = cl.Context([device])
    print(context)
    program = cl.Program(context, open(mywf).read()).build()
    queue = cl.CommandQueue(context)
    print(queue)

    #create the buffers now.
    mem_flags = cl.mem_flags
    ngbrs_buf = cl.Buffer(context,
                          mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                          hostbuf=ngbrs)
    signal_buf = cl.Buffer(context,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=signal)
    weight_buf = cl.Buffer(context,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=wgts)

    #need to create new signal
    new_signal = np.ndarray(shape=(n * chnl, ), dtype=np.float32)
    new_signal_buf = cl.Buffer(context, mem_flags.WRITE_ONLY,
                               new_signal.nbytes)

    #run the kernel here in a loop
    for uv in range(0, it):
        program.laplacian_filter(queue, (n * chnl, ), None, signal_buf,
                                 new_signal_buf, ngbrs_buf, weight_buf,
                                 np.int32(k), np.int32(chnl))
        signal_buf, new_signal_buf = new_signal_buf, signal_buf

    # copy the new intensity vec
    cl.enqueue_copy(queue, new_signal, new_signal_buf)
    # save the new intensity vec here
    print("finish") if bool_1 else print()
    return np.reshape(new_signal, (int(len(new_signal) / chnl), chnl),
                      order="F")
Example #11
0
 def __init__(self, queue, block_size):
     self.queue = queue
     self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE,
                                block_size)
     self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE,
                                block_size)
Example #12
0
def mandel(ctx,
           x,
           y,
           zoom,
           max_iter=1000,
           iter_steps=1,
           width=500,
           height=500,
           use_double=False):
    mf = cl.mem_flags
    cl_queue = cl.CommandQueue(ctx)
    # build program
    code = """
    #if real_t == double
        #pragma OPENCL EXTENSION cl_khr_fp64 : enable
    #endif
    kernel void mandel(
        __global real_t *coords,
        __global uint *output,
        __global real_t *output_coord,
        const uint max_iter,
        const uint start_iter    
    ){
        uint id = get_global_id(0);         
        real_t2 my_coords = vload2(id, coords);           
        real_t2 my_value_coords = vload2(id, output_coord);           
        real_t x = my_value_coords.x;
        real_t y = my_value_coords.y;
        uint iter = 0;
        for(iter=start_iter; iter<max_iter; ++iter){
            if(x*x + y*y > 4.0f){
                break;
            }
            real_t xtemp = x*x - y*y + my_coords.x;
            y = 2*x*y + my_coords.y;
            x = xtemp;
        }
        // copy the current x,y pair back
        real_t2 val = (real_t2){x, y};
        vstore2(val, id, output_coord);
        output[id] = iter;
    }        
    """
    _cltype, _nptype = ("double", np.float64) if use_double else ("float",
                                                                  np.float32)
    prg = cl.Program(ctx, code).build(
        "-cl-opt-disable -D real_t=%s -D real_t2=%s2" % (_cltype, _cltype))

    # Calculate the "viewport".
    x0 = x - ((Decimal(3) * zoom) / Decimal(2.))
    y0 = y - ((Decimal(2) * zoom) / Decimal(2.))
    x1 = x + ((Decimal(3) * zoom) / Decimal(2.))
    y1 = y + ((Decimal(2) * zoom) / Decimal(2.))

    # Create index map in x,y pairs
    xx = np.arange(0, width, 1, dtype=np.uint32)
    yy = np.arange(0, height, 1, dtype=np.uint32)
    index_map = np.dstack(np.meshgrid(xx, yy))
    # and local "coordinates" (real, imaginary parts)
    coord_map = np.ndarray(index_map.shape, dtype=_nptype)
    coord_map[:] = index_map
    coord_map[:] *= (_nptype(
        (x1 - x0) / Decimal(width)), _nptype((y1 - y0) / Decimal(height)))
    coord_map[:] += (_nptype(x0), _nptype(y0))
    coord_map = coord_map.flatten()
    index_map = index_map.flatten().astype(dtype=np.uint32)
    # Create input and output buffer
    buffer_in_cl = cl.Buffer(ctx, mf.READ_ONLY, size=coord_map.nbytes)
    buffer_out = np.zeros(
        width * height,
        dtype=np.uint32)  # This will contain the iteration values of that run
    buffer_out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, size=buffer_out.nbytes)
    buffer_out_coords = np.zeros(width * height * 2,
                                 dtype=_nptype)  # This the last x,y values
    buffer_out_coords_cl = cl.Buffer(ctx,
                                     mf.READ_WRITE,
                                     size=buffer_out_coords.nbytes)
    # 2D Buffer to collect the iterations needed per pixel
    #iter_map = np.zeros(width*height, dtype=np.uint32).reshape((width, height)) #.reshape((height, width))
    iter_map = np.zeros(width * height, dtype=np.uint32).reshape(
        (height, width))

    start_max_iter = 0
    to_do = int(coord_map.size / 2)
    steps_size = int(max_iter / float(iter_steps))
    while to_do > 0 and start_max_iter < max_iter:
        end_max_iter = min(max_iter, start_max_iter + steps_size)
        print(("Iterations from iteration %i to %i for %i numbers" %
               (start_max_iter, end_max_iter, to_do)))

        # copy x/y pairs to device
        cl.enqueue_copy(cl_queue, buffer_in_cl, coord_map[:to_do * 2]).wait()
        cl.enqueue_copy(cl_queue, buffer_out_coords_cl,
                        buffer_out_coords[:to_do * 2]).wait()
        # and finally call the ocl function
        prg.mandel(cl_queue, (to_do, ), None, buffer_in_cl, buffer_out_cl,
                   buffer_out_coords_cl, np.uint32(end_max_iter),
                   np.uint32(start_max_iter)).wait()
        # Copy the output back
        cl.enqueue_copy(cl_queue, buffer_out_coords,
                        buffer_out_coords_cl).wait()
        cl.enqueue_copy(cl_queue, buffer_out, buffer_out_cl).wait()

        # Get indices of "found" escapes
        done = np.where(buffer_out[:to_do] < end_max_iter)[0]
        # and write the iterations to the coresponding cell
        index_reshaped = index_map[:to_do * 2].reshape((to_do, 2))
        tmp = index_reshaped[done]
        iter_map[tmp[:, 1], tmp[:, 0]] = buffer_out[done]
        #iter_map[tmp[:,0], tmp[:,1]] = buffer_out[done]

        # Get the indices of non escapes
        undone = np.where(buffer_out[:to_do] == end_max_iter)[0]
        # and write them back to our "job" maps for the next loop
        tmp = buffer_out_coords[:to_do * 2].reshape((to_do, 2))
        buffer_out_coords[:undone.size * 2] = tmp[undone].flatten()
        tmp = coord_map[:to_do * 2].reshape((to_do, 2))
        coord_map[:undone.size * 2] = tmp[undone].flatten()
        index_map[:undone.size * 2] = index_reshaped[undone].flatten()

        to_do = undone.size
        start_max_iter = end_max_iter
        print(("%i done. %i unknown" % (done.size, undone.size)))

    # simple coloring by modulo 255 on the iter_map
    return (iter_map % 255).astype(np.uint8).reshape((height, width))
Example #13
0
    # simulation parametars
    deltatime = 0.001
    eps = 0.001

    # init particle's position and velocity
    particles = np.random.rand(size, 4).astype(np.float32)  #* (size / 1024)
    velocity = np.zeros((size, 4), dtype=np.float32)

    # create opencl context and put it in program queue
    ctx = cl.Context(cl.get_platforms()[0].get_devices())  # quick fix
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # make buffers
    mf = cl.mem_flags
    particles_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=particles)
    velocity_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=velocity)

    # --- just for checking
    #print particles
    #print " --- "

    # define OpenCL local memory size (blocksize * vectorsize * itemsize)
    local_buf = cl.LocalMemory(block_size * 4 * particles.itemsize)

    # build program
    prg = cl.Program(ctx, kernel).build()

    # execute kernel
    exec_evt = prg.nbody_simulation(
        queue,
Example #14
0
} // execute over n "work items"
"""

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# create some data array to give as input to Kernel and get output
SIZE = 4
a_np = np.arange(SIZE * 3).reshape(SIZE, 3).astype(np.float32)
b_np = np.arange(SIZE * 3, SIZE * 3 + SIZE * 3).reshape(SIZE,
                                                        3).astype(np.float32)
c_np = np.zeros((SIZE * SIZE, 3)).astype(np.float32)

# create the buffers to hold the values of the input
a_buf = cl.Buffer(ctx,
                  cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                  hostbuf=a_np)
b_buf = cl.Buffer(ctx,
                  cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                  hostbuf=b_np)
# create output buffer
c_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, c_np.nbytes)

#Compilation
prg = cl.Program(ctx, source).build()

# Kernel is now launched
launch = prg.gpu_mul(queue, (3, SIZE, SIZE), None, a_buf, b_buf, c_buf)
# wait till the process completes
launch.wait()
Example #15
0
    context = cl.Context([device])
    program = cl.Program(
        context, """
        __kernel void matrix_dot_vector(__global const int *matrix,
        __global const int *vector, __global int *result)
        {
          int gid = get_global_id(0);
          result[gid] = dot(matrix[gid], vector[0]);
        }
        """).build()

    queue = cl.CommandQueue(context)

    mem_flags = cl.mem_flags
    matrix_buf = cl.Buffer(context,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=matrix)
    vector_buf = cl.Buffer(context,
                           mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                           hostbuf=vector)
    matrix_dot_vector = numpy.zeros(10, numpy.float32)
    destination_buf = cl.Buffer(context, mem_flags.WRITE_ONLY,
                                matrix_dot_vector.nbytes)

    program.matrix_dot_vector(queue, matrix_dot_vector.shape, None, matrix_buf,
                              vector_buf, destination_buf)

    cl.enqueue_copy(queue, matrix_dot_vector, destination_buf)

    print vector
    print matrix
Example #16
0
    def InitializeSolver(self):
        """ Calculate u_{-1} to start of the time looping.
            u_-1 = u_0 - dt*du_0 + 0.5*dt**2*ddu_0
        """
        # Allocate the np.array object in CPU.
        self.LM = np.zeros((self.lclNDof, self.nSmp))  # no synchronized
        self.LHS = np.zeros((self.lclNDof, self.nSmp))  # synchronized

        # Allocate the OpenCL source and result buffer memory objects on GPU device GMEM.
        mem_flags = cl.mem_flags

        self.nodes_buf = cl.Buffer(self.context,
                                   mem_flags.READ_ONLY
                                   | mem_flags.COPY_HOST_PTR,
                                   hostbuf=self.mesh.nodes[self.lclNodeIds])
        # self.elmNodeIds_buf = cl.Buffer(self.context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf = self.mesh.elementNodeIds)
        # mesh coloring's color tags
        self.colorGps_buf = [
            cl.Buffer(
                self.context,
                mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                hostbuf=self.mesh.lclElmNodeIds[self.mesh.colorGroups[i]])
            for i in range(len(self.mesh.colorGroups))
        ]
        self.colorGps_elmIds_buf = [
            cl.Buffer(self.context,
                      mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                      hostbuf=self.mesh.colorGroups[i])
            for i in range(len(self.mesh.colorGroups))
        ]

        # for calculating M (mass) matrix, do not need to always exist in GPU memory
        thickness_buf = cl.Buffer(
            self.context,
            mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
            hostbuf=self.mesh.vthickness[self.lclNodeIds])

        # for calculating K (stiffness) matrix, thicknessE (nElms, nSmp)
        # -- Young's Modulus
        elmVerE = self.mesh.vE[self.mesh.elementNodeIds, :]
        elmVerE = elmVerE.swapaxes(1, 2)
        elmAveE = np.mean(elmVerE, axis=2)
        # -- thickness
        elmVerThick = self.mesh.vthickness[self.mesh.elementNodeIds, :]
        elmVerThick = elmVerThick.swapaxes(1, 2)
        # elmAveThick = np.mean(elmVerThick, axis=2)
        # - thickness x E
        elmTE = np.mean(elmVerE * elmVerThick, axis=2)
        self.elmTE_buf = [
            cl.Buffer(self.context,
                      mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                      hostbuf=elmTE[self.mesh.colorGroups[i]])
            for i in range(len(self.mesh.colorGroups))
        ]
        self.elmE_buf = [
            cl.Buffer(self.context,
                      mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                      hostbuf=elmAveE[self.mesh.colorGroups[i]])
            for i in range(len(self.mesh.colorGroups))
        ]

        # for calculating K (stiffness) matrix, D needs
        k = 5.0 / 6.0
        v = self.mesh.v
        pVals = np.array([
            self.mesh.density, v, 0.5 * (1.0 - v), 0.5 * k * (1.0 - v),
            (1.0 - v * v)
        ])
        self.pVals_buf = cl.Buffer(self.context,
                                   mem_flags.READ_ONLY
                                   | mem_flags.COPY_HOST_PTR,
                                   hostbuf=pVals)

        # The initial displacement b.c. (nNodes*3,)
        u_buf = cl.Buffer(self.context,
                          mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                          hostbuf=self.u)

        self.LM_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                self.LM.nbytes)
        self.Ku_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                self.LM.nbytes)
        self.P_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                               self.LM.nbytes)

        # cl.enqueue_fill_buffer(self.queue, self.LM_buf, np.float64(0.0), 0, self.LM.nbytes)
        # cl.enqueue_fill_buffer(self.queue, self.Ku_buf, np.float64(0.0), 0, self.LM.nbytes)
        # cl.enqueue_fill_buffer(self.queue, self.P_buf, np.float64(0.0), 0, self.LM.nbytes)

        map_flags = cl.map_flags
        self.appTrac_buf = cl.Buffer(self.context, mem_flags.READ_ONLY,
                                     int(self.lclNNodes * 24))
        self.pinned_appTrac = cl.Buffer(
            self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
            int(self.lclNNodes * 24))
        self.appTrac, _eventAppTrac = cl.enqueue_map_buffer(
            self.queue, self.pinned_appTrac, map_flags.WRITE, 0,
            (self.lclNNodes, 3), self.LM.dtype)
        self.appTrac[:, :] = 0.0
        # prep_appTrac_event = cl.enqueue_copy(self.queue, self.appTrac_buf, self.appTrac)

        # 'Assemble' the inital M (mass) and Ku (stiffness) 'matrices'.
        # Kernel.
        initial_assemble_events = []
        for iColorGroup in range(len(self.colorGps_buf)):
            initial_assemble_event = \
            self.program.assemble_K_M_P(self.queue, (len(self.mesh.colorGroups[iColorGroup]),), (1,),
                                        np.int64(self.nSmp), np.float64(self.pressure),
                                        self.pVals_buf, self.nodes_buf, self.colorGps_buf[iColorGroup], thickness_buf,
                                        self.elmTE_buf[iColorGroup], u_buf, self.Ku_buf, self.LM_buf, self.P_buf,
                                        wait_for=initial_assemble_events)
            initial_assemble_events = [initial_assemble_event]

        initial_assemble_copy_event = \
        cl.enqueue_copy(self.queue, self.LM, self.LM_buf, wait_for=initial_assemble_events)

        initial_assemble_copy_event.wait()

        # Synchronize the left-hand-side of each equition which is LM.
        # Copy the LM first to LHS.
        self.LHS[:, :] = self.LM
        # Synchronize.
        self.SyncCommNodes(self.LHS)
        # Copy into GPU device and prepared.
        self.LHS_buf = cl.Buffer(self.context,
                                 mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,
                                 hostbuf=self.LHS)

        # Calculate accelaration u''.
        # ddu = (F0 - C*du - Ku)/M
        self.ddu = np.zeros((self.lclNDof, self.nSmp))
        self.ddu_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                 self.LM.nbytes)
        initial_calc_ddu_event = \
        self.program.calc_ddu(self.queue, (self.globalWorkSize,), (self.localWorkSize,),
                              np.int64(self.nSmp), np.int64(self.lclNDof),
                              self.P_buf, self.Ku_buf, self.LHS_buf, self.ddu_buf)
        initial_ddu_copy_event = \
        cl.enqueue_copy(self.queue, self.ddu, self.ddu_buf, wait_for=[initial_calc_ddu_event])
        initial_ddu_copy_event.wait()
        # Synchronize the acceleration on common nodes.
        self.SyncCommNodes(self.ddu)
        # Add on the global force.
        self.ddu += self.appTrac.reshape(self.lclNDof, 1) / self.LHS

        # Prepare the memories.
        # Memory on GPU devices.
        map_flags = cl.map_flags
        self.ures_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                  self.LM.nbytes)
        self.u_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                               self.LM.nbytes)
        self.up_buf = cl.Buffer(self.context, mem_flags.READ_WRITE,
                                self.LM.nbytes)
        self.stress_buf = cl.Buffer(self.context, mem_flags.WRITE_ONLY,
                                    int(self.nElms * self.nSmp * 40))
        # Pinned memory on CPU.
        self.pinned_ures = cl.Buffer(
            self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
            self.LM.nbytes)
        self.pinned_u = cl.Buffer(
            self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
            self.LM.nbytes)
        self.pinned_up = cl.Buffer(
            self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
            self.LM.nbytes)
        self.pinned_stress = cl.Buffer(
            self.context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR,
            int(self.nElms * self.nSmp * 40))
        # Map to CPU.
        self.srcURes, _eventSrcURes = cl.enqueue_map_buffer(
            self.queue, self.pinned_ures, map_flags.WRITE | map_flags.READ, 0,
            self.LM.shape, self.LM.dtype)
        self.srcU, _eventSrcU = cl.enqueue_map_buffer(
            self.queue, self.pinned_u, map_flags.WRITE | map_flags.READ, 0,
            self.LM.shape, self.LM.dtype)
        self.srcUP, _eventSrcUP = cl.enqueue_map_buffer(
            self.queue, self.pinned_up, map_flags.WRITE | map_flags.READ, 0,
            self.LM.shape, self.LM.dtype)
        self.stress, _eventStress = cl.enqueue_map_buffer(
            self.queue, self.pinned_stress, map_flags.READ, 0,
            (self.nElms, self.nSmp, 5), self.LM.dtype)

        # Use Taylor Expansion to get u_-1.
        self.srcU[:, :] = self.u[np.newaxis].transpose()
        self.srcUP[:, :] = self.srcU - self.dt * self.du[
            np.newaxis].transpose() + self.dt**2 * self.ddu / 2.0
        # copy up first to device
        prep_up_event = cl.enqueue_copy(self.queue, self.up_buf, self.srcUP)
        prep_u_event = cl.enqueue_copy(self.queue, self.u_buf, self.srcU)
Example #17
0
}
'''

# Get device and context, create command queue and program
dev = utility.get_default_device()
context = cl.Context(devices=[dev])
queue = cl.CommandQueue(context, dev)

# Build program in the specified context using the kernel source code
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Create output buffer
out = cl.array.vec.zeros_int4()
buffer_out = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=out.itemsize)

# Enqueue kernel (with argument specified directly)
n_globals = (1, )
n_locals = None
prog.op_test(queue, n_globals, n_locals, buffer_out)

# Enqueue command to copy from buffer_out to host memory
cl.enqueue_copy(queue, dest=out, src=buffer_out, is_blocking=True)

print('Output: ' + str(out))
Example #18
0
    def __preexecute_kernels(self):
        total_dna_size = self.__population * self.__sample_chromosome.dna_total_length

        self.__fitnesses = numpy.zeros(self.__population, dtype=numpy.float32)
        self.__np_chromosomes = numpy.zeros(total_dna_size, dtype=numpy.int32)

        mf = cl.mem_flags

        # Random number should be given by Host program because OpenCL doesn't have a random number
        # generator. We just include one, Noise.cl.
        rnum = [
            random.randint(0, 4294967295) for i in range(self.__population)
        ]
        ## note: numpy.random.rand() gives us a list float32 and we cast it to uint32 at the calling
        ##       of kernel function. It just views the original byte order as uint32.
        self.__dev_rnum = cl.Buffer(self.__ctx,
                                    mf.READ_WRITE | mf.COPY_HOST_PTR,
                                    hostbuf=numpy.array(rnum,
                                                        dtype=numpy.uint32))

        self.__dev_chromosomes = cl.Buffer(self.__ctx,
                                           mf.READ_WRITE | mf.COPY_HOST_PTR,
                                           hostbuf=self.__np_chromosomes)
        self.__dev_fitnesses = cl.Buffer(self.__ctx, mf.WRITE_ONLY,
                                         self.__fitnesses.nbytes)
        self.__prepare_fitness_args()

        if self.__is_elitism_mode:
            self.__elites_updated = False
            self.__current_elites = numpy.zeros(
                self.__sample_chromosome.dna_total_length * self.__elitism_top,
                dtype=numpy.int32)
            self.__dev_current_elites = cl.Buffer(
                self.__ctx,
                mf.READ_WRITE | mf.COPY_HOST_PTR,
                hostbuf=self.__current_elites)
            self.__updated_elites = numpy.zeros(
                self.__sample_chromosome.dna_total_length * self.__elitism_top,
                dtype=numpy.int32)
            self.__dev_updated_elites = cl.Buffer(
                self.__ctx,
                mf.READ_WRITE | mf.COPY_HOST_PTR,
                hostbuf=self.__updated_elites)
            self.__updated_elite_fitnesses = numpy.zeros(self.__elitism_top,
                                                         dtype=numpy.float32)
            self.__dev_updated_elite_fitnesses = cl.Buffer(
                self.__ctx,
                mf.READ_WRITE | mf.COPY_HOST_PTR,
                hostbuf=self.__updated_elite_fitnesses)

        # For statistics
        self.__dev_best_indices = cl.Buffer(self.__ctx,
                                            mf.READ_WRITE | mf.COPY_HOST_PTR,
                                            hostbuf=self.__best_indices)
        self.__dev_worst_indices = cl.Buffer(self.__ctx,
                                             mf.READ_WRITE | mf.COPY_HOST_PTR,
                                             hostbuf=self.__worst_indices)

        cl.enqueue_copy(self.__queue, self.__dev_fitnesses, self.__fitnesses)

        ## call preexecute_kernels for internal data structure preparation
        self.__sample_chromosome.preexecute_kernels(self.__ctx, self.__queue,
                                                    self.__population)

        ## dump information on kernel resources usage
        self.__dump_kernel_info(self.__prg, self.__ctx,
                                self.__sample_chromosome)
Example #19
0
def test_image_3d(ctx_factory):
    #test for image_from_array for 3d image of float2
    context = ctx_factory()

    device, = context.devices

    if not device.image_support:
        from pytest import skip
        skip("images not supported on %s" % device)

    if device.platform.vendor == "Intel(R) Corporation":
        from pytest import skip
        skip("images crashy on %s" % device)

    prg = cl.Program(context, """
        __kernel void copy_image_plane(
          __global float2 *dest,
          __read_only image3d_t src,
          sampler_t samp,
          int stride0,
          int stride1)
        {
          int d0 = get_global_id(0);
          int d1 = get_global_id(1);
          int d2 = get_global_id(2);
          /*
          const sampler_t samp =
            CLK_NORMALIZED_COORDS_FALSE
            | CLK_ADDRESS_CLAMP
            | CLK_FILTER_NEAREST;
            */
          dest[d0*stride0 + d1*stride1 + d2] = read_imagef(
                src, samp, (float4)(d2, d1, d0, 0)).xy;
        }
        """).build()

    num_channels = 2
    shape = (3, 4, 2)
    a = np.random.random(shape + (num_channels,)).astype(np.float32)

    queue = cl.CommandQueue(context)
    try:
        a_img = cl.image_from_array(context, a, num_channels)
    except cl.RuntimeError:
        import sys
        exc = sys.exc_info()[1]
        if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
            from pytest import skip
            skip("required image format not supported on %s" % device.name)
        else:
            raise

    a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes)

    samp = cl.Sampler(context, False,
            cl.addressing_mode.CLAMP,
            cl.filter_mode.NEAREST)
    prg.copy_image_plane(queue, shape, None, a_dest, a_img, samp,
                         np.int32(a.strides[0]/a.itemsize/num_channels),
                         np.int32(a.strides[1]/a.itemsize/num_channels),
                         )

    a_result = np.empty_like(a)
    cl.enqueue_copy(queue, a_result, a_dest)

    good = la.norm(a_result - a) == 0
    if not good:
        if queue.device.type & cl.device_type.CPU:
            assert good, ("The image implementation on your CPU CL platform '%s' "
                    "returned bad values. This is bad, but common."
                    % queue.device.platform)
        else:
            assert good
Example #20
0
import numpy as np
import sys

platforms = cl.get_platforms()
platform = platforms[0]
devs = platform.get_devices(cl.device_type.GPU)
dev = devs[0]
mf = cl.mem_flags
ctx = cl.Context([dev])
queue = cl.CommandQueue(ctx, dev)

n1 = np.arange(10).astype(np.int32)
n2 = np.arange(10).astype(np.int32)
out = np.zeros(10).astype(np.int32)

b_n1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=n1)
b_n2 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=n2)
b_out = cl.Buffer(ctx, mf.WRITE_ONLY, size=out.nbytes)

prog = cl.Program(
    ctx, """
__kernel void prog(
    __global int *n1,
    __global int *n2,
    __global int *out)
{
    int i = get_local_id(0);
    __local int a;
    a = i;
    barrier(CLK_LOCAL_MEM_FENCE);
    printf("%d:%d\\n", get_global_id(0), get_group_id(1));
    for name in img_names[1:]:
        img1 = Image.open(name)
        # img1 = img.convert("YCbCr")
        img_arr = numpy.asarray(img1).astype(numpy.uint8)

        host_arr = numpy.concatenate((host_arr, img_arr.reshape(-1)))

    host_arr = host_arr.astype(numpy.uint8)
    print dim
    new_dim = (len(img_names), dim[0], dim[1], dim[2])
    print "new dimensions are", new_dim

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=host_arr)
    dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, host_arr.nbytes)

    print "[%d] Takes " % len(img_names), naturalsize(host_arr.nbytes)

    kernel_code = open("embed_1.cl").read() % (new_dim[1], new_dim[2],
                                               new_dim[3])
    prg1 = cl.Program(ctx, kernel_code).build()

    stime = time.time()
    prg1.embed_one(queue, (new_dim[0], new_dim[1], new_dim[2]), None, a_buf,
                   dest_buf)
    etime = time.time()

    print "[%d] GPU takes " % len(img_names), naturaltime(etime - stime)
Example #22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl

a_np = np.random.rand(50000).astype(np.float32)
b_np = np.random.rand(50000).astype(np.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

prg = cl.Program(
    ctx, """
__kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) {
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
""").build()

res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)

res_np = np.empty_like(a_np)
cl.enqueue_copy(queue, res_np, res_g)
Example #23
0
    def miningThread(self):
        self.loadKernel()
        frame = 1.0 / self.frames
        unit = self.worksize * 256
        globalThreads = unit * 10

        queue = cl.CommandQueue(self.context)

        lastRatedPace = lastRated = lastNTime = time()
        base = lastHashRate = threadsRunPace = threadsRun = 0
        f = np.zeros(8, np.uint32)
        output = np.zeros(OUTPUT_SIZE + 1, np.uint32)
        output_buf = cl.Buffer(self.context,
                               cl.mem_flags.WRITE_ONLY
                               | cl.mem_flags.USE_HOST_PTR,
                               hostbuf=output)

        work = None
        while True:
            if self.stop: return
            if (not work) or (not self.workQueue.empty()):
                try:
                    work = self.workQueue.get(True, 1)
                except Empty:
                    continue
                else:
                    if not work: continue

                    noncesLeft = self.hashspace
                    data = np.array(unpack('IIIIIIIIIIIIIIII',
                                           work['data'][128:].decode('hex')),
                                    dtype=np.uint32)
                    state = np.array(unpack('IIIIIIII',
                                            work['midstate'].decode('hex')),
                                     dtype=np.uint32)
                    target = np.array(unpack('IIIIIIII',
                                             work['target'].decode('hex')),
                                      dtype=np.uint32)
                    state2 = partial(state, data, f)

            self.miner.search(queue, (globalThreads, ), (self.worksize, ),
                              state[0], state[1], state[2], state[3], state[4],
                              state[5], state[6], state[7], state2[1],
                              state2[2],
                              state2[3], state2[5], state2[6], state2[7],
                              pack('I', base), f[0], f[1], f[2], f[3], f[4],
                              f[5], f[6], f[7], output_buf)
            cl.enqueue_read_buffer(queue, output_buf, output)

            noncesLeft -= globalThreads
            threadsRunPace += globalThreads
            threadsRun += globalThreads
            base = uint32(base + globalThreads)

            now = time()
            t = now - lastRatedPace
            if (t > 1):
                rate = (threadsRunPace / t) / self.rateDivisor
                lastRatedPace = now
                threadsRunPace = 0
                r = lastHashRate / rate
                if r < 0.9 or r > 1.1:
                    globalThreads = max(
                        unit * int((rate * frame * self.rateDivisor) / unit),
                        unit)
                    lastHashRate = rate
            t = now - lastRated
            if (t > self.rate):
                self.hashrate(int((threadsRun / t) / self.rateDivisor))
                lastRated = now
                threadsRun = 0

            if self.updateTime == '':
                if noncesLeft < TIMEOUT * globalThreads * self.frames:
                    self.update = True
                    noncesLeft += 0xFFFFFFFFFFFF
                elif 0xFFFFFFFFFFF < noncesLeft < 0xFFFFFFFFFFFF:
                    self.sayLine('warning: job finished, miner is idle')
                    work = None

            queue.finish()

            if output[OUTPUT_SIZE]:
                result = {}
                result['work'] = work
                result['data'] = np.array(data)
                result['state'] = np.array(state)
                result['target'] = target
                result['output'] = np.array(output)
                self.resultQueue.put(result)
                output.fill(0)
                cl.enqueue_write_buffer(queue, output_buf, output)

            if self.updateTime != '' and now - lastNTime > 1:
                data[1] = bytereverse(bytereverse(data[1]) + 1)
                state2 = partial(state, data, f)
                lastNTime = now
Example #24
0
 def __call__(self, size):
     return cl.Buffer(self.context, self.flags, size)
Example #25
0
def clFindRSAll(h_bf_npointsrange, h_pointset, h_query, h_vecradius, thelier,
                nchunks, pointdim, signallength, gpuid):

    triallength = int(signallength / nchunks)
    # print 'Values:', pointdim, triallength, signallength, kth, thelier
    '''for platform in cl.get_platforms():
        for device in platform.get_devices():
            print("===============================================================")
            print("Platform name:", platform.name)
            print("Platform profile:", platform.profile)
            print("Platform vendor:", platform.vendor)
            print("Platform version:", platform.version)
            print("---------------------------------------------------------------")
            print("Device name:", device.name)
            print("Device type:", cl.device_type.to_string(device.type))
            print("Device memory: ", device.global_mem_size//1024//1024, 'MB')
            print("Device max clock speed:", device.max_clock_frequency, 'MHz')
            print("Device compute units:", device.max_compute_units)
            print("Device max work group size:", device.max_work_group_size)
            print("Device max work item sizes:", device.max_work_item_sizes)'''

    # Set up OpenCL
    my_gpu_devices, context, queue = _get_device(gpuid)

    # Check memory resources.
    usedmem = int((h_query.nbytes + h_pointset.nbytes + h_vecradius.nbytes +
                   h_bf_npointsrange.nbytes) // 1024 // 1024)
    totalmem = int(my_gpu_devices[gpuid].global_mem_size // 1024 // 1024)

    if (totalmem * 0.90) < usedmem:
        print('WARNING: {0} Mb used from a total of {1} Mb. GPU could get '
              'without memory.'.format(usedmem, totalmem))

    # Create OpenCL buffers
    d_bf_query = cl.Buffer(context,
                           cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                           hostbuf=h_query)
    d_bf_pointset = cl.Buffer(context,
                              cl.mem_flags.READ_ONLY
                              | cl.mem_flags.COPY_HOST_PTR,
                              hostbuf=h_pointset)
    d_bf_vecradius = cl.Buffer(context,
                               cl.mem_flags.READ_ONLY
                               | cl.mem_flags.COPY_HOST_PTR,
                               hostbuf=h_vecradius)
    d_bf_npointsrange = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                                  h_bf_npointsrange.nbytes)

    # Kernel Launch
    kernelLocation = resource_filename(__name__, 'gpuKnnBF_kernel.cl')
    kernelsource = open(kernelLocation).read()
    program = cl.Program(context, kernelsource).build()
    kernelBFRSAllshared = program.kernelBFRSAllshared
    kernelBFRSAllshared.set_scalar_arg_dtypes(
        [None, None, None, None, np.int32, np.int32, np.int32, np.int32, None])

    # Size of workitems and NDRange
    if signallength / nchunks < my_gpu_devices[gpuid].max_work_group_size:
        workitems_x = 8
    elif my_gpu_devices[gpuid].max_work_group_size < 256:
        workitems_x = my_gpu_devices[gpuid].max_work_group_size
    else:
        workitems_x = 256

    if signallength % workitems_x != 0:
        temp = int(round(((signallength) / workitems_x), 0) + 1)
    else:
        temp = int(signallength / workitems_x)

    NDRange_x = workitems_x * temp

    # Local memory for rangesearch. Actually not used, better results with
    # private memory
    localmem = cl.LocalMemory(np.dtype(np.int32).itemsize * workitems_x)

    kernelBFRSAllshared(queue, (NDRange_x, ), (workitems_x, ), d_bf_query,
                        d_bf_pointset, d_bf_vecradius, d_bf_npointsrange,
                        pointdim, triallength, signallength, thelier, localmem)
    queue.finish()

    # Download results
    cl.enqueue_copy(queue, h_bf_npointsrange, d_bf_npointsrange)

    # Free buffers
    d_bf_npointsrange.release()
    d_bf_vecradius.release()
    d_bf_query.release()
    d_bf_pointset.release()

    return 1
def compress_image(img, num_centroids, iters):
    """compress_image compresses an image, given as an image.Image,
            using the K-Means clustering algorithm.
    
    Args:
        img_data: image.Image to be compressed
    
    Returns:
        image.Image with image data which has been compressed
    """
    # Get OpenCL context and queue
    context, queue = setup_opencl()
    mf = cl.mem_flags

    # Gather image data
    img_data = img.raw_data(image.ImageDataFormat.FLATTENED_NORMALIZED)
    img_dims = img.shape()

    # Create buffers
    imgBuffer = cl.Buffer(context,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=img_data)

    centroids = np.random.random_sample((num_centroids * 4)).astype(np.float32)
    centroidsBuffer = cl.Buffer(context,
                                cl.mem_flags.READ_WRITE
                                | cl.mem_flags.USE_HOST_PTR,
                                hostbuf=centroids)

    indices = np.zeros((img_dims[0] * img_dims[1], )).astype(np.int32)
    indicesBuffer = cl.Buffer(context,
                              cl.mem_flags.READ_WRITE,
                              size=indices.itemsize * img_dims[0] *
                              img_dims[1])

    # Load and compile the kernel
    build_ops = [
        "-D NUM_CENTROIDS={0}".format(num_centroids),
        "-D IMG_WIDTH={0}".format(img_dims[1])
    ]
    program = cl.Program(
        context,
        open('kernels/image_kmeans.cl').read()).build(options=build_ops)

    # Get the kernel and set the arguments
    kernel = cl.Kernel(program, 'FindClosestCentroid')
    kernel.set_arg(0, imgBuffer)
    kernel.set_arg(1, centroidsBuffer)
    kernel.set_arg(2, indicesBuffer)

    for iter in range(iters):
        cl.enqueue_nd_range_kernel(queue, kernel, (img_dims[0], img_dims[1]),
                                   None)
        cl.enqueue_copy(queue, indices, indicesBuffer, is_blocking=True)

        indexCounts = [0] * num_centroids
        indexTotals = np.zeros((num_centroids, 3))
        for i in range(0, len(indices)):
            idx = indices[i]
            indexCounts[idx] += 1
            indexTotals[idx][0] += img_data[3 * i]
            indexTotals[idx][1] += img_data[3 * i + 1]
            indexTotals[idx][2] += img_data[3 * i + 2]
        for i in range(num_centroids):
            count = indexCounts[i]
            if (count == 0):
                continue
            else:
                total = indexTotals[i]
                centroids[i * 3] = total[0] / count
                centroids[i * 3 + 1] = total[1] / count
                centroids[i * 3 + 2] = total[2] / count
        cl.enqueue_copy(queue, centroidsBuffer, centroids, is_blocking=True)
    compressed_img = np.zeros(img_dims)
    for x in range(img_dims[1]):
        for y in range(img_dims[0]):
            img_idx = img_dims[1] * y + x
            centroids_idx = indices[img_idx]
            compressed_img[y][x][0] = int(centroids[3 * centroids_idx] * 256)
            compressed_img[y][x][1] = int(centroids[3 * centroids_idx + 1] *
                                          256)
            compressed_img[y][x][2] = int(centroids[3 * centroids_idx + 2] *
                                          256)
    return image.Image(image_data=compressed_img)
Example #27
0
    def mining_thread(self):
        say_line('started OpenCL miner on platform %d, device %d (%s)',
                 (self.options.platform, self.device_index, self.device_name))

        (self.defines, rate_divisor,
         hashspace) = (vectors_definition(), 500,
                       0x7FFFFFFF) if self.vectors else ('', 1000, 0xFFFFFFFF)
        self.defines += (' -DOUTPUT_SIZE=' + str(self.output_size))
        self.defines += (' -DOUTPUT_MASK=' + str(self.output_size - 1))

        self.load_kernel()
        frame = 1.0 / max(self.frames, 3)
        unit = self.worksize * 256
        global_threads = unit * 10

        queue = cl.CommandQueue(self.context)

        last_rated_pace = last_rated = last_n_time = last_temperature = time()
        base = last_hash_rate = threads_run_pace = threads_run = 0
        output = bytearray((self.output_size + 1) * 4)
        output_buffer = cl.Buffer(self.context,
                                  cl.mem_flags.WRITE_ONLY
                                  | cl.mem_flags.USE_HOST_PTR,
                                  hostbuf=output)
        self.kernel.set_arg(20, output_buffer)

        work = None
        temperature = 0
        while True:

            if self.should_stop: return

            sleep(self.frameSleep)

            if (not work) or (not self.work_queue.empty()):
                try:
                    work = self.work_queue.get(True, 1)

                except Empty:
                    continue
                else:
                    if not work: continue
                    self.nonces_left = hashspace

                    state = work.state
                    f = [0] * 8
                    state2 = partial(state, work.merkle_end, work.time,
                                     work.difficulty, f)
                    calculateF(state, work.merkle_end, work.time,
                               work.difficulty, f, state2)

                    self.kernel.set_arg(0, pack('<I', state[0]))
                    self.kernel.set_arg(1, pack('<I', state[1]))
                    self.kernel.set_arg(2, pack('<I', state[2]))
                    self.kernel.set_arg(3, pack('<I', state[3]))
                    self.kernel.set_arg(4, pack('<I', state[4]))
                    self.kernel.set_arg(5, pack('<I', state[5]))
                    self.kernel.set_arg(6, pack('<I', state[6]))
                    self.kernel.set_arg(7, pack('<I', state[7]))

                    self.kernel.set_arg(8, pack('<I', state2[1]))
                    self.kernel.set_arg(9, pack('<I', state2[2]))
                    self.kernel.set_arg(10, pack('<I', state2[3]))
                    self.kernel.set_arg(11, pack('<I', state2[5]))
                    self.kernel.set_arg(12, pack('<I', state2[6]))
                    self.kernel.set_arg(13, pack('<I', state2[7]))

                    self.kernel.set_arg(15, pack('<I', f[0]))
                    self.kernel.set_arg(16, pack('<I', f[1]))
                    self.kernel.set_arg(17, pack('<I', f[2]))
                    self.kernel.set_arg(18, pack('<I', f[3]))
                    self.kernel.set_arg(19, pack('<I', f[4]))

            if temperature < self.cutoff_temp:
                self.kernel.set_arg(14, pack('<I', base))
                cl.enqueue_nd_range_kernel(queue, self.kernel,
                                           (global_threads, ),
                                           (self.worksize, ))
                self.nonces_left -= global_threads
                threads_run_pace += global_threads
                threads_run += global_threads
                base = uint32(base + global_threads)
            else:
                threads_run_pace = 0
                last_rated_pace = time()
                sleep(self.cutoff_interval)

            now = time()
            if self.adapterIndex != None:
                t = now - last_temperature
                if temperature >= self.cutoff_temp or t > 1:
                    last_temperature = now
                    with adl_lock:
                        temperature = self.get_temperature()

            t = now - last_rated_pace
            if t > 1:
                rate = (threads_run_pace / t) / rate_divisor
                last_rated_pace = now
                threads_run_pace = 0
                r = last_hash_rate / rate
                if r < 0.9 or r > 1.1:
                    global_threads = max(
                        unit * int((rate * frame * rate_divisor) / unit), unit)
                    last_hash_rate = rate

            t = now - last_rated
            if t > self.options.rate:
                self.update_rate(now, threads_run, t, work.targetQ,
                                 rate_divisor)
                last_rated = now
                threads_run = 0

            queue.finish()
            cl.enqueue_read_buffer(queue, output_buffer, output)
            queue.finish()

            if output[-1]:
                result = Object()
                result.header = work.header
                result.merkle_end = work.merkle_end
                result.time = work.time
                result.difficulty = work.difficulty
                result.target = work.target
                result.state = list(state)
                result.nonces = output[:]
                result.job_id = work.job_id
                result.extranonce2 = work.extranonce2
                result.server = work.server
                result.miner = self
                self.switch.put(result)
                output[:] = b'\x00' * len(output)
                cl.enqueue_write_buffer(queue, output_buffer, output)
            if self.switch.should_stop:
                self.stop()
            if not self.switch.update_time:
                if self.nonces_left < 3 * global_threads * self.frames:
                    self.update = True
                    self.nonces_left += 0xFFFFFFFFFFFF

                elif 0xFFFFFFFFFFF < self.nonces_left < 0xFFFFFFFFFFFF:
                    say_line('warning: job finished, %s is idle', self.id())
                    work = None
            elif now - last_n_time > 1:
                work.time = bytereverse(bytereverse(work.time) + 1)
                state2 = partial(state, work.merkle_end, work.time,
                                 work.difficulty, f)
                calculateF(state, work.merkle_end, work.time, work.difficulty,
                           f, state2)
                self.kernel.set_arg(8, pack('<I', state2[1]))
                self.kernel.set_arg(9, pack('<I', state2[2]))
                self.kernel.set_arg(10, pack('<I', state2[3]))
                self.kernel.set_arg(11, pack('<I', state2[5]))
                self.kernel.set_arg(12, pack('<I', state2[6]))
                self.kernel.set_arg(13, pack('<I', state2[7]))
                self.kernel.set_arg(15, pack('<I', f[0]))
                self.kernel.set_arg(16, pack('<I', f[1]))
                self.kernel.set_arg(17, pack('<I', f[2]))
                self.kernel.set_arg(18, pack('<I', f[3]))
                self.kernel.set_arg(19, pack('<I', f[4]))
                last_n_time = now
                self.update_time_counter += 1
                if self.update_time_counter >= self.switch.max_update_time:
                    self.update = True
                    self.update_time_counter = 1
Example #28
0
class Runner:
    def __init__(self, dims):
        import numpy as np
        self.np = np

        self.dims = dims
        self.width = dims[0]
        self.height = dims[1]
        self.regions = REGIONS
        nx = np.random.randint(0,
                               self.width,
                               size=self.regions,
                               dtype=np.int16)
        ny = np.random.randint(0,
                               self.height,
                               size=self.regions,
                               dtype=np.int16)
        self.points = np.dstack((nx, ny))[0]
        self.cols = np.random.randint(0,
                                      256,
                                      size=(self.regions, 3),
                                      dtype=np.uint8)
        self.use_cl = False
        self.init_gpu()

    def init_gpu(self):
        try:
            import pyopencl as cl
            # print cl
            from pyopencl import array
        except Exception, e:
            import os
            print os.getenv('LD_LIBRARY_PATH')
            print e.message
            return
        self.use_cl = True
        self.cl = cl
        device = cl.get_platforms()[0].get_devices()[0]
        self.ctx = cl.Context([device])
        self.queue = cl.CommandQueue(self.ctx)
        print(device)
        self.lut = self.np.zeros(self.regions + 1, cl.array.vec.char3)
        for idx, i in enumerate(self.cols):
            self.lut[idx][0] = i[0]
            self.lut[idx][1] = i[1]
            self.lut[idx][2] = i[2]
        # self.lut[-1][0] = 0
        # self.lut[-1][1] = 0
        # self.lut[-1][2] = 0
        self.lut_opencl = cl.Buffer(self.ctx,
                                    cl.mem_flags.READ_ONLY
                                    | cl.mem_flags.COPY_HOST_PTR,
                                    hostbuf=self.lut)

        self.prg = cl.Program(
            self.ctx, """

               __kernel void voronoi(__global uchar4 *img,
                                     const __global ushort2 *points,
                                     __constant uchar4 *lut,
                                     ushort const height,
                                     ushort const width,
                                     ushort const regions)
               {
                   int x = get_global_id(0);
                   int y = get_global_id(1);
                   // int grid_width = get_num_groups(0) * get_local_size(0);
                    int index = y * height + x;
                   int h = -1;
                   float dmin = hypot((float)width -1, (float)height -1);
                   for(int i = 0; i < regions; i++) {
                      float d = hypot((float)points[i].x - y, (float)points[i].y - x);
                      if (d < dmin) {
                        dmin = d;
                        h = i;
                      }
                   }
                   img[index] = lut[h];
               }
           """).build()
Example #29
0
        kernel = content_file.read()
    prg = cl.Program(ctx, kernel).build()

    mixture_data_buff = np.zeros(3 * nmixtures * resolution, dtype=np.float32)
    mixture_data_buff[0:resolution * nmixtures] = 1.0 / nmixtures / 10
    mixture_data_buff[resolution * nmixtures + 1:2 * resolution *
                      nmixtures] = init_var

    params_list = [k, T, init_var, min_var]
    mog_params = np.array(params_list, dtype=np.float32)

    f = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8)

    #Allocate memory for variables on the device
    mixture_data_g = cl.Buffer(ctx,
                               mf.COPY_HOST_PTR,
                               hostbuf=mixture_data_buff)
    mog_params_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=mog_params)

    cap = cv2.VideoCapture(camera)

    time_begin = time.time()
    cnt = 0

    while (True):
        #Read in image
        ret, frame = cap.read()
        if ret:
            img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA)
            img_g = cl.image_from_array(ctx, img, 4, mode='r', norm_int=True)
            img_shape = (img.shape[1], img.shape[0])
Example #30
0
    dlst = np.array([d1, d2, d2, d1, 0], dtype=np.float32)

    print 'dim (%d, %d, %d)' % (nx, ny, nz)
    total_bytes = nx * ny * nz * 4 * 12
    if total_bytes / (1024**3) == 0:
        print 'mem %d MB' % (total_bytes / (1024**2))
    else:
        print 'mem %1.2f GB' % (float(total_bytes) / (1024**3))

    # memory allocate
    f = np.zeros((nx, ny, nz), 'f', order='F')
    #f = np.random.randn(nx*ny*nz).astype(np.float32).reshape((nx,ny,nz),order='F')
    cf = np.ones_like(f) * (S / 24)

    mf = cl.mem_flags
    ex_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)
    ey_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)
    ez_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)
    hx_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)
    hy_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)
    hz_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=f)

    cex_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)
    cey_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)
    cez_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)
    chx_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)
    chy_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)
    chz_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=cf)

    # prepare kernels
    prg = cl.Program(ctx, kernels).build()