コード例 #1
0
ファイル: server.py プロジェクト: karray/n_body
def start_websocket(websocket, path):
    try:
        # initialize OpenCL
        ctx = cl.create_some_context()
        queue = cl.CommandQueue(ctx)
        # build kernal from file
        prg = cl.Program(ctx, open('kernal.cl').read()).build()

        # load data into numpy arrays on the host
        pos_host, vel_host = load_data()
        # create and initialize arrays on the device
        # copy initial postiton to device
        pos_dev = cl_array.to_device(queue, pos_host)
        # copy initial velocity to device
        vel_dev = cl_array.to_device(queue, vel_host)
        # allocate memory for new data on the devie
        pos_new_dev = cl_array.empty_like(pos_dev)
        vel_new_dev = cl_array.empty_like(vel_dev)

        # main loop
        while True:
            # run kernal with the work goup size of a number of bodies
            prg.nbody_simple(queue, (pos_host.shape[0],), None, pos_dev.data, vel_dev.data, pos_new_dev.data, vel_new_dev.data)
            # copy new position into host np.array (device -> host)
            pos_host = pos_new_dev.get()
            # update position and velocity on the device (device -> device)
            cl.enqueue_copy(queue, pos_dev.data, pos_new_dev.data)
            cl.enqueue_copy(queue, vel_dev.data, vel_new_dev.data)
            #send data to client
            yield from websocket.send(','.join(map(str,pos_host[:, :3].flatten())))
    finally:
      yield from websocket.close()
コード例 #2
0
def gs_mod_gpu(idata, itera=10, osize=256):

    cut = osize // 2

    pl = cl.get_platforms()[0]
    devices = pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,
                dtype=complex128)  #no funciona con "complex128"

    src = str(
        Template(KERNEL).render(
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))
    prg = cl.Program(ctx, src).build()

    idata_gpu = cl_array.to_device(queue,
                                   ifftshift(idata).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data, fdata_gpu.data)

    mask = exp(2.j * pi * random(idata.shape))
    mask[512 - cut:512 + cut, 512 - cut:512 + cut] = 0

    idata_gpu = cl_array.to_device(
        queue,
        ifftshift(idata + mask).astype("complex128"))
    fdata_gpu = cl_array.empty_like(idata_gpu)
    rdata_gpu = cl_array.empty_like(idata_gpu)
    error_gpu = cl_array.to_device(ctx, queue,
                                   zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data, fdata_gpu.data)

    e = 1000
    ea = 1000
    for i in range(itera):
        prg.norm(queue, fdata_gpu.shape, None, fdata_gpu.data)
        plan.execute(fdata_gpu.data, rdata_gpu.data, inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1 = prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape, None, rdata_gpu.data, idata_gpu.data,
              error_gpu.data, int32(cut))

        e = sqrt(cl_array.sum(error_gpu).get()) / (2 * cut)

        #~ if e>ea:
        #~
        #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data, fdata_gpu.data)

    fdata = fdata_gpu.get()
    fdata = ifftshift(fdata)
    fdata = exp(1.j * angle(fdata))
    return fdata
コード例 #3
0
ファイル: gs.py プロジェクト: ramezquitao/pyoptools
def gs_mod_gpu(idata,itera=10,osize=256):
    
    
    cut=osize//2
    
    pl=cl.get_platforms()[0]
    devices=pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128"
    
    src = str(Template(KERNEL).render(
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))
    prg = cl.Program(ctx,src).build() 
    

    idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    mask=exp(2.j*pi*random(idata.shape))
    mask[512-cut:512+cut,512-cut:512+cut]=0
    
    
    idata_gpu=cl_array.to_device(queue, ifftshift(idata+mask).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    error_gpu=cl_array.to_device(ctx, queue, zeros(idata_gpu.shape).astype("double"))
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    e=1000
    ea=1000
    for i in range (itera):
        prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
        plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True)
        #~ prg.norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        norm1=prg.norm1
        norm1.set_scalar_arg_dtypes([None, None, None, int32])
        norm1(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data,error_gpu.data, int32(cut))
        
        e= sqrt(cl_array.sum(error_gpu).get())/(2*cut)

        #~ if e>ea: 
           #~ 
            #~ break
        #~ ea=e
        plan.execute(rdata_gpu.data,fdata_gpu.data)
    
    fdata=fdata_gpu.get()
    fdata=ifftshift(fdata)
    fdata=exp(1.j*angle(fdata))
    return fdata
コード例 #4
0
    def __init__(self,
                 sino_shape,
                 slice_shape=None,
                 axis_position=None,
                 angles=None,
                 ctx=None,
                 devicetype="all",
                 platformid=None,
                 deviceid=None,
                 profile=False):
        OpenclProcessing.__init__(self,
                                  ctx=ctx,
                                  devicetype=devicetype,
                                  platformid=platformid,
                                  deviceid=deviceid,
                                  profile=profile)

        # Create a backprojector
        self.backprojector = Backprojection(sino_shape,
                                            slice_shape=slice_shape,
                                            axis_position=axis_position,
                                            angles=angles,
                                            ctx=self.ctx,
                                            profile=profile)
        # Create a projector
        self.projector = Projection(self.backprojector.slice_shape,
                                    self.backprojector.angles,
                                    axis_position=axis_position,
                                    detector_width=self.backprojector.num_bins,
                                    normalize=False,
                                    ctx=self.ctx,
                                    profile=profile)
        self.sino_shape = sino_shape
        self.is_cpu = self.backprojector.is_cpu
        # Arrays
        self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32)
        self.d_data.fill(0.0)
        self.d_sino = parray.empty_like(self.d_data)
        self.d_sino.fill(0.0)
        self.d_x = parray.empty(self.queue,
                                self.backprojector.slice_shape,
                                dtype=np.float32)
        self.d_x.fill(0.0)
        self.d_x_old = parray.empty_like(self.d_x)
        self.d_x_old.fill(0.0)

        self.add_to_cl_mem({
            "d_data": self.d_data,
            "d_sino": self.d_sino,
            "d_x": self.d_x,
            "d_x_old": self.d_x_old,
        })
コード例 #5
0
def cl_test_sobel(im):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    sobel = Sobel(ctx, queue)

    im_buf = cl_array.to_device(queue, im)
    mag_buf = cl_array.empty_like(im_buf)
    imgx_buf = cl_array.empty_like(im_buf)
    imgy_buf = cl_array.empty_like(im_buf)

    sobel(im_buf, imgx_buf, imgy_buf, mag_buf)
    return (mag_buf.get(), imgx_buf.get(), imgy_buf.get())
コード例 #6
0
def cl_test_sobel(im):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    sobel = Sobel(ctx, queue)

    im_buf = cl_array.to_device(queue, im)
    mag_buf = cl_array.empty_like(im_buf)
    imgx_buf = cl_array.empty_like(im_buf)
    imgy_buf = cl_array.empty_like(im_buf)

    sobel(im_buf, imgx_buf, imgy_buf, mag_buf)
    return (mag_buf.get(), imgx_buf.get(), imgy_buf.get())
コード例 #7
0
    def __init__(self,
                 sino_shape,
                 slice_shape=None,
                 axis_position=None,
                 angles=None,
                 ctx=None,
                 devicetype="all",
                 platformid=None,
                 deviceid=None,
                 profile=False):
        ReconstructionAlgorithm.__init__(self,
                                         sino_shape,
                                         slice_shape=slice_shape,
                                         axis_position=axis_position,
                                         angles=angles,
                                         ctx=ctx,
                                         devicetype=devicetype,
                                         platformid=platformid,
                                         deviceid=deviceid,
                                         profile=profile)
        self.compute_preconditioners()

        # Create a LinAlg instance
        self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx)
        # Positivity constraint
        self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a",
                                              "a[i] = max(a[i], 0.0f);")
        # Projection onto the L-infinity ball of radius Lambda
        self.elwise_proj_linf = ElementwiseKernel(
            self.ctx, "float2* a, float Lambda",
            "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);",
            "elwise_proj_linf")
        # Additional arrays
        self.linalg.gradient(self.d_x)
        self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"])
        self.d_q = parray.empty_like(self.d_data)
        self.d_g = self.linalg.d_image
        self.d_tmp = parray.empty_like(self.d_x)
        self.d_p.fill(0)
        self.d_q.fill(0)
        self.d_tmp.fill(0)
        self.add_to_cl_mem({
            "d_p": self.d_p,
            "d_q": self.d_q,
            "d_tmp": self.d_tmp,
        })

        self.theta = 1.0
コード例 #8
0
ファイル: test_algorithm.py プロジェクト: shinsec/pyopencl
def test_elwise_kernel_with_options(ctx_factory):
    from pyopencl.clrandom import rand as clrand
    from pyopencl.elementwise import ElementwiseKernel

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    in_gpu = clrand(queue, (50,), np.float32)

    options = ['-D', 'ADD_ONE']
    add_one = ElementwiseKernel(
        context,
        "float* out, const float *in",
        """
        out[i] = in[i]
        #ifdef ADD_ONE
            +1
        #endif
        ;
        """,
        options=options,
        )

    out_gpu = cl_array.empty_like(in_gpu)
    add_one(out_gpu, in_gpu)

    gt = in_gpu.get() + 1
    gv = out_gpu.get()
    assert la.norm(gv - gt) < 1e-5
コード例 #9
0
def get_fluid_source(params, G, P, D, out=None):
    """Calculate a small fluid source term, added to conserved variables for stability"""
    s = G.slices
    sh = G.shapes

    # T the old fashioned way: TODO Tmhd_full...
    T = cl_array.empty(params['queue'], sh.grid_tensor, dtype=np.float64)
    for mu in range(4):
        Tmhd_vec(params, G, P, D, mu, out=T[mu])

    if out is None:
        out = cl_array.empty_like(P)

    global gcon1_d, gcon2_d, gcon3_d
    if gcon1_d is None:
        gcon1_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 1, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())
        gcon2_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 2, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())
        gcon3_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 3, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())

    # Contract mhd stress tensor with connection
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon1_d, out=out[s.U1])
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon2_d, out=out[s.U2])
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon3_d, out=out[s.U2])

    if 'profile' in params and params['profile']:
        evt.wait()

    return out
コード例 #10
0
def test_spirv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (ctx._get_cl_version() < (2, 1) or cl.get_cl_header_version() < (2, 1)):
        pytest.skip("SPIR-V program creation only available "
                    "in OpenCL 2.1 and higher")

    n = 50000

    a_dev = cl.clrandom.rand(queue, n, np.float32)
    b_dev = cl.clrandom.rand(queue, n, np.float32)
    dest_dev = cl_array.empty_like(a_dev)

    with open("add-vectors-%d.spv" % queue.device.address_bits,
              "rb") as spv_file:
        spv = spv_file.read()

    prg = cl.Program(ctx, spv).build()
    if (not prg.all_kernels()
            and queue.device.platform.name.startswith("AMD Accelerated")):
        pytest.skip(
            "SPIR-V program creation on AMD did not result in any kernels")

    prg.sum(queue, a_dev.shape, None, a_dev.data, b_dev.data, dest_dev.data)

    assert la.norm((dest_dev - (a_dev + b_dev)).get()) < 1e-7
コード例 #11
0
 def _test_desparsification(self, input_on_device, output_on_device, dtype):
     current_config = "input on device: %s, output on device: %s, dtype: %s" % (
         str(input_on_device), str(output_on_device), str(dtype))
     logger.debug("CSR: %s" % current_config)
     # Generate data and reference CSR
     array = generate_sparse_random_data(shape=(512, 511), dtype=dtype)
     ref_sparse = self.compute_ref_sparsification(array)
     # De-sparsify on device
     csr = CSR(array.shape, dtype=dtype, max_nnz=ref_sparse.nnz)
     if input_on_device:
         data = parray.to_device(csr.queue, ref_sparse.data)
         indices = parray.to_device(csr.queue, ref_sparse.indices)
         indptr = parray.to_device(csr.queue, ref_sparse.indptr)
     else:
         data = ref_sparse.data
         indices = ref_sparse.indices
         indptr = ref_sparse.indptr
     if output_on_device:
         d_arr = parray.empty_like(csr.array)
         d_arr.fill(0)
         output = d_arr
     else:
         output = None
     arr = csr.densify(data, indices, indptr, output=output)
     if output_on_device:
         arr = arr.get()
     # Compare
     self.assertTrue(
         np.allclose(arr.reshape(array.shape), array),
         "something wrong with densified data (%s)" % current_config)
コード例 #12
0
ファイル: bridge.py プロジェクト: kadupitiya/openpilot
    def _cam_callback(self, image, frame_id, pub_type, rgb_type, yuv_type):
        img = np.frombuffer(image.raw_data, dtype=np.dtype("uint8"))
        img = np.reshape(img, (H, W, 4))
        img = img[:, :, [0, 1, 2]].copy()

        # convert RGB frame to YUV
        rgb = np.reshape(img, (H, W * 3))
        rgb_cl = cl_array.to_device(self.queue, rgb)
        yuv_cl = cl_array.empty_like(rgb_cl)
        self.krnl(self.queue, (np.int32(self.Wdiv4), np.int32(self.Hdiv4)),
                  None, rgb_cl.data, yuv_cl.data).wait()
        yuv = np.resize(yuv_cl.get(), np.int32(rgb.size / 2))
        eof = int(frame_id * 0.05 * 1e9)

        # TODO: remove RGB send once the last RGB vipc subscriber is removed
        self.vipc_server.send(rgb_type, img.tobytes(), frame_id, eof, eof)
        self.vipc_server.send(yuv_type, yuv.data.tobytes(), frame_id, eof, eof)

        dat = messaging.new_message(pub_type)
        msg = {
            "frameId": image.frame,
            "transform": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
        }
        setattr(dat, pub_type, msg)
        pm.send(pub_type, dat)
コード例 #13
0
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue,
                                 (nsamples, nrays),
                                 dtype=np.float32,
                                 allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue,
                                (nrays,),
                                dtype=np.int32,
                                allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue,
                                    (nrays, cutoff),
                                    dtype=np.float32,
                                    allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
コード例 #14
0
def mul(a_, b_):
    a = np.array(a_).astype(np.int32)
    b = np.array(b_).astype(np.int32)
    platform = cl.get_platforms()[0]
    device = platform.get_devices()[0]
    ctx = cl.Context([device])
    queue = cl.CommandQueue(ctx)
    a_dev = cl_array.to_device(queue, a)
    b_dev = cl_array.to_device(queue, b)
    sizea_dev = cl_array.to_device(queue, np.array([a.size], dtype=np.int32))
    sizeb_dev = cl_array.to_device(queue, np.array([b.size], dtype=np.int32))
    dest_dev = cl_array.empty_like(cl_array.to_device(queue, np.zeros(a.size + b.size - 1, dtype=np.int32)))
    prg = cl.Program(ctx, """
        __kernel void mul(__global const int *a, __global const int *b, __global const int *sizea, __global const int *sizeb,  __global int *c)
        {
            int size_a = sizea[0];
            int size_b = sizeb[0];
            for(int i=0; i<size_a; i++) {
                for(int j=0; j<size_b; j++) {
                    c[i+j] += a[i] * b[j];
                }
            }
        }
        """).build()
    prg.mul(queue, a.shape, None, a_dev.data, b_dev.data, sizea_dev.data, sizeb_dev.data, dest_dev.data)

    #print(dest_dev, sub)
    return np.trim_zeros(dest_dev.get(), 'b').tolist()
コード例 #15
0
def get_deviders(num):
    a = np.array([num]).astype(np.int32)
    platform = cl.get_platforms()[0]
    device = platform.get_devices()[0]
    ctx = cl.Context([device])
    queue = cl.CommandQueue(ctx)
    a_dev = cl_array.to_device(queue, a)
    dest_dev = cl_array.empty_like(cl_array.to_device(queue, np.zeros((2 * a[0]), dtype=np.int32)))
    prg = cl.Program(ctx, """
        __kernel void sum(__global const int *a, __global int *c)
        {
            int i = 1;
            int n = a[0];
            int j = 0;
            while(i <= sqrt((float) n))
            {
                if(n%i==0) {
                    c[j] = i;
                    j++;
                    c[j] = -i;
                    j++;
                    if (i != (n / i)) {
                        c[j] = n/i;
                        j++;
                        c[j] = -n/i;
                        j++;
                    }
                } 
                i++;
            }
        }
        """).build()
    prg.sum(queue, a.shape, None, a_dev.data, dest_dev.data)
    return np.trim_zeros(dest_dev.get(), 'b').tolist()
コード例 #16
0
ファイル: test_algorithm.py プロジェクト: zeta1999/pyopencl
def test_elwise_kernel_with_options(ctx_factory):
    from pyopencl.clrandom import rand as clrand
    from pyopencl.elementwise import ElementwiseKernel

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    in_gpu = clrand(queue, (50,), np.float32)

    options = ["-D", "ADD_ONE"]
    add_one = ElementwiseKernel(
        context,
        "float* out, const float *in",
        """
        out[i] = in[i]
        #ifdef ADD_ONE
            +1
        #endif
        ;
        """,
        options=options,
        )

    out_gpu = cl_array.empty_like(in_gpu)
    add_one(out_gpu, in_gpu)

    gt = in_gpu.get() + 1
    gv = out_gpu.get()
    assert la.norm(gv - gt) < 1e-5
コード例 #17
0
    def __call__(self,
                 input_buf,
                 row_buf,
                 col_buf,
                 output_buf,
                 intermed_buf=None):

        (h, w) = input_buf.shape
        r = row_buf.shape[0]
        c = col_buf.shape[0]

        if intermed_buf is None:
            intermed_buf = cl_array.empty_like(input_buf)

        self.program.separable_correlation_row(self.queue,
                                               (h, w),
                                               None,
                                               intermed_buf.data,
                                               input_buf.data,
                                               np.int32(w), np.int32(h),
                                               row_buf.data,
                                               np.int32(r))

        self.program.separable_correlation_col(self.queue,
                                               (h, w),
                                               None,
                                               output_buf.data,
                                               intermed_buf.data,
                                               np.int32(w), np.int32(h),
                                               col_buf.data,
                                               np.int32(c))
コード例 #18
0
 def rev_grad(self, valuation):
     cache = {}
     res = self._evaluate(valuation, cache)
     adjoint = clarray.empty_like(res).fill(1.0)
     grad = {key: 0 for key in valuation}
     self._rev_grad(valuation, adjoint, grad, cache)
     return grad
コード例 #19
0
def test_RungeKutta(initials, t0, t1, derived_function, expected,
                    delta_absolute_error, absolute_error, relative_error,
                    expected_error_runge_kutta):
    sut_derivedFn = f'''double4 derivedFn(double4* Y, double t)
    {{
        return (double4)({derived_function});
    }}'''

    sut = cl.elementwise.ElementwiseKernel(
        context,
        'double4 *y, double4 *y0, double t, double dt, double* error_runge_kutta',
        'double4 temp_y = y[0]; double4 temp_y0 = y0[0]; *error_runge_kutta = RungeKutta(&temp_y, &temp_y0, t, dt); y[0] = temp_y',
        name='sut',
        preamble=f'{sut_derivedFn}{rk_pd_4d.rungeKutta}')

    y0 = cl_array.to_device(queue, initials)
    y = cl_array.empty_like(y0)
    error_runge_kutta = cl_array.to_device(queue,
                                           numpy.array([numpy.double(0.0)]))

    sut(y, y0, t0, t1, error_runge_kutta)

    assert error_runge_kutta.get() == pytest.approx(expected_error_runge_kutta,
                                                    abs=delta_absolute_error)

    numpy.testing.assert_allclose(numpy.array(y.get()[0].tolist()),
                                  numpy.array(expected[0].tolist()),
                                  rtol=relative_error,
                                  atol=absolute_error)
コード例 #20
0
ファイル: fixup.py プロジェクト: bprather/clHARM
def fixup_ceiling(params, fflag, G, P):
    s = G.slices

    # First apply ceilings:
    # 1. Limit gamma with respect to normal observer

    # TODO is there a softer touch here?
    gamma = mhd_gamma_calc(params['queue'], G, P, Loci.CENT)
    f = cl_array.if_positive(gamma - params['gamma_max'],
                             ((params['gamma_max']**2 - 1.) / (gamma**2 - 1.))**(1/2),
                             cl_array.empty_like(gamma).fill(1))
    P[s.U1] *= f
    P[s.U2] *= f
    P[s.U3] *= f

    # 2. Limit KTOT
    if params['electrons']:
        # Keep to KTOTMAX by controlling u, to avoid anomalous cooling from funnel wall
        # TODO This operates on last iteration's KTOT, meaning the effective value can escape the ceiling. Rethink
        u_max_ent = params['entropy_max'] * (P[s.RHO] ** params['gam'])/(params['gam']-1.)
        P[s.UU] = cl_array.if_positive(P[s.UU] - u_max_ent, u_max_ent, P[s.UU])
        P[s.KTOT] = cl_array.if_positive(P[s.KTOT] - params['entropy_max'], params['entropy_max'], P[s.KTOT])
        pass

    # TODO keep track of hits
    #fflag |= cl_array.if_positive(gamma - params['gamma_max'], temp.fill(HIT_FLOOR_GAMMA), zero)

    return P, fflag
コード例 #21
0
def run(x, approx):
    if with_pyopencl:
        ctx = cl.create_some_context()
        queue = cl.CommandQueue(ctx)

        queue.finish()
        tree = approx.tree_1d

        # initialize variables
        x_dev = cl_array.to_device(queue, x)
        tree_dev = cl_array.to_device(queue, tree)
        y_dev = cl_array.empty_like(x_dev)

        # build the code to run from given string
        dt = approx.dtype
        declaration = "__kernel void sum(__global " + dt + "  *tree, "
        declaration += "__global " + dt + " *x, __global " + dt + " *y) "
        code = declaration + '{' + approx.code + '}'

        # compile code and then execute it
        prg = cl.Program(ctx, code).build()

        prg.sum(queue, x_dev.shape, None, tree_dev.data, x_dev.data,
                y_dev.data)

        queue.finish()

        return y_dev.get()
    else:
        raise ValueError("Function requires pyopencl installation.")
コード例 #22
0
ファイル: step.py プロジェクト: mfkiwl/pystella
    def get_tmp_arrays_like(self, **kwargs):
        """
        Allocates required temporary arrays matching those passed via keyword.

        :returns: A :class:`dict` of named arrays, suitable for passing via
            dictionary expansion.

        .. versionadded:: 2020.2
        """

        tmp_arrays = {}
        for name in self.dof_names:
            f = kwargs[name]
            tmp_name = gen_tmp_name(name)
            import pyopencl.array as cla
            if isinstance(f, cla.Array):
                tmp_arrays[tmp_name] = cla.empty_like(f)
            elif isinstance(f, np.ndarray):
                tmp_arrays[tmp_name] = np.empty_like(f)
            else:
                raise ValueError(f"Could not generate tmp array for {f}"
                                 f"of type {type(f)}")
            tmp_arrays[tmp_name][...] = 0.

        return tmp_arrays
コード例 #23
0
ファイル: clidt.py プロジェクト: KermMartian/TopoMC
    def genindices(self, arrayin):
        """Generate indices for splitting array."""
        retval = dict()
        # run the 'trim' program
        # need to split if it's too long!
        splitlist = tuple([x for x in xrange(CLIDT.indexmaxsize, arrayin.shape[0], CLIDT.indexmaxsize)])
        indexinc = 0
        for chunk in np.vsplit(arrayin, splitlist):
            chunkarr = cla.to_device(self.queue, np.asarray(chunk, dtype=np.int32))
            template = cla.empty_like(chunkarr)
            event = self.program.trim(
                self.queue, chunkarr.shape, None, chunkarr.data, template.data, np.int32(self.split)
            )
            try:
                event.wait()
            except cl.RuntimeError, inst:
                errstr = inst.__str__()
                if errstr == "clWaitForEvents failed: out of resources":
                    print "OpenCL timed out, probably due to the display manager."
                    print "Disable your display manager and try again!"
                    print "If that does not work, rerun with OpenCL disabled."
                else:
                    raise cl.RuntimeError, inst
                sys.exit(1)

            for index, elem in enumerate(template.get()):
                splitkey = tuple([x for x in elem])
                try:
                    retval[splitkey]
                except KeyError:
                    retval[splitkey] = []
                retval[splitkey].append(index + indexinc)
            indexinc += CLIDT.indexmaxsize
コード例 #24
0
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue, (nsamples, nrays),
                                       dtype=np.float32,
                                       allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue, (nrays, ),
                                      dtype=np.int32,
                                      allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff),
                                          dtype=np.float32,
                                          allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue,
                                    0,
                                    cutoff * nrays,
                                    1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
コード例 #25
0
ファイル: _linalg.py プロジェクト: dtbinh/T1_python-exercises
def opencl_cross(a, b):
    a     = asarray(a, dtype=float32)
    b     = asarray(b, dtype=float32)
    ctx   = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    a_dev = cl_array.to_device(queue, a)
    b_dev = cl_array.to_device(queue, b)
    c_dev = cl_array.empty_like(a_dev)
    code  = """
    __kernel void crossproduct(__global const float *a, __global const float *b, __global float *c)
    {
        int i = get_global_id(0);

        __global const float * a_ = &a[i * 3];
        __global const float * b_ = &b[i * 3];
        __global float * c_ = &c[i * 3];

        c_[0] = a_[1] * b_[2] - a_[2] * b_[1];
        c_[1] = a_[2] * b_[0] - a_[0] * b_[2];
        c_[2] = a_[0] * b_[1] - a_[1] * b_[0];
    }
    """
    prg = cl.Program(ctx, code).build()
    prg.crossproduct(queue, a.shape, None, a_dev.data, b_dev.data, c_dev.data)
    return c_dev.get()
コード例 #26
0
ファイル: bridge.py プロジェクト: ntoenis/openpilot
    def cam_callback(self, image):
        img = np.frombuffer(image.raw_data, dtype=np.dtype("uint8"))
        img = np.reshape(img, (H, W, 4))
        img = img[:, :, [0, 1, 2]].copy()

        # convert RGB frame to YUV
        rgb = np.reshape(img, (H, W * 3))
        rgb_cl = cl_array.to_device(self.queue, rgb)
        yuv_cl = cl_array.empty_like(rgb_cl)
        self.krnl(self.queue, (np.int32(self.Wdiv4), np.int32(self.Hdiv4)),
                  None, rgb_cl.data, yuv_cl.data).wait()
        yuv = np.resize(yuv_cl.get(), np.int32((rgb.size / 2)))
        eof = self.frame_id * 0.05

        # TODO: remove RGB send once the last RGB vipc subscriber is removed
        self.vipc_server.send(VisionStreamType.VISION_STREAM_RGB_BACK,
                              img.tobytes(), self.frame_id, eof, eof)
        self.vipc_server.send(VisionStreamType.VISION_STREAM_ROAD,
                              yuv.data.tobytes(), self.frame_id, eof, eof)

        dat = messaging.new_message('roadCameraState')
        dat.roadCameraState = {
            "frameId": image.frame,
            "transform": [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
        }
        pm.send('roadCameraState', dat)
        self.frame_id += 1
コード例 #27
0
ファイル: _nlinvns_3D.py プロジェクト: philippgualdi/PyQMRI
def _nsIfft(M, fft):
    # K = np.fft.ifftn(M, axes=(-3,-2,-1), norm="ortho")
    tmp = cla.to_device(fft.queue, M)
    K = cla.empty_like(tmp)
    fft.FFTH(K, tmp)
    # K = fft(M)
    return K.get()
コード例 #28
0
def square():
  if not slicer.util.getNode('moving'):
    load_default_volume()

  a = slicer.util.array('moving').flatten()

  ctx = cl.create_some_context()
  queue = cl.CommandQueue(ctx)

  mf = cl.mem_flags
  a_dev = cl_array.to_device(queue, a)
  dest_dev = cl_array.empty_like(a_dev)

  prg = cl.Program(ctx, """
      __kernel void square(__global const short *a, __global short *c)
      {
        int gid = get_global_id(0);
        c[gid] = a[gid] * a[gid];
      }
      """).build()

  prg.square(queue, a.shape, None, a_dev.data, dest_dev.data)

  diff = ( dest_dev - (a_dev*a_dev) ).get()
  norm = la.norm(diff)
  print(norm)
コード例 #29
0
    def __call__(self,
                 input_buf,
                 row_buf,
                 col_buf,
                 output_buf,
                 intermed_buf=None):

        (h, w) = input_buf.shape
        r = row_buf.shape[0]
        c = col_buf.shape[0]

        if intermed_buf is None:
            intermed_buf = cl_array.empty_like(input_buf)

        self.program.separable_correlation_row(self.queue,
                                               (h, w),
                                               None,
                                               intermed_buf.data,
                                               input_buf.data,
                                               np.int32(w), np.int32(h),
                                               row_buf.data,
                                               np.int32(r))

        self.program.separable_correlation_col(self.queue,
                                               (h, w),
                                               None,
                                               output_buf.data,
                                               intermed_buf.data,
                                               np.int32(w), np.int32(h),
                                               col_buf.data,
                                               np.int32(c))
コード例 #30
0
 def _evaluate(self, valuation, cache):
     if id(self) not in cache:
         op = self.ops[0]._evaluate(valuation, cache)
         val = clarray.empty_like(op)
         ev = self.eval_kernel(op, val)
         ev.wait()
         cache[id(self)] = val
     return cache[id(self)]
コード例 #31
0
ファイル: scan.py プロジェクト: lichinka/cai
        def __call__(self, input_ary, output_ary=None, allocator=None,
                stream=None):
            allocator = allocator or input_ary.allocator

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary, allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                        "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
            dev = driver.Context.get_device()
            max_groups = 3*dev.get_attribute(
                    driver.device_attribute.MULTIPROCESSOR_COUNT)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                    n, unit_size, max_groups);

            block_results = allocator(self.dtype.itemsize*num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl.prepared_async_call(
                    (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                    input_ary.gpudata,
                    n, interval_size,
                    output_ary.gpudata,
                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl.prepared_async_call(
                    (1,1), (self.scan_wg_size, 1, 1), stream,
                    block_results,
                    num_groups, interval_size,
                    block_results,
                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl.prepared_async_call(
                    (num_groups, 1,), (self.update_wg_size, 1, 1), stream,
                    output_ary.gpudata,
                    n, interval_size,
                    block_results)

            return output_ary
コード例 #32
0
        def __call__(self,
                     input_ary,
                     output_ary=None,
                     allocator=None,
                     stream=None):
            allocator = allocator or input_ary.allocator

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary,
                                                 allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                                   "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size = self.scan_wg_size * self.scan_wg_seq_batches
            dev = driver.Context.get_device()
            max_groups = 3 * dev.get_attribute(
                driver.device_attribute.MULTIPROCESSOR_COUNT)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups)

            block_results = allocator(self.dtype.itemsize * num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl.prepared_async_call(
                (num_groups, 1), (self.scan_wg_size, 1, 1), stream,
                input_ary.gpudata, n, interval_size, output_ary.gpudata,
                block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl.prepared_async_call(
                (1, 1), (self.scan_wg_size, 1, 1), stream, block_results,
                num_groups, interval_size, block_results, dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl.prepared_async_call((
                num_groups,
                1,
            ), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n,
                                                      interval_size,
                                                      block_results)

            return output_ary
コード例 #33
0
def get_state(params, G, P, loc=Loci.CENT, out=None):
    """Calculate ucon, ucov, bcon, bcov from primitive variables
    Returns a dict of state variables
    """
    # TODO make this a fusion of kernels?  Components are needed in places
    if out is None:
        out = {}
        out['ucon'] = cl_array.empty(params['queue'],
                                     G.shapes.grid_vector,
                                     dtype=np.float64)
        out['ucov'] = cl_array.empty_like(out['ucon'])
        out['bcon'] = cl_array.empty_like(out['ucon'])
        out['bcov'] = cl_array.empty_like(out['ucon'])
    ucon_calc(params, G, P, loc, out=out['ucon'])
    G.lower_grid(out['ucon'], loc, out=out['ucov'])
    bcon_calc(params, G, P, out['ucon'], out['ucov'], out=out['bcon'])
    G.lower_grid(out['bcon'], loc, out=out['bcov'])
    return out
コード例 #34
0
ファイル: scan.py プロジェクト: lichinka/cai
        def __call__(self, input_ary, output_ary=None, allocator=None,
                queue=None):
            allocator = allocator or input_ary.allocator
            queue = queue or input_ary.queue or output_ary.queue

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary, allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                        "deal with non-contiguous arrays")
 
            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
            max_groups = 3*max(dev.max_compute_units for dev in self.devices)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                    n, unit_size, max_groups);

            block_results = allocator(self.dtype.itemsize*num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl(
                    queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,),
                    input_ary.data,
                    n, interval_size,
                    output_ary.data,
                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl(
                    queue, (self.scan_wg_size,), (self.scan_wg_size,),
                    block_results,
                    num_groups, interval_size,
                    block_results,
                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl(
                    queue, (num_groups*self.update_wg_size,), (self.update_wg_size,),
                    output_ary.data,
                    n, interval_size,
                    block_results)

            return output_ary
コード例 #35
0
    def setup_device(self, imshape):

        print('Setting up with imshape = %s' % (str(imshape)))

        self.cached_shape = imshape

        self.clIm = cla.Array(self.q, imshape, np.float32)
        self.clm = cla.empty_like(self.clIm)
        self.clx = cla.empty_like(self.clIm)
        self.cly = cla.empty_like(self.clIm)
        self.clO = cla.zeros_like(self.clIm)
        self.clM = cla.zeros_like(self.clIm)
        self.clF = cla.empty_like(self.clIm)
        self.clS = cla.empty_like(self.clIm)
        self.clThisS = cla.empty_like(self.clIm)
        self.clScratch = cla.empty_like(self.clIm)

        self.radial_prg = pyopencl.Program(self.ctx, RADIAL_PROGRAM).build()

        self.sobel = Sobel(self.ctx, self.q)

        #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q)
        self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q)

        self.accum = ElementwiseKernel(self.ctx,
                                       'float *a, float *b',
                                       'a[i] += b[i]')

        self.norm_s = ElementwiseKernel(self.ctx,
                                        'float *s, const float nRadii',
                                        's[i] = -1 * s[i] / nRadii',
                                        'norm_s')

        self.accum_s = ElementwiseKernel(self.ctx,
                                         'float *a, float *b, const float nr',
                                         'a[i] -= b[i] / nr')

        self.gaussians = {}
        self.gaussian_prgs = {}

        self.minmax = MinMaxKernel(self.ctx, self.q)

        # starburst storage

        clImageFormat = cl.ImageFormat(cl.channel_order.R,
                                       cl.channel_type.FLOAT)

        self.clIm2D = cl.Image(self.ctx,
                               mf.READ_ONLY,
                               clImageFormat,
                               imshape)

        # Create sampler for sampling image object
        self.imSampler = cl.Sampler(self.ctx,
                                    False,  # Non-normalized coordinates
                                    cl.addressing_mode.CLAMP_TO_EDGE,
                                    cl.filter_mode.LINEAR)

        self.cl_find_ray_boundaries = FindRayBoundaries(self.ctx, self.q)
コード例 #36
0
ファイル: Main.py プロジェクト: favilo/4space-nbody
    def test(self):
        a = numpy.random.randn(4, 4).astype(numpy.float32)
        b = numpy.random.randn(4, 4).astype(numpy.float32)
        c = numpy.random.randn(4, 4).astype(numpy.float32)

        a_gpu = cl_array.to_device(self.ctx, queue, a)
        b_gpu = cl_array.to_device(self.ctx, queue, b)
        c_gpu = cl_array.to_device(self.ctx, queue, c)

        dest_gpu = cl_array.empty_like(a_gpu)
コード例 #37
0
    def __call__(self, input_buf, imgx_buf, imgy_buf, mag_buf):

        if self.scratch is None or self.scratch.shape != input_buf.shape:
            self.scratch = cl_array.empty_like(input_buf)

        self.sepconv_cr(input_buf, self.sobel_c, self.sobel_r, imgx_buf,
                        self.scratch)
        self.sepconv_rc(input_buf, self.sobel_r, self.sobel_c, imgy_buf,
                        self.scratch)
        self.mag(mag_buf, imgx_buf, imgy_buf)
def test_derivedFnCoulomb(initials, expected):
    sut = cl.elementwise.ElementwiseKernel(
        context, 'double4 *k, double4 *y, double t', 'double4 temp_y = y[0]; k[0] = derivedFn(&temp_y, t)', name='sut', preamble=rk_pd_4d.derivedFnCoulomb)

    y = cl_array.to_device(queue, initials)
    k = cl_array.empty_like(y)

    sut(k, y, 0.0)

    assert expected == k.get()
コード例 #39
0
def prim_to_flux(params, G, P, D=None, dir=0, loc=Loci.CENT, out=None):
    """Calculate fluxes of conserved varibles in direction dir, or if dir=0 the variables themselves"""
    sh = G.shapes

    if out is None:
        out = cl_array.empty_like(P)
    if D is None:
        D = get_state(params, G, P, loc)

    global knl_prim_to_flux
    if knl_prim_to_flux is None:
        code = replace_prim_names("""
        out[RHO,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j]
        out[UU,i,j,k]  = (T[0,i,j,k] + P[RHO,i,j,k] * ucon[dir,i,j,k]) * gdet[i,j]
        out[U1,i,j,k]  = T[1,i,j,k] * gdet[i,j]
        out[U2,i,j,k]  = T[2,i,j,k] * gdet[i,j]
        out[U3,i,j,k]  = T[3,i,j,k] * gdet[i,j]
        out[B1,i,j,k]  = (bcon[1,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[1,i,j,k]) * gdet[i,j]
        out[B2,i,j,k]  = (bcon[2,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[2,i,j,k]) * gdet[i,j]
        out[B3,i,j,k]  = (bcon[3,i,j,k] * ucon[dir,i,j,k] - bcon[dir,i,j,k] * ucon[3,i,j,k]) * gdet[i,j]
        """)
        if 'electrons' in params and params['electrons']:
            code += replace_prim_names("""
            out[KEL,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j] * P[KEL,i,j,k]
            out[KTOT,i,j,k] = P[RHO,i,j,k] * ucon[dir,i,j,k] * gdet[i,j] * P[KTOT,i,j,k]
            """)
        # TODO also passives
        knl_prim_to_flux = lp.make_kernel(
            "[dir, ndim, n1, n2, n3] -> " + sh.isl_grid_scalar,
            code, [
                *primsArrayArgs("P", "out", ghosts=False),
                *vecArrayArgs("T", "ucon", "bcon", ghosts=False),
                *gscalarArrayArgs("gdet", ghosts=False), ...
            ],
            assumptions=sh.assume_grid + "and 0 <= dir < ndim",
            default_offset=lp.auto)
        knl_prim_to_flux = lp.fix_parameters(knl_prim_to_flux,
                                             nprim=params['n_prim'],
                                             ndim=4)
        # TODO keep k because of the geom argument
        knl_prim_to_flux = tune_grid_kernel(knl_prim_to_flux, sh.grid_scalar)
        print("Compiled prim_to_flux")

    evt, _ = knl_prim_to_flux(params['queue'],
                              P=P,
                              T=Tmhd_vec(params, G, P, D, dir),
                              gdet=G.gdet_d[loc.value],
                              ucon=D['ucon'],
                              bcon=D['bcon'],
                              dir=dir,
                              out=out)
    if 'profile' in params and params['profile']:
        evt.wait()

    return out
コード例 #40
0
        def __call__(self,
                     input_ary,
                     output_ary=None,
                     allocator=None,
                     queue=None):
            allocator = allocator or input_ary.allocator
            queue = queue or input_ary.queue or output_ary.queue

            if output_ary is None:
                output_ary = input_ary

            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
                output_ary = cl_array.empty_like(input_ary,
                                                 allocator=allocator)

            if input_ary.shape != output_ary.shape:
                raise ValueError("input and output must have the same shape")

            if not input_ary.flags.forc:
                raise RuntimeError("ScanKernel cannot "
                                   "deal with non-contiguous arrays")

            n, = input_ary.shape

            if not n:
                return output_ary

            unit_size = self.scan_wg_size * self.scan_wg_seq_batches
            max_groups = 3 * max(dev.max_compute_units for dev in self.devices)

            from pytools import uniform_interval_splitting
            interval_size, num_groups = uniform_interval_splitting(
                n, unit_size, max_groups)

            block_results = allocator(self.dtype.itemsize * num_groups)
            dummy_results = allocator(self.dtype.itemsize)

            # first level scan of interval (one interval per block)
            self.scan_intervals_knl(queue, (num_groups * self.scan_wg_size, ),
                                    (self.scan_wg_size, ), input_ary.data, n,
                                    interval_size, output_ary.data,
                                    block_results)

            # second level inclusive scan of per-block results
            self.scan_intervals_knl(queue, (self.scan_wg_size, ),
                                    (self.scan_wg_size, ), block_results,
                                    num_groups, interval_size, block_results,
                                    dummy_results)

            # update intervals with result of second level scan
            self.final_update_knl(queue, (num_groups * self.update_wg_size, ),
                                  (self.update_wg_size, ), output_ary.data, n,
                                  interval_size, block_results)

            return output_ary
コード例 #41
0
    def __call__(self,
                 input_buf,
                 imgx_buf,
                 imgy_buf,
                 mag_buf):

        if self.scratch is None or self.scratch.shape != input_buf.shape:
            self.scratch = cl_array.empty_like(input_buf)

        self.sepconv_cr(input_buf, self.sobel_c, self.sobel_r, imgx_buf, self.scratch)
        self.sepconv_rc(input_buf, self.sobel_r, self.sobel_c, imgy_buf, self.scratch)
        self.mag(mag_buf, imgx_buf, imgy_buf)
コード例 #42
0
ファイル: test_array.py プロジェクト: initcrash/pyopencl
def test_elwise_kernel(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    a_gpu = clrand(context, queue, (50,), numpy.float32)
    b_gpu = clrand(context, queue, (50,), numpy.float32)

    from pyopencl.elementwise import ElementwiseKernel
    lin_comb = ElementwiseKernel(context,
            "float a, float *x, float b, float *y, float *z",
            "z[i] = a*x[i] + b*y[i]",
            "linear_combination")

    c_gpu = cl_array.empty_like(a_gpu)
    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
コード例 #43
0
ファイル: test_callback.py プロジェクト: geggo/gpyfft
    def callback_post(self, context):
        print("context:", context)
        queue = cl.CommandQueue(context)

        nd_data = np.array([[1, 2, 3, 4],
                            [5, 6, 5, 2]],
                           dtype=np.complex64)
        nd_user_data = np.array([[2, 2, 2, 2],
                                 [3, 4, 5, 6]],
                                dtype=np.float32)

        cl_data = cla.to_device(queue, nd_data)
        cl_user_data = cla.to_device(queue, nd_user_data)
        cl_data_transformed = cla.empty_like(cl_data)

        G = GpyFFT(debug=False)
        plan = G.create_plan(context, cl_data.shape)
        plan.strides_in  = tuple(x // cl_data.dtype.itemsize for x in cl_data.strides)
        plan.strides_out = tuple(x // cl_data.dtype.itemsize for x in cl_data_transformed.strides)
        plan.inplace = False
        plan.precision = CLFFT_SINGLE
        plan.set_callback(b'postset',
                          self.callback_kernel_src_postset,
                          'post',
                          user_data=cl_user_data.data)

        plan.bake(queue)
        plan.enqueue_transform((queue,),
                               (cl_data.data,),
                               (cl_data_transformed.data,)
                               )
        queue.finish()

        print('cl_data_transformed:')
        print(cl_data_transformed)

        print('fft(nd_data) * nd_user_data')
        print(np.fft.fftn(nd_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fftn(nd_data) * nd_user_data)
        
        del plan
コード例 #44
0
ファイル: test_wrapper.py プロジェクト: hrfuller/pyopencl
def test_spirv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (ctx._get_cl_version() < (2, 1) or
            cl.get_cl_header_version() < (2, 1)):
        from pytest import skip
        skip("SPIR-V program creation only available in OpenCL 2.1 and higher")

    n = 50000

    a_dev = cl.clrandom.rand(queue, n, np.float32)
    b_dev = cl.clrandom.rand(queue, n, np.float32)
    dest_dev = cl_array.empty_like(a_dev)

    with open("add-vectors-%d.spv" % queue.device.address_bits, "rb") as spv_file:
        spv = spv_file.read()

    prg = cl.Program(ctx, spv)

    prg.sum(queue, a_dev.shape, None, a_dev.data, b_dev.data, dest_dev.data)

    assert la.norm((dest_dev - (a_dev+b_dev)).get()) < 1e-7
コード例 #45
0
# Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise)

import pyopencl as cl  # Import the OpenCL GPU computing API
import pyopencl.array as cl_array  # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
import numpy  # Import Numpy number tools

context = cl.create_some_context()  # Initialize the Context
queue = cl.CommandQueue(context)  # Instantiate a Queue

a = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32))  # Create a random pyopencl array
b = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32))  # Create a random pyopencl array
c = cl_array.empty_like(a)  # Create an empty pyopencl destination array

sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c", "c[i] = a[i] + b[i]", "sum")
# Create an elementwise kernel object
#  - Arguments: a string formatted as a C argument list
#  - Operation: a snippet of C that carries out the desired map operatino
#  - Name: the fuction name as which the kernel is compiled

sum(a, b, c)  # Call the elementwise kernel

print("a: {}".format(a))
print("b: {}".format(b))
print("c: {}".format(c))
# Print all three arrays, to show sum() worked
コード例 #46
0
ファイル: gs.py プロジェクト: ramezquitao/pyoptools
def gs_gpu(idata,itera=100):
    """Gerchberg-Saxton algorithm to calculate DOEs using the GPU
    
    Calculates the phase distribution in a object plane to obtain an 
    specific amplitude distribution in the target plane. It uses a 
    FFT to calculate the field propagation.
    The wavefront at the DOE plane is assumed as a plane wave.
    
    **ARGUMENTS:**
	
		========== ======================================================
		idata      numpy array containing the target amplitude distribution 
        itera      Maximum number of iterations
		========== ======================================================
    """ 
    
    pl=cl.get_platforms()[0]
    devices=pl.get_devices(device_type=cl.device_type.GPU)
    ctx = cl.Context(devices=[devices[0]])
    queue = cl.CommandQueue(ctx)

    plan = Plan(idata.shape, queue=queue,dtype=complex128) #no funciona con "complex128"
    
    src = str(Template(KERNEL).render(
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))
    prg = cl.Program(ctx,src).build() 
    
    idata_gpu=cl_array.to_device(queue, ifftshift(idata).astype("complex128"))
    fdata_gpu=cl_array.empty_like(idata_gpu)
    rdata_gpu=cl_array.empty_like(idata_gpu)
    plan.execute(idata_gpu.data,fdata_gpu.data)
    
    e=1000
    ea=1000
    for i in range (itera):
        prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
        plan.execute(fdata_gpu.data,rdata_gpu.data,inverse=True)
        tr=rdata_gpu.get()
        rdata=ifftshift(tr)
        
        
        #TODO: This calculation should be done in the GPU
        e= (abs(rdata)-idata).std()
        if e>ea: 
            break
        ea=e
        
        prg.norm2(queue, rdata_gpu.shape,None,rdata_gpu.data,idata_gpu.data)
        
        plan.execute(rdata_gpu.data,fdata_gpu.data)
    
    fdata=fdata_gpu.get()
    
    #~ prg.norm(queue, fdata_gpu.shape, None,fdata_gpu.data)
    fdata=ifftshift(fdata)
    fdata=exp(1.j*angle(fdata))
    
    #~ fdata=fdata_gpu.get()
    return fdata
コード例 #47
0
ファイル: integration.py プロジェクト: vincentericsson/NHQM
# Kernel for reduce-code
krnlRed=ReductionKernel(ctx,numpy.float32,neutral="0",
    reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length,
    arguments="__global float *x",
    preamble="""
    float get_val(float x)
    {
        return x*x;
    }
    """)

# Generation of an array where each element is an evaluated integral.
tonum=1000000 # Number of elements.
# Array to send to the GPU.
p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.float32))
res=cl_array.empty_like(p_gpu) # The resultating array

# Elementwise (mapping) kernel.
krnlMap=ElementwiseKernel(ctx,"float *param, float *res", "res[i]=integrate(param[i])",preamble="""
    float integrate(float param)
    {
        float sum=0;
        for (float f=0.0;f<10.0;f+=0.001)
        {
            sum+=(f*f-10*f-param);
        }
        return sum/1000.0;
    }
""")
integrand=krnlRed(vals).get() # Calculate the first integral.
krnlMap(p_gpu,res) # Generate the large array.
コード例 #48
0
ファイル: accelerator.py プロジェクト: oseiskar/raytracer
 def empty_like(self, a):
     arr = cl_array.empty_like(a)
     self._cl_arrays.append(arr)
     return arr
コード例 #49
0
import pyopencl as cl
import pyopencl.array as cl_array
import numpy

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 10
a_gpu = cl_array.to_device(
        ctx, queue, numpy.random.randn(n).astype(numpy.float32))
b_gpu = cl_array.to_device(
        ctx, queue, numpy.random.randn(n).astype(numpy.float32))

from pyopencl.elementwise import ElementwiseKernel
lin_comb = ElementwiseKernel(ctx,
        "float a, float *x, "
        "float b, float *y, "
        "float *z",
        "z[i] = a*x[i] + b*y[i]",
        "linear_combination")

c_gpu = cl_array.empty_like(a_gpu)
lin_comb(5, a_gpu, 6, b_gpu, c_gpu)

import numpy.linalg as la
assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
コード例 #50
0
ファイル: test_algorithm.py プロジェクト: shinsec/pyopencl
def test_segmented_scan(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.tools import dtype_to_ctype
    dtype = np.int32
    ctype = dtype_to_ctype(dtype)

    #for is_exclusive in [False, True]:
    for is_exclusive in [True, False]:
        if is_exclusive:
            output_statement = "out[i] = prev_item"
        else:
            output_statement = "out[i] = item"

        from pyopencl.scan import GenericScanKernel
        knl = GenericScanKernel(context, dtype,
                arguments="__global %s *ary, __global char *segflags, "
                    "__global %s *out" % (ctype, ctype),
                input_expr="ary[i]",
                scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
                is_segment_start_expr="segflags[i]",
                output_statement=output_statement,
                options=[])

        np.set_printoptions(threshold=2000)
        from random import randrange
        from pyopencl.clrandom import rand as clrand
        for n in scan_test_counts:
            a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
            a = a_dev.get()

            if 10 <= n < 20:
                seg_boundaries_values = [
                        [0, 9],
                        [0, 3],
                        [4, 6],
                        ]
            else:
                seg_boundaries_values = []
                for i in range(10):
                    seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
                    seg_boundaries = [
                            randrange(n) for i in range(seg_boundary_count)]
                    if n >= 1029:
                        seg_boundaries.insert(0, 1028)
                    seg_boundaries.sort()
                    seg_boundaries_values.append(seg_boundaries)

            for seg_boundaries in seg_boundaries_values:
                #print "BOUNDARIES", seg_boundaries
                #print a

                seg_boundary_flags = np.zeros(n, dtype=np.uint8)
                seg_boundary_flags[seg_boundaries] = 1
                seg_boundary_flags_dev = cl_array.to_device(
                        queue, seg_boundary_flags)

                seg_boundaries.insert(0, 0)

                result_host = a.copy()
                for i, seg_start in enumerate(seg_boundaries):
                    if i+1 < len(seg_boundaries):
                        seg_end = seg_boundaries[i+1]
                    else:
                        seg_end = None

                    if is_exclusive:
                        result_host[seg_start+1:seg_end] = np.cumsum(
                                a[seg_start:seg_end][:-1])
                        result_host[seg_start] = 0
                    else:
                        result_host[seg_start:seg_end] = np.cumsum(
                                a[seg_start:seg_end])

                #print "REF", result_host

                result_dev = cl_array.empty_like(a_dev)
                knl(a_dev, seg_boundary_flags_dev, result_dev)

                #print "RES", result_dev
                is_correct = (result_dev.get() == result_host).all()
                if not is_correct:
                    diff = result_dev.get() - result_host
                    print("RES-REF", diff)
                    print("ERRWHERE", np.where(diff))
                    print(n, list(seg_boundaries))

                assert is_correct
                from gc import collect
                collect()

            print("%d excl:%s done" % (n, is_exclusive))
コード例 #51
0
ファイル: demo_array.py プロジェクト: AI42/pyopencl
from __future__ import absolute_import
from __future__ import print_function
import pyopencl as cl
import pyopencl.array as cl_array
import numpy
import numpy.linalg as la

a = numpy.random.rand(50000).astype(numpy.float32)
b = numpy.random.rand(50000).astype(numpy.float32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

a_dev = cl_array.to_device(queue, a)
b_dev = cl_array.to_device(queue, b)
dest_dev = cl_array.empty_like(a_dev)

prg = cl.Program(ctx, """
    __kernel void sum(__global const float *a,
    __global const float *b, __global float *c)
    {
      int gid = get_global_id(0);
      c[gid] = a[gid] + b[gid];
    }
    """).build()

prg.sum(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)

print(la.norm((dest_dev - (a_dev+b_dev)).get()))
コード例 #52
0
ファイル: test_callback.py プロジェクト: geggo/gpyfft
    def callback_pre(self, context):
        print("context:", context)
        queue = cl.CommandQueue(context)

        nd_data = np.array([[1, 2, 3, 4],
                            [5, 6, 5, 2]],
                           dtype=np.complex64)
        cl_data = cla.to_device(queue, nd_data)
        cl_data_transformed = cla.empty_like(cl_data)

        print("cl_data:")
        print(cl_data)
        print('nd_data.shape/strides:', nd_data.shape, nd_data.strides)
        print('cl_data.shape/strides:', cl_data.shape, cl_data.strides)
        print('cl_data_transformed.shape/strides:', cl_data_transformed.shape, cl_data_transformed.strides)

        G = GpyFFT(debug=False)
        plan = G.create_plan(context, cl_data.shape)
        
        plan.strides_in  = tuple(x // cl_data.dtype.itemsize for x in cl_data.strides)
        plan.strides_out = tuple(x // cl_data.dtype.itemsize for x in cl_data_transformed.strides)
        print('plan.strides_in', plan.strides_in)
        print('plan.strides_out', plan.strides_out)
        print('plan.distances', plan.distances)
        print('plan.batch_size', plan.batch_size)

        plan.inplace = False
        
        plan.precision = CLFFT_SINGLE
        print('plan.precision:', plan.precision)

        plan.scale_forward = 1.
        print('plan.scale_forward:', plan.scale_forward)
        
        #print('plan.transpose_result:', plan.transpose_result)

        nd_user_data = np.array([[2, 2, 2, 2],
                                 [3, 4, 5, 6]],
                                dtype=np.float32)
        cl_user_data = cla.to_device(queue, nd_user_data)
        print('cl_user_data')
        print(cl_user_data)

        plan.set_callback(b'premul',
                          self.callback_kernel_src_premul,
                          'pre',
                          user_data=cl_user_data.data)

        plan.bake(queue)
        print('plan.temp_array_size:', plan.temp_array_size)

        plan.enqueue_transform((queue,),
                               (cl_data.data,),
                               (cl_data_transformed.data,)
                               )
        queue.finish()

        print('cl_data_transformed:')
        print(cl_data_transformed)

        print('fft(nd_data * nd_user_data):')
        print(np.fft.fftn(nd_data * nd_user_data))

        assert np.allclose(cl_data_transformed.get(),
                           np.fft.fftn(nd_data * nd_user_data))
        
        del plan
コード例 #53
0
ファイル: test_simple.py プロジェクト: jakebolewski/gpyfft
import numpy as np
import gpyfft

G = gpyfft.GpyFFT(debug=True)

print "clAmdFft Version: %d.%d.%d"%(G.get_version())

context = cl.create_some_context()
queue = cl.CommandQueue(context)

print "context:", hex(context.obj_ptr)
print "queue:", hex(queue.obj_ptr)

nd_data = np.array([[1,2,3,4], [5,6,7,8]], dtype = np.complex64)
cl_data = cla.to_device(queue, nd_data)
cl_data_transformed = cla.empty_like(cl_data)

print "cl_data:"
print cl_data

print 'nd_data.shape/strides', nd_data.shape, nd_data.strides
print 'cl_data.shape/strides', cl_data.shape, cl_data.strides
print 'cl_data_transformed.shape/strides', cl_data_transformed.shape, cl_data_transformed.strides

plan = G.create_plan(context, cl_data.shape)
plan.strides_in = tuple(x//cl_data.dtype.itemsize for x in cl_data.strides)
plan.strides_out = tuple(x//cl_data.dtype.itemsize for x in cl_data_transformed.strides)


print 'plan.strides_in', plan.strides_in
print 'plan.strides_out', plan.strides_out
コード例 #54
0
        a = np.array(range(10, 1, -1), dtype=np.float32)
        test_im = np.outer(a, a)
        row_k = np.array([1, 2, 3]).astype(np.float32)
        col_k = np.array([5, 6, 7]).astype(np.float32)
    else:
        test_im = np.ones([10, 10]).astype(np.float32)
        row_k = np.array([1, 2, 3]).astype(np.float32)
        col_k = np.array([2, 4, 5]).astype(np.float32)

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    in_buf = cl_array.to_device(queue, test_im)
    row_buf = cl_array.to_device(queue, row_k)
    col_buf = cl_array.to_device(queue, col_k)
    out_buf = cl_array.empty_like(in_buf)
    imgx_buf = cl_array.empty_like(in_buf)
    imgy_buf = cl_array.empty_like(in_buf)
    mag_buf = cl_array.empty_like(in_buf)

    # Test the Sobel
    sobel = Sobel(ctx, queue)
    sobel(in_buf, imgx_buf, imgy_buf, mag_buf)

    print(imgx_buf.get())
    print(mag_buf.get())

    # Test the conv
    #conv = NaiveSeparableCorrelation(ctx, queue)
    conv = LocalMemorySeparableCorrelation(ctx, queue)
コード例 #55
0
ファイル: slicercl.py プロジェクト: ikolesov/SlicerCL
def dilate():
  #headURI = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd'
  #labelURI = 'http://boggs.bwh.harvard.edu/tmp/MRHead-label.nrrd'
  base = '/tmp/hoot/'
  headURI = base + 'MR-head.nrrd'
  labelURI = base + 'MR-head-label.nrrd'

  print("Starting...")
  if not slicer.util.getNode('MR-head*'):
    print("Downloading...")
    vl = slicer.modules.volumes.logic()
    name = 'MR-head'
    volumeNode = vl.AddArchetypeVolume(headURI, name, 0)
    name = 'MR-head-label'
    labelNode = vl.AddArchetypeVolume(labelURI, name, 1)
    if volumeNode:
      storageNode = volumeNode.GetStorageNode()
      if storageNode:
        # Automatically select the volume to display
        appLogic = slicer.app.applicationLogic()
        selNode = appLogic.GetSelectionNode()
        selNode.SetReferenceActiveVolumeID(volumeNode.GetID())
        selNode.SetReferenceActiveLabelVolumeID(labelNode.GetID())
        appLogic.PropagateVolumeSelection(1)

  node = slicer.util.getNode('MR-head')
  volume = slicer.util.array('MR-head')
  oneOverVolumeMax = 1. / volume.max()
  labelNode = slicer.util.getNode('MR-head-label')
  labelVolume = slicer.util.array('MR-head-label')

  print("Creating Context...")
  ctx = None
  for platform in cl.get_platforms():
      for device in platform.get_devices():
          print(cl.device_type.to_string(device.type))
          if cl.device_type.to_string(device.type) == "GPU":
             ctx = cl.Context([device])
             break;

  if not ctx:
    print ("no GPU context available")
    ctx = cl.create_some_context()
  print("Creating Queue...")
  queue = cl.CommandQueue(ctx)

  print("Copying volumes...")
  mf = cl.mem_flags
  volume_dev = cl_array.to_device(queue, volume)
  volume_image_dev = cl.image_from_array(ctx, volume,1)
  label_dev = cl.array.to_device(queue, labelVolume)
  theta = numpy.zeros_like(volume)
  theta_dev = cl.array.to_device(queue,theta)
  thetaNext = numpy.zeros_like(volume)
  thetaNext_dev = cl.array.to_device(queue,thetaNext)
  dest_dev = cl_array.empty_like(volume_dev)

  sampler = cl.Sampler(ctx,False,cl.addressing_mode.REPEAT,cl.filter_mode.LINEAR)

  print("Building program...")
  slices,rows,columns = volume.shape
  prg = cl.Program(ctx, """
      #pragma OPENCL EXTENSION cl_khr_fp64: enable

      __kernel void copy(
          __global short source[{slices}][{rows}][{columns}],
          __global short destination[{slices}][{rows}][{columns}])
      {{
        size_t slice = get_global_id(0);
        size_t column = get_global_id(1);
        size_t row = get_global_id(2);

        if (slice < {slices} && row < {rows} && column < {columns})
        {{
          destination[slice][row][column] = source [slice][row][column];
        }}
      }}

      __kernel void dilate(
          __read_only image3d_t volume,
          __global short label[{slices}][{rows}][{columns}],
          sampler_t volumeSampler,
          __global short dest[{slices}][{rows}][{columns}])
      {{
        size_t slice = get_global_id(0);
        size_t column = get_global_id(1);
        size_t row = get_global_id(2);

        if (slice >= {slices} || row >= {rows} || column >= {columns})
        {{
          return;
        }}

        int size = 1;

        int sliceOff, rowOff, columnOff;
        unsigned int sampleSlice, sampleRow, sampleColumn;

        short samples = 0;
        float4 samplePosition;
        for (sliceOff = -size; sliceOff <= size; sliceOff++)
        {{
          sampleSlice = slice + sliceOff;
          if (sampleSlice < 0 || sampleSlice >= {slices}) continue;
          for (rowOff = -size; rowOff <= size; rowOff++)
          {{
          sampleRow = row + rowOff;
          if (sampleRow < 0 || sampleRow >= {rows}) continue;
            for (columnOff = -size; columnOff <= size; columnOff++)
            {{
              sampleColumn = column + columnOff;
              if (sampleColumn < 0 || sampleColumn >= {columns}) continue;
              if (label[sampleSlice][sampleRow][sampleColumn] != 0)
              {{
                samples++;
              }}
            }}
          }}
        }}
        dest[slice][row][column] = samples;
      }}
      """.format(slices=slices,rows=rows,columns=columns)).build()

  def iterate(iterations=10):
    print("Running!")
    for iteration in xrange(iterations):
      prg.dilate(queue, volume.shape, None, volume_image_dev, label_dev.data, sampler, dest_dev.data)
      prg.copy(queue, volume.shape, None, dest_dev.data, label_dev.data)

    print("Getting data...")
    labelVolume[:] = dest_dev.get()
    print("Rendering...")
    labelNode.GetImageData().Modified()
    node.GetImageData().Modified()

    print("Done!")

  def grow(iterations=10):
    for iteration in xrange(iterations):
      iterate(1)
      slicer.app.processEvents()
コード例 #56
0
ファイル: slicercl.py プロジェクト: ikolesov/SlicerCL
def memoryBlur():
	print("Starting...")
	if not slicer.util.getNode('MRHead*'):
	  print("Downloading...")
	  vl = slicer.modules.volumes.logic()
	  uri = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd'
	  name = 'MRHead'
	  volumeNode = vl.AddArchetypeVolume(uri, name, 0)
	  if volumeNode:
	    storageNode = volumeNode.GetStorageNode()
	    if storageNode:
	      # Automatically select the volume to display
	      appLogic = slicer.app.applicationLogic()
	      selNode = appLogic.GetSelectionNode()
	      selNode.SetReferenceActiveVolumeID(volumeNode.GetID())
	      appLogic.PropagateVolumeSelection(1)

	node = slicer.util.getNode('MRHead*')
	volume = slicer.util.array('MRHead*')

	print("Creating Context...")
	ctx = None
	for platform in cl.get_platforms():
	    for device in platform.get_devices():
		print(cl.device_type.to_string(device.type))
		if cl.device_type.to_string(device.type) == "GPU":
		   ctx = cl.Context([device])

	if not ctx:
	  print ("no GPU context available")
	  ctx = cl.create_some_context()
	print("Creating Queue...")
	queue = cl.CommandQueue(ctx)

	print("Copying volume...")
	mf = cl.mem_flags
	volume_dev = cl_array.to_device(queue, volume)
	dest_dev = cl_array.empty_like(volume_dev)

	print("Building program...")
	slices,rows,columns = volume.shape
	prg = cl.Program(ctx, """
	    #pragma OPENCL EXTENSION cl_khr_fp64: enable
	    __kernel void blur(
		__global const short volume[{slices}][{rows}][{columns}],
		__global short dest[{slices}][{rows}][{columns}])
	    {{
	      size_t slice = get_global_id(0);
	      size_t column = get_global_id(1);
	      size_t row = get_global_id(2);

	      int size = 3;

	      int sliceOff, rowOff, columnOff;
	      unsigned int sampleSlice, sampleRow, sampleColumn;

	      double sum = 0;
	      unsigned int samples = 0;
	      for (sliceOff = -size; sliceOff <= size; sliceOff++)
	      {{
		sampleSlice = slice + sliceOff;
		if (sampleSlice < 0 || sampleSlice >= {slices}) continue;
		for (rowOff = -size; rowOff <= size; rowOff++)
		{{
		sampleRow = row + rowOff;
		if (sampleRow < 0 || sampleRow >= {rows}) continue;
		  for (columnOff = -size; columnOff <= size; columnOff++)
		  {{
		  sampleColumn = column + columnOff;
		  if (sampleColumn < 0 || sampleColumn >= {columns}) continue;
		  sum += volume[sampleSlice][sampleRow][sampleColumn];
		  samples++;
		  }}
		}}
	      }}
	      dest[slice][row][column] = (short) (sum / samples);
	    }}
	    """.format(slices=slices,rows=rows,columns=columns)).build()

	print("Running!")
	prg.blur(queue, volume.shape, None, volume_dev.data, dest_dev.data)

	print("Getting data...")
	volume[:] = dest_dev.get()
	print("Rendering...")
	node.GetImageData().Modified()

	print("Done!")
コード例 #57
0
end_val=6.0
start_val=0.0
side_length=2000

print "Solving mom-space with matrix-dimensions:", side_length,"x",side_length

# Timing, to see how fast the code is now.
t1=time.time()  

# Create all the necessary arrays.
x_vector=numpy.array([i%side_length for i in range(side_length**2)])
y_vector=numpy.array([(i-i%side_length)/side_length for i in range(side_length**2)])
gpu_matrix_x=cl_array.to_device(ctx,queue,(x_vector).astype(numpy.float32))
gpu_matrix_y=cl_array.to_device(ctx,queue,(y_vector).astype(numpy.float32))
gpu_matrix_res=cl_array.empty_like(gpu_matrix_x)

# Kernel to generate an identity matrix.
krnl_identity_matrix=ElementwiseKernel(ctx,"int *x, int *y, float *res",
    "res[i]=get_element(x[i],y[i])",preamble="""
    float get_element(int x, int y)
    {
        return x==y;
    }
    """)
# Kernel to generate a matrix through integrals depending on row and column. 
# The integrand is given in the C-function f().
krnl_gaussian_matrix=ElementwiseKernel(ctx,"float *x, float *y, float start, float end, float step, float *res",
    "res[i]=get_element(x[i],y[i],start,end,step)",
    preamble="#define PI 3.14159265f\n"+
    "".join(open("bessel.cl",'r').readlines())+
コード例 #58
0
ファイル: slicercl.py プロジェクト: ikolesov/SlicerCL
if not ctx:
  print ("preferred context not available")
  ctx = cl.create_some_context()
print("Creating Queue...")
queue = cl.CommandQueue(ctx)

print("Copying volumes...")
mf = cl.mem_flags
volume_dev = cl_array.to_device(queue, volume)
label_dev = cl.array.to_device(queue, labelVolume)
binaryLabels = numpy.logical_not(numpy.logical_not(labelVolume))
theta = float(2**15) * numpy.array(binaryLabels,dtype=numpy.dtype('float32'))
theta_dev = cl.array.to_device(queue,theta)
thetaNext = numpy.copy(numpy.array(theta, dtype=numpy.dtype('float32')))
thetaNext_dev = cl.array.to_device(queue,thetaNext)
labelNext_dev = cl_array.empty_like(label_dev)
candidates = labelVolume.copy()
candidates_dev = cl.array.to_device(queue,candidates)
candidatesNext = candidates.copy()
candidatesNext_dev = cl.array.to_device(queue,candidatesNext)
candidatesInitialized = False

print("label mean ", labelVolume.mean())
print("label_dev mean ", label_dev.get().mean())
print("labelNext_dev mean ", labelNext_dev.get().mean())
print("candidates_dev mean ", candidates_dev.get().mean())
print("theta mean ", theta.mean())
print("thetaNext_dev mean ", thetaNext_dev.get().mean())
print("candidatesNext_dev mean ", candidatesNext_dev.get().mean())

print("Building program...")
コード例 #59
0
    def __call__(self,
                 input_dev,
                 row_dev,
                 col_dev,
                 result_dev,
                 scratch_dev=None):

        if scratch_dev is None:
            scratch_dev = cla.empty_like(input_dev)

        row_tile_width = 128
        #row_tile_width = 64

        col_tile_width = 8
        #col_tile_height = 16
        col_tile_height = 8

        #col_tile_width = 16
        #col_tile_height = 48

        col_hstride = 8
        assert (np.mod(row_dev.shape[0], 2) == 1,
                "Kernels must be of odd width")
        row_kernel_radius = row_dev.shape[0] / 2

        coallescing_quantum = 16
        row_kernel_radius_aligned = ((row_kernel_radius /
                                      coallescing_quantum) *
                                     coallescing_quantum)

        if row_kernel_radius_aligned == 0:
            row_kernel_radius_aligned = coallescing_quantum

        assert (np.mod(col_dev.shape[0], 2) == 1,
                "Kernels must be of odd width")
        col_kernel_radius = col_dev.shape[0] / 2

        prg = self.build_program(input_dev.dtype,
                                 input_dev.shape,
                                 row_kernel_radius,
                                 row_kernel_radius_aligned,
                                 row_tile_width,
                                 col_kernel_radius,
                                 col_tile_width,
                                 col_tile_height,
                                 col_hstride)

        # Row kernel launch parameters
        row_local_size = (row_kernel_radius_aligned +
                          row_tile_width +
                          row_kernel_radius, 1)

        row_group_size = (int_div_up(input_dev.shape[1], row_tile_width),
                          input_dev.shape[0])

        row_global_size = (row_local_size[0] * row_group_size[0],
                           row_local_size[1] * row_group_size[1])

        # Column kernel launch parameters
        col_local_size = (min(input_dev.shape[1], col_tile_width),
                          min(input_dev.shape[0], col_hstride))

        col_group_size = (int_div_up(input_dev.shape[1], col_tile_width),
                          int_div_up(input_dev.shape[0], col_tile_height))

        col_global_size = (col_local_size[0] * col_group_size[0],
                           col_local_size[1] * col_group_size[1])

        row_global_size = tuple(int(e) for e in row_global_size)
        row_local_size = tuple(int(e) for e in row_local_size)
        col_global_size = tuple(int(e) for e in col_global_size)
        col_local_size = tuple(int(e) for e in col_local_size)

        try:
            prg.separable_convolution_row(self.queue,
                                          row_global_size,
                                          row_local_size,
                                          scratch_dev.data,
                                          input_dev.data,
                                          row_dev.data)
        except Exception as ex:
            print(input_dev.shape)
            print(row_dev.shape)
            print(row_global_size)
            print(row_local_size)
            raise ex

        try:
            prg.separable_convolution_col(self.queue,
                                         col_global_size,
                                         col_local_size,
                                         result_dev.data,
                                         scratch_dev.data,
                                         col_dev.data)
        except Exception as e:
            print(input_dev.shape)
            print(result_dev)
            print(scratch_dev)
            print(col_dev)
            print(col_dev.shape)
            raise e