Beispiel #1
0
        def __init__(self, ctx, values):
            self.sampler_nearest = cl.Sampler(ctx, True,
                                              cl.addressing_mode.REPEAT,
                                              cl.filter_mode.NEAREST)
            self.sampler_linear = cl.Sampler(ctx, True,
                                             cl.addressing_mode.REPEAT,
                                             cl.filter_mode.LINEAR)
            self.multiply = ElementwiseKernel(ctx,
                                              "float *x, float *y, float *z",
                                              "z[i] = x[i] * y[i];")
            self.conj_multiply = ElementwiseKernel(
                ctx, "cfloat_t *x, cfloat_t *y, cfloat_t *z",
                "z[i] = cfloat_mul(cfloat_conj(x[i]), y[i]);")
            self.calc_lcc_and_take_best = ElementwiseKernel(
                ctx, """float *gcc, float *ave, float *ave2, int *mask,
                       float norm_factor, int nrot, float *lcc, int *grot""",
                """float _lcc;
                       if (mask[i] > 0) {
                           _lcc = gcc[i] / sqrt(ave2[i] * norm_factor - ave[i] * ave[i]);
                           if (_lcc > lcc[i]) {
                               lcc[i] = _lcc;
                               grot[i] = nrot;
                           };
                       };
                    """)

            kernel_file = os.path.join(os.path.dirname(__file__), 'kernels.cl')
            with open(kernel_file) as f:
                t = Template(f.read()).substitute(**values)

            self._program = cl.Program(ctx, t).build()
            self._gws_rotate_grid3d = (96, 64, 1)
Beispiel #2
0
    def _generate(self, declarations=None):
        self.tp.add(self.func, declarations=declarations)
        if self.backend == 'cython':
            py_data, c_data = self.cython_gen.get_func_signature(self.func)
            py_defn = ['long SIZE'] + py_data[0][1:]
            c_defn = ['long SIZE'] + c_data[0][1:]
            py_args = ['SIZE'] + py_data[1][1:]
            template = Template(text=elementwise_cy_template)
            src = template.render(name=self.name,
                                  c_arg_sig=', '.join(c_defn),
                                  c_args=', '.join(c_data[1]),
                                  py_arg_sig=', '.join(py_defn),
                                  py_args=', '.join(py_args),
                                  openmp=self._config.use_openmp,
                                  get_parallel_range=get_parallel_range)
            self.tp.add_code(src)
            self.tp.compile()
            return getattr(self.tp.mod, 'py_' + self.name)
        elif self.backend == 'opencl':
            py_data, c_data = self.cython_gen.get_func_signature(self.func)
            self._correct_opencl_address_space(c_data)

            from .opencl import get_context, get_queue
            from pyopencl.elementwise import ElementwiseKernel
            from pyopencl._cluda import CLUDA_PREAMBLE
            ctx = get_context()
            self.queue = get_queue()
            name = self.func.__name__
            expr = '{func}({args})'.format(func=name,
                                           args=', '.join(c_data[1]))
            arguments = convert_to_float_if_needed(', '.join(c_data[0][1:]))
            preamble = convert_to_float_if_needed(self.tp.get_code())
            cluda_preamble = Template(text=CLUDA_PREAMBLE).render(
                double_support=True)
            knl = ElementwiseKernel(ctx,
                                    arguments=arguments,
                                    operation=expr,
                                    preamble="\n".join(
                                        [cluda_preamble, preamble]))
            return knl
        elif self.backend == 'cuda':
            py_data, c_data = self.cython_gen.get_func_signature(self.func)
            self._correct_opencl_address_space(c_data)

            from .cuda import set_context
            set_context()
            from pycuda.elementwise import ElementwiseKernel
            from pycuda._cluda import CLUDA_PREAMBLE
            name = self.func.__name__
            expr = '{func}({args})'.format(func=name,
                                           args=', '.join(c_data[1]))
            arguments = convert_to_float_if_needed(', '.join(c_data[0][1:]))
            preamble = convert_to_float_if_needed(self.tp.get_code())
            cluda_preamble = Template(text=CLUDA_PREAMBLE).render(
                double_support=True)
            knl = ElementwiseKernel(arguments=arguments,
                                    operation=expr,
                                    preamble="\n".join(
                                        [cluda_preamble, preamble]))
            return knl
	def __init__(self, context, queue):
		""" Constructor.
		@param context OpenCL context where apply.
		@param queue OpenCL command queue.
		"""
		self.context = context
		self.queue   = queue
		self.program = clUtils.loadProgram(context, clUtils.path() + "/lsqr.cl")
		# Create OpenCL objects as null objects, that we will generate
		# at the first iteration
		self.A      = None
		self.B      = None
		self.X0     = None
		self.X      = None
		self.R      = None
		# Create dot operator
		self.dot = ReductionKernel(context, np.float32, neutral="0",
		                           reduce_expr="a+b", map_expr="x[i]*y[i]",
		                           arguments="__global float *x, __global float *y")
		self.dot_c_vec   = ElementwiseKernel(context,
		                                     "float c, float *v",
		                                     "v[i] *= c")
		self.copy_vec    = ElementwiseKernel(context,
		                                     "float* out, float *in",
		                                     "out[i] = in[i]")
		self.linear_comb = ElementwiseKernel(context,
		                                     "float* z,"
		                                     "float a, float *x, "
		                                     "float b, float *y",
		                                     "z[i] = a*x[i] + b*y[i]")
		self.prod        = ElementwiseKernel(context,
		                                     "float* z,"
		                                     "float *x, float *y",
		                                     "z[i] = x[i]*y[i]")
Beispiel #4
0
    def __init__(self, context, queue):
        """ Constructor.
		@param context OpenCL context where apply.
		@param queue OpenCL command queue.
		"""
        self.context = context
        self.queue = queue
        self.program = clUtils.loadProgram(context,
                                           clUtils.path() + "/lsqr.cl")
        # Create OpenCL objects as null objects, that we will generate
        # at the first iteration
        self.A = None
        self.b = None
        self.x0 = None
        self.x = None
        self.r = None
        # Create some useful operators
        self.dot_c_vec = ElementwiseKernel(context, "float c, float *v",
                                           "v[i] *= c")
        self.copy_vec = ElementwiseKernel(context, "float* out, float *in",
                                          "out[i] = in[i]")
        self.linear_comb = ElementwiseKernel(
            context, "float* z,"
            "float a, float *x, "
            "float b, float *y", "z[i] = a*x[i] + b*y[i]")
        self.prod = ElementwiseKernel(context, "float *z,"
                                      "float *x, float *y", "z[i] = x[i]*y[i]")
Beispiel #5
0
 def __init__(self, op, parents=None):
     super(ReLU, self).__init__(parents)
     self.ops = [op]
     self.eval_kernel = ElementwiseKernel(pl.ctx, 'float *arr, float *out',
                                          'out[i] = fmax(0.0f, arr[i])',
                                          'relu_fwd')
     self.backprop_kernel = ElementwiseKernel(
         pl.ctx, 'float *res, float *gy, float *gx',
         'gx[i] = res[i]>0 ? gy[i] : 0', 'relu_bwd')
Beispiel #6
0
def sim_tilt():
    band4 = rasterio.open('/project2/macs30123/landsat8/LC08_B4.tif')  #red
    band5 = rasterio.open('/project2/macs30123/landsat8/LC08_B5.tif')  #nir
    red = band4.read(1).astype('float64')
    nir = band5.read(1).astype('float64')
    red_10 = np.tile(red, 10)
    nir_10 = np.tile(nir, 10)
    #cpu
    t0 = time.time()
    nvdi_cpu = (nir_10 - red_10) / (nir_10 + red_10)
    time_cpu = time.time() - t0
    #gpu
    t1 = time.time()
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    red_gpu = cl_array.to_device(queue, red_10)
    nir_gpu = cl_array.to_device(queue, nir_10)
    nvdi_formula = ElementwiseKernel(
        ctx, "double *x, double *y, double *nvdi",
        "nvdi[i] = (x[i] - y[i]) / (x[i] + y[i])")
    nvdi_gpu = cl.array.empty_like(nir_gpu)
    nvdi_formula(nir_gpu, red_gpu, nvdi_gpu)
    nvdi_gpu_new = nvdi_gpu.get()
    time_gpu = time.time() - t1

    print("The time of CPU computation for 10 times scene is", time_cpu)
    print('The time of GPU computation for 10 times scene is', time_gpu)

    red_20 = np.tile(red, 20)
    nir_20 = np.tile(nir, 20)
    #cpu
    t0 = time.time()
    nvdi_cpu = (nir_20 - red_20) / (nir_20 + red_20)
    time_cpu = time.time() - t0
    #gpu
    t1 = time.time()
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    red_gpu = cl_array.to_device(queue, red_20)
    nir_gpu = cl_array.to_device(queue, nir_20)
    nvdi_formula = ElementwiseKernel(
        ctx, "double *x, double *y, double *nvdi",
        "nvdi[i] = (x[i] - y[i]) / (x[i] + y[i])")
    nvdi_gpu = cl.array.empty_like(nir_gpu)
    nvdi_formula(nir_gpu, red_gpu, nvdi_gpu)
    nvdi_gpu_new = nvdi_gpu.get()
    time_gpu = time.time() - t1

    print("The time of CPU computation for 20 times scene is", time_cpu)
    print('The time of GPU computation for 20 times scene is', time_gpu)
    def __init__(self,
                 sino_shape,
                 slice_shape=None,
                 axis_position=None,
                 angles=None,
                 ctx=None,
                 devicetype="all",
                 platformid=None,
                 deviceid=None,
                 profile=False):
        ReconstructionAlgorithm.__init__(self,
                                         sino_shape,
                                         slice_shape=slice_shape,
                                         axis_position=axis_position,
                                         angles=angles,
                                         ctx=ctx,
                                         devicetype=devicetype,
                                         platformid=platformid,
                                         deviceid=deviceid,
                                         profile=profile)
        self.compute_preconditioners()

        # Create a LinAlg instance
        self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx)
        # Positivity constraint
        self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a",
                                              "a[i] = max(a[i], 0.0f);")
        # Projection onto the L-infinity ball of radius Lambda
        self.elwise_proj_linf = ElementwiseKernel(
            self.ctx, "float2* a, float Lambda",
            "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);",
            "elwise_proj_linf")
        # Additional arrays
        self.linalg.gradient(self.d_x)
        self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"])
        self.d_q = parray.empty_like(self.d_data)
        self.d_g = self.linalg.d_image
        self.d_tmp = parray.empty_like(self.d_x)
        self.d_p.fill(0)
        self.d_q.fill(0)
        self.d_tmp.fill(0)
        self.add_to_cl_mem({
            "d_p": self.d_p,
            "d_q": self.d_q,
            "d_tmp": self.d_tmp,
        })

        self.theta = 1.0
Beispiel #8
0
    def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
                    have_src_indices, have_dst_indices, map_values):
        from boxtree.tools import VectorArg

        args = [
                VectorArg(dtype, "input_ary"),
                VectorArg(dtype, "output_ary"),
               ]

        if have_src_indices:
            args.append(VectorArg(src_index_dtype, "from_indices"))

        if have_dst_indices:
            args.append(VectorArg(dst_index_dtype, "to_indices"))

        if map_values:
            args.append(VectorArg(dtype, "value_map"))

        from pyopencl.tools import dtype_to_ctype
        src = GAPPY_COPY_TPL.render(
                dtype=dtype,
                dtype_to_ctype=dtype_to_ctype,
                from_dtype=src_index_dtype,
                to_dtype=dst_index_dtype,
                from_indices=have_src_indices,
                to_indices=have_dst_indices,
                map_values=map_values)

        from pyopencl.elementwise import ElementwiseKernel
        return ElementwiseKernel(self.context,
                args, str(src),
                preamble=dtype_to_c_struct(self.context.devices[0], dtype),
                name="gappy_copy_and_map")
Beispiel #9
0
def get_elwise_kernel(kernel_name, args, src, preamble=""):
    ctx = get_context()
    knl = ElementwiseKernel(
        ctx, args, src,
        kernel_name, preamble=preamble
    )
    return profile_kernel(knl, kernel_name)
Beispiel #10
0
    def count_global_qbx_centers_knl(self, context, box_id_dtype,
                                     particle_id_dtype):
        return ElementwiseKernel(context,
                                 Template(r"""
                ${particle_id_t} *nqbx_centers_itgt_box,
                ${particle_id_t} *global_qbx_center_weight,
                ${box_id_t} *target_boxes,
                ${particle_id_t} *box_target_starts,
                ${particle_id_t} *box_target_counts_nonchild
            """).render(box_id_t=dtype_to_ctype(box_id_dtype),
                        particle_id_t=dtype_to_ctype(particle_id_dtype)),
                                 Template(r"""
                ${box_id_t} global_box_id = target_boxes[i];
                ${particle_id_t} start = box_target_starts[global_box_id];
                ${particle_id_t} end = start + box_target_counts_nonchild[
                    global_box_id
                ];

                ${particle_id_t} nqbx_centers = 0;
                for(${particle_id_t} iparticle = start; iparticle < end; iparticle++)
                    nqbx_centers += global_qbx_center_weight[iparticle];

                nqbx_centers_itgt_box[i] = nqbx_centers;
            """).render(box_id_t=dtype_to_ctype(box_id_dtype),
                        particle_id_t=dtype_to_ctype(particle_id_dtype)),
                                 name="count_global_qbx_centers")
Beispiel #11
0
def tree_bottom_up(ctx,
                   args,
                   setup,
                   leaf_operation,
                   node_operation,
                   output_expr,
                   preamble=""):
    operation = NODE_KERNEL_TEMPLATE % dict(setup=setup,
                                            leaf_operation=leaf_operation,
                                            node_operation=node_operation,
                                            output_expr=output_expr)

    args = ', '.join(["int *offsets, uint2 *pbounds", args])

    kernel = ElementwiseKernel(ctx,
                               args,
                               operation=operation,
                               preamble=preamble)

    def callable(tree, *args):
        csum_nodes = tree.total_nodes
        out = None
        for i in range(tree.depth, -1, -1):
            csum_nodes_next = csum_nodes
            csum_nodes -= tree.num_nodes[i]
            out = kernel(tree.offsets.dev,
                         tree.pbounds.dev,
                         *args,
                         slice=slice(csum_nodes, csum_nodes_next))
        return out

    return callable
    def __init__(self, ctx, queue, dtype=np.float32):
        self.ctx = ctx
        self.queue = queue
        sobel_c = np.array([1., 0., -1.]).astype(dtype)
        sobel_r = np.array([1., 2., 1.]).astype(dtype)
        self.sobel_c = cl_array.to_device(self.queue, sobel_c)
        self.sobel_r = cl_array.to_device(self.queue, sobel_r)

        self.scratch = None

        self.sepconv_rc = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_r, sobel_c)
        self.sepconv_cr = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_c, sobel_r)

        TYPE = ""
        if dtype == np.float32:
            TYPE = "float"
        elif dtype == np.uint8:
            TYPE = "unsigned char"
        elif dtype == np.uint16:
            TYPE = "unsigned short"

        self.mag = ElementwiseKernel(ctx,
                                    "float *result, %s *imgx, %s *imgy" % (TYPE, TYPE),
                                    "result[i] = sqrt((float)imgx[i]*imgx[i] + (float)imgy[i]*imgy[i])",
                                    "mag")
Beispiel #13
0
def get_neg_indx(rho, ran):
    rho = 0.5
    mu = 3
    S = 1000
    T = 4160
    eps_mat = sts.norm.rvs(loc=0, scale=1, size=(S * T)).astype(np.float32)
    initial = np.zeros(S).astype(np.float32) + 3
    eps_mat = cl.array.to_device(queue, eps_mat)
    initial = cl.array.to_device(queue, initial)
    mknl = ElementwiseKernel(
        ctx, "float *a, float *b, float rho, float mu, float *rslt",
        "rslt[i] = rho * a[i] +(1-rho)*mu+b[i]")

    output = cl.array.empty_like(eps_mat)
    mknl(initial, eps_mat[:S], rho, mu, output[:S])
    for i in range(1, T):
        mknl(output[S * (i - 1):S * i], eps_mat[S * i:S * i + 1], rho, mu,
             output[S * i:S * (i + 1)])
    z_mat = output.get().reshape(T, S)
    fst_neg_indx = np.zeros(S)
    for i in range(S):
        if np.all(z_mat.transpose()[i, :] >= 0):
            pass
        else:
            fst_neg_indx[i] = np.where(z_mat.transpose()[i, :] < 0)[0][0]
    return -np.sum(fst_neg_indx)
Beispiel #14
0
def point_tree_traverse(ctx,
                        k,
                        args,
                        setup,
                        node_operation,
                        leaf_operation,
                        output_expr,
                        common_operation,
                        preamble=""):
    # FIXME: variable max_depth
    operation = POINT_DFS_TEMPLATE % dict(setup=setup,
                                          leaf_operation=leaf_operation,
                                          node_operation=node_operation,
                                          common_operation=common_operation,
                                          output_expr=output_expr,
                                          max_depth=21,
                                          k=k)

    args = ', '.join(["int *cids, int *offsets", args])

    kernel = ElementwiseKernel(ctx,
                               args,
                               operation=operation,
                               preamble=preamble)

    def callable(tree_src, tree_dst, *args):
        return kernel(tree_dst.cids.dev, tree_src.offsets.dev, *args)

    return callable
Beispiel #15
0
def test_elwise_kernel_with_options(ctx_factory):
    from pyopencl.clrandom import rand as clrand
    from pyopencl.elementwise import ElementwiseKernel

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    in_gpu = clrand(queue, (50,), np.float32)

    options = ["-D", "ADD_ONE"]
    add_one = ElementwiseKernel(
        context,
        "float* out, const float *in",
        """
        out[i] = in[i]
        #ifdef ADD_ONE
            +1
        #endif
        ;
        """,
        options=options,
        )

    out_gpu = cl_array.empty_like(in_gpu)
    add_one(out_gpu, in_gpu)

    gt = in_gpu.get() + 1
    gv = out_gpu.get()
    assert la.norm(gv - gt) < 1e-5
Beispiel #16
0
 def process_m2qbxl_knl(self, context, box_id_dtype, particle_id_dtype):
     return ElementwiseKernel(
         context,
         Template(r"""
             ${box_id_t} *idx_to_itgt_box,
             ${particle_id_t} *nqbx_centers_itgt_box,
             ${box_id_t} *ssn_starts,
             double *nm2qbxl,
             double m2qbxl_cost
         """).render(
             box_id_t=dtype_to_ctype(box_id_dtype),
             particle_id_t=dtype_to_ctype(particle_id_dtype)
         ),
         Template(r"""
             // get the index of current box in target_boxes
             ${box_id_t} itgt_box = idx_to_itgt_box[i];
             // get the number of expansion centers in current box
             ${particle_id_t} nqbx_centers = nqbx_centers_itgt_box[itgt_box];
             // get the number of list 3 boxes of the current box in a particular
             // level
             ${box_id_t} nlist3_boxes = ssn_starts[i + 1] - ssn_starts[i];
             // calculate the cost
             nm2qbxl[itgt_box] += (nqbx_centers * nlist3_boxes * m2qbxl_cost);
         """).render(
             box_id_t=dtype_to_ctype(box_id_dtype),
             particle_id_t=dtype_to_ctype(particle_id_dtype)
         ),
         name="process_m2qbxl"
     )
Beispiel #17
0
def OCL_NORMALIZE(X1, Y1):
    global arrX
    global arrY

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    x = np.asarray(X1)
    y = np.asarray(Y1)
    x = np.float32(X1)
    y = np.float32(Y1)
    XX = cl_array.to_device(queue, x)
    YY = cl_array.to_device(queue, y)
    mf = cl.mem_flags

    V = cl.array.empty_like(XX)
    V1 = cl.array.empty_like(YY)

    Xmax = np.float(max(x))
    Ymax = np.float(max(y))
    Xmin = np.float(min(x))
    Ymin = np.float(min(y))

    miniMaxX = ElementwiseKernel(ctx,
                                 "float *x,float Xmax,float Xmin,float *v",
                                 "v[i] = (x[i]-Xmin)/(Xmax-Xmin);", "sum")

    miniMaxY = ElementwiseKernel(ctx,
                                 "float *y,float Ymax,float Ymin,float *v",
                                 "v[i] = (y[i]-Ymin)/(Ymax-Ymin);", "sum")

    start_timer = time.time()
    print('Timer: on')
    miniMaxX(XX, Xmax, Xmin, V)
    miniMaxY(YY, Ymax, Ymin, V1)

    arrX = V.T.get()
    arrY = V1.T.get()

    print(arrX)
    print(arrY)

    time_working = time.time() - start_timer
    print('\nTimer: stop; time: {} seconds'.format(round(time_working, 3)))
    global DataSet
    DataSet = DataSets(arrX, arrY)
    return
Beispiel #18
0
 def get_kernel(self, kernel_name, **kwargs):
     args, src = self._get_code(kernel_name, **kwargs)
     knl = ElementwiseKernel(self.ctx,
                             args,
                             src,
                             kernel_name,
                             preamble=self.preamble)
     return knl
    def __init__(self, cq, shape, prefer_add=False, n_iter=10):
        self.context, self.queue = parse_cq(cq)

        self._in_shape = tuple(shape)
        self._conv_shape = None
        self._out_shape = tuple([
            find_optimal_size(n, prefer_add=prefer_add) for n in self._in_shape
        ])
        logger.info("shape: in={}, out={}".format(self._in_shape,
                                                  self._out_shape))

        self.n_iter = n_iter

        # determine roi
        in_roi, out_roi = [], []
        for n_in, n_out in zip(self._in_shape, self._out_shape):
            dn = n_out - n_in
            if dn < 0:
                # smaller output
                in_roi.append(slice((-dn) // 2, (-dn) // 2 + n_out))
                out_roi.append(slice(0, n_out))
            elif dn > 0:
                # smaller input
                in_roi.append(slice(0, n_in))
                out_roi.append(slice(d // 2, d // 2 + n_in))
            else:
                in_roi.append(slice(0, n_in))
                out_roi.append(slice(0, n_out))
        in_roi, out_roi = tuple(in_roi), tuple(out_roi)

        def _crop_func(dst, src):
            dst[out_roi] = src[in_roi]

        self._crop_func = _crop_func

        self._estimate_func = ElementwiseKernel(
            self.context, "float *out, float *ref, float eps",
            "out[i] = ref[i] / ((out[i] > eps) ? out[i] : eps) + eps",
            "estimate_func")
        self._pos_clip_func = ElementwiseKernel(
            self.context, "float *im", "im[i] = (im[i] > 0) ? im[i] : 0",
            "_pos_clip_func")
Beispiel #20
0
def get_copy_kernel(ctx, dtype1, dtype2, varnames):
    arg_list = [('%(data_t1)s *%(v)s1' % dict(data_t1=dtype1, v=v))
                for v in varnames]
    arg_list += [('%(data_t2)s *%(v)s2' % dict(data_t2=dtype2, v=v))
                 for v in varnames]
    args = ', '.join(arg_list)

    operation = '; '.join(
        ('%(v)s2[i] = (%(data_t2)s)%(v)s1[i];' % dict(v=v, data_t2=dtype2))
        for v in varnames)
    return ElementwiseKernel(ctx, args, operation=operation)
def sim_health_index(r):

    # Set up context and command queue
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    # Start time:
    t0 = time.time()

    # Set model parameters
    S = 1000  # Set the number of lives to simulate
    T = int(4160)  # Set the number of periods for each simulation
    rho = r
    mu = 3.0
    np.random.seed(25)
    z_mat = np.zeros((T, S), dtype=np.float32)

    # Generate array of random shocks
    init_np = np.zeros(S).astype(np.float32) + mu
    epsm_np = sts.norm.rvs(loc=0, scale=1.0, size=T * S).astype(np.float32)

    init_g = cl.array.to_device(queue, init_np)
    epsm_g = cl.array.to_device(queue, epsm_np)

    # GPU: Define Segmented Elementwise Kernel
    prefix_sum = ElementwiseKernel(
        ctx, "float *a_g, float *b_g, float *res_g, float rho, float mu",
        "res_g[i] = rho * a_g[i]+(1-rho)*mu + b_g[i]")

    # Allocate space for result of kernel on device
    dev_result = cl_array.empty_like(epsm_g)

    # Enqueue and Run Elementwise Kernel
    prefix_sum(init_g, epsm_g[:S], dev_result[:S], rho, mu)
    [
        prefix_sum(dev_result[S * (i - 1):S * i], epsm_g[S * i:S * (i + 1)],
                   dev_result[S * i:S * (i + 1)], rho, mu)
        for i in range(1, T)
    ]

    # Get results back on CPU
    z_all = dev_result.get().reshape(T, S)

    # Create an array to store the index for first negative z_t
    neg_index = np.full(S, fill_value=T + 1)

    # Print simulation results
    for s in range(S):
        for t in range(T):
            if z_all[t, s] < 0:
                if neg_index[s] == T + 1:
                    neg_index[s] = t + 1
    mean = np.mean(neg_index)
    return mean
Beispiel #22
0
 def _fill_array_with_index_knl(self, context, idx_dtype, array_dtype):
     return ElementwiseKernel(context,
                              Template(r"""
             ${idx_t} *index,
             ${array_t} *array,
             ${array_t} val
         """).render(idx_t=dtype_to_ctype(idx_dtype),
                     array_t=dtype_to_ctype(array_dtype)),
                              Template(r"""
             array[index[i]] = val;
         """).render(),
                              name="fill_array_with_index")
Beispiel #23
0
 def get_kernel(self, kernel_name, **kwargs):
     data = kernel_name, tuple(kwargs.items())
     if data in self.cache:
         return profile_kernel(self.cache[data], kernel_name)
     else:
         args, src = self._get_code(kernel_name, **kwargs)
         knl = ElementwiseKernel(self.ctx,
                                 args,
                                 src,
                                 kernel_name,
                                 preamble=self.preamble)
         self.cache[data] = knl
         return profile_kernel(knl, kernel_name)
    def setup_device(self, imshape):

        print('Setting up with imshape = %s' % (str(imshape)))

        self.imshape = imshape

        self.clIm = cla.Array(self.q, imshape, numpy.float32)
        self.clm = cla.empty_like(self.clIm)
        self.clx = cla.empty_like(self.clIm)
        self.cly = cla.empty_like(self.clIm)
        self.clO = cla.zeros_like(self.clIm)
        self.clM = cla.zeros_like(self.clIm)
        self.clF = cla.empty_like(self.clIm)
        self.clS = cla.empty_like(self.clIm)
        self.clThisS = cla.empty_like(self.clIm)
        self.clScratch = cla.empty_like(self.clIm)

        self.radial_prg = pyopencl.Program(self.ctx, PROGRAM).build()

        self.sobel = Sobel(self.ctx, self.q)

        #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q)
        self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q)

        self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b',
                                       'a[i] += b[i]')

        self.norm_s = ElementwiseKernel(self.ctx,
                                        'float *s, const float nRadii',
                                        's[i] = -1 * s[i] / nRadii', 'norm_s')

        self.accum_s = ElementwiseKernel(self.ctx,
                                         'float *a, float *b, const float nr',
                                         'a[i] -= b[i] / nr')

        self.gaussians = {}
        self.gaussian_prgs = {}

        self.minmax = MinMaxKernel(self.ctx, self.q)
Beispiel #25
0
 def test_fast_sum2(self):
     test_kernel = ElementwiseKernel(self.ctx,
                   "float *a, float *b, float *res_h, float *res_l",
                   "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1",
                   preamble=self.doubleword)
     a_g = pyopencl.array.to_device(self.queue, self.ah)
     b_g = pyopencl.array.to_device(self.queue, self.bl)
     res_l = pyopencl.array.empty_like(a_g)
     res_h = pyopencl.array.empty_like(a_g)
     test_kernel(a_g, b_g, res_h, res_l)
     self.assertEqual(abs(self.ah + self.bl - res_h.get()).max(), 0, "Major matches")
     self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(), 0, "Exact mismatches")
     self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bl - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
Beispiel #26
0
    def __init__(self, ctx, queue):
        self.ctx = ctx
        self.queue = queue

        self.allocator = clt.ImmediateAllocator(self.queue)
        self.memory_pool = clt.MemoryPool(self.allocator)

        self.program_cache = {}
        self.array_cache = {}
        self.arrays = None

        self.square_array = ElementwiseKernel(self.ctx,
                                              "float *in, float *out",
                                              "out[i] = in[i]*in[i]", "square")
Beispiel #27
0
    def init_kernels(self):
        """Set up the OpenCL kernels."""
        from pkg_resources import resource_string
        kernel_src = resource_string(__name__, 'CLBacterium.cl')

        self.program = cl.Program(self.context,
                                  kernel_src).build(cache_dir=False)
        # Some kernels that seem like they should be built into pyopencl...
        self.vclearf = ElementwiseKernel(self.context, "float8 *v", "v[i]=0.0",
                                         "vecclearf")
        self.vcleari = ElementwiseKernel(self.context, "int *v", "v[i]=0",
                                         "veccleari")
        self.vadd = ElementwiseKernel(
            self.context, "float8 *res, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] + in2[i]", "vecadd")
        self.vsub = ElementwiseKernel(
            self.context, "float8 *res, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] - in2[i]", "vecsub")
        self.vaddkx = ElementwiseKernel(
            self.context,
            "float8 *res, const float k, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] + k*in2[i]", "vecaddkx")
        self.vsubkx = ElementwiseKernel(
            self.context,
            "float8 *res, const float k, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] - k*in2[i]", "vecsubkx")

        # cell geometry kernels
        self.calc_cell_area = ElementwiseKernel(
            self.context, "float* res, float* r, float* l",
            "res[i] = 2.f*3.1415927f*r[i]*(2.f*r[i]+l[i])", "cell_area_kern")
        self.calc_cell_vol = ElementwiseKernel(
            self.context, "float* res, float* r, float* l",
            "res[i] = 3.1415927f*r[i]*r[i]*(2.f*r[i]+l[i])", "cell_vol_kern")

        # A dot product as sum of float4 dot products -
        # i.e. like flattening vectors of float8s into big float vectors
        # then computing dot
        # NB. Some openCLs seem not to implement dot(float8,float8) so split
        # into float4's
        self.vdot = ReductionKernel(
            self.context,
            numpy.float32,
            neutral="0",
            reduce_expr="a+b",
            map_expr="dot(x[i].s0123,y[i].s0123)+dot(x[i].s4567,y[i].s4567)",
            arguments="__global float8 *x, __global float8 *y")
Beispiel #28
0
def integrate_in_time_cl(context,
                         dtype,
                         state,
                         rhs_func,
                         dt,
                         final_time,
                         vis_hook=None):
    time = 0
    step = 0

    residual = 0 * state

    from pyopencl.elementwise import ElementwiseKernel, VectorArg, ScalarArg
    from pytools.obj_array import as_oarray_func_n_args
    axpby_knl = ElementwiseKernel(context, [
        ScalarArg(dtype, "a"),
        VectorArg(dtype, "x"),
        ScalarArg(dtype, "b"),
        VectorArg(dtype, "y"),
        VectorArg(dtype, "z"),
    ], "z[i] = a*x[i] + b*y[i]")

    # The decorator module won't work on callable objects. D'oh.
    def axpby_wrapper(*args):
        return axpby_knl(*args)

    axpby = as_oarray_func_n_args(axpby_wrapper)
    # outer time step loop
    while time < final_time:
        if time + dt > final_time:
            dt = final_time - time

        for a, b in zip(rk4a, rk4b):
            rhs = rhs_func(time, state)

            # residual = a*residual + dt*rhs
            axpby(a, residual, dt, rhs, residual)

            # state = state + b*residual
            axpby(1, state, b, residual, state)

        if vis_hook is not None:
            vis_hook(step, time, state)

        # Increment time
        time = time + dt
        step += 1

    return time, state
Beispiel #29
0
 def test_dw_div_fp(self):
     test_kernel = ElementwiseKernel(self.ctx,
                 "float *ah, float *al, float *b, float *res_h, float *res_l",
                 "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;",
                 preamble=self.doubleword)
     ah_g = pyopencl.array.to_device(self.queue, self.ah)
     al_g = pyopencl.array.to_device(self.queue, self.al)
     b_g = pyopencl.array.to_device(self.queue, self.bh)
     res_l = pyopencl.array.empty_like(b_g)
     res_h = pyopencl.array.empty_like(b_g)
     test_kernel(ah_g, al_g, b_g, res_h, res_l)
     res_m = res_h.get()
     res = res_h.get().astype(numpy.float64) + res_l.get()
     self.assertLess(abs(self.a / self.bh - res_m).max(), EPS32, "Major matches")
     self.assertGreater(abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches")
     self.assertLess(abs(self.a / self.bh - res).max(), 3 * EPS32 ** 2, "Exact matches")
def sim_gpu_nvdi():
    band4 = rasterio.open('LC08_B4.tif')  #red
    band5 = rasterio.open('LC08_B5.tif')  #nir

    #Convert nit and red objects to float64 arrays
    red = band4.read(1).astype('float64')
    nir = band5.read(1).astype('float64')

    #Tile the arrays 20 times
    red = np.tile(red, 20)
    nir = np.tile(nir, 20)

    #Get the computation time using original serial solution
    t0_s = time.time()
    nvdi_s = (nir - red) / (nir + red)
    final_time_s = time.time()
    serial_time = final_time_s - t0_s

    #Plot the graph using serial solution
    plt.imsave('ps_q3_serial_tile20.png', nvdi_s)

    #Get the computation time using gpu
    #Set up OpenCL context and command queue
    t0_g = time.time()
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    red_dev = cl_array.to_device(queue, red)
    nir_dev = cl_array.to_device(queue, nir)

    nvdi_comb = ElementwiseKernel(ctx, "double *x, double *y, double *res",
                                  "res[i] = (x[i] - y[i]) / (x[i] + y[i])")

    res_gpu = cl.array.empty_like(nir_dev)
    nvdi_comb(nir_dev, red_dev, res_gpu)
    nvdi_gpu = res_gpu.get()

    final_time_g = time.time()
    gpu_time = final_time_g - t0_g

    #Plot the graph using gpu solution (to prove that they are the same)
    plt.imsave('ps_q3_gpu_tile20.png', nvdi_gpu)

    #Report the time for serial solution and gpu
    print('The time using serial solution (20 times): {0:.4f} seconds'.format(
        serial_time))
    print('The time using gpu (20 times): {0:.4f} seconds'.format(gpu_time))