def __init__(self, ctx, values): self.sampler_nearest = cl.Sampler(ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.NEAREST) self.sampler_linear = cl.Sampler(ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) self.multiply = ElementwiseKernel(ctx, "float *x, float *y, float *z", "z[i] = x[i] * y[i];") self.conj_multiply = ElementwiseKernel( ctx, "cfloat_t *x, cfloat_t *y, cfloat_t *z", "z[i] = cfloat_mul(cfloat_conj(x[i]), y[i]);") self.calc_lcc_and_take_best = ElementwiseKernel( ctx, """float *gcc, float *ave, float *ave2, int *mask, float norm_factor, int nrot, float *lcc, int *grot""", """float _lcc; if (mask[i] > 0) { _lcc = gcc[i] / sqrt(ave2[i] * norm_factor - ave[i] * ave[i]); if (_lcc > lcc[i]) { lcc[i] = _lcc; grot[i] = nrot; }; }; """) kernel_file = os.path.join(os.path.dirname(__file__), 'kernels.cl') with open(kernel_file) as f: t = Template(f.read()).substitute(**values) self._program = cl.Program(ctx, t).build() self._gws_rotate_grid3d = (96, 64, 1)
def _generate(self, declarations=None): self.tp.add(self.func, declarations=declarations) if self.backend == 'cython': py_data, c_data = self.cython_gen.get_func_signature(self.func) py_defn = ['long SIZE'] + py_data[0][1:] c_defn = ['long SIZE'] + c_data[0][1:] py_args = ['SIZE'] + py_data[1][1:] template = Template(text=elementwise_cy_template) src = template.render(name=self.name, c_arg_sig=', '.join(c_defn), c_args=', '.join(c_data[1]), py_arg_sig=', '.join(py_defn), py_args=', '.join(py_args), openmp=self._config.use_openmp, get_parallel_range=get_parallel_range) self.tp.add_code(src) self.tp.compile() return getattr(self.tp.mod, 'py_' + self.name) elif self.backend == 'opencl': py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) from .opencl import get_context, get_queue from pyopencl.elementwise import ElementwiseKernel from pyopencl._cluda import CLUDA_PREAMBLE ctx = get_context() self.queue = get_queue() name = self.func.__name__ expr = '{func}({args})'.format(func=name, args=', '.join(c_data[1])) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True) knl = ElementwiseKernel(ctx, arguments=arguments, operation=expr, preamble="\n".join( [cluda_preamble, preamble])) return knl elif self.backend == 'cuda': py_data, c_data = self.cython_gen.get_func_signature(self.func) self._correct_opencl_address_space(c_data) from .cuda import set_context set_context() from pycuda.elementwise import ElementwiseKernel from pycuda._cluda import CLUDA_PREAMBLE name = self.func.__name__ expr = '{func}({args})'.format(func=name, args=', '.join(c_data[1])) arguments = convert_to_float_if_needed(', '.join(c_data[0][1:])) preamble = convert_to_float_if_needed(self.tp.get_code()) cluda_preamble = Template(text=CLUDA_PREAMBLE).render( double_support=True) knl = ElementwiseKernel(arguments=arguments, operation=expr, preamble="\n".join( [cluda_preamble, preamble])) return knl
def __init__(self, context, queue): """ Constructor. @param context OpenCL context where apply. @param queue OpenCL command queue. """ self.context = context self.queue = queue self.program = clUtils.loadProgram(context, clUtils.path() + "/lsqr.cl") # Create OpenCL objects as null objects, that we will generate # at the first iteration self.A = None self.B = None self.X0 = None self.X = None self.R = None # Create dot operator self.dot = ReductionKernel(context, np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="__global float *x, __global float *y") self.dot_c_vec = ElementwiseKernel(context, "float c, float *v", "v[i] *= c") self.copy_vec = ElementwiseKernel(context, "float* out, float *in", "out[i] = in[i]") self.linear_comb = ElementwiseKernel(context, "float* z," "float a, float *x, " "float b, float *y", "z[i] = a*x[i] + b*y[i]") self.prod = ElementwiseKernel(context, "float* z," "float *x, float *y", "z[i] = x[i]*y[i]")
def __init__(self, context, queue): """ Constructor. @param context OpenCL context where apply. @param queue OpenCL command queue. """ self.context = context self.queue = queue self.program = clUtils.loadProgram(context, clUtils.path() + "/lsqr.cl") # Create OpenCL objects as null objects, that we will generate # at the first iteration self.A = None self.b = None self.x0 = None self.x = None self.r = None # Create some useful operators self.dot_c_vec = ElementwiseKernel(context, "float c, float *v", "v[i] *= c") self.copy_vec = ElementwiseKernel(context, "float* out, float *in", "out[i] = in[i]") self.linear_comb = ElementwiseKernel( context, "float* z," "float a, float *x, " "float b, float *y", "z[i] = a*x[i] + b*y[i]") self.prod = ElementwiseKernel(context, "float *z," "float *x, float *y", "z[i] = x[i]*y[i]")
def __init__(self, op, parents=None): super(ReLU, self).__init__(parents) self.ops = [op] self.eval_kernel = ElementwiseKernel(pl.ctx, 'float *arr, float *out', 'out[i] = fmax(0.0f, arr[i])', 'relu_fwd') self.backprop_kernel = ElementwiseKernel( pl.ctx, 'float *res, float *gy, float *gx', 'gx[i] = res[i]>0 ? gy[i] : 0', 'relu_bwd')
def sim_tilt(): band4 = rasterio.open('/project2/macs30123/landsat8/LC08_B4.tif') #red band5 = rasterio.open('/project2/macs30123/landsat8/LC08_B5.tif') #nir red = band4.read(1).astype('float64') nir = band5.read(1).astype('float64') red_10 = np.tile(red, 10) nir_10 = np.tile(nir, 10) #cpu t0 = time.time() nvdi_cpu = (nir_10 - red_10) / (nir_10 + red_10) time_cpu = time.time() - t0 #gpu t1 = time.time() ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) red_gpu = cl_array.to_device(queue, red_10) nir_gpu = cl_array.to_device(queue, nir_10) nvdi_formula = ElementwiseKernel( ctx, "double *x, double *y, double *nvdi", "nvdi[i] = (x[i] - y[i]) / (x[i] + y[i])") nvdi_gpu = cl.array.empty_like(nir_gpu) nvdi_formula(nir_gpu, red_gpu, nvdi_gpu) nvdi_gpu_new = nvdi_gpu.get() time_gpu = time.time() - t1 print("The time of CPU computation for 10 times scene is", time_cpu) print('The time of GPU computation for 10 times scene is', time_gpu) red_20 = np.tile(red, 20) nir_20 = np.tile(nir, 20) #cpu t0 = time.time() nvdi_cpu = (nir_20 - red_20) / (nir_20 + red_20) time_cpu = time.time() - t0 #gpu t1 = time.time() ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) red_gpu = cl_array.to_device(queue, red_20) nir_gpu = cl_array.to_device(queue, nir_20) nvdi_formula = ElementwiseKernel( ctx, "double *x, double *y, double *nvdi", "nvdi[i] = (x[i] - y[i]) / (x[i] + y[i])") nvdi_gpu = cl.array.empty_like(nir_gpu) nvdi_formula(nir_gpu, red_gpu, nvdi_gpu) nvdi_gpu_new = nvdi_gpu.get() time_gpu = time.time() - t1 print("The time of CPU computation for 20 times scene is", time_cpu) print('The time of GPU computation for 20 times scene is', time_gpu)
def __init__(self, sino_shape, slice_shape=None, axis_position=None, angles=None, ctx=None, devicetype="all", platformid=None, deviceid=None, profile=False): ReconstructionAlgorithm.__init__(self, sino_shape, slice_shape=slice_shape, axis_position=axis_position, angles=angles, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, profile=profile) self.compute_preconditioners() # Create a LinAlg instance self.linalg = LinAlg(self.backprojector.slice_shape, ctx=self.ctx) # Positivity constraint self.elwise_clamp = ElementwiseKernel(self.ctx, "float *a", "a[i] = max(a[i], 0.0f);") # Projection onto the L-infinity ball of radius Lambda self.elwise_proj_linf = ElementwiseKernel( self.ctx, "float2* a, float Lambda", "a[i].x = copysign(min(fabs(a[i].x), Lambda), a[i].x); a[i].y = copysign(min(fabs(a[i].y), Lambda), a[i].y);", "elwise_proj_linf") # Additional arrays self.linalg.gradient(self.d_x) self.d_p = parray.empty_like(self.linalg.cl_mem["d_gradient"]) self.d_q = parray.empty_like(self.d_data) self.d_g = self.linalg.d_image self.d_tmp = parray.empty_like(self.d_x) self.d_p.fill(0) self.d_q.fill(0) self.d_tmp.fill(0) self.add_to_cl_mem({ "d_p": self.d_p, "d_q": self.d_q, "d_tmp": self.d_tmp, }) self.theta = 1.0
def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, have_src_indices, have_dst_indices, map_values): from boxtree.tools import VectorArg args = [ VectorArg(dtype, "input_ary"), VectorArg(dtype, "output_ary"), ] if have_src_indices: args.append(VectorArg(src_index_dtype, "from_indices")) if have_dst_indices: args.append(VectorArg(dst_index_dtype, "to_indices")) if map_values: args.append(VectorArg(dtype, "value_map")) from pyopencl.tools import dtype_to_ctype src = GAPPY_COPY_TPL.render( dtype=dtype, dtype_to_ctype=dtype_to_ctype, from_dtype=src_index_dtype, to_dtype=dst_index_dtype, from_indices=have_src_indices, to_indices=have_dst_indices, map_values=map_values) from pyopencl.elementwise import ElementwiseKernel return ElementwiseKernel(self.context, args, str(src), preamble=dtype_to_c_struct(self.context.devices[0], dtype), name="gappy_copy_and_map")
def get_elwise_kernel(kernel_name, args, src, preamble=""): ctx = get_context() knl = ElementwiseKernel( ctx, args, src, kernel_name, preamble=preamble ) return profile_kernel(knl, kernel_name)
def count_global_qbx_centers_knl(self, context, box_id_dtype, particle_id_dtype): return ElementwiseKernel(context, Template(r""" ${particle_id_t} *nqbx_centers_itgt_box, ${particle_id_t} *global_qbx_center_weight, ${box_id_t} *target_boxes, ${particle_id_t} *box_target_starts, ${particle_id_t} *box_target_counts_nonchild """).render(box_id_t=dtype_to_ctype(box_id_dtype), particle_id_t=dtype_to_ctype(particle_id_dtype)), Template(r""" ${box_id_t} global_box_id = target_boxes[i]; ${particle_id_t} start = box_target_starts[global_box_id]; ${particle_id_t} end = start + box_target_counts_nonchild[ global_box_id ]; ${particle_id_t} nqbx_centers = 0; for(${particle_id_t} iparticle = start; iparticle < end; iparticle++) nqbx_centers += global_qbx_center_weight[iparticle]; nqbx_centers_itgt_box[i] = nqbx_centers; """).render(box_id_t=dtype_to_ctype(box_id_dtype), particle_id_t=dtype_to_ctype(particle_id_dtype)), name="count_global_qbx_centers")
def tree_bottom_up(ctx, args, setup, leaf_operation, node_operation, output_expr, preamble=""): operation = NODE_KERNEL_TEMPLATE % dict(setup=setup, leaf_operation=leaf_operation, node_operation=node_operation, output_expr=output_expr) args = ', '.join(["int *offsets, uint2 *pbounds", args]) kernel = ElementwiseKernel(ctx, args, operation=operation, preamble=preamble) def callable(tree, *args): csum_nodes = tree.total_nodes out = None for i in range(tree.depth, -1, -1): csum_nodes_next = csum_nodes csum_nodes -= tree.num_nodes[i] out = kernel(tree.offsets.dev, tree.pbounds.dev, *args, slice=slice(csum_nodes, csum_nodes_next)) return out return callable
def __init__(self, ctx, queue, dtype=np.float32): self.ctx = ctx self.queue = queue sobel_c = np.array([1., 0., -1.]).astype(dtype) sobel_r = np.array([1., 2., 1.]).astype(dtype) self.sobel_c = cl_array.to_device(self.queue, sobel_c) self.sobel_r = cl_array.to_device(self.queue, sobel_r) self.scratch = None self.sepconv_rc = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_r, sobel_c) self.sepconv_cr = LocalMemorySeparableCorrelation(self.ctx, self.queue, sobel_c, sobel_r) TYPE = "" if dtype == np.float32: TYPE = "float" elif dtype == np.uint8: TYPE = "unsigned char" elif dtype == np.uint16: TYPE = "unsigned short" self.mag = ElementwiseKernel(ctx, "float *result, %s *imgx, %s *imgy" % (TYPE, TYPE), "result[i] = sqrt((float)imgx[i]*imgx[i] + (float)imgy[i]*imgy[i])", "mag")
def get_neg_indx(rho, ran): rho = 0.5 mu = 3 S = 1000 T = 4160 eps_mat = sts.norm.rvs(loc=0, scale=1, size=(S * T)).astype(np.float32) initial = np.zeros(S).astype(np.float32) + 3 eps_mat = cl.array.to_device(queue, eps_mat) initial = cl.array.to_device(queue, initial) mknl = ElementwiseKernel( ctx, "float *a, float *b, float rho, float mu, float *rslt", "rslt[i] = rho * a[i] +(1-rho)*mu+b[i]") output = cl.array.empty_like(eps_mat) mknl(initial, eps_mat[:S], rho, mu, output[:S]) for i in range(1, T): mknl(output[S * (i - 1):S * i], eps_mat[S * i:S * i + 1], rho, mu, output[S * i:S * (i + 1)]) z_mat = output.get().reshape(T, S) fst_neg_indx = np.zeros(S) for i in range(S): if np.all(z_mat.transpose()[i, :] >= 0): pass else: fst_neg_indx[i] = np.where(z_mat.transpose()[i, :] < 0)[0][0] return -np.sum(fst_neg_indx)
def point_tree_traverse(ctx, k, args, setup, node_operation, leaf_operation, output_expr, common_operation, preamble=""): # FIXME: variable max_depth operation = POINT_DFS_TEMPLATE % dict(setup=setup, leaf_operation=leaf_operation, node_operation=node_operation, common_operation=common_operation, output_expr=output_expr, max_depth=21, k=k) args = ', '.join(["int *cids, int *offsets", args]) kernel = ElementwiseKernel(ctx, args, operation=operation, preamble=preamble) def callable(tree_src, tree_dst, *args): return kernel(tree_dst.cids.dev, tree_src.offsets.dev, *args) return callable
def test_elwise_kernel_with_options(ctx_factory): from pyopencl.clrandom import rand as clrand from pyopencl.elementwise import ElementwiseKernel context = ctx_factory() queue = cl.CommandQueue(context) in_gpu = clrand(queue, (50,), np.float32) options = ["-D", "ADD_ONE"] add_one = ElementwiseKernel( context, "float* out, const float *in", """ out[i] = in[i] #ifdef ADD_ONE +1 #endif ; """, options=options, ) out_gpu = cl_array.empty_like(in_gpu) add_one(out_gpu, in_gpu) gt = in_gpu.get() + 1 gv = out_gpu.get() assert la.norm(gv - gt) < 1e-5
def process_m2qbxl_knl(self, context, box_id_dtype, particle_id_dtype): return ElementwiseKernel( context, Template(r""" ${box_id_t} *idx_to_itgt_box, ${particle_id_t} *nqbx_centers_itgt_box, ${box_id_t} *ssn_starts, double *nm2qbxl, double m2qbxl_cost """).render( box_id_t=dtype_to_ctype(box_id_dtype), particle_id_t=dtype_to_ctype(particle_id_dtype) ), Template(r""" // get the index of current box in target_boxes ${box_id_t} itgt_box = idx_to_itgt_box[i]; // get the number of expansion centers in current box ${particle_id_t} nqbx_centers = nqbx_centers_itgt_box[itgt_box]; // get the number of list 3 boxes of the current box in a particular // level ${box_id_t} nlist3_boxes = ssn_starts[i + 1] - ssn_starts[i]; // calculate the cost nm2qbxl[itgt_box] += (nqbx_centers * nlist3_boxes * m2qbxl_cost); """).render( box_id_t=dtype_to_ctype(box_id_dtype), particle_id_t=dtype_to_ctype(particle_id_dtype) ), name="process_m2qbxl" )
def OCL_NORMALIZE(X1, Y1): global arrX global arrY ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) x = np.asarray(X1) y = np.asarray(Y1) x = np.float32(X1) y = np.float32(Y1) XX = cl_array.to_device(queue, x) YY = cl_array.to_device(queue, y) mf = cl.mem_flags V = cl.array.empty_like(XX) V1 = cl.array.empty_like(YY) Xmax = np.float(max(x)) Ymax = np.float(max(y)) Xmin = np.float(min(x)) Ymin = np.float(min(y)) miniMaxX = ElementwiseKernel(ctx, "float *x,float Xmax,float Xmin,float *v", "v[i] = (x[i]-Xmin)/(Xmax-Xmin);", "sum") miniMaxY = ElementwiseKernel(ctx, "float *y,float Ymax,float Ymin,float *v", "v[i] = (y[i]-Ymin)/(Ymax-Ymin);", "sum") start_timer = time.time() print('Timer: on') miniMaxX(XX, Xmax, Xmin, V) miniMaxY(YY, Ymax, Ymin, V1) arrX = V.T.get() arrY = V1.T.get() print(arrX) print(arrY) time_working = time.time() - start_timer print('\nTimer: stop; time: {} seconds'.format(round(time_working, 3))) global DataSet DataSet = DataSets(arrX, arrY) return
def get_kernel(self, kernel_name, **kwargs): args, src = self._get_code(kernel_name, **kwargs) knl = ElementwiseKernel(self.ctx, args, src, kernel_name, preamble=self.preamble) return knl
def __init__(self, cq, shape, prefer_add=False, n_iter=10): self.context, self.queue = parse_cq(cq) self._in_shape = tuple(shape) self._conv_shape = None self._out_shape = tuple([ find_optimal_size(n, prefer_add=prefer_add) for n in self._in_shape ]) logger.info("shape: in={}, out={}".format(self._in_shape, self._out_shape)) self.n_iter = n_iter # determine roi in_roi, out_roi = [], [] for n_in, n_out in zip(self._in_shape, self._out_shape): dn = n_out - n_in if dn < 0: # smaller output in_roi.append(slice((-dn) // 2, (-dn) // 2 + n_out)) out_roi.append(slice(0, n_out)) elif dn > 0: # smaller input in_roi.append(slice(0, n_in)) out_roi.append(slice(d // 2, d // 2 + n_in)) else: in_roi.append(slice(0, n_in)) out_roi.append(slice(0, n_out)) in_roi, out_roi = tuple(in_roi), tuple(out_roi) def _crop_func(dst, src): dst[out_roi] = src[in_roi] self._crop_func = _crop_func self._estimate_func = ElementwiseKernel( self.context, "float *out, float *ref, float eps", "out[i] = ref[i] / ((out[i] > eps) ? out[i] : eps) + eps", "estimate_func") self._pos_clip_func = ElementwiseKernel( self.context, "float *im", "im[i] = (im[i] > 0) ? im[i] : 0", "_pos_clip_func")
def get_copy_kernel(ctx, dtype1, dtype2, varnames): arg_list = [('%(data_t1)s *%(v)s1' % dict(data_t1=dtype1, v=v)) for v in varnames] arg_list += [('%(data_t2)s *%(v)s2' % dict(data_t2=dtype2, v=v)) for v in varnames] args = ', '.join(arg_list) operation = '; '.join( ('%(v)s2[i] = (%(data_t2)s)%(v)s1[i];' % dict(v=v, data_t2=dtype2)) for v in varnames) return ElementwiseKernel(ctx, args, operation=operation)
def sim_health_index(r): # Set up context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # Start time: t0 = time.time() # Set model parameters S = 1000 # Set the number of lives to simulate T = int(4160) # Set the number of periods for each simulation rho = r mu = 3.0 np.random.seed(25) z_mat = np.zeros((T, S), dtype=np.float32) # Generate array of random shocks init_np = np.zeros(S).astype(np.float32) + mu epsm_np = sts.norm.rvs(loc=0, scale=1.0, size=T * S).astype(np.float32) init_g = cl.array.to_device(queue, init_np) epsm_g = cl.array.to_device(queue, epsm_np) # GPU: Define Segmented Elementwise Kernel prefix_sum = ElementwiseKernel( ctx, "float *a_g, float *b_g, float *res_g, float rho, float mu", "res_g[i] = rho * a_g[i]+(1-rho)*mu + b_g[i]") # Allocate space for result of kernel on device dev_result = cl_array.empty_like(epsm_g) # Enqueue and Run Elementwise Kernel prefix_sum(init_g, epsm_g[:S], dev_result[:S], rho, mu) [ prefix_sum(dev_result[S * (i - 1):S * i], epsm_g[S * i:S * (i + 1)], dev_result[S * i:S * (i + 1)], rho, mu) for i in range(1, T) ] # Get results back on CPU z_all = dev_result.get().reshape(T, S) # Create an array to store the index for first negative z_t neg_index = np.full(S, fill_value=T + 1) # Print simulation results for s in range(S): for t in range(T): if z_all[t, s] < 0: if neg_index[s] == T + 1: neg_index[s] = t + 1 mean = np.mean(neg_index) return mean
def _fill_array_with_index_knl(self, context, idx_dtype, array_dtype): return ElementwiseKernel(context, Template(r""" ${idx_t} *index, ${array_t} *array, ${array_t} val """).render(idx_t=dtype_to_ctype(idx_dtype), array_t=dtype_to_ctype(array_dtype)), Template(r""" array[index[i]] = val; """).render(), name="fill_array_with_index")
def get_kernel(self, kernel_name, **kwargs): data = kernel_name, tuple(kwargs.items()) if data in self.cache: return profile_kernel(self.cache[data], kernel_name) else: args, src = self._get_code(kernel_name, **kwargs) knl = ElementwiseKernel(self.ctx, args, src, kernel_name, preamble=self.preamble) self.cache[data] = knl return profile_kernel(knl, kernel_name)
def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.imshape = imshape self.clIm = cla.Array(self.q, imshape, numpy.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q)
def test_fast_sum2(self): test_kernel = ElementwiseKernel(self.ctx, "float *a, float *b, float *res_h, float *res_l", "float2 tmp = fast_fp_plus_fp(a[i], b[i]); res_h[i] = tmp.s0; res_l[i] = tmp.s1", preamble=self.doubleword) a_g = pyopencl.array.to_device(self.queue, self.ah) b_g = pyopencl.array.to_device(self.queue, self.bl) res_l = pyopencl.array.empty_like(a_g) res_h = pyopencl.array.empty_like(a_g) test_kernel(a_g, b_g, res_h, res_l) self.assertEqual(abs(self.ah + self.bl - res_h.get()).max(), 0, "Major matches") self.assertGreater(abs(self.ah.astype(numpy.float64) + self.bl - res_h.get()).max(), 0, "Exact mismatches") self.assertEqual(abs(self.ah.astype(numpy.float64) + self.bl - (res_h.get().astype(numpy.float64) + res_l.get())).max(), 0, "Exact matches")
def __init__(self, ctx, queue): self.ctx = ctx self.queue = queue self.allocator = clt.ImmediateAllocator(self.queue) self.memory_pool = clt.MemoryPool(self.allocator) self.program_cache = {} self.array_cache = {} self.arrays = None self.square_array = ElementwiseKernel(self.ctx, "float *in, float *out", "out[i] = in[i]*in[i]", "square")
def init_kernels(self): """Set up the OpenCL kernels.""" from pkg_resources import resource_string kernel_src = resource_string(__name__, 'CLBacterium.cl') self.program = cl.Program(self.context, kernel_src).build(cache_dir=False) # Some kernels that seem like they should be built into pyopencl... self.vclearf = ElementwiseKernel(self.context, "float8 *v", "v[i]=0.0", "vecclearf") self.vcleari = ElementwiseKernel(self.context, "int *v", "v[i]=0", "veccleari") self.vadd = ElementwiseKernel( self.context, "float8 *res, const float8 *in1, const float8 *in2", "res[i] = in1[i] + in2[i]", "vecadd") self.vsub = ElementwiseKernel( self.context, "float8 *res, const float8 *in1, const float8 *in2", "res[i] = in1[i] - in2[i]", "vecsub") self.vaddkx = ElementwiseKernel( self.context, "float8 *res, const float k, const float8 *in1, const float8 *in2", "res[i] = in1[i] + k*in2[i]", "vecaddkx") self.vsubkx = ElementwiseKernel( self.context, "float8 *res, const float k, const float8 *in1, const float8 *in2", "res[i] = in1[i] - k*in2[i]", "vecsubkx") # cell geometry kernels self.calc_cell_area = ElementwiseKernel( self.context, "float* res, float* r, float* l", "res[i] = 2.f*3.1415927f*r[i]*(2.f*r[i]+l[i])", "cell_area_kern") self.calc_cell_vol = ElementwiseKernel( self.context, "float* res, float* r, float* l", "res[i] = 3.1415927f*r[i]*r[i]*(2.f*r[i]+l[i])", "cell_vol_kern") # A dot product as sum of float4 dot products - # i.e. like flattening vectors of float8s into big float vectors # then computing dot # NB. Some openCLs seem not to implement dot(float8,float8) so split # into float4's self.vdot = ReductionKernel( self.context, numpy.float32, neutral="0", reduce_expr="a+b", map_expr="dot(x[i].s0123,y[i].s0123)+dot(x[i].s4567,y[i].s4567)", arguments="__global float8 *x, __global float8 *y")
def integrate_in_time_cl(context, dtype, state, rhs_func, dt, final_time, vis_hook=None): time = 0 step = 0 residual = 0 * state from pyopencl.elementwise import ElementwiseKernel, VectorArg, ScalarArg from pytools.obj_array import as_oarray_func_n_args axpby_knl = ElementwiseKernel(context, [ ScalarArg(dtype, "a"), VectorArg(dtype, "x"), ScalarArg(dtype, "b"), VectorArg(dtype, "y"), VectorArg(dtype, "z"), ], "z[i] = a*x[i] + b*y[i]") # The decorator module won't work on callable objects. D'oh. def axpby_wrapper(*args): return axpby_knl(*args) axpby = as_oarray_func_n_args(axpby_wrapper) # outer time step loop while time < final_time: if time + dt > final_time: dt = final_time - time for a, b in zip(rk4a, rk4b): rhs = rhs_func(time, state) # residual = a*residual + dt*rhs axpby(a, residual, dt, rhs, residual) # state = state + b*residual axpby(1, state, b, residual, state) if vis_hook is not None: vis_hook(step, time, state) # Increment time time = time + dt step += 1 return time, state
def test_dw_div_fp(self): test_kernel = ElementwiseKernel(self.ctx, "float *ah, float *al, float *b, float *res_h, float *res_l", "float2 tmp = dw_div_fp((float2)(ah[i], al[i]),b[i]); res_h[i]=tmp.s0; res_l[i]=tmp.s1;", preamble=self.doubleword) ah_g = pyopencl.array.to_device(self.queue, self.ah) al_g = pyopencl.array.to_device(self.queue, self.al) b_g = pyopencl.array.to_device(self.queue, self.bh) res_l = pyopencl.array.empty_like(b_g) res_h = pyopencl.array.empty_like(b_g) test_kernel(ah_g, al_g, b_g, res_h, res_l) res_m = res_h.get() res = res_h.get().astype(numpy.float64) + res_l.get() self.assertLess(abs(self.a / self.bh - res_m).max(), EPS32, "Major matches") self.assertGreater(abs(self.a / self.bh - res_m).max(), EPS64, "Exact mismatches") self.assertLess(abs(self.a / self.bh - res).max(), 3 * EPS32 ** 2, "Exact matches")
def sim_gpu_nvdi(): band4 = rasterio.open('LC08_B4.tif') #red band5 = rasterio.open('LC08_B5.tif') #nir #Convert nit and red objects to float64 arrays red = band4.read(1).astype('float64') nir = band5.read(1).astype('float64') #Tile the arrays 20 times red = np.tile(red, 20) nir = np.tile(nir, 20) #Get the computation time using original serial solution t0_s = time.time() nvdi_s = (nir - red) / (nir + red) final_time_s = time.time() serial_time = final_time_s - t0_s #Plot the graph using serial solution plt.imsave('ps_q3_serial_tile20.png', nvdi_s) #Get the computation time using gpu #Set up OpenCL context and command queue t0_g = time.time() ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) red_dev = cl_array.to_device(queue, red) nir_dev = cl_array.to_device(queue, nir) nvdi_comb = ElementwiseKernel(ctx, "double *x, double *y, double *res", "res[i] = (x[i] - y[i]) / (x[i] + y[i])") res_gpu = cl.array.empty_like(nir_dev) nvdi_comb(nir_dev, red_dev, res_gpu) nvdi_gpu = res_gpu.get() final_time_g = time.time() gpu_time = final_time_g - t0_g #Plot the graph using gpu solution (to prove that they are the same) plt.imsave('ps_q3_gpu_tile20.png', nvdi_gpu) #Report the time for serial solution and gpu print('The time using serial solution (20 times): {0:.4f} seconds'.format( serial_time)) print('The time using gpu (20 times): {0:.4f} seconds'.format(gpu_time))