def test_dtype_support(thr, dtype): # Test passes if either thread correctly reports that it does not support given dtype, # or it successfully compiles kernel that operates with this dtype. N = 256 if not thr.device_params.supports_dtype(dtype): pytest.skip() mul = functions.mul(dtype, dtype) div = functions.div(dtype, dtype) program = thr.compile( """ KERNEL void test( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b) { const SIZE_T i = get_global_id(0); ${ctype} temp = ${mul}(a[i], b[i]); dest[i] = ${div}(temp, b[i]); } """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div)) test = program.test # we need results to fit even in unsigned char a = get_test_array(N, dtype, high=8) b = get_test_array(N, dtype, no_zeros=True, high=8) a_dev = thr.to_device(a) b_dev = thr.to_device(b) dest_dev = thr.empty_like(a_dev) test(dest_dev, a_dev, b_dev, global_size=N) assert diff_is_negligible(thr.from_device(dest_dev), a)
def get_nonlinear3(state_arr, scalar_dtype, nonlinear_module, dt): # k4 = N(D(psi_4), t + dt) # output = D(psi_k) + k4 / 6 return PureParallel( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('kprop_psi_k', Annotation(state_arr, 'i')), Parameter('kprop_psi_4', Annotation(state_arr, 'i')), Parameter('t', Annotation(scalar_dtype))], """ <% all_indices = ', '.join(idxs) %> ${output.ctype} psi4_0 = ${kprop_psi_4.load_idx}(0, ${all_indices}); ${output.ctype} psi4_1 = ${kprop_psi_4.load_idx}(1, ${all_indices}); ${output.ctype} psik_0 = ${kprop_psi_k.load_idx}(0, ${all_indices}); ${output.ctype} psik_1 = ${kprop_psi_k.load_idx}(1, ${all_indices}); ${output.ctype} k4_0 = ${nonlinear}0(psi4_0, psi4_1, ${t} + ${dt}); ${output.ctype} k4_1 = ${nonlinear}1(psi4_0, psi4_1, ${t} + ${dt}); ${output.store_idx}(0, ${all_indices}, psik_0 + ${div}(k4_0, 6)); ${output.store_idx}(1, ${all_indices}, psik_1 + ${div}(k4_1, 6)); """, guiding_array=state_arr.shape[1:], render_kwds=dict( nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def get_nonlinear3(state_arr, scalar_dtype, nonlinear_module, dt): # k4 = N(D(psi_4), t + dt) # output = D(psi_k) + k4 / 6 return PureParallel([ Parameter('output', Annotation(state_arr, 'o')), Parameter('kprop_psi_k', Annotation(state_arr, 'i')), Parameter('kprop_psi_4', Annotation(state_arr, 'i')), Parameter('t', Annotation(scalar_dtype)) ], """ <% all_indices = ', '.join(idxs) %> ${output.ctype} psi4_0 = ${kprop_psi_4.load_idx}(0, ${all_indices}); ${output.ctype} psi4_1 = ${kprop_psi_4.load_idx}(1, ${all_indices}); ${output.ctype} psik_0 = ${kprop_psi_k.load_idx}(0, ${all_indices}); ${output.ctype} psik_1 = ${kprop_psi_k.load_idx}(1, ${all_indices}); ${output.ctype} k4_0 = ${nonlinear}0(psi4_0, psi4_1, ${t} + ${dt}); ${output.ctype} k4_1 = ${nonlinear}1(psi4_0, psi4_1, ${t} + ${dt}); ${output.store_idx}(0, ${all_indices}, psik_0 + ${div}(k4_0, 6)); ${output.store_idx}(1, ${all_indices}, psik_1 + ${div}(k4_1, 6)); """, guiding_array=state_arr.shape[1:], render_kwds=dict( nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def get_nonlinear3(state_arr, potential_arr, scalar_dtype, nonlinear_module, dt): # k4 = N(D(psi_4), t + dt) # output = D(psi_k) + k4 / 6 return PureParallel( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('kprop_psi_k', Annotation(state_arr, 'i')), Parameter('kprop_psi_4', Annotation(state_arr, 'i')), Parameter('potential_next', Annotation(potential_arr, 'i')), Parameter('t', Annotation(scalar_dtype))], """ %for comp in range(components): ${output.ctype} psi4_${comp} = ${kprop_psi_4.load_idx}(${comp}, ${idxs.all()}); ${output.ctype} psik_${comp} = ${kprop_psi_k.load_idx}(${comp}, ${idxs.all()}); %endfor ${potential_next.ctype} V = ${potential_next.load_idx}(${', '.join(idxs[1:])}); %for comp in range(components): ${output.ctype} k4_${comp} = ${nonlinear}${comp}( %for pcomp in range(components): psi4_${pcomp}, %endfor V, ${t} + ${dt}); ${output.store_idx}(${comp}, ${idxs.all()}, psik_${comp} + ${div}(k4_${comp}, 6)); %endfor """, guiding_array=state_arr.shape[1:], render_kwds=dict( components=state_arr.shape[0], nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def get_nonlinear2(state_arr, scalar_dtype, nonlinear_module, dt): # k2 = N(psi_I + k1 / 2, t + dt / 2) # k3 = N(psi_I + k2 / 2, t + dt / 2) # psi_4 = psi_I + k3 (argument for the 4-th step k-propagation) # psi_k = psi_I + (k1 + 2(k2 + k3)) / 6 (argument for the final k-propagation) return PureParallel([ Parameter('psi_k', Annotation(state_arr, 'o')), Parameter('psi_4', Annotation(state_arr, 'o')), Parameter('psi_I', Annotation(state_arr, 'i')), Parameter('k1', Annotation(state_arr, 'i')), Parameter('t', Annotation(scalar_dtype)) ], """ <% all_indices = ', '.join(idxs) %> ${psi_k.ctype} psi_I_0 = ${psi_I.load_idx}(0, ${all_indices}); ${psi_k.ctype} psi_I_1 = ${psi_I.load_idx}(1, ${all_indices}); ${psi_k.ctype} k1_0 = ${k1.load_idx}(0, ${all_indices}); ${psi_k.ctype} k1_1 = ${k1.load_idx}(1, ${all_indices}); ${psi_k.ctype} k2_0 = ${nonlinear}0( psi_I_0 + ${div}(k1_0, 2), psi_I_1 + ${div}(k1_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k2_1 = ${nonlinear}1( psi_I_0 + ${div}(k1_0, 2), psi_I_1 + ${div}(k1_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k3_0 = ${nonlinear}0( psi_I_0 + ${div}(k2_0, 2), psi_I_1 + ${div}(k2_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k3_1 = ${nonlinear}1( psi_I_0 + ${div}(k2_0, 2), psi_I_1 + ${div}(k2_1, 2), ${t} + ${dt} / 2); ${psi_4.store_idx}(0, ${all_indices}, psi_I_0 + k3_0); ${psi_4.store_idx}(1, ${all_indices}, psi_I_1 + k3_1); ${psi_k.store_idx}( 0, ${all_indices}, psi_I_0 + ${div}(k1_0, 6) + ${div}(k2_0, 3) + ${div}(k3_0, 3)); ${psi_k.store_idx}( 1, ${all_indices}, psi_I_1 + ${div}(k1_1, 6) + ${div}(k2_1, 3) + ${div}(k3_1, 3)); """, guiding_array=state_arr.shape[1:], render_kwds=dict( nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def get_nonlinear2(state_arr, scalar_dtype, nonlinear_module, dt): # k2 = N(psi_I + k1 / 2, t + dt / 2) # k3 = N(psi_I + k2 / 2, t + dt / 2) # psi_4 = psi_I + k3 (argument for the 4-th step k-propagation) # psi_k = psi_I + (k1 + 2(k2 + k3)) / 6 (argument for the final k-propagation) return PureParallel( [ Parameter('psi_k', Annotation(state_arr, 'o')), Parameter('psi_4', Annotation(state_arr, 'o')), Parameter('psi_I', Annotation(state_arr, 'i')), Parameter('k1', Annotation(state_arr, 'i')), Parameter('t', Annotation(scalar_dtype))], """ <% all_indices = ', '.join(idxs) %> ${psi_k.ctype} psi_I_0 = ${psi_I.load_idx}(0, ${all_indices}); ${psi_k.ctype} psi_I_1 = ${psi_I.load_idx}(1, ${all_indices}); ${psi_k.ctype} k1_0 = ${k1.load_idx}(0, ${all_indices}); ${psi_k.ctype} k1_1 = ${k1.load_idx}(1, ${all_indices}); ${psi_k.ctype} k2_0 = ${nonlinear}0( psi_I_0 + ${div}(k1_0, 2), psi_I_1 + ${div}(k1_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k2_1 = ${nonlinear}1( psi_I_0 + ${div}(k1_0, 2), psi_I_1 + ${div}(k1_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k3_0 = ${nonlinear}0( psi_I_0 + ${div}(k2_0, 2), psi_I_1 + ${div}(k2_1, 2), ${t} + ${dt} / 2); ${psi_k.ctype} k3_1 = ${nonlinear}1( psi_I_0 + ${div}(k2_0, 2), psi_I_1 + ${div}(k2_1, 2), ${t} + ${dt} / 2); ${psi_4.store_idx}(0, ${all_indices}, psi_I_0 + k3_0); ${psi_4.store_idx}(1, ${all_indices}, psi_I_1 + k3_1); ${psi_k.store_idx}( 0, ${all_indices}, psi_I_0 + ${div}(k1_0, 6) + ${div}(k2_0, 3) + ${div}(k3_0, 3)); ${psi_k.store_idx}( 1, ${all_indices}, psi_I_1 + ${div}(k1_1, 6) + ${div}(k2_1, 3) + ${div}(k3_1, 3)); """, guiding_array=state_arr.shape[1:], render_kwds=dict( nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def div_param(arr_t, param_dtype): """ Returns a scaling transformation with a dynamic parameter (1 output, 1 input, 1 scalar): ``output = input / param``. """ return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i')), Parameter('param', Annotation(param_dtype))], "${output.store_same}(${div}(${input.load_same}, ${param}));", render_kwds=dict(div=functions.div(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype)))
def get_common_kwds(dtype, device_params): return dict( dtype=dtype, min_mem_coalesce_width=device_params.min_mem_coalesce_width[dtype.itemsize], local_mem_banks=device_params.local_mem_banks, get_padding=get_padding, wrap_const=lambda x: dtypes.c_constant(x, dtypes.real_for(dtype)), min_blocks=helpers.min_blocks, mul=functions.mul(dtype, dtype), polar_unit=functions.polar_unit(dtypes.real_for(dtype)), cdivs=functions.div(dtype, numpy.uint32, out_dtype=dtype))
def get_nonlinear3(state_type, nonlinear_wrapper, components, diffusion=None, noise_type=None): real_dtype = dtypes.real_for(state_type.dtype) # k4 = N(D(psi_4), t + dt) # output = D(psi_k) + k4 / 6 return PureParallel( [ Parameter('output', Annotation(state_type, 'o')), Parameter('kprop_psi_k', Annotation(state_type, 'i')), Parameter('kprop_psi_4', Annotation(state_type, 'i'))] + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) + [Parameter('t', Annotation(real_dtype)), Parameter('dt', Annotation(real_dtype))], """ <% if diffusion is None: dW = None coords = ", ".join(idxs[1:]) trajectory = idxs[0] args = lambda prefix, num: list(map(lambda i: prefix + str(i), range(num))) dW_args = args('dW_', diffusion.noise_sources) if diffusion is not None else [] k4_args = ", ".join(idxs[1:] + args('psi4_', components) + dW_args) %> %for comp in range(components): ${output.ctype} psi4_${comp} = ${kprop_psi_4.load_idx}(${trajectory}, ${comp}, ${coords}); ${output.ctype} psik_${comp} = ${kprop_psi_k.load_idx}(${trajectory}, ${comp}, ${coords}); %endfor %if diffusion is not None: %for ncomp in range(diffusion.noise_sources): ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords}); %endfor %endif %for comp in range(components): ${output.ctype} k4_${comp} = ${nonlinear}${comp}(${k4_args}, ${t} + ${dt}, ${dt}); %endfor %for comp in range(components): ${output.store_idx}( ${trajectory}, ${comp}, ${coords}, psik_${comp} + ${div}(k4_${comp}, 6)); %endfor """, guiding_array=(state_type.shape[0],) + state_type.shape[2:], render_kwds=dict( components=components, nonlinear=nonlinear_wrapper, diffusion=diffusion, div=functions.div(state_type.dtype, numpy.int32, out_dtype=state_type.dtype)))
def div_const(arr_t, param): """ Returns a scaling transformation with a fixed parameter (1 output, 1 input): ``output = input / param``. """ param_dtype = dtypes.detect_type(param) return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))], "${output.store_same}(${div}(${input.load_same}, ${param}));", render_kwds=dict( div=functions.div(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype), param=dtypes.c_constant(param, dtype=param_dtype)))
def get_nonlinear2(state_arr, potential_arr, scalar_dtype, nonlinear_module, dt): # k2 = N(psi_I + k1 / 2, t + dt / 2) # k3 = N(psi_I + k2 / 2, t + dt / 2) # psi_4 = psi_I + k3 (argument for the 4-th step k-propagation) # psi_k = psi_I + (k1 + 2(k2 + k3)) / 6 (argument for the final k-propagation) return PureParallel( [ Parameter('psi_k', Annotation(state_arr, 'o')), Parameter('psi_4', Annotation(state_arr, 'o')), Parameter('psi_I', Annotation(state_arr, 'i')), Parameter('k1', Annotation(state_arr, 'i')), Parameter('potential_half', Annotation(potential_arr, 'i')), Parameter('t', Annotation(scalar_dtype))], """ %for comp in range(components): ${psi_k.ctype} psi_I_${comp} = ${psi_I.load_idx}(${comp}, ${idxs.all()}); ${psi_k.ctype} k1_${comp} = ${k1.load_idx}(${comp}, ${idxs.all()}); %endfor ${potential_half.ctype} V = ${potential_half.load_idx}(${', '.join(idxs[1:])}); %for comp in range(components): ${psi_k.ctype} k2_${comp} = ${nonlinear}${comp}( %for pcomp in range(components): psi_I_${pcomp} + ${div}(k1_${pcomp}, 2), %endfor V, ${t} + ${dt} / 2); %endfor %for comp in range(components): ${psi_k.ctype} k3_${comp} = ${nonlinear}${comp}( %for pcomp in range(components): psi_I_${pcomp} + ${div}(k2_${pcomp}, 2), %endfor V, ${t} + ${dt} / 2); %endfor %for comp in range(components): ${psi_4.store_idx}(${comp}, ${idxs.all()}, psi_I_${comp} + k3_${comp}); ${psi_k.store_idx}( ${comp}, ${idxs.all()}, psi_I_${comp} + ${div}(k1_${comp}, 6) + ${div}(k2_${comp}, 3) + ${div}(k3_${comp}, 3)); %endfor """, guiding_array=state_arr.shape[1:], render_kwds=dict( components=state_arr.shape[0], nonlinear=nonlinear_module, dt=dtypes.c_constant(dt, scalar_dtype), div=functions.div(state_arr.dtype, numpy.int32, out_dtype=state_arr.dtype)))
def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2): plan = plan_factory() nested = Dummy(C, D, coeff1, same_A_B=True) C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) # Testing a computation call which uses the same argument for two parameters. plan.computation_call(nested, C_temp, D, C, C, coeff1) arr_dtype = C.dtype coeff_dtype = coeff2.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from( """ <%def name="dummy(kernel_declaration, CC, C, D, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1) + ${mul}(${D.load_idx}(idx0, idx1), ${coeff})); } </%def> """ ) # Testing a kernel call which uses the same argument for two parameters. plan.kernel_call( template.get_def("dummy"), [C, C_temp, C_temp, coeff2], global_size=C.shape, render_kwds=dict(mul=mul) ) return plan
def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2): plan = plan_factory() nested = Dummy(C, D, coeff1, same_A_B=True) C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) # Testing a computation call which uses the same argument for two parameters. plan.computation_call(nested, C_temp, D, C, C, coeff1) arr_dtype = C.dtype coeff_dtype = coeff2.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from(""" <%def name="dummy(kernel_declaration, CC, C, D, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1) + ${mul}(${D.load_idx}(idx0, idx1), ${coeff})); } </%def> """) # Testing a kernel call which uses the same argument for two parameters. plan.kernel_call(template.get_def('dummy'), [C, C_temp, C_temp, coeff2], global_size=C.shape, render_kwds=dict(mul=mul)) return plan
def get_nonlinear2(state_type, nonlinear_wrapper, components, diffusion=None, noise_type=None): real_dtype = dtypes.real_for(state_type.dtype) # k2 = N(psi_I + k1 / 2, t + dt / 2) # k3 = N(psi_I + k2 / 2, t + dt / 2) # psi_4 = psi_I + k3 (argument for the 4-th step k-propagation) # psi_k = psi_I + (k1 + 2(k2 + k3)) / 6 (argument for the final k-propagation) return PureParallel( [ Parameter('psi_k', Annotation(state_type, 'o')), Parameter('psi_4', Annotation(state_type, 'o')), Parameter('psi_I', Annotation(state_type, 'i')), Parameter('k1', Annotation(state_type, 'i'))] + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) + [Parameter('t', Annotation(real_dtype)), Parameter('dt', Annotation(real_dtype))], """ <% if diffusion is None: dW = None coords = ", ".join(idxs[1:]) trajectory = idxs[0] args = lambda prefix, num: ", ".join(map(lambda i: prefix + str(i), range(num))) dW_args = (args('dW_', diffusion.noise_sources) + ",") if diffusion is not None else "" %> %for comp in range(components): ${psi_k.ctype} psi_I_${comp} = ${psi_I.load_idx}(${trajectory}, ${comp}, ${coords}); ${psi_k.ctype} k1_${comp} = ${k1.load_idx}(${trajectory}, ${comp}, ${coords}); %endfor %if diffusion is not None: %for ncomp in range(diffusion.noise_sources): ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords}); %endfor %endif %for comp in range(components): ${psi_k.ctype} k2_${comp} = ${nonlinear}${comp}( ${coords}, %for c in range(components): psi_I_${c} + ${div}(k1_${c}, 2), %endfor ${dW_args} ${t} + ${dt} / 2, ${dt}); %endfor %for comp in range(components): ${psi_k.ctype} k3_${comp} = ${nonlinear}${comp}( ${coords}, %for c in range(components): psi_I_${c} + ${div}(k2_${c}, 2), %endfor ${dW_args} ${t} + ${dt} / 2, ${dt}); %endfor %for comp in range(components): ${psi_4.store_idx}(${trajectory}, ${comp}, ${coords}, psi_I_${comp} + k3_${comp}); %endfor %for comp in range(components): ${psi_k.store_idx}( ${trajectory}, ${comp}, ${coords}, psi_I_${comp} + ${div}(k1_${comp}, 6) + ${div}(k2_${comp}, 3) + ${div}(k3_${comp}, 3)); %endfor """, guiding_array=(state_type.shape[0],) + state_type.shape[2:], render_kwds=dict( components=components, nonlinear=nonlinear_wrapper, diffusion=diffusion, div=functions.div(state_type.dtype, numpy.int32, out_dtype=state_type.dtype)))
def test_div(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func( thr, functions.div(*in_dtypes, out_dtype=out_dtype), lambda x, y: dtypes.cast(out_dtype)(x / y), out_dtype, in_dtypes)
def test_div(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func(thr, functions.div(*in_dtypes, out_dtype=out_dtype), lambda x, y: dtypes.cast(out_dtype)(x / y), out_dtype, in_dtypes)
def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff): plan = plan_factory() arr_dtype = C.dtype coeff_dtype = coeff.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from( """ <%def name="dummy(kernel_declaration, C, D, A, B, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${A.ctype} a = ${A.load_idx}(idx0, idx1); ${C.ctype} c = ${mul}(a, ${coeff}); ${C.store_idx}(idx1, idx0, c); %if same_A_B: ${B.ctype} b = ${B.load_idx}(idx0, idx1); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, idx1, d); %else: if (idx1 == 0) { ${B.ctype} b = ${B.load_idx}(idx0); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, d); } %endif } </%def> <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1)); %if same_A_B: ${DD.store_idx}( idx0, idx1, ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) + ${pers_arr.load_idx}(idx0, idx1)); %else: if (idx1 == 0) { ${DD.store_idx}( idx0, ${mul}(${D.load_idx}(idx0), ${const_coeff}) + ${pers_arr.load_idx}(idx0)); } %endif } </%def> """ ) block_size = 8 C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) arr = plan.persistent_array(self._persistent_array) plan.kernel_call( template.get_def("dummy"), [C_temp, D_temp, A, B, coeff], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, div=div, same_A_B=self._same_A_B), ) plan.kernel_call( template.get_def("dummy2"), [ C, D, C_temp, D_temp, (self._persistent_array if self._test_kernel_adhoc_array else arr), (10 if self._test_untyped_scalar else numpy.float32(10)), ], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, same_A_B=self._same_A_B), ) return plan
def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff): plan = plan_factory() arr_dtype = C.dtype coeff_dtype = coeff.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from(""" <%def name="dummy(kernel_declaration, C, D, A, B, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${A.ctype} a = ${A.load_idx}(idx0, idx1); ${C.ctype} c = ${mul}(a, ${coeff}); ${C.store_idx}(idx1, idx0, c); %if same_A_B: ${B.ctype} b = ${B.load_idx}(idx0, idx1); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, idx1, d); %else: if (idx1 == 0) { ${B.ctype} b = ${B.load_idx}(idx0); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, d); } %endif } </%def> <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1)); %if same_A_B: ${DD.store_idx}( idx0, idx1, ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) + ${pers_arr.load_idx}(idx0, idx1)); %else: if (idx1 == 0) { ${DD.store_idx}( idx0, ${mul}(${D.load_idx}(idx0), ${const_coeff}) + ${pers_arr.load_idx}(idx0)); } %endif } </%def> """) block_size = 8 C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) arr = plan.persistent_array(self._persistent_array) plan.kernel_call(template.get_def('dummy'), [C_temp, D_temp, A, B, coeff], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, div=div, same_A_B=self._same_A_B)) plan.kernel_call(template.get_def('dummy2'), [ C, D, C_temp, D_temp, (self._persistent_array if self._test_kernel_adhoc_array else arr), (10 if self._test_untyped_scalar else numpy.float32(10)) ], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, same_A_B=self._same_A_B)) return plan