def get_kprop_trf(state_arr, ksquared_arr, coeffs, exp=False): compound_dtype = dtypes.result_type(coeffs.dtype, ksquared_arr.dtype) return Transformation( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('ksquared', Annotation(ksquared_arr, 'i')), Parameter('dt', Annotation(ksquared_arr.dtype))], """ %if max(coeffs.values) > 0: ${ksquared.ctype} ksquared = ${ksquared.load_idx}(${', '.join(idxs[2:])}); %endif ${dtypes.ctype(compound_dtype)} compound_coeff = ${dtypes.c_constant(0, compound_dtype)}; %for pwr, values in coeffs.values.items(): { ${dtypes.ctype(coeffs.dtype)} value; %for comp in range(output.shape[1]): ${'if' if comp == 0 else 'else if'} (${idxs[1]} == ${comp}) { value = ${dtypes.c_constant(values[comp], coeffs.dtype)}; } %endfor compound_coeff = compound_coeff + ${mul_kc}( %if pwr == 0: ${dt} %elif pwr == 2: -ksquared * ${dt} %else: pow(-ksquared, ${pwr // 2}) * ${dt} %endif , value ); } %endfor ${output.store_same}(${mul_ic}( ${input.load_same}, %if exp is not None: ${exp}(compound_coeff) %else: compound_coeff %endif )); """, render_kwds=dict( coeffs=coeffs, compound_dtype=compound_dtype, mul_ic=functions.mul(state_arr.dtype, compound_dtype, out_dtype=state_arr.dtype), mul_kc=functions.mul(ksquared_arr.dtype, coeffs.dtype, out_dtype=compound_dtype), exp=functions.exp(compound_dtype) if exp else None))
def get_nonlinear_wrapper(state_dtype, grid_dims, drift, diffusion=None): real_dtype = dtypes.real_for(state_dtype) if diffusion is not None: noise_dtype = diffusion.dtype else: noise_dtype = real_dtype return Module.create( """ <% components = drift.components idx_args = ["idx_" + str(dim) for dim in range(grid_dims)] psi_args = ["psi_" + str(comp) for comp in range(components)] if diffusion is not None: dW_args = ["dW_" + str(ncomp) for ncomp in range(diffusion.noise_sources)] %> %for comp in range(components): INLINE WITHIN_KERNEL ${s_ctype} ${prefix}${comp}( %for idx in idx_args: const int ${idx}, %endfor %for psi in psi_args: const ${s_ctype} ${psi}, %endfor %if diffusion is not None: %for dW in dW_args: const ${n_ctype} ${dW}, %endfor %endif const ${r_ctype} t, const ${r_ctype} dt) { return ${mul_sr}(${drift.module}${comp}( ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), dt) %if diffusion is not None: %for ncomp in range(diffusion.noise_sources): + ${mul_sn}(${diffusion.module}${comp}_${ncomp}( ${", ".join(idx_args)}, ${", ".join(psi_args)}, t), ${dW_args[ncomp]}) %endfor %endif ; } %endfor """, render_kwds=dict( grid_dims=grid_dims, s_ctype=dtypes.ctype(state_dtype), r_ctype=dtypes.ctype(real_dtype), n_ctype=dtypes.ctype(noise_dtype), mul_sr=functions.mul(state_dtype, real_dtype), mul_sn=functions.mul(state_dtype, noise_dtype), drift=drift, diffusion=diffusion))
def hanning_window(arr, NFFT): """ Applies the von Hann window to the rows of a 2D array. To account for zero padding (which we do not want to window), NFFT is provided separately. """ if dtypes.is_complex(arr.dtype): coeff_dtype = dtypes.real_for(arr.dtype) else: coeff_dtype = arr.dtype return Transformation([ Parameter('output', Annotation(arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ ${dtypes.ctype(coeff_dtype)} coeff; %if NFFT != output.shape[0]: if (${idxs[1]} >= ${NFFT}) { coeff = 1; } else %endif { coeff = 0.5 * (1 - cos(2 * ${numpy.pi} * ${idxs[-1]} / (${NFFT} - 1))); } ${output.store_same}(${mul}(${input.load_same}, coeff)); """, render_kwds=dict(coeff_dtype=coeff_dtype, NFFT=NFFT, mul=functions.mul( arr.dtype, coeff_dtype)))
def _build_plan(self, plan_factory, device_params, alpha, beta, seed): plan = plan_factory() bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) squeezing = plan.persistent_array(self._system.squeezing) decoherence = plan.persistent_array(self._system.decoherence) plan.kernel_call(TEMPLATE.get_def("generate_input_state"), [alpha, beta, squeezing, decoherence, seed], kernel_name="generate", global_size=alpha.shape, render_kwds=dict( system=self._system, representation=self._representation, Representation=Representation, bijection=bijection, keygen=keygen, sampler=sampler, ordering=ordering, exp=functions.exp(numpy.float64), mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) return plan
def test_dtype_support(thr, dtype): # Test passes if either thread correctly reports that it does not support given dtype, # or it successfully compiles kernel that operates with this dtype. N = 256 if not thr.device_params.supports_dtype(dtype): pytest.skip() mul = functions.mul(dtype, dtype) div = functions.div(dtype, dtype) program = thr.compile( """ KERNEL void test( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b) { const SIZE_T i = get_global_id(0); ${ctype} temp = ${mul}(a[i], b[i]); dest[i] = ${div}(temp, b[i]); } """, render_kwds=dict(ctype=dtypes.ctype(dtype), dtype=dtype, mul=mul, div=div)) test = program.test # we need results to fit even in unsigned char a = get_test_array(N, dtype, high=8) b = get_test_array(N, dtype, no_zeros=True, high=8) a_dev = thr.to_device(a) b_dev = thr.to_device(b) dest_dev = thr.empty_like(a_dev) test(dest_dev, a_dev, b_dev, global_size=N) assert diff_is_negligible(thr.from_device(dest_dev), a)
def _build_plan(self, plan_factory, device_params, output, matrix, vector): plan = plan_factory() summation = Reduce(matrix, predicate_sum(matrix.dtype), axes=(len(matrix.shape) - 1, )) mul_vec = Transformation( [ Parameter('output', Annotation(matrix, 'o')), Parameter('matrix', Annotation(matrix, 'i')), Parameter('vector', Annotation(vector, 'i')) ], """ ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]}))); """, render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)), connectors=['output', 'matrix']) summation.parameter.input.connect(mul_vec, mul_vec.output, matrix=mul_vec.matrix, vector=mul_vec.vector) plan.computation_call(summation, output, matrix, vector) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); ${output.store_same}(cp.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def hanning_window(arr, NFFT): """ Applies the von Hann window to the rows of a 2D array. To account for zero padding (which we do not want to window), NFFT is provided separately. """ if dtypes.is_complex(arr.dtype): coeff_dtype = dtypes.real_for(arr.dtype) else: coeff_dtype = arr.dtype return Transformation( [ Parameter('output', Annotation(arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ ${dtypes.ctype(coeff_dtype)} coeff; %if NFFT != output.shape[0]: if (${idxs[1]} >= ${NFFT}) { coeff = 1; } else %endif { coeff = 0.5 * (1 - cos(2 * ${numpy.pi} * ${idxs[-1]} / (${NFFT} - 1))); } ${output.store_same}(${mul}(${input.load_same}, coeff)); """, render_kwds=dict( coeff_dtype=coeff_dtype, NFFT=NFFT, mul=functions.mul(arr.dtype, coeff_dtype)))
def get_drift(state_dtype, U, gamma, dx, wigner=False): return Drift( Module.create( """ <% r_dtype = dtypes.real_for(s_dtype) s_ctype = dtypes.ctype(s_dtype) r_ctype = dtypes.ctype(r_dtype) %> INLINE WITHIN_KERNEL ${s_ctype} ${prefix}0( const int idx_x, const ${s_ctype} psi, ${r_ctype} t) { return ${mul_cc}( COMPLEX_CTR(${s_ctype})( -${gamma}, -(${U} * (${norm}(psi) - ${correction}))), psi ); } """, render_kwds=dict( s_dtype=state_dtype, U=U, gamma=gamma, mul_cc=functions.mul(state_dtype, state_dtype), norm=functions.norm(state_dtype), correction=1. / dx if wigner else 0 )), state_dtype, components=1)
def nonlinear_no_potential(dtype, U, nu): c_dtype = dtype c_ctype = dtypes.ctype(c_dtype) s_dtype = dtypes.real_for(dtype) s_ctype = dtypes.ctype(s_dtype) return Module.create( """ %for comp in (0, 1): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t) { return ( ${mul}(psi${comp}, ( ${dtypes.c_constant(U[comp, 0])} * ${norm}(psi0) + ${dtypes.c_constant(U[comp, 1])} * ${norm}(psi1) )) - ${mul}(psi${1 - comp}, ${nu}) ); } %endfor """, render_kwds=dict( mul=functions.mul(c_dtype, s_dtype), norm=functions.norm(c_dtype), U=U, nu=dtypes.c_constant(nu, s_dtype), c_ctype=c_ctype, s_ctype=s_ctype))
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${output.store_same}(t.x - ${ordering}); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def get_nonlinear_wrapper(components, c_dtype, nonlinear_module, dt): s_dtype = dtypes.real_for(c_dtype) return Module.create( """ %for comp in range(components): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( %for pcomp in range(components): ${c_ctype} psi${pcomp}, %endfor ${s_ctype} V, ${s_ctype} t) { ${c_ctype} nonlinear = ${nonlinear}${comp}( %for pcomp in range(components): psi${pcomp}, %endfor V, t); return ${mul}( COMPLEX_CTR(${c_ctype})(0, -${dt}), nonlinear); } %endfor """, render_kwds=dict( components=components, c_ctype=dtypes.ctype(c_dtype), s_ctype=dtypes.ctype(s_dtype), mul=functions.mul(c_dtype, c_dtype), dt=dtypes.c_constant(dt, s_dtype), nonlinear=nonlinear_module))
def tr_scale(arr, coeff_t): return Transformation( [ Parameter('o1', Annotation(arr, 'o')), Parameter('i1', Annotation(arr, 'i')), Parameter('s1', Annotation(coeff_t)) ], "${o1.store_same}(${mul}(${i1.load_same}, ${s1}));", render_kwds=dict( mul=functions.mul(arr.dtype, coeff_t, out_dtype=arr.dtype)))
def tr_scale(arr, coeff_t): return Transformation( [ Parameter("o1", Annotation(arr, "o")), Parameter("i1", Annotation(arr, "i")), Parameter("s1", Annotation(coeff_t)), ], "${o1.store_same}(${mul}(${i1.load_same}, ${s1}));", render_kwds=dict(mul=functions.mul(arr.dtype, coeff_t, out_dtype=arr.dtype)), )
def fft512(use_constant_memory=False): module = Module(TEMPLATE.get_def('fft512'), render_kwds=dict( elem_ctype=dtypes.ctype(numpy.complex128), temp_ctype=dtypes.ctype(numpy.float64), cdata_ctype=dtypes.ctype(numpy.complex128), polar_unit=functions.polar_unit(numpy.float64), mul=functions.mul(numpy.complex128, numpy.complex128), use_constant_memory=use_constant_memory, )) return FFT512(module, use_constant_memory)
def mul_param(arr_t, param_dtype): """ Returns a scaling transformation with a dynamic parameter (1 output, 1 input, 1 scalar): ``output = input * param``. """ return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i')), Parameter('param', Annotation(param_dtype))], "${output.store_same}(${mul}(${input.load_same}, ${param}));", render_kwds=dict(mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype)))
def get_common_kwds(dtype, device_params): return dict( dtype=dtype, min_mem_coalesce_width=device_params.min_mem_coalesce_width[dtype.itemsize], local_mem_banks=device_params.local_mem_banks, get_padding=get_padding, wrap_const=lambda x: dtypes.c_constant(x, dtypes.real_for(dtype)), min_blocks=helpers.min_blocks, mul=functions.mul(dtype, dtype), polar_unit=functions.polar_unit(dtypes.real_for(dtype)), cdivs=functions.div(dtype, numpy.uint32, out_dtype=dtype))
def get_multiply_trf(arr): return Transformation( [ Parameter('output', Annotation(arr, 'o')), Parameter('input1', Annotation(arr, 'i')), Parameter('input2', Annotation(arr, 'i')) ], "${output.store_same}(${mul}(${input1.load_same}, ${input2.load_same}));", connectors=['output', 'input1'], render_kwds=dict( mul=functions.mul(arr.dtype, arr.dtype, out_dtype=arr.dtype)))
def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b): bwo = self._block_width_override if bwo is not None: block_widths = [bwo] else: nbanks = device_params.local_mem_banks block_widths = [2**n for n in range(helpers.log2(nbanks), -1, -1)] a_batch = helpers.product(matrix_a.shape[:-2]) b_batch = helpers.product(matrix_b.shape[:-2]) batch = max(a_batch, b_batch) for block_width in block_widths: plan = plan_factory() if block_width**2 > device_params.max_work_group_size: continue num_steps = helpers.min_blocks(self._convolution_size, block_width) a_blocks = helpers.min_blocks(self._a_outer_size, block_width) b_blocks = helpers.min_blocks(self._b_outer_size, block_width) render_kwds = dict(batched_a=(a_batch != 1), batched_b=(b_batch != 1), transposed_a=self._transposed_a, transposed_b=self._transposed_b, num_steps=num_steps, a_slices=(len(matrix_a.shape) - 2, 1, 1), b_slices=(len(matrix_b.shape) - 2, 1, 1), output_slices=(len(output.shape) - 2, 1, 1), block_width=block_width, mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype)) try: plan.kernel_call(TEMPLATE.get_def('matrixmul'), [output, matrix_a, matrix_b], kernel_name="kernel_matrixmul", global_size=(batch, a_blocks * block_width, b_blocks * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds) except OutOfResourcesError: continue return plan raise ValueError( "Could not find suitable call parameters for the kernel")
def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b): bwo = self._block_width_override if bwo is not None: block_widths = [bwo] else: nbanks = device_params.local_mem_banks block_widths = [2 ** n for n in range(helpers.log2(nbanks), -1, -1)] a_batch = helpers.product(matrix_a.shape[:-2]) b_batch = helpers.product(matrix_b.shape[:-2]) batch = max(a_batch, b_batch) for block_width in block_widths: plan = plan_factory() if block_width ** 2 > device_params.max_work_group_size: continue num_steps = helpers.min_blocks(self._convolution_size, block_width) a_blocks = helpers.min_blocks(self._a_outer_size, block_width) b_blocks = helpers.min_blocks(self._b_outer_size, block_width) render_kwds = dict( batched_a=(a_batch != 1), batched_b=(b_batch != 1), transposed_a=self._transposed_a, transposed_b=self._transposed_b, num_steps=num_steps, a_slices=(len(matrix_a.shape) - 2, 1, 1), b_slices=(len(matrix_b.shape) - 2, 1, 1), output_slices=(len(output.shape) - 2, 1, 1), block_width=block_width, mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype)) try: plan.kernel_call( TEMPLATE.get_def('matrixmul'), [output, matrix_a, matrix_b], kernel_name="kernel_matrixmul", global_size=( batch, a_blocks * block_width, b_blocks * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds) except OutOfResourcesError: continue return plan raise ValueError("Could not find suitable call parameters for the kernel")
def tr_1_to_2(arr): return Transformation( [Parameter('o1', Annotation(arr, 'o')), Parameter('o2', Annotation(arr, 'o')), Parameter('i1', Annotation(arr, 'i'))], """ ${o1.ctype} t = ${mul}(${i1.load_same}, 0.5); ${o1.store_same}(t); ${o2.store_same}(t); """, render_kwds=dict( mul=functions.mul(arr.dtype, numpy.float32)))
def __init__(self, state_arr, dt, box=None, kinetic_coeff=1, nonlinear_module=None): scalar_dtype = dtypes.real_for(state_arr.dtype) Computation.__init__(self, [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('t', Annotation(scalar_dtype)) ]) self._box = box self._kinetic_coeff = kinetic_coeff self._nonlinear_module = nonlinear_module self._components = state_arr.shape[0] self._ensembles = state_arr.shape[1] self._grid_shape = state_arr.shape[2:] ksquared = get_ksquared(self._grid_shape, self._box) self._kprop = numpy.exp( ksquared * (-1j * kinetic_coeff * dt / 2)).astype(state_arr.dtype) self._kprop_trf = Transformation( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('kprop', Annotation(self._kprop, 'i')) ], """ ${kprop.ctype} kprop_coeff = ${kprop.load_idx}(${', '.join(idxs[2:])}); ${output.store_same}(${mul}(${input.load_same}, kprop_coeff)); """, render_kwds=dict( mul=functions.mul(state_arr.dtype, self._kprop.dtype))) self._fft = FFT(state_arr, axes=range(2, len(state_arr.shape))) self._fft_with_kprop = FFT(state_arr, axes=range(2, len(state_arr.shape))) self._fft_with_kprop.parameter.output.connect( self._kprop_trf, self._kprop_trf.input, output_prime=self._kprop_trf.output, kprop=self._kprop_trf.kprop) nonlinear_wrapper = get_nonlinear_wrapper(state_arr.dtype, nonlinear_module, dt) self._N1 = get_nonlinear1(state_arr, scalar_dtype, nonlinear_wrapper) self._N2 = get_nonlinear2(state_arr, scalar_dtype, nonlinear_wrapper, dt) self._N3 = get_nonlinear3(state_arr, scalar_dtype, nonlinear_wrapper, dt)
def get_nonlinear(dtype, interaction, tunneling): r""" Nonlinear module .. math:: N(\psi_1, ... \psi_C) = \sum_{n=1}^{C} U_{jn} |\psi_n|^2 \psi_j - \nu_j psi_{m_j} ``interaction``: a symmetrical ``components x components`` array with interaction strengths. ``tunneling``: a list of (other_comp, coeff) pairs of tunnelling strengths. """ c_dtype = dtype c_ctype = dtypes.ctype(c_dtype) s_dtype = dtypes.real_for(dtype) s_ctype = dtypes.ctype(s_dtype) return Module.create( """ %for comp in range(components): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( %for pcomp in range(components): ${c_ctype} psi${pcomp}, %endfor ${s_ctype} V, ${s_ctype} t) { return ( ${mul}(psi${comp}, ( %for other_comp in range(components): + ${dtypes.c_constant(interaction[comp, other_comp], s_dtype)} * ${norm}(psi${other_comp}) %endfor + V )) - ${mul}( psi${tunneling[comp][0]}, ${dtypes.c_constant(tunneling[comp][1], s_dtype)}) ); } %endfor """, render_kwds=dict( components=interaction.shape[0], mul=functions.mul(c_dtype, s_dtype), norm=functions.norm(c_dtype), interaction=interaction, tunneling=tunneling, s_dtype=s_dtype, c_ctype=c_ctype, s_ctype=s_ctype))
def get_multiply(output): return Transformation( [ Parameter('output', Annotation(output, 'o')), Parameter('a', Annotation(output, 'i')), Parameter('b', Annotation(Type(output.dtype, (output.shape[-1],)), 'i')) ], """ ${output.store_same}(${mul}(${a.load_same}, ${b.load_idx}(${idxs[-1]}))); """, connectors=['output', 'a'], render_kwds=dict(mul=functions.mul(output.dtype, output.dtype)) )
def mul_const(arr_t, param): """ Returns a scaling transformation with a fixed parameter (1 output, 1 input): ``output = input * param``. """ param_dtype = dtypes.detect_type(param) return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))], "${output.store_same}(${mul}(${input.load_same}, ${param}));", render_kwds=dict( mul=functions.mul(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype), param=dtypes.c_constant(param, dtype=param_dtype)))
def tr_2_to_1(arr, scalar): return Transformation( [Parameter('o1', Annotation(arr, 'o')), Parameter('i1', Annotation(arr, 'i')), Parameter('i2', Annotation(arr, 'i')), Parameter('s1', Annotation(scalar))], """ ${o1.ctype} t = ${mul}(${cast}(${s1}), ${i1.load_same}); ${o1.store_same}(t + ${i2.load_same}); """, render_kwds= dict( mul=functions.mul(arr.dtype, arr.dtype), cast=functions.cast(arr.dtype, scalar.dtype)))
def get_multiply(output): return Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('a', Annotation(output, 'i')), Parameter('b', Annotation(Type(output.dtype, (output.shape[-1], )), 'i')) ], """ ${output.store_same}(${mul}(${a.load_same}, ${b.load_idx}(${idxs[-1]}))); """, connectors=['output', 'a'], render_kwds=dict( mul=functions.mul(output.dtype, output.dtype)))
def __init__(self, state_arr, dt, box=None, kinetic_coeff=1, nonlinear_module=None): scalar_dtype = dtypes.real_for(state_arr.dtype) potential_arr = Type(scalar_dtype, shape=state_arr.shape[2:]) Computation.__init__(self, [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('potential1', Annotation(potential_arr, 'i')), Parameter('potential2', Annotation(potential_arr, 'i')), Parameter('t_potential1', Annotation(scalar_dtype)), Parameter('t_potential2', Annotation(scalar_dtype)), Parameter('t', Annotation(scalar_dtype))]) self._box = box self._kinetic_coeff = kinetic_coeff self._nonlinear_module = nonlinear_module self._components = state_arr.shape[0] self._ensembles = state_arr.shape[1] self._grid_shape = state_arr.shape[2:] ksquared = get_ksquared(self._grid_shape, self._box) self._kprop = numpy.exp(ksquared * (-1j * kinetic_coeff * dt / 2)).astype(state_arr.dtype) self._kprop_trf = Transformation( [ Parameter('output', Annotation(state_arr, 'o')), Parameter('input', Annotation(state_arr, 'i')), Parameter('kprop', Annotation(self._kprop, 'i'))], """ ${kprop.ctype} kprop_coeff = ${kprop.load_idx}(${', '.join(idxs[2:])}); ${output.store_same}(${mul}(${input.load_same}, kprop_coeff)); """, render_kwds=dict(mul=functions.mul(state_arr.dtype, self._kprop.dtype))) self._fft = FFT(state_arr, axes=range(2, len(state_arr.shape))) self._fft_with_kprop = FFT(state_arr, axes=range(2, len(state_arr.shape))) self._fft_with_kprop.parameter.output.connect( self._kprop_trf, self._kprop_trf.input, output_prime=self._kprop_trf.output, kprop=self._kprop_trf.kprop) nonlinear_wrapper = get_nonlinear_wrapper( state_arr.shape[0], state_arr.dtype, nonlinear_module, dt) self._N1 = get_nonlinear1(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper) self._N2 = get_nonlinear2(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper, dt) self._N3 = get_nonlinear3(state_arr, potential_arr, scalar_dtype, nonlinear_wrapper, dt) self._potential_interpolator = get_potential_interpolator(potential_arr, dt)
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]} + 1; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i < ${modes}; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); if (i >= order) { result = ${mul_cc}(result, np); } else { ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); result = ${mul_cc}(result, cp); } } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), modes=self._system.modes, )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def get_nonlinear_wrapper(c_dtype, nonlinear_module, dt): s_dtype = dtypes.real_for(c_dtype) return Module.create(""" %for comp in (0, 1): INLINE WITHIN_KERNEL ${c_ctype} ${prefix}${comp}( ${c_ctype} psi0, ${c_ctype} psi1, ${s_ctype} t) { ${c_ctype} nonlinear = ${nonlinear}${comp}(psi0, psi1, t); return ${mul}( COMPLEX_CTR(${c_ctype})(0, -${dt}), nonlinear); } %endfor """, render_kwds=dict(c_ctype=dtypes.ctype(c_dtype), s_ctype=dtypes.ctype(s_dtype), mul=functions.mul(c_dtype, c_dtype), dt=dtypes.c_constant(dt, s_dtype), nonlinear=nonlinear_module))
def test_multiarg_mul(thr, out_code, in_codes): """ Checks multi-argument mul() with a variety of data types. """ out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) def reference_mul(*args): res = product(args) if not dtypes.is_complex(out_dtype) and dtypes.is_complex(res.dtype): res = res.real return res.astype(out_dtype) # Temporarily catching imaginary part truncation warnings with catch_warnings(): filterwarnings("ignore", "", numpy.ComplexWarning) mul = functions.mul(*in_dtypes, out_dtype=out_dtype) check_func(thr, mul, reference_mul, out_dtype, in_dtypes)
def get_diffusion(state_dtype, gamma): return Diffusion( Module.create( """ <% r_dtype = dtypes.real_for(s_dtype) s_ctype = dtypes.ctype(s_dtype) r_ctype = dtypes.ctype(r_dtype) %> INLINE WITHIN_KERNEL ${s_ctype} ${prefix}0_0( const int idx_x, const ${s_ctype} psi, ${r_ctype} t) { return COMPLEX_CTR(${s_ctype})(${numpy.sqrt(gamma)}, 0); } """, render_kwds=dict( mul_cr=functions.mul(state_dtype, dtypes.real_for(state_dtype)), s_dtype=state_dtype, gamma=gamma)), state_dtype, components=1, noise_sources=1)
def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2): plan = plan_factory() nested = Dummy(C, D, coeff1, same_A_B=True) C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) # Testing a computation call which uses the same argument for two parameters. plan.computation_call(nested, C_temp, D, C, C, coeff1) arr_dtype = C.dtype coeff_dtype = coeff2.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from( """ <%def name="dummy(kernel_declaration, CC, C, D, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1) + ${mul}(${D.load_idx}(idx0, idx1), ${coeff})); } </%def> """ ) # Testing a kernel call which uses the same argument for two parameters. plan.kernel_call( template.get_def("dummy"), [C, C_temp, C_temp, coeff2], global_size=C.shape, render_kwds=dict(mul=mul) ) return plan
def _build_plan(self, plan_factory, device_params, C, D, coeff1, coeff2): plan = plan_factory() nested = Dummy(C, D, coeff1, same_A_B=True) C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) # Testing a computation call which uses the same argument for two parameters. plan.computation_call(nested, C_temp, D, C, C, coeff1) arr_dtype = C.dtype coeff_dtype = coeff2.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from(""" <%def name="dummy(kernel_declaration, CC, C, D, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1) + ${mul}(${D.load_idx}(idx0, idx1), ${coeff})); } </%def> """) # Testing a kernel call which uses the same argument for two parameters. plan.kernel_call(template.get_def('dummy'), [C, C_temp, C_temp, coeff2], global_size=C.shape, render_kwds=dict(mul=mul)) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = (input_.shape[-1] - 1) * 2 WNmk = numpy.exp(-2j * numpy.pi * numpy.arange(N // 2) / N) A = 0.5 * (1 - 1j * WNmk) B = 0.5 * (1 + 1j * WNmk) A_arr = plan.persistent_array(A.conj()) B_arr = plan.persistent_array(B.conj()) cfft_arr = Type(input_.dtype, input_.shape[:-1] + (N // 2, )) cfft = FFT(cfft_arr, axes=(len(input_.shape) - 1, )) prepare_output = prepare_irfft_output(cfft.parameter.output) cfft.parameter.output.connect(prepare_output, prepare_output.input, real_output=prepare_output.output) temp = plan.temp_array_like(cfft.parameter.input) batch_size = helpers.product(output.shape[:-1]) plan.kernel_call(TEMPLATE.get_def('prepare_irfft_input'), [temp, input_, A_arr, B_arr], global_size=(batch_size, N // 2), render_kwds=dict(slices=(len(input_.shape) - 1, 1), N=N, mul=functions.mul( input_.dtype, input_.dtype), conj=functions.conj(input_.dtype))) plan.computation_call(cfft, output, temp, inverse=True) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = (input_.shape[-1] - 1) * 2 WNmk = numpy.exp(-2j * numpy.pi * numpy.arange(N//2) / N) A = 0.5 * (1 - 1j * WNmk) B = 0.5 * (1 + 1j * WNmk) A_arr = plan.persistent_array(A.conj()) B_arr = plan.persistent_array(B.conj()) cfft_arr = Type(input_.dtype, input_.shape[:-1] + (N // 2,)) cfft = FFT(cfft_arr, axes=(len(input_.shape) - 1,)) prepare_output = prepare_irfft_output(cfft.parameter.output) cfft.parameter.output.connect( prepare_output, prepare_output.input, real_output=prepare_output.output) temp = plan.temp_array_like(cfft.parameter.input) batch_size = helpers.product(output.shape[:-1]) plan.kernel_call( TEMPLATE.get_def('prepare_irfft_input'), [temp, input_, A_arr, B_arr], global_size=(batch_size, N // 2), render_kwds=dict( slices=(len(input_.shape) - 1, 1), N=N, mul=functions.mul(input_.dtype, input_.dtype), conj=functions.conj(input_.dtype))) plan.computation_call(cfft, output, temp, inverse=True) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]}; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i <= order; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); t.x -= ${ordering}; result = ${mul_cc}(result, t); } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff): plan = plan_factory() arr_dtype = C.dtype coeff_dtype = coeff.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from(""" <%def name="dummy(kernel_declaration, C, D, A, B, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${A.ctype} a = ${A.load_idx}(idx0, idx1); ${C.ctype} c = ${mul}(a, ${coeff}); ${C.store_idx}(idx1, idx0, c); %if same_A_B: ${B.ctype} b = ${B.load_idx}(idx0, idx1); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, idx1, d); %else: if (idx1 == 0) { ${B.ctype} b = ${B.load_idx}(idx0); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, d); } %endif } </%def> <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1)); %if same_A_B: ${DD.store_idx}( idx0, idx1, ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) + ${pers_arr.load_idx}(idx0, idx1)); %else: if (idx1 == 0) { ${DD.store_idx}( idx0, ${mul}(${D.load_idx}(idx0), ${const_coeff}) + ${pers_arr.load_idx}(idx0)); } %endif } </%def> """) block_size = 8 C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) arr = plan.persistent_array(self._persistent_array) plan.kernel_call(template.get_def('dummy'), [C_temp, D_temp, A, B, coeff], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, div=div, same_A_B=self._same_A_B)) plan.kernel_call(template.get_def('dummy2'), [ C, D, C_temp, D_temp, (self._persistent_array if self._test_kernel_adhoc_array else arr), (10 if self._test_untyped_scalar else numpy.float32(10)) ], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, same_A_B=self._same_A_B)) return plan
def _build_plan(self, plan_factory, device_params, C, D, A, B, coeff): plan = plan_factory() arr_dtype = C.dtype coeff_dtype = coeff.dtype mul = functions.mul(arr_dtype, coeff_dtype) div = functions.div(arr_dtype, coeff_dtype) template = template_from( """ <%def name="dummy(kernel_declaration, C, D, A, B, coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${A.ctype} a = ${A.load_idx}(idx0, idx1); ${C.ctype} c = ${mul}(a, ${coeff}); ${C.store_idx}(idx1, idx0, c); %if same_A_B: ${B.ctype} b = ${B.load_idx}(idx0, idx1); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, idx1, d); %else: if (idx1 == 0) { ${B.ctype} b = ${B.load_idx}(idx0); ${D.ctype} d = ${div}(b, ${coeff}); ${D.store_idx}(idx0, d); } %endif } </%def> <%def name="dummy2(kernel_declaration, CC, DD, C, D, pers_arr, const_coeff)"> ${kernel_declaration} { VIRTUAL_SKIP_THREADS; VSIZE_T idx0 = virtual_global_id(0); VSIZE_T idx1 = virtual_global_id(1); ${CC.store_idx}(idx0, idx1, ${C.load_idx}(idx0, idx1)); %if same_A_B: ${DD.store_idx}( idx0, idx1, ${mul}(${D.load_idx}(idx0, idx1), ${const_coeff}) + ${pers_arr.load_idx}(idx0, idx1)); %else: if (idx1 == 0) { ${DD.store_idx}( idx0, ${mul}(${D.load_idx}(idx0), ${const_coeff}) + ${pers_arr.load_idx}(idx0)); } %endif } </%def> """ ) block_size = 8 C_temp = plan.temp_array_like(C) D_temp = plan.temp_array_like(D) arr = plan.persistent_array(self._persistent_array) plan.kernel_call( template.get_def("dummy"), [C_temp, D_temp, A, B, coeff], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, div=div, same_A_B=self._same_A_B), ) plan.kernel_call( template.get_def("dummy2"), [ C, D, C_temp, D_temp, (self._persistent_array if self._test_kernel_adhoc_array else arr), (10 if self._test_untyped_scalar else numpy.float32(10)), ], global_size=A.shape, local_size=(block_size, block_size), render_kwds=dict(mul=mul, same_A_B=self._same_A_B), ) return plan
def get_procs(thr, N): fft = FFTFactory.create(thr, (N,), compile_=False) unimod_trans = Transformation( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N) ) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''' ) mag_square = mag_square.compile(thr) apply_mask = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double)) ) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double)) ) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan
def _build_plan(self, plan_factory, device_params, alpha, beta, alpha_i, beta_i, seed): plan = plan_factory() system = self._system representation = self._representation unitary = plan.persistent_array(self._system.unitary) needs_noise_matrix = representation != Representation.POSITIVE_P and system.needs_noise_matrix( ) mmul = MatrixMul(alpha, unitary, transposed_b=True) if not needs_noise_matrix: # TODO: this could be sped up for repr != POSITIVE_P, # since in that case alpha == conj(beta), and we don't need to do two multuplications. mmul_beta = MatrixMul(beta, unitary, transposed_b=True) trf_conj = self._make_trf_conj() mmul_beta.parameter.matrix_b.connect(trf_conj, trf_conj.output, matrix_b_p=trf_conj.input) plan.computation_call(mmul, alpha, alpha_i, unitary) plan.computation_call(mmul_beta, beta, beta_i, unitary) else: noise_matrix = system.noise_matrix() noise_matrix_dev = plan.persistent_array(noise_matrix) # If we're here, it's not positive-P, and alpha == conj(beta). # This means we can just calculate alpha, and then build beta from it. w = plan.temp_array_like(alpha) temp_alpha = plan.temp_array_like(alpha) plan.computation_call(mmul, temp_alpha, alpha_i, unitary) bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) plan.kernel_call(TEMPLATE.get_def("generate_apply_matrix_noise"), [w, seed], kernel_name="generate_apply_matrix_noise", global_size=alpha.shape, render_kwds=dict( bijection=bijection, keygen=keygen, sampler=sampler, mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) noise = plan.temp_array_like(alpha) plan.computation_call(mmul, noise, w, noise_matrix_dev) plan.kernel_call(TEMPLATE.get_def("add_noise"), [alpha, beta, temp_alpha, noise], kernel_name="add_noise", global_size=alpha.shape, render_kwds=dict( add=functions.add(numpy.complex128, numpy.complex128), conj=functions.conj(numpy.complex128))) return plan
def multiplier(dtype, num=1): mul = functions.mul(dtype, dtype, out_dtype=dtype) return Module(TEMPLATE.get_def('multiplier'), render_kwds=dict(dtype=dtype, num=num, mul=mul))
def transformed_mul(perf_params): return functions.mul(transformed_dtype(), transformed_dtype())
def multiplier(dtype, num=1): mul = functions.mul(dtype, dtype, out_dtype=dtype) return Module( TEMPLATE.get_def('multiplier'), render_kwds=dict(dtype=dtype, num=num, mul=mul))
def get_procs(thr, N): fft = FFTFactory.create(thr, (N, ), compile_=False) unimod_trans = Transformation( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N)) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''') mag_square = mag_square.compile(thr) apply_mask = PureParallel( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double))) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict( pow=functions.pow(np.double), polar=functions.polar(np.double))) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def get_xpropagate(state_type, drift, diffusion=None, noise_type=None): real_dtype = dtypes.real_for(state_type.dtype) if diffusion is not None: noise_dtype = noise_type.dtype else: noise_dtype = real_dtype return PureParallel( [ Parameter('output', Annotation(state_type, 'o')), Parameter('omega', Annotation(state_type, 'io')), Parameter('input', Annotation(state_type, 'i')), Parameter('kinput', Annotation(state_type, 'i'))] + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) + [Parameter('ai', Annotation(real_dtype)), Parameter('bi', Annotation(real_dtype)), Parameter('ci', Annotation(real_dtype)), Parameter('t', Annotation(real_dtype)), Parameter('dt', Annotation(real_dtype)), Parameter('stage', Annotation(numpy.int32))], """ <% coords = ", ".join(idxs[1:]) trajectory = idxs[0] components = drift.components if diffusion is not None: noise_sources = diffusion.noise_sources psi_args = ", ".join("psi_" + str(c) for c in range(components)) if diffusion is None: dW = None %> %for comp in range(components): ${output.ctype} omega_${comp}; if (${stage} == 0) { omega_${comp} = ${dtypes.c_constant(0, output.dtype)}; } else { omega_${comp} = ${omega.load_idx}(${trajectory}, ${comp}, ${coords}); } ${output.ctype} psi_${comp} = ${input.load_idx}(${trajectory}, ${comp}, ${coords}); ${output.ctype} kpsi_${comp} = ${kinput.load_idx}(${trajectory}, ${comp}, ${coords}); ${output.ctype} dpsi_${comp}; %endfor %if diffusion is not None: %for ncomp in range(noise_sources): ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords}); %endfor %endif %for comp in range(components): dpsi_${comp} = kpsi_${comp} + ${mul_cr}( + ${drift.module}${comp}(${coords}, ${psi_args}, ${t} + ${dt} * ${ci}), ${dt}) %if diffusion is not None: %for ncomp in range(noise_sources): + ${mul_cn}(${diffusion.module}${comp}_${ncomp}( ${coords}, ${psi_args}, ${t} + ${dt} * ${ci}), dW_${ncomp}) %endfor %endif ; %endfor ${output.ctype} new_omega, new_u; %for comp in range(components): new_omega = ${mul_cr}(omega_${comp}, ${ai}) + dpsi_${comp}; new_u = psi_${comp} + ${mul_cr}(new_omega, ${bi}); if (${stage} < 5) { ${omega.store_idx}(${trajectory}, ${comp}, ${coords}, new_omega); } ${output.store_idx}(${trajectory}, ${comp}, ${coords}, new_u); %endfor """, guiding_array=(state_type.shape[0],) + state_type.shape[2:], render_kwds=dict( drift=drift, diffusion=diffusion, mul_cr=functions.mul(state_type.dtype, real_dtype), mul_cn=functions.mul(state_type.dtype, noise_dtype)))
def get_prop_iter(state_type, drift, iterations, diffusion=None, noise_type=None): if dtypes.is_complex(state_type.dtype): real_dtype = dtypes.real_for(state_type.dtype) else: real_dtype = state_type.dtype if diffusion is not None: noise_dtype = noise_type.dtype else: noise_dtype = real_dtype return PureParallel( [ Parameter('output', Annotation(state_type, 'o')), Parameter('input', Annotation(state_type, 'i'))] + ([Parameter('dW', Annotation(noise_type, 'i'))] if diffusion is not None else []) + [Parameter('t', Annotation(real_dtype)), Parameter('dt', Annotation(real_dtype))], """ <% coords = ", ".join(idxs[1:]) trajectory = idxs[0] components = drift.components if diffusion is not None: noise_sources = diffusion.noise_sources psi_args = ", ".join("psi_" + str(c) + "_tmp" for c in range(components)) if diffusion is None: dW = None %> %for comp in range(components): ${output.ctype} psi_${comp} = ${input.load_idx}(${trajectory}, ${comp}, ${coords}); ${output.ctype} psi_${comp}_tmp = psi_${comp}; ${output.ctype} dpsi_${comp}; %endfor %if diffusion is not None: %for ncomp in range(noise_sources): ${dW.ctype} dW_${ncomp} = ${dW.load_idx}(${trajectory}, ${ncomp}, ${coords}); %endfor %endif %for i in range(iterations): %for comp in range(components): dpsi_${comp} = ${mul_cr}( ${mul_cr}(${drift.module}${comp}( ${coords}, ${psi_args}, ${t} + ${dt} / 2), ${dt}) %if diffusion is not None: %for ncomp in range(noise_sources): + ${mul_cn}(${diffusion.module}${comp}_${ncomp}( ${coords}, ${psi_args}, ${t} + ${dt} / 2), dW_${ncomp}) %endfor %endif , 0.5); %endfor %for comp in range(components): psi_${comp}_tmp = psi_${comp} + dpsi_${comp}; %endfor %endfor %for comp in range(components): ${output.store_idx}(${trajectory}, ${comp}, ${coords}, psi_${comp}_tmp + dpsi_${comp}); %endfor """, guiding_array=(state_type.shape[0],) + state_type.shape[2:], render_kwds=dict( drift=drift, diffusion=diffusion, iterations=iterations, mul_cr=functions.mul(state_type.dtype, real_dtype), mul_cn=functions.mul(state_type.dtype, noise_dtype)))