def get_common_kwds(dtype, device_params): return dict( dtype=dtype, min_mem_coalesce_width=device_params.min_mem_coalesce_width[dtype.itemsize], local_mem_banks=device_params.local_mem_banks, get_padding=get_padding, wrap_const=lambda x: dtypes.c_constant(x, dtypes.real_for(dtype)), min_blocks=helpers.min_blocks, mul=functions.mul(dtype, dtype), polar_unit=functions.polar_unit(dtypes.real_for(dtype)), cdivs=functions.div(dtype, numpy.uint32, out_dtype=dtype))
def fft512(use_constant_memory=False): module = Module(TEMPLATE.get_def('fft512'), render_kwds=dict( elem_ctype=dtypes.ctype(numpy.complex128), temp_ctype=dtypes.ctype(numpy.float64), cdata_ctype=dtypes.ctype(numpy.complex128), polar_unit=functions.polar_unit(numpy.float64), mul=functions.mul(numpy.complex128, numpy.complex128), use_constant_memory=use_constant_memory, )) return FFT512(module, use_constant_memory)
def unimod_gen(size, single=True): if single: dtype = np.complex64 else: dtype = np.complex128 unimod = Transformation([ Parameter('output', Annotation(Type(dtype, size), 'o')), Parameter('input', Annotation(Type(dtype, size), 'i')) ], ''' ${input.ctype} val = ${input.load_same}; ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); ''', render_kwds=dict(polar_unit=functions.polar_unit( dtype=np.float32 if single else np.double))) return unimod
def unimod_gen(size, single=True): if single: dtype = np.complex64 else: dtype = np.complex128 unimod = Transformation( [ Parameter('output', Annotation(Type(dtype, size), 'o')), Parameter('input', Annotation(Type(dtype, size), 'i')) ], ''' ${input.ctype} val = ${input.load_same}; ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); ''', render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float32 if single else np.double)) ) return unimod
def normal_bm(bijection, dtype, mean=0, std=1): """ Generates normally distributed random numbers with the mean ``mean`` and the standard deviation ``std`` using Box-Muller transform. Supported dtypes: ``float(32/64)``, ``complex(64/128)``. Produces two random numbers per call for real types and one number for complex types. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. .. note:: In case of a complex ``dtype``, ``std`` refers to the standard deviation of the complex numbers (same as ``numpy.std()`` returns), not real and imaginary components (which will be normally distributed with the standard deviation ``std / sqrt(2)``). Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real. """ if dtypes.is_complex(dtype): r_dtype = dtypes.real_for(dtype) c_dtype = dtype else: r_dtype = dtype c_dtype = dtypes.complex_for(dtype) uf = uniform_float(bijection, r_dtype, low=0, high=1) module = Module(TEMPLATE.get_def("normal_bm"), render_kwds=dict(complex_res=dtypes.is_complex(dtype), r_dtype=r_dtype, r_ctype=dtypes.ctype(r_dtype), c_dtype=c_dtype, c_ctype=dtypes.ctype(c_dtype), polar_unit=functions.polar_unit(r_dtype), bijection=bijection, mean=mean, std=std, uf=uf)) return Sampler(bijection, module, dtype, deterministic=uf.deterministic, randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
def normal_bm(bijection, dtype, mean=0, std=1): """ Generates normally distributed random numbers with the mean ``mean`` and the standard deviation ``std`` using Box-Muller transform. Supported dtypes: ``float(32/64)``, ``complex(64/128)``. Produces two random numbers per call for real types and one number for complex types. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. .. note:: In case of a complex ``dtype``, ``std`` refers to the standard deviation of the complex numbers (same as ``numpy.std()`` returns), not real and imaginary components (which will be normally distributed with the standard deviation ``std / sqrt(2)``). Consequently, while ``mean`` is of type ``dtype``, ``std`` must be real. """ if dtypes.is_complex(dtype): r_dtype = dtypes.real_for(dtype) c_dtype = dtype else: r_dtype = dtype c_dtype = dtypes.complex_for(dtype) uf = uniform_float(bijection, r_dtype, low=0, high=1) module = Module( TEMPLATE.get_def("normal_bm"), render_kwds=dict( complex_res=dtypes.is_complex(dtype), r_dtype=r_dtype, r_ctype=dtypes.ctype(r_dtype), c_dtype=c_dtype, c_ctype=dtypes.ctype(c_dtype), polar_unit=functions.polar_unit(r_dtype), bijection=bijection, mean=mean, std=std, uf=uf)) return Sampler( bijection, module, dtype, deterministic=uf.deterministic, randoms_per_call=1 if dtypes.is_complex(dtype) else 2)
def get_procs(thr, N): fft = FFTFactory.create(thr, (N,), compile_=False) unimod_trans = Transformation( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N) ) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''' ) mag_square = mag_square.compile(thr) apply_mask = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double)) ) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel( [Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i'))], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double)) ) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi
def test_polar_unit(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func(thr, functions.polar_unit(in_dtypes[0]), lambda theta: numpy.exp(1j * theta), out_dtype, in_dtypes)
def test_polar_unit(thr, out_code, in_codes): out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) check_func( thr, functions.polar_unit(in_dtypes[0]), lambda theta: numpy.exp(1j * theta), out_dtype, in_dtypes)
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan
def get_procs(thr, N): fft = FFTFactory.create(thr, (N, ), compile_=False) unimod_trans = Transformation( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], """ VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_same}; if (idx>${N}/2){ val.x = 0.0; val.y = 0.0; ${output.store_same}(val); }else ${output.store_same}(${polar_unit}(atan2(val.y, val.x))); """, render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N)) fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output) fft_unimod = fft.compile(thr) mag_square = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('input', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${input.ctype} val = ${input.load_idx}(idx); val.x = val.x*val.x + val.y*val.y; val.y = 0; ${output.store_idx}(idx, val); ''') mag_square = mag_square.compile(thr) apply_mask = PureParallel( [ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('origin', Annotation(Type(np.complex128, N), 'i')), Parameter('mask', Annotation(Type(np.double, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; ${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx))); ''', render_kwds=dict(mul=functions.mul(np.complex128, np.double))) apply_mask = apply_mask.compile(thr) combine_mag_phi = PureParallel([ Parameter('output', Annotation(Type(np.complex128, N), 'o')), Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')), Parameter('phase', Annotation(Type(np.complex128, N), 'i')) ], ''' VSIZE_T idx = ${idxs[0]}; double r = ${mag_square.load_idx}(idx).x; r = r<0.0 ? 0.0 : ${pow}(r, 0.5); double2 v = ${phase.load_idx}(idx); double angle = atan2(v.y, v.x); ${output.store_idx}(idx, ${polar}(r, angle)); ''', render_kwds=dict( pow=functions.pow(np.double), polar=functions.polar(np.double))) combine_mag_phi = combine_mag_phi.compile(thr) return fft_unimod, mag_square, apply_mask, combine_mag_phi