def _build_plan(self, plan_factory, device_params, alpha, beta, seed): plan = plan_factory() bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) squeezing = plan.persistent_array(self._system.squeezing) decoherence = plan.persistent_array(self._system.decoherence) plan.kernel_call(TEMPLATE.get_def("generate_input_state"), [alpha, beta, squeezing, decoherence, seed], kernel_name="generate", global_size=alpha.shape, render_kwds=dict( system=self._system, representation=self._representation, Representation=Representation, bijection=bijection, keygen=keygen, sampler=sampler, ordering=ordering, exp=functions.exp(numpy.float64), mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) return plan
def add_param(arr_t, param_dtype): """ Returns an addition transformation with a dynamic parameter (1 output, 1 input, 1 scalar): ``output = input + param``. """ return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i')), Parameter('param', Annotation(param_dtype))], "${output.store_same}(${add}(${input.load_same}, ${param}));", render_kwds=dict(add=functions.add(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype)))
def add_const(arr_t, param): """ Returns an addition transformation with a fixed parameter (1 output, 1 input): ``output = input + param``. """ param_dtype = dtypes.detect_type(param) return Transformation( [Parameter('output', Annotation(arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))], "${output.store_same}(${add}(${input.load_same}, ${param}));", render_kwds=dict( add=functions.add(arr_t.dtype, param_dtype, out_dtype=arr_t.dtype), param=dtypes.c_constant(param, dtype=param_dtype)))
def test_multiarg_add(thr, out_code, in_codes): """ Checks multi-argument add() with a variety of data types. """ out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) def reference_add(*args): res = sum(args) if not dtypes.is_complex(out_dtype) and dtypes.is_complex(res.dtype): res = res.real return res.astype(out_dtype) # Temporarily catching imaginary part truncation warnings with catch_warnings(): filterwarnings("ignore", "", numpy.ComplexWarning) mul = functions.add(*in_dtypes, out_dtype=out_dtype) check_func(thr, mul, reference_add, out_dtype, in_dtypes)
def _build_plan(self, plan_factory, device_params, alpha, beta, alpha_i, beta_i, seed): plan = plan_factory() system = self._system representation = self._representation unitary = plan.persistent_array(self._system.unitary) needs_noise_matrix = representation != Representation.POSITIVE_P and system.needs_noise_matrix( ) mmul = MatrixMul(alpha, unitary, transposed_b=True) if not needs_noise_matrix: # TODO: this could be sped up for repr != POSITIVE_P, # since in that case alpha == conj(beta), and we don't need to do two multuplications. mmul_beta = MatrixMul(beta, unitary, transposed_b=True) trf_conj = self._make_trf_conj() mmul_beta.parameter.matrix_b.connect(trf_conj, trf_conj.output, matrix_b_p=trf_conj.input) plan.computation_call(mmul, alpha, alpha_i, unitary) plan.computation_call(mmul_beta, beta, beta_i, unitary) else: noise_matrix = system.noise_matrix() noise_matrix_dev = plan.persistent_array(noise_matrix) # If we're here, it's not positive-P, and alpha == conj(beta). # This means we can just calculate alpha, and then build beta from it. w = plan.temp_array_like(alpha) temp_alpha = plan.temp_array_like(alpha) plan.computation_call(mmul, temp_alpha, alpha_i, unitary) bijection = philox(64, 2) # Keeping the kernel the same so it can be cached. # The seed will be passed as the computation parameter instead. keygen = KeyGenerator.create(bijection, seed=numpy.int32(0)) sampler = normal_bm(bijection, numpy.float64) plan.kernel_call(TEMPLATE.get_def("generate_apply_matrix_noise"), [w, seed], kernel_name="generate_apply_matrix_noise", global_size=alpha.shape, render_kwds=dict( bijection=bijection, keygen=keygen, sampler=sampler, mul_cr=functions.mul(numpy.complex128, numpy.float64), add_cc=functions.add(numpy.complex128, numpy.complex128), )) noise = plan.temp_array_like(alpha) plan.computation_call(mmul, noise, w, noise_matrix_dev) plan.kernel_call(TEMPLATE.get_def("add_noise"), [alpha, beta, temp_alpha, noise], kernel_name="add_noise", global_size=alpha.shape, render_kwds=dict( add=functions.add(numpy.complex128, numpy.complex128), conj=functions.conj(numpy.complex128))) return plan
def transformed_add(perf_params): return functions.add(transformed_dtype(), transformed_dtype())
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan