def test_summation(thr): perf_size = 2**22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def test_summation(thr): perf_size = 2 ** 22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def test_structure_type(thr): shape = (100, 100) dtype = dtypes.align( numpy.dtype([('i1', numpy.uint32), ('nested', numpy.dtype([ ('v', numpy.uint64), ])), ('i2', numpy.uint32)])) a = get_test_array(shape, dtype) a_dev = thr.to_device(a) # Have to construct the resulting array manually, # since numpy cannot reduce arrays with struct dtypes. b_ref = numpy.empty(100, dtype) b_ref['i1'] = a['i1'].sum(0) b_ref['nested']['v'] = a['nested']['v'].sum(0) b_ref['i2'] = a['i2'].sum(0) predicate = Predicate( Snippet.create(lambda v1, v2: """ ${ctype} result = ${v1}; result.i1 += ${v2}.i1; result.nested.v += ${v2}.nested.v; result.i2 += ${v2}.i2; return result; """, render_kwds=dict(ctype=dtypes.ctype_module(dtype))), numpy.zeros(1, dtype)[0]) rd = Reduce(a_dev, predicate, axes=(0, )) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) b_res = b_dev.get() # Array.get() runs numpy.lib.stride_tricks.as_strided() on the array, # which adds dummy variables instead of custom offsets (and our `dtype` has them), # making the result dtype different, and failing the test. # For now we will just note the difference and convert the result # back to the original dtype (they are still compatible). # When the behavior changes, the test will start to fail and we will notice. # See inducer/compyte issue #26. wrong_dtype = b_res.dtype != b_dev.dtype b_res = b_res.astype(dtype) assert diff_is_negligible(b_res, b_ref) if wrong_dtype: pytest.xfail("as_strided() still corrupts the datatype") else: pytest.fail("as_strided() does not corrupt the datatype anymore, " "we can remove the `astype()` now")
def test_normal(thr, shape, axis): a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis,) if axis is not None else None) b_dev = thr.empty_like(rd.parameter.output) b_ref = a.sum(axis) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_nonsequential_axes(thr): shape = (50, 40, 30, 20) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0, 2)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_nonsequential_axes(thr): shape = (50, 40, 30, 20) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0,2)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); ${output.store_same}(cp.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, output, matrix, vector): plan = plan_factory() summation = Reduce(matrix, predicate_sum(matrix.dtype), axes=(len(matrix.shape) - 1, )) mul_vec = Transformation( [ Parameter('output', Annotation(matrix, 'o')), Parameter('matrix', Annotation(matrix, 'i')), Parameter('vector', Annotation(vector, 'i')) ], """ ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]}))); """, render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)), connectors=['output', 'matrix']) summation.parameter.input.connect(mul_vec, mul_vec.output, matrix=mul_vec.matrix, vector=mul_vec.vector) plan.computation_call(summation, output, matrix, vector) return plan
def test_normal(thr, shape, axis): a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis, ) if axis is not None else None) b_dev = thr.empty_like(rd.parameter.output) b_ref = a.sum(axis) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, alpha.shape) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ ${alpha.ctype} alpha = ${alpha.load_same}; ${beta.ctype} beta = ${beta.load_same}; ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${output.store_same}(t.x - ${ordering}); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 2 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N) coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N) c1_arr = plan.persistent_array(coeffs1) c2_arr = plan.persistent_array(coeffs2) multiply = get_multiply(input_) # re_X_1 = sum(x * coeffs2) t = plan.temp_array_like(input_) rd = Reduce(t, predicate_sum(input_.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(multiply, multiply.output, x=multiply.a, c2=multiply.b) re_X_0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, re_X_0, input_, c2_arr) # Y = numpy.fft.rfft(x * coeffs1) rfft = RFFT(input_, dont_store_last=True) rfft.parameter.input.connect(multiply, multiply.output, x=multiply.a, c1=multiply.b) Y = plan.temp_array_like(rfft.parameter.output) plan.computation_call(rfft, Y, input_, c1_arr) # Y *= -1j # Y[0] /= 2 # Y[0] += re_X_1 # res = numpy.cumsum(Y[:-1]) prepare_prfft_scan = get_prepare_prfft_scan(Y) sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False) sc.parameter.input.connect(prepare_prfft_scan, prepare_prfft_scan.output, Y=prepare_prfft_scan.Y, re_X_0=prepare_prfft_scan.re_X_0) plan.computation_call(sc, output, Y, re_X_0) return plan
def test_nondefault_function(thr): shape = (100, 100) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0) predicate = Predicate( Snippet.create(lambda v1, v2: "return ${v1} + ${v2};"), 0) rd = Reduce(a_dev, predicate, axes=(0, )) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_nondefault_function(thr): shape = (100, 100) a = get_test_array(shape, numpy.int64) a_dev = thr.to_device(a) b_ref = a.sum(0) predicate = Predicate( Snippet.create(lambda v1, v2: "return ${v1} + ${v2};"), 0) rd = Reduce(a_dev, predicate, axes=(0,)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) assert diff_is_negligible(b_dev.get(), b_ref)
def test_structure_type(thr): shape = (100, 100) dtype = dtypes.align(numpy.dtype([ ('i1', numpy.uint32), ('nested', numpy.dtype([ ('v', numpy.uint64), ])), ('i2', numpy.uint32) ])) a = get_test_array(shape, dtype) a_dev = thr.to_device(a) # Have to construct the resulting array manually, # since numpy cannot reduce arrays with struct dtypes. b_ref = numpy.empty(100, dtype) b_ref['i1'] = a['i1'].sum(0) b_ref['nested']['v'] = a['nested']['v'].sum(0) b_ref['i2'] = a['i2'].sum(0) predicate = Predicate( Snippet.create(lambda v1, v2: """ ${ctype} result = ${v1}; result.i1 += ${v2}.i1; result.nested.v += ${v2}.nested.v; result.i2 += ${v2}.i2; return result; """, render_kwds=dict( ctype=dtypes.ctype_module(dtype))), numpy.zeros(1, dtype)[0]) rd = Reduce(a_dev, predicate, axes=(0,)) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) b_res = b_dev.get() assert diff_is_negligible(b_res, b_ref)
def test_structure_type(thr): shape = (100, 100) dtype = dtypes.align( numpy.dtype([('i1', numpy.uint32), ('nested', numpy.dtype([ ('v', numpy.uint64), ])), ('i2', numpy.uint32)])) a = get_test_array(shape, dtype) a_dev = thr.to_device(a) # Have to construct the resulting array manually, # since numpy cannot reduce arrays with struct dtypes. b_ref = numpy.empty(100, dtype) b_ref['i1'] = a['i1'].sum(0) b_ref['nested']['v'] = a['nested']['v'].sum(0) b_ref['i2'] = a['i2'].sum(0) predicate = Predicate( Snippet.create(lambda v1, v2: """ ${ctype} result = ${v1}; result.i1 += ${v2}.i1; result.nested.v += ${v2}.nested.v; result.i2 += ${v2}.i2; return result; """, render_kwds=dict(ctype=dtypes.ctype_module(dtype))), numpy.zeros(1, dtype)[0]) rd = Reduce(a_dev, predicate, axes=(0, )) b_dev = thr.empty_like(rd.parameter.output) rdc = rd.compile(thr) rdc(b_dev, a_dev) b_res = b_dev.get() assert diff_is_negligible(b_res, b_ref)
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 4 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) # The first element is unused coeffs = numpy.concatenate( [[0], 1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N // 2) / N))]) coeffs_arr = plan.persistent_array(coeffs) prepare_iprfft_input = get_prepare_iprfft_input(input_) prepare_iprfft_output = get_prepare_iprfft_output(output) irfft = IRFFT(prepare_iprfft_input.Y) irfft.parameter.input.connect(prepare_iprfft_input, prepare_iprfft_input.Y, X=prepare_iprfft_input.X) irfft.parameter.output.connect(prepare_iprfft_output, prepare_iprfft_output.y, x=prepare_iprfft_output.x, x0=prepare_iprfft_output.x0, coeffs=prepare_iprfft_output.coeffs) real = Transformation([ Parameter( 'output', Annotation(Type(dtypes.real_for(input_.dtype), input_.shape), 'o')), Parameter('input', Annotation(input_, 'i')), ], """ ${output.store_same}((${input.load_same}).x); """, connectors=['output']) rd_t = Type(output.dtype, input_.shape) rd = Reduce(rd_t, predicate_sum(rd_t.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(real, real.output, X=real.input) x0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, x0, input_) plan.computation_call(irfft, output, x0, coeffs_arr, input_) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]} + 1; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i < ${modes}; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y)); if (i >= order) { result = ${mul_cc}(result, np); } else { ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y); result = ${mul_cc}(result, cp); } } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), modes=self._system.modes, )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
def __init__(self, arr_t, order=2, axes=None): tr_elems = norm_const(arr_t, order) out_dtype = tr_elems.output.dtype rd = Reduce(Type(out_dtype, arr_t.shape), predicate_sum(out_dtype), axes=axes) res_t = rd.parameter.output tr_sum = norm_const(res_t, 1. / order) rd.parameter.input.connect(tr_elems, tr_elems.output, input_prime=tr_elems.input) rd.parameter.output.connect(tr_sum, tr_sum.input, output_prime=tr_sum.output) self._rd = rd Computation.__init__(self, [ Parameter('output', Annotation(res_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))])
def _build_plan( self, plan_factory, device_params, ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b): plan = plan_factory() extracted_n, t, base, inner_n = ks_a.shape mean = Reduce(noises_b, predicate_sum(noises_b.dtype)) norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape)) mean.parameter.output.connect(norm, norm.input, mean=norm.output) noises_b_mean = plan.temp_array_like(mean.parameter.mean) mul_key = MatrixMulVector(noises_a) b_term = plan.temp_array_like(mul_key.parameter.output) build_keyswitch = PureParallel([ Parameter('ks_a', Annotation(ks_a, 'o')), Parameter('ks_b', Annotation(ks_b, 'o')), Parameter('ks_cv', Annotation(ks_cv, 'o')), Parameter('in_key', Annotation(in_key, 'i')), Parameter('b_term', Annotation(b_term, 'i')), Parameter('noises_a', Annotation(noises_a, 'i')), Parameter('noises_b', Annotation(noises_b, 'i')), Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))], Snippet( TEMPLATE.get_def("make_lwe_keyswitch_key"), render_kwds=dict( log2_base=self._log2_base, output_size=self._output_size, double_to_t32=double_to_t32_module, noise=self._noise)), guiding_array="ks_b") plan.computation_call(mean, noises_b_mean, noises_b) plan.computation_call(mul_key, b_term, noises_a, out_key) plan.computation_call( build_keyswitch, ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment)) meter_trf = Transformation([ Parameter('output', Annotation(for_reduction, 'o')), Parameter('alpha', Annotation(alpha, 'i')), Parameter('beta', Annotation(beta, 'i')), ], """ VSIZE_T sample_idx = ${idxs[0]}; VSIZE_T order = ${idxs[1]}; ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0); for (VSIZE_T i = 0; i <= order; i++) { ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i); ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i); ${alpha.ctype} t = ${mul_cc}(alpha, beta); t.x -= ${ordering}; result = ${mul_cc}(result, t); } ${output.store_same}(result.x); """, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), ordering=ordering(self._representation), )) reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,)) reduction.parameter.input.connect( meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta) plan.computation_call(reduction, output, alpha, beta) return plan
# Test array arr = numpy.random.randint(0, 10**6, 20000) # A transformation that creates initial minmax structures for the given array of integers to_mmc = Transformation( [Parameter('output', Annotation(Type(mmc_dtype, arr.shape), 'o')), Parameter('input', Annotation(arr, 'i'))], """ ${output.ctype} res; res.cur_min = ${input.load_same}; res.cur_max = ${input.load_same}; ${output.store_same}(res); """) # Create the reduction computation and attach the transformation above to its input. reduction = Reduce(to_mmc.output, predicate) reduction.parameter.input.connect(to_mmc, to_mmc.output, new_input=to_mmc.input) creduction = reduction.compile(thr) # Run the computation arr_dev = thr.to_device(arr) res_dev = thr.empty_like(reduction.parameter.output) creduction(res_dev, arr_dev) minmax = res_dev.get() assert minmax["cur_min"] == arr.min() assert minmax["cur_max"] == arr.max()
# Test array arr = numpy.random.randint(0, 10**6, 20000) # A transformation that creates initial minmax structures for the given array of integers to_mmc = Transformation([ Parameter('output', Annotation(Type(mmc_dtype, arr.shape), 'o')), Parameter('input', Annotation(arr, 'i')) ], """ ${output.ctype} res; res.cur_min = ${input.load_same}; res.cur_max = ${input.load_same}; ${output.store_same}(res); """) # Create the reduction computation and attach the transformation above to its input. reduction = Reduce(to_mmc.output, predicate) reduction.parameter.input.connect(to_mmc, to_mmc.output, new_input=to_mmc.input) creduction = reduction.compile(thr) # Run the computation arr_dev = thr.to_device(arr) res_dev = thr.empty_like(reduction.parameter.output) creduction(res_dev, arr_dev) minmax = res_dev.get() assert minmax["cur_min"] == arr.min() assert minmax["cur_max"] == arr.max()
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan
def solve(nufft,gy, solver=None, maxiter=30, *args, **kwargs): """ The solve function of NUFFT_hsa. The current version supports solvers = 'cg' or 'L1TVOLS'. :param nufft: NUFFT_hsa object :param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape. :type y: numpy.complex64 reikna array :return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode. :rtype: x: reikna array, complex64. """ # define the reduction kernel on the device # if None == solver: # solver = 'cg' if 'L1TVLAD' == solver: x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs ) # x2 = nufft.thr.copy_array(nufft.x_Nd) return x2 elif 'L1TVOLS' == solver: x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs ) # x2 = nufft.thr.copy_array(nufft.x_Nd) return x2 elif 'dc' == solver: """ Density compensation method nufft.st['W'] will be computed if doesn't exist If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y) input: y: (M,) array output: x2: Nd array """ print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version") # nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs) # # x2 = nufft.adjoint(nufft.st['W']*gy) return x2 # return gx elif 'cg' == solver: from reikna.algorithms import Reduce, Predicate, predicate_sum nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr) # nufft.reduce_sum = nufft.reduce_sum.compile(nufft.thr) # update: b = spH * gy b = nufft._y2k_device(gy) # Initialize x = b x = nufft.thr.copy_array( b) rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0+0.0j) nufft.reduce_sum(rsold, x) # print('x',rsold) # initialize r = b - A * x r = nufft.thr.empty_like( b) # r.fill(0.0 + 0.0j) y_tmp = nufft._k2y_device(x) Ax = nufft._y2k_device(y_tmp) del y_tmp rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0 + 0.0j) nufft.reduce_sum(rsold, Ax) # print('Ax',rsold) nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() # p = r p = nufft.thr.copy_array(r) # rsold = r' * r tmp_array = nufft.thr.empty_like( r) # tmp_array.fill(0.0 + 0.0j) nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsold.fill(0.0 + 0.0j) nufft.reduce_sum(rsold, tmp_array) # allocate Ap # Ap = nufft.thr.empty_like( b) rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # rsnew.fill(0.0 + 0.0j) tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output) # tmp_sum.fill(0.0 + 0.0j) for pp in range(0, maxiter): tmp_p = nufft._k2y_device(p) Ap = nufft._y2k_device(tmp_p) del tmp_p # alpha = rs_old/(p'*Ap) nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.reduce_sum(tmp_sum, tmp_array) alpha = rsold / tmp_sum # alpha_cpu = alpha.get() # if numpy.isnan(alpha_cpu): # alpha_cpu = 0 # avoid singularity # print(tmp_sum, alpha, rsold) # print(pp,rsold , alpha, numpy.sum(tmp_array.get()) ) # x = x + alpha*p p2 = nufft.thr.copy_array(p) nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() # nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod)) x += p2 # r = r - alpha * Ap p2= nufft.thr.copy_array(Ap) # nufft.thr.synchronize() nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() r -= p2 # print(pp, numpy.sum(x.get()), numpy.sum(r.get())) # rs_new = r'*r nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.reduce_sum(rsnew, tmp_array) # tmp_sum = p = r + (rs_new/rs_old)*p beta = rsnew/rsold # beta_cpu = beta.get() # if numpy.isnan(beta_cpu): # beta_cpu = 0 # print(beta, rsnew, rsold) p2= nufft.thr.copy_array(p) nufft.prg.cMultiplyScalar(beta.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod)) # nufft.thr.synchronize() p = r + p2 rsold =nufft.thr.copy_array( rsnew) # nufft.thr.synchronize() # end of iteration # copy result to k_Kd2 # nufft.k_Kd2 = nufft.thr.copy_array(x) # inverse FFT: k_Kd2 -> x_Nd x2 = nufft._k2xx_device(x) # x is the solved k space # rescale the SnGPUArray # x2 /= nufft.volume['gpu_sense2'] # x3 = nufft.x2s(x2) # combine multi-coil to single-coil try: x2 /= nufft.volume['SnGPUArray'] except: nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch), numpy.uint32(nufft.tSN['Tdims']), nufft.tSN['Td'], nufft.tSN['Td_elements'], nufft.tSN['invTd_elements'], nufft.tSN['tensor_sn'], x2, numpy.uint32(1), # division, 1 is true local_size = None, global_size = int(nufft.batch*nufft.Ndprod)) return x2
# Create LUT and stringify into preamble of map kernel LUT = np.zeros(256, np.int32) for b in xrange(8): LUT[(np.arange(256) & (1 << b)) != 0] += 1 strLUT = "constant int LUT[256] = {" + ",".join(map(str, LUT)) + "};\n" byte_to_count = Transformation([ Parameter('output', Annotation(Type(np.int32, (1, )), 'o')), Parameter('input', Annotation(Type(np.uint8, (1, )), 'i')) ], strLUT + """ ${output.store_same}(LUT[${input.load_same}]); """) predicate = Predicate( Snippet.create(lambda v1, v2: """return ${v1} + ${v2}"""), np.int32(0)) sum_bits_reduction = Reduce(byte_to_count.output, predicate) sum_bits_reduction.parameter.input.connect(byte_to_count, byte_to_count.output, new_input=byte_to_count.input) sum_bits = sum_bits_reduction.compile(thr) #sum_byte_count = ReductionKernel(cx, np.int32, neutral="0", # reduce_expr="a+b", map_expr="LUT[bytes[i]]", # arguments="__global unsigned char *bytes", # preamble=strLUT) #def count_bits(img): # return sum_byte_count(img).get().item() # #pixel_inds = GenericScanKernel(cx, np.int32, # arguments="__global unsigned char *bytes, " # "int image_w, " # "__global int *pixels",
LUT[(np.arange(256) & (1 << b)) != 0] += 1 strLUT = "constant int LUT[256] = {" + ",".join(map(str, LUT)) + "};\n" byte_to_count = Transformation( [ Parameter("output", Annotation(Type(np.int32, (1,)), "o")), Parameter("input", Annotation(Type(np.uint8, (1,)), "i")), ], strLUT + """ ${output.store_same}(LUT[${input.load_same}]); """, ) predicate = Predicate(Snippet.create(lambda v1, v2: """return ${v1} + ${v2}"""), np.int32(0)) sum_bits_reduction = Reduce(byte_to_count.output, predicate) sum_bits_reduction.parameter.input.connect(byte_to_count, byte_to_count.output, new_input=byte_to_count.input) sum_bits = sum_bits_reduction.compile(thr) # sum_byte_count = ReductionKernel(cx, np.int32, neutral="0", # reduce_expr="a+b", map_expr="LUT[bytes[i]]", # arguments="__global unsigned char *bytes", # preamble=strLUT) # def count_bits(img): # return sum_byte_count(img).get().item() # # pixel_inds = GenericScanKernel(cx, np.int32, # arguments="__global unsigned char *bytes, " # "int image_w, " # "__global int *pixels", # # Keep count of pixels we have stored so far # input_expr="LUT[bytes[i]]",