def __init__( self, result_shape_info, input_size: int, output_size: int, decomp_length: int, log2_base: int): base = 2**log2_base a = result_shape_info.a b = result_shape_info.b cv = result_shape_info.current_variances ks_a = Type(Torus32, (input_size, decomp_length, base, output_size)) ks_b = Type(Torus32, (input_size, decomp_length, base)) ks_cv = Type(Float, (input_size, decomp_length, base)) source_a = Type(Torus32, result_shape_info.shape + (input_size,)) source_b = Type(Torus32, result_shape_info.shape) self._decomp_length = decomp_length self._input_size = input_size self._output_size = output_size self._log2_base = log2_base Computation.__init__(self, [ Parameter('result_a', Annotation(a, 'io')), Parameter('result_b', Annotation(b, 'io')), Parameter('result_cv', Annotation(cv, 'io')), Parameter('ks_a', Annotation(ks_a, 'i')), Parameter('ks_b', Annotation(ks_b, 'i')), Parameter('ks_cv', Annotation(ks_cv, 'i')), Parameter('source_a', Annotation(source_a, 'i')), Parameter('source_b', Annotation(source_b, 'i'))])
def test_guiding_output(thr): N = 1000 dtype = numpy.float32 p = PureParallel([ Parameter('output', Annotation(Type(dtype, shape=N), 'o')), Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i')) ], """ float t1 = ${input.load_idx}(0, ${idxs[0]}); float t2 = ${input.load_idx}(1, ${idxs[0]}); ${output.store_idx}(${idxs[0]}, t1 + t2); """, guiding_array='output') a = get_test_array_like(p.parameter.input) a_dev = thr.to_device(a) res_dev = thr.empty_like(p.parameter.output) pc = p.compile(thr) pc(res_dev, a_dev) res_ref = a[0] + a[1] assert diff_is_negligible(res_dev.get(), res_ref)
def hough_paths(segments, line_dist=40): # View segments as a 1D structured array seg_struct = segments.ravel().astype(np.int32).view(int4).reshape(-1) segments, _ = sort_segments(thr.to_device(seg_struct)) segments = segments[0].view(np.int32).reshape((seg_struct.shape[0], 2, 2)) can_join = thr.empty_like(Type(np.int32, segments.shape[0])) can_join.fill(0) prg.can_join_segments( segments, can_join, np.int32(line_dist), global_size=segments.shape[:1], local_size=(1, ), ) labels = cumsum(can_join) num_labels = int(labels[labels.shape[0] - 1].get().item()) + 1 longest_seg_inds = thr.empty_like(Type(np.int32, num_labels)) longest_seg_inds.fill(-1) prg.assign_segments(segments, labels, longest_seg_inds, global_size=(segments.shape[0], ), local_size=(1, )) longest_segs = thr.empty_like(Type(np.int32, (num_labels, 2, 2))) prg.copy_chosen_segments(segments, longest_seg_inds, longest_segs, global_size=(num_labels, ), local_size=(1, )) return longest_segs.get()
def get_prepare_iprfft_output(y): # Input: size N//4 # Output: size N//4 N = y.shape[-1] * 2 return Transformation([ Parameter('x', Annotation(y, 'o')), Parameter('y', Annotation(y, 'i')), Parameter('x0', Annotation(Type(y.dtype, y.shape[:-1]), 'i')), Parameter('coeffs', Annotation(Type(y.dtype, (N // 2, )), 'i')), ], """ ${y.ctype} y = ${y.load_same}; ${coeffs.ctype} coeff = ${coeffs.load_idx}(${idxs[-1]}); ${x.ctype} x; if (${idxs[-1]} == 0) { ${x0.ctype} x0 = ${x0.load_idx}(${", ".join(idxs[:-1])}); x = x0 / ${N // 2}; } else { x = y * coeff; } ${x.store_same}(x); """, connectors=['y'], render_kwds=dict(N=N))
def __init__( self, transform, batch_shape, inverse=False, i32_conversion=False, transforms_per_block=4, kernel_repetitions=1): self._inverse = inverse self._transform = transform self._transforms_per_block = transforms_per_block self._kernel_repetitions = kernel_repetitions self._i32_conversion = i32_conversion tr_arr = Type(self._transform.elem_dtype, batch_shape + (transform.transform_length,)) if i32_conversion: arr = Type(numpy.int32, batch_shape + (transform.polynomial_length,)) if inverse: oarr = arr iarr = tr_arr else: oarr = tr_arr iarr = arr else: oarr = tr_arr iarr = tr_arr Computation.__init__(self, [ Parameter('output', Annotation(oarr, 'o')), Parameter('input', Annotation(iarr, 'i'))])
def __init__(self, params: TGswParams, in_out_params: LweParams, shape, perf_params: PerformanceParameters): tlwe_params = params.tlwe_params decomp_length = params.decomp_length mask_size = tlwe_params.mask_size polynomial_degree = tlwe_params.polynomial_degree input_size = params.tlwe_params.extracted_lweparams.size output_size = in_out_params.size assert mask_size == 1 and decomp_length == 2 transform_type = params.tlwe_params.transform_type transform = get_transform(transform_type) tlength = transform.transformed_length(polynomial_degree) tdtype = transform.transformed_dtype() out_a = Type(Torus32, shape + (input_size, )) out_b = Type(Torus32, shape) accum_a = Type(Torus32, shape + (mask_size + 1, polynomial_degree)) gsw = Type(tdtype, (output_size, mask_size + 1, decomp_length, mask_size + 1, tlength)) bara = Type(Torus32, shape + (output_size, )) self._params = params self._in_out_params = in_out_params self._perf_params = perf_params Computation.__init__(self, [ Parameter('lwe_a', Annotation(out_a, 'io')), Parameter('lwe_b', Annotation(out_b, 'io')), Parameter('accum_a', Annotation(accum_a, 'io')), Parameter('gsw', Annotation(gsw, 'i')), Parameter('bara', Annotation(bara, 'i')) ])
def __init__(self, polynomial_degree, shape, powers_shape, powers_view=False, minus_one=False, invert_powers=False): self._batch_shape = powers_shape[:-1] if powers_view else powers_shape assert self._batch_shape == shape[:len(self._batch_shape)] self._powers_view = powers_view self._minus_one = minus_one self._invert_powers = invert_powers polynomials = Type(Torus32, shape + (polynomial_degree, )) powers = Type(Int32, powers_shape) Computation.__init__( self, [ Parameter('result', Annotation(polynomials, 'o')), Parameter('source', Annotation(polynomials, 'i')), Parameter('powers', Annotation(powers, 'i')), Parameter('powers_idx', Annotation( Type(Int32))) # unused if powers_view==False ])
def __init__(self, params: 'TGswParams', shape, bk_len, perf_params: PerformanceParameters): mask_size = params.tlwe_params.mask_size polynomial_degree = params.tlwe_params.polynomial_degree decomp_length = params.decomp_length transform = get_transform(params.tlwe_params.transform_type) tdtype = transform.transformed_dtype() tlength = transform.transformed_length(polynomial_degree) accum = Type(Torus32, shape + (mask_size + 1, polynomial_degree)) bootstrap_key = Type( tdtype, (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength)) self._params = params self._perf_params = perf_params self._shape = shape self._bk_len = bk_len Computation.__init__(self, [ Parameter('accum', Annotation(accum, 'io')), Parameter('bootstrap_key', Annotation(bootstrap_key, 'i')), Parameter('bk_row_idx', Annotation(numpy.int32)) ])
def __init__(self, params: 'TLweParams', shape, noise: float, perf_params: PerformanceParametersForDevice): polynomial_degree = params.polynomial_degree mask_size = params.mask_size result_a = Type(Torus32, shape + (mask_size + 1, polynomial_degree)) result_cv = Type(ErrorFloat, shape) key = Type(Int32, (mask_size, polynomial_degree)) noises1 = Type(Torus32, shape + (mask_size, polynomial_degree)) noises2 = Type(Torus32, shape + (polynomial_degree, )) self._transform_type = params.transform_type self._noise = noise self._mask_size = mask_size self._polynomial_degree = polynomial_degree self._perf_params = perf_params Computation.__init__(self, [ Parameter('result_a', Annotation(result_a, 'o')), Parameter('result_cv', Annotation(result_cv, 'o')), Parameter('key', Annotation(key, 'i')), Parameter('noises1', Annotation(noises1, 'i')), Parameter('noises2', Annotation(noises2, 'i')) ])
def get_method(array): temp = array.thread.array(array.shape, array.dtype) comp = array.thread.get_cached_computation(setitem_computation, Type.from_value(temp), Type.from_value(array), True) comp(temp, array) return temp.get()
def get_tgsw_polynomial_decomp_trf(params: 'TGswParams', shape): tlwe_params = params.tlwe_params decomp_length = params.decomp_length mask_size = tlwe_params.mask_size polynomial_degree = tlwe_params.polynomial_degree result = Type(Int32, shape + (mask_size + 1, decomp_length, polynomial_degree)) sample = Type(Torus32, shape + (mask_size + 1, polynomial_degree)) return Transformation([ Parameter('result', Annotation(result, 'o')), Parameter('sample', Annotation(sample, 'i')) ], """ <% mask = 2**params.bs_log2_base - 1 half_base = 2**(params.bs_log2_base - 1) %> ${sample.ctype} sample = ${sample.load_idx}(${", ".join(idxs[:-2])}, ${idxs[-1]}); int decomp_shift = 32 - (${idxs[-2]} + 1) * ${params.bs_log2_base}; ${result.store_same}( (((sample + (${params.offset})) >> decomp_shift) & ${mask}) - ${half_base} ); """, connectors=['results'], render_kwds=dict(params=params))
def __init__(self, matrix_t): Computation.__init__(self, [ Parameter( 'output', Annotation(Type(matrix_t.dtype, matrix_t.shape[:-1]), 'o')), Parameter('matrix', Annotation(matrix_t, 'i')), Parameter( 'vector', Annotation(Type(matrix_t.dtype, matrix_t.shape[-1]), 'i')) ])
def __init__(self, x, NFFT=256, noverlap=128, pad_to=None, window=hanning_window): # print("x Data type = %s" % x.dtype) # print("Is Real = %s" % dtypes.is_real(x.dtype)) # print("dim = %s" % x.ndim) assert dtypes.is_real(x.dtype) assert x.ndim == 1 rolling_frame_trf = rolling_frame(x, NFFT, noverlap, pad_to) complex_dtype = dtypes.complex_for(x.dtype) fft_arr = Type(complex_dtype, rolling_frame_trf.output.shape) real_fft_arr = Type(x.dtype, rolling_frame_trf.output.shape) window_trf = window(real_fft_arr, NFFT) broadcast_zero_trf = transformations.broadcast_const(real_fft_arr, 0) to_complex_trf = transformations.combine_complex(fft_arr) amplitude_trf = transformations.norm_const(fft_arr, 1) crop_trf = crop_frequencies(amplitude_trf.output) fft = FFT(fft_arr, axes=(1, )) fft.parameter.input.connect(to_complex_trf, to_complex_trf.output, input_real=to_complex_trf.real, input_imag=to_complex_trf.imag) fft.parameter.input_imag.connect(broadcast_zero_trf, broadcast_zero_trf.output) fft.parameter.input_real.connect(window_trf, window_trf.output, unwindowed_input=window_trf.input) fft.parameter.unwindowed_input.connect( rolling_frame_trf, rolling_frame_trf.output, flat_input=rolling_frame_trf.input) fft.parameter.output.connect(amplitude_trf, amplitude_trf.input, amplitude=amplitude_trf.output) fft.parameter.amplitude.connect(crop_trf, crop_trf.input, cropped_amplitude=crop_trf.output) self._fft = fft self._transpose = Transpose(fft.parameter.cropped_amplitude) Computation.__init__(self, [ Parameter('output', Annotation(self._transpose.parameter.output, 'o')), Parameter('input', Annotation(fft.parameter.flat_input, 'i')) ])
def __init__(self, params: 'TLweParams', shape): a_type = Type(Torus32, shape + (params.mask_size + 1, params.polynomial_degree)) cv_type = Type(ErrorFloat, shape + (params.mask_size + 1,)) mu_type = Type(Torus32, shape + (params.polynomial_degree,)) self._mask_size = params.mask_size Computation.__init__(self, [Parameter('a', Annotation(a_type, 'o')), Parameter('current_variances', Annotation(cv_type, 'o')), Parameter('mu', Annotation(mu_type, 'i'))])
def __init__(self, shape, mspace_size): self._mspace_size = mspace_size messages = Type(Torus32, shape) result = Type(Int32, shape) Computation.__init__(self, [ Parameter('result', Annotation(result, 'o')), Parameter('messages', Annotation(messages, 'i')) ])
def __init__(self, shape, lwe_size): a = Type(Torus32, shape + (lwe_size,)) b = Type(Torus32, shape) key = Type(Int32, (lwe_size,)) Computation.__init__(self, [ Parameter('result', Annotation(b, 'o')), Parameter('lwe_a', Annotation(a, 'i')), Parameter('lwe_b', Annotation(b, 'i')), Parameter('key', Annotation(key, 'i'))])
def get_prepare_for_mul_trf(shape): # Preparation for FFT is just an identity dtype = transformed_dtype() return Transformation([ Parameter('output', Annotation(Type(dtype, shape), 'o')), Parameter('input', Annotation(Type(dtype, shape), 'i')) ], """ ${output.store_same}(${input.load_same}); """, connectors=['input', 'output'])
def setitem_method(array, index, value): # We need it both in ``cuda.Array`` and ``ocl.Array``, hence a standalone function. # PyOpenCL and PyCUDA support __setitem__() for some restricted cases, # but it is too complicated to determine when it will work, # and it is easier to just call our own implementation every time. view = array[index] value = normalize_value(array.thread, type(array), value) comp = array.thread.get_cached_computation( setitem_computation, Type.from_value(view), Type.from_value(value)) comp(view, value)
def get_tlwe_transformed_add_mul_to_trf(params: 'TGswParams', shape, bk_len: int, perf_params: PerformanceParameters): tlwe_params = params.tlwe_params decomp_length = params.decomp_length mask_size = tlwe_params.mask_size polynomial_degree = tlwe_params.polynomial_degree transform = get_transform(params.tlwe_params.transform_type) tdtype = transform.transformed_dtype() tlength = transform.transformed_length(polynomial_degree) tr_ctype = transform.transformed_internal_ctype() result = Type(tdtype, shape + (mask_size + 1, tlength)) sample = Type(tdtype, shape + (mask_size + 1, decomp_length, tlength)) bootstrap_key = Type( tdtype, (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength)) return Transformation([ Parameter('result', Annotation(result, 'o')), Parameter('sample', Annotation(sample, 'i')), Parameter('bootstrap_key', Annotation(bootstrap_key, 'i')), Parameter('bk_row_idx', Annotation(numpy.int32)) ], """ ${tr_ctype} result = ${tr_ctype}pack(${dtypes.c_constant(0, result.dtype)}); %for mask_idx in range(mask_size + 1): %for decomp_idx in range(decomp_length): { ${tr_ctype} a = ${tr_ctype}pack( ${sample.load_idx}( ${", ".join(idxs[:-2])}, ${mask_idx}, ${decomp_idx}, ${idxs[-1]}) ); ${tr_ctype} b = ${tr_ctype}pack( ${bootstrap_key.load_idx}( ${bk_row_idx}, ${mask_idx}, ${decomp_idx}, ${idxs[-2]}, ${idxs[-1]}) ); result = ${add}(result, ${mul}(a, b)); } %endfor %endfor ${result.store_same}(${tr_ctype}unpack(result)); """, connectors=['result'], render_kwds=dict( mask_size=mask_size, decomp_length=decomp_length, add=transform.transformed_add(perf_params), mul=transform.transformed_mul(perf_params), tr_ctype=tr_ctype))
def __init__(self, a, b, current_variances): if (not (len(a.shape) - 1 == len(b.shape) == len( current_variances.shape)) or not (a.shape[:-1] == b.shape == current_variances.shape)): raise ValueError("Inconsistent shapes: {a}, {b}, {cv}".format( a=a.shape, b=b.shape, cv=current_variances.shape)) self.a = Type.from_value(a) self.b = Type.from_value(b) self.current_variances = Type.from_value(current_variances) self.shape = b.shape
def __init__(self, params: 'TLweParams', shape): self._mask_size = params.mask_size self._polynomial_degree = params.polynomial_degree result_a = Type(Torus32, shape + (params.extracted_lweparams.size,)) result_b = Type(Torus32, shape) tlwe_a = Type(Torus32, shape + (params.mask_size + 1, params.polynomial_degree)) Computation.__init__(self, [ Parameter('result_a', Annotation(result_a, 'o')), Parameter('result_b', Annotation(result_b, 'o')), Parameter('tlwe_a', Annotation(tlwe_a, 'i'))])
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 4 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) # The first element is unused coeffs = numpy.concatenate( [[0], 1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N // 2) / N))]) coeffs_arr = plan.persistent_array(coeffs) prepare_iprfft_input = get_prepare_iprfft_input(input_) prepare_iprfft_output = get_prepare_iprfft_output(output) irfft = IRFFT(prepare_iprfft_input.Y) irfft.parameter.input.connect(prepare_iprfft_input, prepare_iprfft_input.Y, X=prepare_iprfft_input.X) irfft.parameter.output.connect(prepare_iprfft_output, prepare_iprfft_output.y, x=prepare_iprfft_output.x, x0=prepare_iprfft_output.x0, coeffs=prepare_iprfft_output.coeffs) real = Transformation([ Parameter( 'output', Annotation(Type(dtypes.real_for(input_.dtype), input_.shape), 'o')), Parameter('input', Annotation(input_, 'i')), ], """ ${output.store_same}((${input.load_same}).x); """, connectors=['output']) rd_t = Type(output.dtype, input_.shape) rd = Reduce(rd_t, predicate_sum(rd_t.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(real, real.output, X=real.input) x0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, x0, input_) plan.computation_call(irfft, output, x0, coeffs_arr, input_) return plan
def __init__(self, click_probability_meter, system, representation, samples): assert representation == Representation.POSITIVE_P self._system = system state = Type(numpy.complex128, (samples, system.modes)) output = Type(numpy.float64, (system.modes,)) Computation.__init__( self, [ Parameter('output', Annotation(output, 'o')), Parameter('alpha', Annotation(state, 'i')), Parameter('beta', Annotation(state, 'i')), ])
def setitem_method(array, index, value): # We need it both in ``cuda.Array`` and ``ocl.Array``, hence a standalone function. # PyOpenCL and PyCUDA support __setitem__() for some restricted cases, # but it is too complicated to determine when it will work, # and it is easier to just call our own implementation every time. view = array[index] value, is_array = normalize_value(array.thread, type(array), value) comp = array.thread.get_cached_computation(setitem_computation, Type.from_value(view), Type.from_value(value), is_array) comp(view, value)
def __init__(self, meter, system, representation, samples): self._system = system self._representation = representation state = Type(numpy.complex128, (samples, system.modes)) output = Type(numpy.float64, (system.modes,)) Computation.__init__( self, [ Parameter('output', Annotation(output, 'o')), Parameter('alpha', Annotation(state, 'i')), Parameter('beta', Annotation(state, 'i')), ])
def __init__(self, params: 'TGswParams', shape): self._params = params decomp_length = params.decomp_length mask_size = params.tlwe_params.mask_size polynomial_degree = params.tlwe_params.polynomial_degree result_a = Type( Torus32, shape + (mask_size + 1, decomp_length, mask_size + 1, polynomial_degree)) messages = Type(Torus32, shape) Computation.__init__(self, [Parameter('result_a', Annotation(result_a, 'o')), Parameter('messages', Annotation(messages, 'i'))])
def test_computation_performance(thr_and_double, fast_math, test_sampler_float): thr, double = thr_and_double size = 2**15 batch = 2**6 sampler = test_sampler_float.get_sampler(double) rng = CBRNG(Type(sampler.dtype, shape=(batch, size)), 1, sampler) dest_dev = thr.empty_like(rng.parameter.randoms) counters = rng.create_counters() counters_dev = thr.to_device(counters) rngc = rng.compile(thr, fast_math=fast_math) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rngc(counters_dev, dest_dev) thr.synchronize() times.append(time.time() - t1) byte_size = size * batch * sampler.dtype.itemsize return min(times), byte_size
def transpose(img): assert img.shape[0] % 8 == 0 img_T = thr.empty_like( Type(np.uint8, (img.shape[1] * 8, img.shape[0] // 8))) img_T.fill(0) prg.transpose(img, img_T, global_size=img_T.shape[::-1], local_size=(1, 8)) return img_T
def rolling_frame(arr, NFFT, noverlap, pad_to): """ Transforms a 1D array to a 2D array whose rows are partially overlapped parts of the initial array. """ frame_step = NFFT - noverlap frame_num = (arr.size - noverlap) // frame_step frame_size = NFFT if pad_to is None else pad_to result_arr = Type(arr.dtype, (frame_num, frame_size)) return Transformation( [ Parameter('output', Annotation(result_arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ %if NFFT != output.shape[1]: if (${idxs[1]} >= ${NFFT}) { ${output.store_same}(0); } else %endif { ${output.store_same}(${input.load_idx}(${idxs[0]} * ${frame_step} + ${idxs[1]})); } """, render_kwds=dict(frame_step=frame_step, NFFT=NFFT), # note that only the "store_same"-using argument can serve as a connector! connectors=['output'])
def __init__(self, mode_arr, add_points=None, inverse=False, order=1, axes=None): if axes is None: axes = tuple(range(len(mode_arr.shape))) else: axes = tuple(axes) self._axes = list(sorted(axes)) if add_points is None: add_points = [0] * len(mode_arr.shape) else: add_points = list(add_points) self._add_points = add_points coord_shape = list(mode_arr.shape) for axis in range(len(mode_arr.shape)): if axis in axes: coord_shape[axis] = get_spatial_points( mode_arr.shape[axis], order, add_points=add_points[axis]) coord_arr = Type(mode_arr.dtype, shape=coord_shape) self._inverse = inverse self._order = order if not inverse: parameters = [ Parameter('modes', Annotation(mode_arr, 'o')), Parameter('coords', Annotation(coord_arr, 'i'))] else: parameters = [ Parameter('coords', Annotation(coord_arr, 'o')), Parameter('modes', Annotation(mode_arr, 'i'))] Computation.__init__(self, parameters)
def __init__(self, thr, shape, dtype, box, tmax, steps, samples, kinetic_coeff=1, nonlinear_module=None): state_arr = Type(dtype, shape) self.tmax = tmax self.steps = steps self.samples = samples self.dt = float(tmax) / steps self.dt_half = self.dt / 2 self.thr = thr self.stepper = RK4IPStepper( state_arr, self.dt, box=box, kinetic_coeff=kinetic_coeff, nonlinear_module=nonlinear_module).compile(thr) self.stepper_half = RK4IPStepper( state_arr, self.dt_half, box=box, kinetic_coeff=kinetic_coeff, nonlinear_module=nonlinear_module).compile(thr)
def cast(arr_t, dtype): """ Returns a typecast transformation of ``arr_t`` to ``dtype`` (1 output, 1 input): ``output = cast[dtype](input)``. """ dest = Type.from_value(arr_t).with_dtype(dtype) return Transformation( [Parameter('output', Annotation(dest, 'o')), Parameter('input', Annotation(arr_t, 'i'))], "${output.store_same}(${cast}(${input.load_same}));", render_kwds=dict(cast=functions.cast(dtype, arr_t.dtype)))
def copy_broadcasted(arr_t, out_arr_t=None): """ Returns an identity transformation (1 output, 1 input): ``output = input``, where ``input`` may be broadcasted (with the same semantics as ``numpy.broadcast_to()``). Output array type ``out_arr_t`` may have different strides, but must have compatible shapes the same shape and data type. .. note:: This is an input-only transformation. """ if out_arr_t is None: out_arr_t = arr_t if out_arr_t.dtype != arr_t.dtype: raise ValueError("Input and output arrays must have the same data type") in_tp = Type.from_value(arr_t) out_tp = Type.from_value(out_arr_t) if not in_tp.broadcastable_to(out_tp): raise ValueError("Input is not broadcastable to output") return Transformation( [Parameter('output', Annotation(out_arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))], """ ${output.store_same}(${input.load_idx}( %for i in range(len(input.shape)): %if input.shape[i] == 1: 0 %else: ${idxs[i + len(output.shape) - len(input.shape)]} %endif %if i != len(input.shape) - 1: , %endif %endfor )); """, connectors=['output'])
def roll(array, shift, axis=-1): """ Cyclically shifts elements of ``array`` by ``shift`` positions to the right along ``axis``. ``shift`` can be negative (in which case the elements are shifted to the left). Elements that are shifted beyond the last position are re-introduced at the first (and vice versa). Works equivalently to ``numpy.roll`` (except ``axis=None`` is not supported). """ temp = array.thread.array(array.shape, array.dtype) axis = axis % len(array.shape) comp = array.thread.get_cached_computation( roll_computation, Type.from_value(array), axis) comp(temp, array, shift) return temp
def setitem_computation(dest, source): """ Returns a compiled computation that broadcasts ``source`` to ``dest``, where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar. """ if len(source.shape) == 0: trf = transformations.broadcast_param(dest) return PureParallel.from_trf(trf, guiding_array=trf.output) else: source_dt = Type.from_value(source).with_dtype(dest.dtype) trf = transformations.copy(source_dt, dest) comp = PureParallel.from_trf(trf, guiding_array=trf.output) cast_trf = transformations.cast(source, dest.dtype) comp.parameter.input.connect(cast_trf, cast_trf.output, src_input=cast_trf.input) return comp
def roll_method(array, shift, axis=-1): axis = axis % len(array.shape) comp = array.thread.get_cached_computation( RollInplace, Type.from_value(array), axis) comp(array, shift)
def get_method(array): temp = array.thread.array(array.shape, array.dtype) comp = array.thread.get_cached_computation( setitem_computation, Type.from_value(temp), Type.from_value(array)) comp(temp, array) return temp.get()