Beispiel #1
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 2
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N)
        coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N)

        c1_arr = plan.persistent_array(coeffs1)
        c2_arr = plan.persistent_array(coeffs2)

        multiply = get_multiply(input_)

        # re_X_1 = sum(x * coeffs2)

        t = plan.temp_array_like(input_)
        rd = Reduce(t,
                    predicate_sum(input_.dtype),
                    axes=(len(input_.shape) - 1, ))

        rd.parameter.input.connect(multiply,
                                   multiply.output,
                                   x=multiply.a,
                                   c2=multiply.b)

        re_X_0 = plan.temp_array_like(rd.parameter.output)
        plan.computation_call(rd, re_X_0, input_, c2_arr)

        # Y = numpy.fft.rfft(x * coeffs1)

        rfft = RFFT(input_, dont_store_last=True)
        rfft.parameter.input.connect(multiply,
                                     multiply.output,
                                     x=multiply.a,
                                     c1=multiply.b)

        Y = plan.temp_array_like(rfft.parameter.output)
        plan.computation_call(rfft, Y, input_, c1_arr)

        # Y *= -1j
        # Y[0] /= 2
        # Y[0] += re_X_1
        # res = numpy.cumsum(Y[:-1])

        prepare_prfft_scan = get_prepare_prfft_scan(Y)

        sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False)
        sc.parameter.input.connect(prepare_prfft_scan,
                                   prepare_prfft_scan.output,
                                   Y=prepare_prfft_scan.Y,
                                   re_X_0=prepare_prfft_scan.re_X_0)

        plan.computation_call(sc, output, Y, re_X_0)

        return plan
Beispiel #2
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 2
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N//2) / N)
        coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N//2) / N)

        c1_arr = plan.persistent_array(coeffs1)
        c2_arr = plan.persistent_array(coeffs2)

        multiply = get_multiply(input_)

        # re_X_1 = sum(x * coeffs2)

        t = plan.temp_array_like(input_)
        rd = Reduce(t, predicate_sum(input_.dtype), axes=(len(input_.shape)-1,))

        rd.parameter.input.connect(
            multiply, multiply.output, x=multiply.a, c2=multiply.b)

        re_X_0 = plan.temp_array_like(rd.parameter.output)
        plan.computation_call(rd, re_X_0, input_, c2_arr)

        # Y = numpy.fft.rfft(x * coeffs1)

        rfft = RFFT(input_, dont_store_last=True)
        rfft.parameter.input.connect(
            multiply, multiply.output, x=multiply.a, c1=multiply.b)

        Y = plan.temp_array_like(rfft.parameter.output)
        plan.computation_call(rfft, Y, input_, c1_arr)

        # Y *= -1j
        # Y[0] /= 2
        # Y[0] += re_X_1
        # res = numpy.cumsum(Y[:-1])

        prepare_prfft_scan = get_prepare_prfft_scan(Y)

        sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1,), exclusive=False)
        sc.parameter.input.connect(
            prepare_prfft_scan, prepare_prfft_scan.output,
            Y=prepare_prfft_scan.Y, re_X_0=prepare_prfft_scan.re_X_0)

        plan.computation_call(sc, output, Y, re_X_0)

        return plan
Beispiel #3
0
def test_summation(thr):

    perf_size = 2**22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Beispiel #4
0
def test_summation(thr):

    perf_size = 2 ** 22
    dtype = dtypes.normalize_type(numpy.int64)

    a = get_test_array(perf_size, dtype)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(dtype))

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = numpy.array([a.sum()], dtype)

    rdc = rd.compile(thr)

    attempts = 10
    times = []
    for i in range(attempts):
        t1 = time.time()
        rdc(b_dev, a_dev)
        thr.synchronize()
        times.append(time.time() - t1)

    assert diff_is_negligible(b_dev.get(), b_ref)

    return min(times), perf_size * dtype.itemsize
Beispiel #5
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${output.store_same}(t.x - ${ordering});
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Beispiel #6
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, alpha.shape)

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                ${alpha.ctype} alpha = ${alpha.load_same};
                ${beta.ctype} beta = ${beta.load_same};
                ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));
                ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                ${output.store_same}(cp.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Beispiel #7
0
    def _build_plan(self, plan_factory, device_params, output, matrix, vector):
        plan = plan_factory()

        summation = Reduce(matrix,
                           predicate_sum(matrix.dtype),
                           axes=(len(matrix.shape) - 1, ))

        mul_vec = Transformation(
            [
                Parameter('output', Annotation(matrix, 'o')),
                Parameter('matrix', Annotation(matrix, 'i')),
                Parameter('vector', Annotation(vector, 'i'))
            ],
            """
            ${output.store_same}(${mul}(${matrix.load_same}, ${vector.load_idx}(${idxs[-1]})));
            """,
            render_kwds=dict(mul=functions.mul(matrix.dtype, vector.dtype)),
            connectors=['output', 'matrix'])

        summation.parameter.input.connect(mul_vec,
                                          mul_vec.output,
                                          matrix=mul_vec.matrix,
                                          vector=mul_vec.vector)

        plan.computation_call(summation, output, matrix, vector)

        return plan
Beispiel #8
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 4
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        # The first element is unused
        coeffs = numpy.concatenate(
            [[0],
             1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N // 2) / N))])
        coeffs_arr = plan.persistent_array(coeffs)

        prepare_iprfft_input = get_prepare_iprfft_input(input_)
        prepare_iprfft_output = get_prepare_iprfft_output(output)

        irfft = IRFFT(prepare_iprfft_input.Y)
        irfft.parameter.input.connect(prepare_iprfft_input,
                                      prepare_iprfft_input.Y,
                                      X=prepare_iprfft_input.X)
        irfft.parameter.output.connect(prepare_iprfft_output,
                                       prepare_iprfft_output.y,
                                       x=prepare_iprfft_output.x,
                                       x0=prepare_iprfft_output.x0,
                                       coeffs=prepare_iprfft_output.coeffs)

        real = Transformation([
            Parameter(
                'output',
                Annotation(Type(dtypes.real_for(input_.dtype), input_.shape),
                           'o')),
            Parameter('input', Annotation(input_, 'i')),
        ],
                              """
            ${output.store_same}((${input.load_same}).x);
            """,
                              connectors=['output'])

        rd_t = Type(output.dtype, input_.shape)
        rd = Reduce(rd_t,
                    predicate_sum(rd_t.dtype),
                    axes=(len(input_.shape) - 1, ))
        rd.parameter.input.connect(real, real.output, X=real.input)

        x0 = plan.temp_array_like(rd.parameter.output)

        plan.computation_call(rd, x0, input_)
        plan.computation_call(irfft, output, x0, coeffs_arr, input_)

        return plan
Beispiel #9
0
def test_normal(thr, shape, axis):

    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)

    rd = Reduce(a, predicate_sum(numpy.int64), axes=(axis,) if axis is not None else None)

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = a.sum(axis)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Beispiel #10
0
def test_nonsequential_axes(thr):

    shape = (50, 40, 30, 20)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0).sum(1)  # sum over axes 0 and 2 of the initial array

    rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0, 2))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Beispiel #11
0
def test_nonsequential_axes(thr):

    shape = (50, 40, 30, 20)
    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)
    b_ref = a.sum(0).sum(1) # sum over axes 0 and 2 of the initial array

    rd = Reduce(a_dev, predicate_sum(numpy.int64), axes=(0,2))

    b_dev = thr.empty_like(rd.parameter.output)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Beispiel #12
0
def test_normal(thr, shape, axis):

    a = get_test_array(shape, numpy.int64)
    a_dev = thr.to_device(a)

    rd = Reduce(a,
                predicate_sum(numpy.int64),
                axes=(axis, ) if axis is not None else None)

    b_dev = thr.empty_like(rd.parameter.output)
    b_ref = a.sum(axis)

    rdc = rd.compile(thr)
    rdc(b_dev, a_dev)

    assert diff_is_negligible(b_dev.get(), b_ref)
Beispiel #13
0
def check_scan(thr,
               shape,
               axes,
               exclusive=False,
               measure_time=False,
               dtype=numpy.int64,
               max_work_group_size=None,
               predicate=None,
               seq_size=None):

    # Note: the comparison will only work if the custom predicate is
    # functionally equivalent to `predicate_sum`.
    if predicate is None:
        predicate = predicate_sum(dtype)

    arr = get_test_array(shape, dtype)

    scan = Scan(arr,
                predicate,
                axes=axes,
                exclusive=exclusive,
                max_work_group_size=max_work_group_size,
                seq_size=seq_size).compile(thr)

    arr_dev = thr.to_device(arr)
    res_dev = thr.to_device(numpy.ones_like(arr) * (-1))  #thr.empty_like(arr)

    if measure_time:
        attempts = 10
        times = []
        for i in range(attempts):
            t1 = time.time()
            scan(res_dev, arr_dev)
            thr.synchronize()
            times.append(time.time() - t1)
        min_time = min(times)
    else:
        scan(res_dev, arr_dev)
        min_time = None

    res_test = res_dev.get()

    res_ref = ref_scan(arr, axes=axes, exclusive=exclusive)

    assert diff_is_negligible(res_ref, res_test)

    return min_time
Beispiel #14
0
    def __init__(self, arr_t, order=2, axes=None):
        tr_elems = norm_const(arr_t, order)
        out_dtype = tr_elems.output.dtype

        rd = Reduce(Type(out_dtype, arr_t.shape), predicate_sum(out_dtype), axes=axes)

        res_t = rd.parameter.output
        tr_sum = norm_const(res_t, 1. / order)

        rd.parameter.input.connect(tr_elems, tr_elems.output, input_prime=tr_elems.input)
        rd.parameter.output.connect(tr_sum, tr_sum.input, output_prime=tr_sum.output)

        self._rd = rd

        Computation.__init__(self, [
            Parameter('output', Annotation(res_t, 'o')),
            Parameter('input', Annotation(arr_t, 'i'))])
Beispiel #15
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        N = input_.shape[-1] * 4
        batch_shape = input_.shape[:-1]
        batch_size = helpers.product(batch_shape)

        # The first element is unused
        coeffs = numpy.concatenate(
            [[0], 1 / (4 * numpy.sin(2 * numpy.pi * numpy.arange(1, N//2) / N))])
        coeffs_arr = plan.persistent_array(coeffs)

        prepare_iprfft_input = get_prepare_iprfft_input(input_)
        prepare_iprfft_output = get_prepare_iprfft_output(output)

        irfft = IRFFT(prepare_iprfft_input.Y)
        irfft.parameter.input.connect(
            prepare_iprfft_input, prepare_iprfft_input.Y,
            X=prepare_iprfft_input.X)
        irfft.parameter.output.connect(
            prepare_iprfft_output, prepare_iprfft_output.y,
            x=prepare_iprfft_output.x,
            x0=prepare_iprfft_output.x0, coeffs=prepare_iprfft_output.coeffs)

        real = Transformation(
            [
                Parameter('output', Annotation(Type(dtypes.real_for(input_.dtype), input_.shape), 'o')),
                Parameter('input', Annotation(input_, 'i')),
            ],
            """
            ${output.store_same}((${input.load_same}).x);
            """,
            connectors=['output']
            )

        rd_t = Type(output.dtype, input_.shape)
        rd = Reduce(rd_t, predicate_sum(rd_t.dtype), axes=(len(input_.shape)-1,))
        rd.parameter.input.connect(real, real.output, X=real.input)

        x0 = plan.temp_array_like(rd.parameter.output)

        plan.computation_call(rd, x0, input_)
        plan.computation_call(irfft, output, x0, coeffs_arr, input_)

        return plan
Beispiel #16
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_click_order))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]} + 1;

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i < ${modes}; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    ${alpha.ctype} np = ${exp_c}(COMPLEX_CTR(${alpha.ctype})(-t.x, -t.y));

                    if (i >= order) {
                        result = ${mul_cc}(result, np);
                    }
                    else {
                        ${alpha.ctype} cp = COMPLEX_CTR(${alpha.ctype})(1 - np.x, -np.y);
                        result = ${mul_cc}(result, cp);
                    }
                }

                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                modes=self._system.modes,
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Beispiel #17
0
    def _build_plan(
            self, plan_factory, device_params,
            ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b):

        plan = plan_factory()

        extracted_n, t, base, inner_n = ks_a.shape

        mean = Reduce(noises_b, predicate_sum(noises_b.dtype))
        norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape))
        mean.parameter.output.connect(norm, norm.input, mean=norm.output)

        noises_b_mean = plan.temp_array_like(mean.parameter.mean)

        mul_key = MatrixMulVector(noises_a)
        b_term = plan.temp_array_like(mul_key.parameter.output)

        build_keyswitch = PureParallel([
            Parameter('ks_a', Annotation(ks_a, 'o')),
            Parameter('ks_b', Annotation(ks_b, 'o')),
            Parameter('ks_cv', Annotation(ks_cv, 'o')),
            Parameter('in_key', Annotation(in_key, 'i')),
            Parameter('b_term', Annotation(b_term, 'i')),
            Parameter('noises_a', Annotation(noises_a, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i')),
            Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))],
            Snippet(
                TEMPLATE.get_def("make_lwe_keyswitch_key"),
                render_kwds=dict(
                    log2_base=self._log2_base, output_size=self._output_size,
                    double_to_t32=double_to_t32_module, noise=self._noise)),
            guiding_array="ks_b")

        plan.computation_call(mean, noises_b_mean, noises_b)
        plan.computation_call(mul_key, b_term, noises_a, out_key)
        plan.computation_call(
            build_keyswitch,
            ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean)

        return plan
Beispiel #18
0
def check_scan(
        thr, shape, axes, exclusive=False,
        measure_time=False, dtype=numpy.int64, max_work_group_size=None, predicate=None,
        seq_size=None):

    # Note: the comparison will only work if the custom predicate is
    # functionally equivalent to `predicate_sum`.
    if predicate is None:
        predicate = predicate_sum(dtype)

    arr = get_test_array(shape, dtype)

    scan = Scan(
        arr, predicate, axes=axes, exclusive=exclusive,
        max_work_group_size=max_work_group_size, seq_size=seq_size).compile(thr)

    arr_dev = thr.to_device(arr)
    res_dev = thr.to_device(numpy.ones_like(arr) * (-1))#thr.empty_like(arr)

    if measure_time:
        attempts = 10
        times = []
        for i in range(attempts):
            t1 = time.time()
            scan(res_dev, arr_dev)
            thr.synchronize()
            times.append(time.time() - t1)
        min_time = min(times)
    else:
        scan(res_dev, arr_dev)
        min_time = None

    res_test = res_dev.get()

    res_ref = ref_scan(arr, axes=axes, exclusive=exclusive)

    assert diff_is_negligible(res_ref, res_test)

    return min_time
Beispiel #19
0
def sample(data, t, samplers):
    sample_dict = {}
    stop_integration = False

    for key, sampler in samplers.items():

        try:
            sample = sampler(data, t)
        except StopIntegration as e:
            sample = e.args[0]
            stop_integration = True

        sample_dict[key] = dict(trajectories=sample.shape[0], time=t)

        if isinstance(sample, numpy.ndarray):
            if not sampler.no_values:
                sample_dict[key]['values'] = sample.copy()
            if not sampler.no_mean:
                sample_dict[key]['mean'] = sample.mean(0)
            if not sampler.no_stderr:
                sample_dict[key]['stderr'] = sample.std(0) / numpy.sqrt(sample.shape[0])
        else:
            thr = sample.thread

            if not sampler.no_values:
                sample_dict[key]['values'] = sample.get()
            if not sampler.no_mean:
                sum_vals = Reduce(sample, predicate_sum(sample.dtype), axes=(0,)).compile(thr)
                sum_dev = thr.empty_like(sum_vals.parameter.output)
                sum_vals(sum_dev, sample)
                sample_dict[key]['mean'] = sum_dev.get() / sample.shape[0]
            if not sampler.no_stderr:
                norm2 = EntrywiseNorm(sample, order=2, axes=(0,)).compile(thr)
                n2_dev = thr.empty_like(norm2.parameter.output)
                norm2(n2_dev, sample)
                std = n2_dev.get() / numpy.sqrt(sample.shape[0])
                sample_dict[key]['stderr'] = std / numpy.sqrt(sample.shape[0])

    return sample_dict, stop_integration
Beispiel #20
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        for_reduction = Type(numpy.float64, (alpha.shape[0], self._max_moment))

        meter_trf = Transformation([
            Parameter('output', Annotation(for_reduction, 'o')),
            Parameter('alpha', Annotation(alpha, 'i')),
            Parameter('beta', Annotation(beta, 'i')),
            ],
            """
                VSIZE_T sample_idx = ${idxs[0]};
                VSIZE_T order = ${idxs[1]};

                ${alpha.ctype} result = COMPLEX_CTR(${alpha.ctype})(1, 0);
                for (VSIZE_T i = 0; i <= order; i++) {
                    ${alpha.ctype} alpha = ${alpha.load_idx}(sample_idx, i);
                    ${beta.ctype} beta = ${beta.load_idx}(sample_idx, i);
                    ${alpha.ctype} t = ${mul_cc}(alpha, beta);
                    t.x -= ${ordering};
                    result = ${mul_cc}(result, t);
                }
                ${output.store_same}(result.x);
                """,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                ordering=ordering(self._representation),
                ))

        reduction = Reduce(for_reduction, predicate_sum(output.dtype), axes=(0,))
        reduction.parameter.input.connect(
            meter_trf, meter_trf.output, alpha_p=meter_trf.alpha, beta_p=meter_trf.beta)

        plan.computation_call(reduction, output, alpha, beta)

        return plan
Beispiel #21
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan
Beispiel #22
0
def solve(nufft,gy, solver=None,  maxiter=30, *args, **kwargs):
    """
    The solve function of NUFFT_hsa.
    The current version supports solvers = 'cg' or 'L1TVOLS'. 
    
    :param nufft: NUFFT_hsa object
    :param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape.
    :type y: numpy.complex64 reikna array
    :return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode. 
    :rtype: x: reikna array, complex64. 
    """
    # define the reduction kernel on the device
#     if None ==  solver:
#         solver  =   'cg'
    if 'L1TVLAD' == solver:
        x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'L1TVOLS' == solver:
        x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
        return x2
    elif 'dc'   ==  solver:
         """
         Density compensation method
         nufft.st['W'] will be computed if doesn't exist
         If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y)
         input: 
             y: (M,) array
         output:
             x2: Nd array
         """
         print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version")

#          nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs)
#  
#          x2 = nufft.adjoint(nufft.st['W']*gy)
         return x2
#             return gx        
    elif 'cg' == solver:
 
        from reikna.algorithms import Reduce, Predicate, predicate_sum
         
        nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr)      
#         nufft.reduce_sum  = nufft.reduce_sum.compile(nufft.thr)        
         
        
        # update: b = spH * gy         
        b = nufft._y2k_device(gy)
        
        # Initialize x = b
        x   =   nufft.thr.copy_array( b)
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0+0.0j)
        nufft.reduce_sum(rsold, x)
#         print('x',rsold) 
         
        # initialize r = b - A * x
        r   =   nufft.thr.empty_like( b)
        
#         r.fill(0.0 + 0.0j) 
        y_tmp = nufft._k2y_device(x)
        
        Ax = nufft._y2k_device(y_tmp)
        
        del y_tmp
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, Ax)
#         print('Ax',rsold) 
        nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod))
        
#         nufft.thr.synchronize()
        # p = r
        p   =   nufft.thr.copy_array(r)
        
        # rsold = r' * r
        tmp_array = nufft.thr.empty_like( r)
#         tmp_array.fill(0.0 + 0.0j)
        nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#         nufft.thr.synchronize()
        rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
        nufft.reduce_sum(rsold, tmp_array)
 
        # allocate Ap
#         Ap  =   nufft.thr.empty_like( b)     
 
        rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsnew.fill(0.0 + 0.0j)
        tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         tmp_sum.fill(0.0 + 0.0j)
        for pp in range(0, maxiter):
            
            tmp_p = nufft._k2y_device(p)
            Ap = nufft._y2k_device(tmp_p)
            del tmp_p
#             alpha = rs_old/(p'*Ap)
            nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(tmp_sum, tmp_array)
            
            alpha = rsold / tmp_sum
#             alpha_cpu = alpha.get()
#             if numpy.isnan(alpha_cpu):
#                 alpha_cpu = 0 # avoid singularity
                
#             print(tmp_sum, alpha, rsold)
#             print(pp,rsold , alpha, numpy.sum(tmp_array.get()) )
            # x = x + alpha*p
            p2 = nufft.thr.copy_array(p)
            
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
#             nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod))
            x += p2
 
            # r = r - alpha * Ap
            p2= nufft.thr.copy_array(Ap)
#             nufft.thr.synchronize()
            nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            r -= p2
#             print(pp, numpy.sum(x.get()), numpy.sum(r.get()))
            # rs_new = r'*r
             
            nufft.prg.cMultiplyConjVec(r,    r,  tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.reduce_sum(rsnew, tmp_array)        
             
            # tmp_sum = p = r + (rs_new/rs_old)*p
            beta = rsnew/rsold
#             beta_cpu = beta.get()
#             if numpy.isnan(beta_cpu):
#                 beta_cpu = 0
#             print(beta, rsnew, rsold)
            p2= nufft.thr.copy_array(p)
            nufft.prg.cMultiplyScalar(beta.get(),   p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
            p = r + p2
             
            rsold =nufft.thr.copy_array( rsnew)
#             nufft.thr.synchronize()
        # end of iteration    
         
        # copy result to k_Kd2
#         nufft.k_Kd2 = nufft.thr.copy_array(x)
         
        # inverse FFT: k_Kd2 -> x_Nd
        x2 = nufft._k2xx_device(x) # x is the solved k space
        
        # rescale the SnGPUArray
        # x2 /= nufft.volume['gpu_sense2']
#         x3 = nufft.x2s(x2) # combine multi-coil to single-coil
        try:
            x2 /= nufft.volume['SnGPUArray']
        except:
            
            nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch), 
                                    numpy.uint32(nufft.tSN['Tdims']),
                                    nufft.tSN['Td'],
                                    nufft.tSN['Td_elements'],
                                    nufft.tSN['invTd_elements'],
                                    nufft.tSN['tensor_sn'], 
                                    x2, 
                                    numpy.uint32(1), # division, 1 is true
                                    local_size = None, global_size = int(nufft.batch*nufft.Ndprod))
         
        return x2