Python PureParallel Examples, reikna.algorithms.PureParallel Python Examples

Example #1

0

Show file

def logistic(context, activations, bias, dest=None):
    kernel_cache, thread = context.kernel_cache, context.thread

    if dest is None:
        dest = activations

    key = (logistic, activations.shape, thread)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        assert activations.shape[1] == bias.shape[0]

        kernel = PureParallel([
            Parameter('activations', Annotation(activations, 'i')),
            Parameter('bias', Annotation(bias, 'i')),
            Parameter('dest', Annotation(dest, 'o')),
        ],
                              """
        ${activations.ctype} a = ${activations.load_same};
        ${bias.ctype} b = ${bias.load_idx}(${idxs[1]});

        a += b;
        a = min(max(-45.0f, a), 45.0f);
        a = 1.0f / (1.0f + exp(-a));

        ${dest.store_same}(a);
        """,
                              guiding_array='activations')

        kernel_cache[key] = kernel.compile(thread, fast_math=True)

    # Run kernel
    kernel_cache[key](activations, bias, dest)

    return dest

Example #2

0

Show file

def classification_delta_kernel(ctx, outputs, targets, deltas):
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    assert outputs.shape[0] == targets.shape[0] == deltas.shape[0]
    assert len(targets.shape) == 1
    assert targets.dtype == numpy.int32
    assert outputs.shape[1] == deltas.shape[1]

    key = (classification_delta_kernel, outputs.shape)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        kernel = PureParallel([
            Parameter('outputs', Annotation(outputs, 'i')),
            Parameter('targets', Annotation(targets, 'i')),
            Parameter('deltas', Annotation(deltas, 'o'))
        ],
                              """
        ${outputs.ctype} out = ${outputs.load_same};
        SIZE_T t = ${targets.load_idx}(${idxs[0]});
        SIZE_T idx = ${idxs[1]};
        ${deltas.ctype} d;
        if (t == idx) {
            d = 1.0f - out;
        } else {
            d = -out;
        }
        ${deltas.store_same}(d);
        """,
                              guiding_array='deltas')

        kernel_cache[key] = kernel.compile(thread)

    # Run kernel
    kernel_cache[key](outputs, targets, deltas)

Example #3

0

Show file

File: test_pureparallel.py Project: xexo7C8/reikna

def test_guiding_output(thr):

    N = 1000
    dtype = numpy.float32

    p = PureParallel([
        Parameter('output', Annotation(Type(dtype, shape=N), 'o')),
        Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i'))
    ],
                     """
        float t1 = ${input.load_idx}(0, ${idxs[0]});
        float t2 = ${input.load_idx}(1, ${idxs[0]});
        ${output.store_idx}(${idxs[0]}, t1 + t2);
        """,
                     guiding_array='output')

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev)

    res_ref = a[0] + a[1]

    assert diff_is_negligible(res_dev.get(), res_ref)

Example #4

0

Show file

File: test_pureparallel.py Project: fjarri/reikna

def test_guiding_output(thr):

    N = 1000
    dtype = numpy.float32

    p = PureParallel(
        [
            Parameter('output', Annotation(Type(dtype, shape=N), 'o')),
            Parameter('input', Annotation(Type(dtype, shape=(2, N)), 'i'))],
        """
        float t1 = ${input.load_idx}(0, ${idxs[0]});
        float t2 = ${input.load_idx}(1, ${idxs[0]});
        ${output.store_idx}(${idxs[0]}, t1 + t2);
        """,
        guiding_array='output')

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev)

    res_ref = a[0] + a[1]

    assert diff_is_negligible(res_dev.get(), res_ref)

Example #5

0

Show file

File: logistic.py Project: schreon/neuronaut

def logistic(context, activations, bias, dest=None):
    kernel_cache, thread = context.kernel_cache, context.thread

    if dest is None:
        dest = activations

    key = (logistic, activations.shape, thread)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        assert activations.shape[1] == bias.shape[0]

        kernel = PureParallel(
            [
                Parameter('activations', Annotation(activations, 'i')),
                Parameter('bias', Annotation(bias, 'i')),
                Parameter('dest', Annotation(dest, 'o')),
            ],
            """
        ${activations.ctype} a = ${activations.load_same};
        ${bias.ctype} b = ${bias.load_idx}(${idxs[1]});

        a += b;
        a = min(max(-45.0f, a), 45.0f);
        a = 1.0f / (1.0f + exp(-a));

        ${dest.store_same}(a);
        """, guiding_array='activations')

        kernel_cache[key] = kernel.compile(thread, fast_math=True)

    # Run kernel
    kernel_cache[key](activations, bias, dest)

    return dest

Example #6

0

Show file

File: classification.py Project: schreon/neuronaut

def classification_delta_kernel(ctx, outputs, targets, deltas):
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    assert outputs.shape[0] == targets.shape[0] == deltas.shape[0]
    assert len(targets.shape) == 1
    assert targets.dtype == numpy.int32
    assert outputs.shape[1] == deltas.shape[1]


    key = (classification_delta_kernel, outputs.shape)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        kernel = PureParallel(
            [
                Parameter('outputs', Annotation(outputs, 'i')),
                Parameter('targets', Annotation(targets, 'i')),
                Parameter('deltas', Annotation(deltas, 'o'))
            ],
        """
        ${outputs.ctype} out = ${outputs.load_same};
        SIZE_T t = ${targets.load_idx}(${idxs[0]});
        SIZE_T idx = ${idxs[1]};
        ${deltas.ctype} d;
        if (t == idx) {
            d = 1.0f - out;
        } else {
            d = -out;
        }
        ${deltas.store_same}(d);
        """, guiding_array='deltas')

        kernel_cache[key] = kernel.compile(thread)

    # Run kernel
    kernel_cache[key](outputs, targets, deltas)

Example #7

0

Show file

def logistic_derivative(context, activations, delta, dest=None):
    kernel_cache, thread = context.kernel_cache, context.thread

    if dest is None:
        dest = delta

    key = (logistic_derivative, activations.shape, thread)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        kernel = PureParallel([
            Parameter('activations', Annotation(activations, 'i')),
            Parameter('delta', Annotation(activations, 'i')),
            Parameter('dest', Annotation(dest, 'o')),
        ],
                              """
        ${activations.ctype} a = ${activations.load_same};
        ${delta.ctype} d = ${delta.load_same};

        d = d*a*(1.0f - a);

        ${dest.store_same}(d);
        """,
                              guiding_array='activations')

        kernel_cache[key] = kernel.compile(thread, fast_math=True)

    # Run kernel
    kernel_cache[key](activations, delta, dest)

Example #8

0

Show file

File: logistic.py Project: schreon/neuronaut

def logistic_derivative(context, activations, delta, dest=None):
    kernel_cache, thread = context.kernel_cache, context.thread

    if dest is None:
        dest = delta

    key = (logistic_derivative, activations.shape, thread)
    if not key in kernel_cache.keys():
        log.info("compiling " + str(key))
        kernel = PureParallel(
            [
                Parameter('activations', Annotation(activations, 'i')),
                Parameter('delta', Annotation(activations, 'i')),
                Parameter('dest', Annotation(dest, 'o')),
            ],
            """
        ${activations.ctype} a = ${activations.load_same};
        ${delta.ctype} d = ${delta.load_same};

        d = d*a*(1.0f - a);

        ${dest.store_same}(d);
        """, guiding_array='activations')

        kernel_cache[key] = kernel.compile(thread, fast_math=True)

    # Run kernel
    kernel_cache[key](activations, delta, dest)

Example #9

0

Show file

def class_errors(ctx, expected, actual, errors):
    """ expected int32, actual float, errors int32 """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (class_errors, expected.shape)

    if key not in kernel_cache.keys():
        # target should be an integer
        logging.info("compiling " + str(key))
        assert expected.shape == errors.shape  # one neuron per class
        assert expected.shape == (actual.shape[0], )  # index of the class
        assert actual.dtype == numpy.float32
        assert expected.dtype == numpy.int32
        assert errors.dtype == numpy.int32
        kernel = PureParallel(
            [
                Parameter('expected', Annotation(expected, 'i')),
                Parameter('actual', Annotation(actual, 'i')),
                Parameter('errors', Annotation(errors, 'o'))
            ],
            """
            SIZE_T expected = ${expected.load_idx}(${idxs[0]});;
            float maximum=0.0f;
            float value;
            SIZE_T maxindex = 0;

            SIZE_T tl = ${target_length};

            // calculate argmax
            for(SIZE_T j=0; j < tl; j++) {
                value = ${actual.load_idx}(${idxs[0]}, j);

                if (value > maximum) {
                    maximum = value;
                    maxindex = j;
                }
            }

            // If the confidence is too low, return an error
            if (maximum < (1.0f / ${target_length}.0f + 0.001f)) {
                ${errors.store_same}(1);
                return;
            };

            // compare argmax
            if (maxindex != expected) {
                ${errors.store_same}(1);
            } else {
                ${errors.store_same}(0);
            }

        """,
            guiding_array='expected',
            render_kwds={'target_length': numpy.int32(actual.shape[1])})

        kernel_cache[key] = kernel.compile(thread)

    kernel_cache[key](expected, actual, errors)

Example #10

0

Show file

File: convolution.py Project: schreon/neuronaut

def convolve2d_propagation(ctx, array, weights, dest):
    """ The output is the valid discrete linear convolution of the inputs. """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (convolve2d_propagation, weights.shape, array.shape, thread)
    if not key in kernel_cache.keys():
        logging.info("compiling" + str(key))

        channels, filters, owidth, oheight = weights.shape[0], weights.shape[
            1], dest.shape[1], dest.shape[2]

        render_kwds = {
            'w0': weights.shape[2],
            'w1': weights.shape[3],
            'a0': array.shape[2],
            'a1': array.shape[3],
            'off0': int(weights.shape[2] - 1),
            'off1': int(weights.shape[3] - 1)
        }

        kernel_conv = PureParallel([
            Parameter('array', Annotation(array, 'i')),
            Parameter('weights', Annotation(weights, 'i')),
            Parameter('dest', Annotation(dest, 'o'))
        ],
                                   """
        // Array dimensions:
        // array : (channels, width, height)
        // weights: (channels, filters, fwidth, fheight)
        // dest (channels, filters, owidth, oheight)

        float a = 0.0f;
        SIZE_T x, y, i, j;
        const SIZE_T number = ${idxs[0]};
        const SIZE_T channel = ${idxs[1]};
        const SIZE_T filter = ${idxs[2]};
        const SIZE_T xout = ${idxs[3]};
        const SIZE_T yout = ${idxs[4]};
        for (i=0; i < ${w0}; i++){
            for (j=0; j < ${w1}; j++){
                x = xout - i  + ${off0};
                y = yout - j  + ${off1};
                a += ${array.load_idx}(number, channel, x, y)
                   * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j
            }
        }

        ${dest.store_same}(a);

        """,
                                   guiding_array='dest',
                                   render_kwds=render_kwds)
        kernel_cache[key] = kernel_conv.compile(thread, fast_math=True)

    # run convolution
    kernel_cache[key](array, weights, dest)

    return dest

Example #11

0

Show file

File: convolution.py Project: schreon/neuronaut

def convolve2d_propagation(ctx, array, weights, dest):
    """ The output is the valid discrete linear convolution of the inputs. """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (convolve2d_propagation, weights.shape, array.shape, thread)
    if not key in kernel_cache.keys():
        logging.info("compiling" + str(key))

        channels, filters, owidth, oheight = weights.shape[0], weights.shape[1], dest.shape[1], dest.shape[2]

        render_kwds = {
            'w0': weights.shape[2],
            'w1': weights.shape[3],
            'a0': array.shape[2],
            'a1': array.shape[3],
            'off0': int(weights.shape[2] - 1),
            'off1': int(weights.shape[3] - 1)
        }

        kernel_conv = PureParallel(
            [
                Parameter('array', Annotation(array, 'i')),
                Parameter('weights', Annotation(weights, 'i')),
                Parameter('dest', Annotation(dest, 'o'))
            ],
            """
        // Array dimensions:
        // array : (channels, width, height)
        // weights: (channels, filters, fwidth, fheight)
        // dest (channels, filters, owidth, oheight)

        float a = 0.0f;
        SIZE_T x, y, i, j;
        const SIZE_T number = ${idxs[0]};
        const SIZE_T channel = ${idxs[1]};
        const SIZE_T filter = ${idxs[2]};
        const SIZE_T xout = ${idxs[3]};
        const SIZE_T yout = ${idxs[4]};
        for (i=0; i < ${w0}; i++){
            for (j=0; j < ${w1}; j++){
                x = xout - i  + ${off0};
                y = yout - j  + ${off1};
                a += ${array.load_idx}(number, channel, x, y)
                   * ${weights.load_idx}(channel, filter, i, j); // channel, filter, i, j
            }
        }

        ${dest.store_same}(a);

        """, guiding_array='dest', render_kwds=render_kwds)
        kernel_cache[key] = kernel_conv.compile(
            thread, fast_math=True)

    # run convolution
    kernel_cache[key](array, weights, dest)

    return dest

Example #12

0

Show file

File: classification.py Project: schreon/neuronaut

def class_errors(ctx, expected, actual, errors):
    """ expected int32, actual float, errors int32 """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (class_errors, expected.shape)

    if key not in kernel_cache.keys():
        # target should be an integer
        logging.info("compiling " + str(key))
        assert expected.shape == errors.shape # one neuron per class
        assert expected.shape == (actual.shape[0],) # index of the class
        assert actual.dtype == numpy.float32
        assert expected.dtype == numpy.int32
        assert errors.dtype == numpy.int32
        kernel = PureParallel(
            [
                Parameter('expected', Annotation(expected, 'i')),
                Parameter('actual', Annotation(actual, 'i')),
                Parameter('errors', Annotation(errors, 'o'))
            ],
            """
            SIZE_T expected = ${expected.load_idx}(${idxs[0]});;
            float maximum=0.0f;
            float value;
            SIZE_T maxindex = 0;

            SIZE_T tl = ${target_length};

            // calculate argmax
            for(SIZE_T j=0; j < tl; j++) {
                value = ${actual.load_idx}(${idxs[0]}, j);

                if (value > maximum) {
                    maximum = value;
                    maxindex = j;
                }
            }

            // If the confidence is too low, return an error
            if (maximum < (1.0f / ${target_length}.0f + 0.001f)) {
                ${errors.store_same}(1);
                return;
            };

            // compare argmax
            if (maxindex != expected) {
                ${errors.store_same}(1);
            } else {
                ${errors.store_same}(0);
            }

        """, guiding_array='expected', render_kwds={'target_length' : numpy.int32(actual.shape[1])})

        kernel_cache[key] = kernel.compile(thread)

    kernel_cache[key](expected, actual, errors)

Example #13

0

Show file

File: array_helpers.py Project: fjarri/reikna

def setitem_computation(dest, source):
    """
    Returns a compiled computation that broadcasts ``source`` to ``dest``,
    where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar.
    """
    if len(source.shape) == 0:
        trf = transformations.broadcast_param(dest)
        return PureParallel.from_trf(trf, guiding_array=trf.output)
    else:
        source_dt = Type.from_value(source).with_dtype(dest.dtype)
        trf = transformations.copy(source_dt, dest)
        comp = PureParallel.from_trf(trf, guiding_array=trf.output)
        cast_trf = transformations.cast(source, dest.dtype)
        comp.parameter.input.connect(cast_trf, cast_trf.output, src_input=cast_trf.input)
        return comp

Example #14

0

Show file

    def _build_plan(self, plan_factory, device_params, a, current_variances, mu):
        plan = plan_factory()

        fill = PureParallel([
            Parameter('a', Annotation(a, 'o')),
            Parameter('current_variances', Annotation(current_variances, 'o')),
            Parameter('mu', Annotation(mu, 'i'))],
            """
            ${a.ctype} a;
            if (${idxs[-2]} == ${mask_size})
            {
                a = ${mu.load_idx}(${", ".join(idxs[:-2])}, ${idxs[-1]});
            }
            else
            {
                a = 0;
            }
            ${a.store_same}(a);

            if (${idxs[-1]} == 0)
            {
                ${current_variances.store_idx}(${", ".join(idxs[:-1])}, 0);
            }
            """,
            render_kwds=dict(mask_size=self._mask_size))

        plan.computation_call(fill, a, current_variances, mu)

        return plan

Example #15

0

Show file

File: test_tgsw.py Project: phorcys/nufhe

def test_tgsw_polynomial_decomp_trf(thread):

    shape = (2, 3)
    params = NuFHEParameters()
    tgsw_params = params.tgsw_params
    decomp_length = tgsw_params.decomp_length
    mask_size = tgsw_params.tlwe_params.mask_size
    polynomial_degree = tgsw_params.tlwe_params.polynomial_degree

    sample = get_test_array(shape + (mask_size + 1, polynomial_degree), Torus32, (0, 1000))
    result = numpy.empty(shape + (mask_size + 1, decomp_length, polynomial_degree), dtype=Int32)

    sample_dev = thread.to_device(sample)
    result_dev = thread.empty_like(result)

    trf = get_tgsw_polynomial_decomp_trf(tgsw_params, shape)
    test = PureParallel.from_trf(trf, guiding_array='result').compile(thread)

    ref = tgsw_polynomial_decomp_trf_reference(tgsw_params, shape)

    test(result_dev, sample_dev)
    result_test = result_dev.get()

    ref(result, sample)

    assert (result == result_test).all()

Example #16

0

Show file

File: test_pureparallel.py Project: SyamGadde/reikna

def test_trf_with_guiding_output(thr):
    """
    Test the creation of ``PureParallel`` out of a transformation,
    with an output parameter as a guiding array.
    """

    N = 1000
    coeff = 3
    dtype = numpy.float32

    arr_t = Type(dtype, shape=N)
    trf = mul_param(arr_t, dtype)
    p = PureParallel.from_trf(trf, trf.output)

    # The new PureParallel has to preserve the parameter list of the original transformation.
    assert list(p.signature.parameters.values()) == list(trf.signature.parameters.values())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev, coeff)

    assert diff_is_negligible(res_dev.get(), a * 3)

Example #17

0

Show file

File: array_helpers.py Project: drtpotter/reikna

def roll_computation(array, axis):
    return PureParallel([
        Parameter('output', Annotation(array, 'o')),
        Parameter('input', Annotation(array, 'i')),
        Parameter('shift', Annotation(Type(numpy.int32)))
    ],
                        """
        <%
            shape = input.shape
        %>
        %for i in range(len(shape)):
            VSIZE_T output_${idxs[i]} =
                %if i == axis:
                ${shift} == 0 ?
                    ${idxs[i]} :
                    ## Since ``shift`` can be negative, and its absolute value greater than
                    ## ``shape[i]``, a double modulo division is necessary
                    ## (the ``%`` operator preserves the sign of the dividend in C).
                    (${idxs[i]} + (${shape[i]} + ${shift} % ${shape[i]})) % ${shape[i]};
                %else:
                ${idxs[i]};
                %endif
        %endfor
        ${output.store_idx}(
            ${", ".join("output_" + name for name in idxs)},
            ${input.load_idx}(${", ".join(idxs)}));
        """,
                        guiding_array='input',
                        render_kwds=dict(axis=axis))

Example #18

0

Show file

def test_trf_with_guiding_output(thr):
    """
    Test the creation of ``PureParallel`` out of a transformation,
    with an output parameter as a guiding array.
    """

    N = 1000
    coeff = 3
    dtype = numpy.float32

    arr_t = Type(dtype, shape=N)
    trf = mul_param(arr_t, dtype)
    p = PureParallel.from_trf(trf, trf.output)

    # The new PureParallel has to preserve the parameter list of the original transformation.
    assert list(p.signature.parameters.values()) == list(
        trf.signature.parameters.values())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev, coeff)

    assert diff_is_negligible(res_dev.get(), a * 3)

Example #19

0

Show file

def identity(type):
    return PureParallel([
        Parameter('output', Annotation(type, 'o')),
        Parameter('input', Annotation(type, 'i'))
    ], """
        ${output.store_same}(${input.load_same});
        """)

Example #20

0

Show file

File: test_pureparallel.py Project: fjarri/reikna

def test_from_trf(thr, guiding_array):
    """
    Test the creation of ``PureParallel`` out of a transformation
    with various values of the guiding array.
    """

    N = 1000
    coeff = 3
    dtype = numpy.float32

    arr_t = Type(dtype, shape=N)
    trf = mul_param(arr_t, dtype)

    if guiding_array == 'input':
        arr = trf.input
    elif guiding_array == 'output':
        arr = trf.output
    elif guiding_array == 'none':
        arr = None

    p = PureParallel.from_trf(trf, guiding_array=arr)

    # The new PureParallel has to preserve the parameter list of the original transformation.
    assert list(p.signature.parameters.values()) == list(trf.signature.parameters.values())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev, coeff)

    assert diff_is_negligible(res_dev.get(), a * 3)

Example #21

0

Show file

    def _build_plan(self, plan_factory, device_params, output, input_,
                    inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        plan = plan_factory()

        axes = tuple(sorted(self._axes))
        shape = list(input_.shape)

        if all(shape[axis] % 2 == 0 for axis in axes):
            # If all shift axes have even length, it is possible to perform the shift inplace
            # (by swapping pairs of elements).
            # Note that the inplace fftshift is its own inverse.
            shape[axes[0]] //= 2
            plan.kernel_call(TEMPLATE.get_def('fftshift_inplace'),
                             [output, input_],
                             kernel_name="kernel_fftshift_inplace",
                             global_size=shape,
                             render_kwds=dict(axes=axes))
        else:
            # Resort to an out-of-place shift to a temporary array and then copy.
            temp = plan.temp_array_like(output)
            plan.kernel_call(TEMPLATE.get_def('fftshift_outplace'),
                             [temp, input_, inverse],
                             kernel_name="kernel_fftshift_outplace",
                             global_size=shape,
                             render_kwds=dict(axes=axes))

            copy_trf = copy(input_, out_arr_t=output)
            copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
            plan.computation_call(copy_comp, output, temp)

        return plan

Example #22

0

Show file

File: fftshift.py Project: fjarri/reikna

    def _build_plan(self, plan_factory, device_params, output, input_, inverse):

        if helpers.product([input_.shape[i] for i in self._axes]) == 1:
            return self._build_trivial_plan(plan_factory, output, input_)

        plan = plan_factory()

        axes = tuple(sorted(self._axes))
        shape = list(input_.shape)

        if all(shape[axis] % 2 == 0 for axis in axes):
        # If all shift axes have even length, it is possible to perform the shift inplace
        # (by swapping pairs of elements).
        # Note that the inplace fftshift is its own inverse.
            shape[axes[0]] //= 2
            plan.kernel_call(
                TEMPLATE.get_def('fftshift_inplace'), [output, input_],
                kernel_name="kernel_fftshift_inplace",
                global_size=shape,
                render_kwds=dict(axes=axes))
        else:
        # Resort to an out-of-place shift to a temporary array and then copy.
            temp = plan.temp_array_like(output)
            plan.kernel_call(
                TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse],
                kernel_name="kernel_fftshift_outplace",
                global_size=shape,
                render_kwds=dict(axes=axes))

            copy_trf = copy(input_, out_arr_t=output)
            copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
            plan.computation_call(copy_comp, output, temp)

        return plan

Example #23

0

Show file

File: lwe_gpu.py Project: stjordanis/nufhe

    def _build_plan(
            self, plan_factory, device_params,
            result_a, result_b, result_cv, messages, key, noises_a, noises_b):

        plan = plan_factory()

        mul_key = MatrixMulVector(noises_a)

        fill_b_cv = Transformation([
            Parameter('result_b', Annotation(result_b, 'o')),
            Parameter('result_cv', Annotation(result_cv, 'o')),
            Parameter('messages', Annotation(messages, 'i')),
            Parameter('noises_a_times_key', Annotation(noises_b, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i'))],
            """
            ${result_b.store_same}(
                ${noises_b.load_same}
                + ${messages.load_same}
                + ${noises_a_times_key.load_same});
            ${result_cv.store_same}(${noise**2});
            """,
            connectors=['noises_a_times_key'],
            render_kwds=dict(noise=self._noise))

        mul_key.parameter.output.connect(
            fill_b_cv, fill_b_cv.noises_a_times_key,
            b=fill_b_cv.result_b, cv=fill_b_cv.result_cv, messages=fill_b_cv.messages,
            noises_b=fill_b_cv.noises_b)

        plan.computation_call(mul_key, result_b, result_cv, messages, noises_b, noises_a, key)
        plan.computation_call(
            PureParallel.from_trf(transformations.copy(noises_a)),
            result_a, noises_a)

        return plan

Example #24

0

Show file

    def _build_plan(
            self, plan_factory, device_params,
            ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b):

        plan = plan_factory()

        extracted_n, t, base, inner_n = ks_a.shape

        mul_key = MatrixMulVector(noises_a)
        b_term = plan.temp_array_like(mul_key.parameter.output)

        build_keyswitch = PureParallel([
            Parameter('ks_a', Annotation(ks_a, 'o')),
            Parameter('ks_b', Annotation(ks_b, 'o')),
            Parameter('ks_cv', Annotation(ks_cv, 'o')),
            Parameter('in_key', Annotation(in_key, 'i')),
            Parameter('b_term', Annotation(b_term, 'i')),
            Parameter('noises_a', Annotation(noises_a, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i'))],
            Snippet(
                TEMPLATE.get_def("make_lwe_keyswitch_key"),
                render_kwds=dict(
                    log2_base=self._log2_base, output_size=self._output_size,
                    noise=self._noise)),
            guiding_array="ks_b")

        plan.computation_call(mul_key, b_term, noises_a, out_key)
        plan.computation_call(
            build_keyswitch,
            ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b)

        return plan

Example #25

0

Show file

File: test_pureparallel.py Project: xexo7C8/reikna

def test_from_trf(thr, guiding_array):
    """
    Test the creation of ``PureParallel`` out of a transformation
    with various values of the guiding array.
    """

    N = 1000
    coeff = 3
    dtype = numpy.float32

    arr_t = Type(dtype, shape=N)
    trf = mul_param(arr_t, dtype)

    if guiding_array == 'input':
        arr = trf.input
    elif guiding_array == 'output':
        arr = trf.output
    elif guiding_array == 'none':
        arr = None

    p = PureParallel.from_trf(trf, guiding_array=arr)

    # The new PureParallel has to preserve the parameter list of the original transformation.
    assert list(p.signature.parameters.values()) == list(
        trf.signature.parameters.values())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev, coeff)

    assert diff_is_negligible(res_dev.get(), a * 3)

Example #26

0

Show file

File: array_helpers.py Project: drtpotter/reikna

def setitem_computation(dest, source, is_array):
    """
    Returns a compiled computation that broadcasts ``source`` to ``dest``,
    where ``dest`` is a GPU array, and ``source`` is either a GPU array or a scalar.
    """
    if is_array:
        source_dt = Type.from_value(source).with_dtype(dest.dtype)
        trf = transformations.copy(source_dt, dest)
        comp = PureParallel.from_trf(trf, guiding_array=trf.output)
        cast_trf = transformations.cast(source, dest.dtype)
        comp.parameter.input.connect(cast_trf,
                                     cast_trf.output,
                                     src_input=cast_trf.input)
        return comp
    else:
        trf = transformations.broadcast_param(dest)
        return PureParallel.from_trf(trf, guiding_array=trf.output)

Example #27

0

Show file

File: test_pureparallel.py Project: fjarri/reikna

    def __init__(self, arr):

        copy_trf = copy(arr, out_arr_t=arr)
        self._copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)

        Computation.__init__(self, [
            Parameter('outer_output', Annotation(arr, 'o')),
            Parameter('outer_input', Annotation(arr, 'i'))])

Example #28

0

Show file

File: softmax.py Project: schreon/neuronaut

def softmax(ctx, activations, bias, dest=None):
    """ Softmax Activation Function """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    if dest is None:
        dest = activations

    key = (softmax, activations.shape)
    if key not in kernel_cache.keys():
        logging.info("compiling " + str(key))
        # Regression hidden layer
        kernel_softmax = PureParallel(
            [
                Parameter('activations', Annotation(activations, 'i')),
                Parameter('bias', Annotation(bias, 'i')),
                Parameter('dest', Annotation(dest, 'o')),
            ],
            """
            float x;
            float b;
            float s = 0.0f;
            SIZE_T tl = ${target_length};
            for(SIZE_T j=0; j < tl; j++) {
                x = ${activations.load_idx}(${idxs[0]}, j);
                b = ${bias.load_idx}(j);
                x += b;
                x = exp(min(max(x, -45.0f), 45.0f));
                ${dest.store_idx}(${idxs[0]}, j, x);

                s += x;
            }

            // divide by sum
            for(SIZE_T j=0; j < tl; j++) {
                x = ${dest.load_idx}(${idxs[0]}, j);
                x /= s;
                ${dest.store_idx}(${idxs[0]}, j, x);
            }
        """,
            guiding_array=(activations.shape[0], ),
            render_kwds={'target_length': numpy.int32(activations.shape[1])})

        kernel_cache[key] = kernel_softmax.compile(thread)

    kernel_cache[key](activations, bias, dest)

Example #29

0

Show file

File: test_pureparallel.py Project: xexo7C8/reikna

    def __init__(self, arr):

        copy_trf = copy(arr, out_arr_t=arr)
        self._copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)

        Computation.__init__(self, [
            Parameter('outer_output', Annotation(arr, 'o')),
            Parameter('outer_input', Annotation(arr, 'i'))
        ])

Example #30

0

Show file

File: softmax.py Project: schreon/neuronaut

def softmax(ctx, activations, bias, dest=None):
    """ Softmax Activation Function """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    if dest is None:
        dest = activations

    key = (softmax, activations.shape)
    if key not in kernel_cache.keys():
        logging.info("compiling " + str(key))
        # Regression hidden layer
        kernel_softmax = PureParallel(
            [
                Parameter('activations', Annotation(activations, 'i')),
                Parameter('bias', Annotation(bias, 'i')),
                Parameter('dest', Annotation(dest, 'o')),
            ],
            """
            float x;
            float b;
            float s = 0.0f;
            SIZE_T tl = ${target_length};
            for(SIZE_T j=0; j < tl; j++) {
                x = ${activations.load_idx}(${idxs[0]}, j);
                b = ${bias.load_idx}(j);
                x += b;
                x = exp(min(max(x, -45.0f), 45.0f));
                ${dest.store_idx}(${idxs[0]}, j, x);

                s += x;
            }

            // divide by sum
            for(SIZE_T j=0; j < tl; j++) {
                x = ${dest.load_idx}(${idxs[0]}, j);
                x /= s;
                ${dest.store_idx}(${idxs[0]}, j, x);
            }
        """, guiding_array=(activations.shape[0],), render_kwds={'target_length' : numpy.int32(activations.shape[1])})

        kernel_cache[key] = kernel_softmax.compile(thread)

    kernel_cache[key](activations, bias, dest)

Example #31

0

Show file

def get_test_computation(arr_t):
    return PureParallel([
        Parameter('output', Annotation(arr_t, 'o')),
        Parameter('input', Annotation(arr_t, 'i'))
    ], """
        <%
            all_idxs = ", ".join(idxs)
        %>
        ${output.store_idx}(${all_idxs}, ${input.load_idx}(${all_idxs}));
        """)

Example #32

0

Show file

File: fft.py Project: ringw/reikna

    def _build_trivial_plan(self, plan_factory, output, input_):
        # Trivial problem. Need to add a dummy kernel
        # because we still have to run transformations.

        plan = plan_factory()

        copy_trf = copy(input_, out_arr_t=output)
        copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
        plan.computation_call(copy_comp, output, input_)

        return plan

Example #33

0

Show file

File: array_helpers.py Project: fjarri/reikna

    def _build_plan(self, plan_factory, device_params, array, shift):
        plan = plan_factory()

        temp = plan.temp_array_like(array)
        plan.computation_call(roll_computation(array, self._axis), temp, array, shift)

        tr = transformations.copy(temp, out_arr_t=array)
        copy_comp = PureParallel.from_trf(tr, guiding_array=tr.output)
        plan.computation_call(copy_comp, array, temp)

        return plan

Example #34

0

Show file

File: fft.py Project: mgolub2/reikna

    def _build_trivial_plan(self, plan_factory, output, input_):
        # Trivial problem. Need to add a dummy kernel
        # because we still have to run transformations.

        plan = plan_factory()

        copy_trf = copy(input_, out_arr_t=output)
        copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input)
        plan.computation_call(copy_comp, output, input_)

        return plan

Example #35

0

Show file

File: array_helpers.py Project: drtpotter/reikna

    def _build_plan(self, plan_factory, device_params, array, shift):
        plan = plan_factory()

        temp = plan.temp_array_like(array)
        plan.computation_call(roll_computation(array, self._axis), temp, array,
                              shift)

        tr = transformations.copy(temp, out_arr_t=array)
        copy_comp = PureParallel.from_trf(tr, guiding_array=tr.output)
        plan.computation_call(copy_comp, array, temp)

        return plan

Example #36

0

Show file

File: test_pureparallel.py Project: fjarri/reikna

def test_zero_length_shape(thr):

    dtype = numpy.float32

    p = PureParallel(
        [
            Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')),
            Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i'))],
        """
        float t = ${input.load_idx}();
        ${output.store_idx}(t * 2);
        """,
        guiding_array=tuple())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev)

    res_ref = (a * 2).astype(dtype)

    assert diff_is_negligible(res_dev.get(), res_ref)

Example #37

0

Show file

File: test_pureparallel.py Project: xexo7C8/reikna

def test_zero_length_shape(thr):

    dtype = numpy.float32

    p = PureParallel([
        Parameter('output', Annotation(Type(dtype, shape=tuple()), 'o')),
        Parameter('input', Annotation(Type(dtype, shape=tuple()), 'i'))
    ],
                     """
        float t = ${input.load_idx}();
        ${output.store_idx}(t * 2);
        """,
                     guiding_array=tuple())

    a = get_test_array_like(p.parameter.input)
    a_dev = thr.to_device(a)
    res_dev = thr.empty_like(p.parameter.output)

    pc = p.compile(thr)
    pc(res_dev, a_dev)

    res_ref = (a * 2).astype(dtype)

    assert diff_is_negligible(res_dev.get(), res_ref)

Example #38

0

Show file

def test_tlwe_transformed_add_mul_to_trf(thread):

    shape = (2, 3)
    params = NuFHEParameters(transform_type='NTT')
    perf_params = PerformanceParameters(params).for_device(
        thread.device_params)
    tgsw_params = params.tgsw_params

    decomp_length = tgsw_params.decomp_length
    mask_size = tgsw_params.tlwe_params.mask_size
    polynomial_degree = tgsw_params.tlwe_params.polynomial_degree

    transform_type = tgsw_params.tlwe_params.transform_type
    transform = get_transform(transform_type)
    tlength = transform.transformed_length(polynomial_degree)
    tdtype = transform.transformed_dtype()

    result_shape = shape + (mask_size + 1, tlength)
    sample_shape = shape + (mask_size + 1, decomp_length, tlength)
    bk_len = 10
    bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1,
                           tlength)
    bk_row_idx = 2

    result = numpy.empty(result_shape, tdtype)

    sample = get_test_array(sample_shape, 'ff_number')
    bootstrap_key = get_test_array(bootstrap_key_shape, 'ff_number')

    result_dev = thread.empty_like(result)
    sample_dev = thread.to_device(sample)
    bootstrap_key_dev = thread.to_device(bootstrap_key)

    trf = get_tlwe_transformed_add_mul_to_trf(tgsw_params, shape, bk_len,
                                              perf_params)
    test = PureParallel.from_trf(trf, guiding_array='result').compile(thread)
    ref = tlwe_transformed_add_mul_to_trf_reference(tgsw_params, shape, bk_len,
                                                    perf_params)

    test(result_dev, sample_dev, bootstrap_key_dev, bk_row_idx)
    result_test = result_dev.get()

    ref(result, sample, bootstrap_key, bk_row_idx)

    if numpy.issubdtype(tdtype, numpy.integer):
        assert (result == result_test).all()
    else:
        assert numpy.allclose(result, result_test)

Example #39

0

Show file

File: test_pureparallel.py Project: xexo7C8/reikna

    def __init__(self, size, dtype):

        Computation.__init__(self, [
            Parameter('output', Annotation(Type(dtype, shape=size), 'o')),
            Parameter('input', Annotation(Type(dtype, shape=size), 'i'))
        ])

        self._p = PureParallel([
            Parameter('output', Annotation(Type(dtype, shape=size), 'o')),
            Parameter('i1', Annotation(Type(dtype, shape=size), 'i')),
            Parameter('i2', Annotation(Type(dtype, shape=size), 'i'))
        ], """
            ${i1.ctype} t1 = ${i1.load_idx}(${idxs[0]});
            ${i2.ctype} t2 = ${i2.load_idx}(${idxs[0]});
            ${output.store_idx}(${idxs[0]}, t1 + t2);
            """)

Example #40

0

Show file

def Multiply(type):
    return PureParallel([
        Parameter('output', Annotation(type, 'o')),
        Parameter('in1', Annotation(type, 'i')),
        Parameter('in2', Annotation(type, 'i'))
    ],
                        """
        ${ctype} f1 = ${in1.load_same}, f2 = ${in2.load_same};
        #if ${complex}
        ${output.store_same}((${ctype})(f1.x*f2.x - f1.y*f2.y, f1.x*f2.y + f1.y*f2.x));
        #else
        ${output.store_same}(f1*f2);
        #endif
        """,
                        render_kwds=dict(ctype=type.ctype,
                                         complex=int(dtypes.is_complex(type))))

Example #41

0

Show file

File: test_transformation.py Project: xexo7C8/reikna

def test_array_offset(thr):

    dtype = numpy.uint32
    itemsize = dtypes.normalize_type(dtype).itemsize
    offset_len = 10
    arr_len = 16

    # internal creation of the base array
    a1 = thr.array((arr_len,), dtype, offset=itemsize * offset_len)

    # providing base
    a2_base = thr.array((arr_len + offset_len,), dtype)
    a2 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, base=a2_base)

    # providing base_data
    a3_base = thr.array((arr_len + offset_len,), dtype)
    a3_data = a3_base.base_data
    a3 = thr.array((arr_len,), dtype, offset=itemsize * offset_len, base_data=a3_data)

    fill = PureParallel(
        [
            Parameter('output1', Annotation(a1, 'o')),
            Parameter('output2', Annotation(a2, 'o')),
            Parameter('output3', Annotation(a3, 'o')),
        ],
        """
        ${output1.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]});
        ${output2.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]});
        ${output3.store_idx}((int)${idxs[0]} - ${offset_len}, ${idxs[0]});
        """,
        render_kwds=dict(offset_len=offset_len),
        guiding_array=(arr_len + offset_len,)
        ).compile(thr)

    fill(a1, a2, a3)

    offset_range = numpy.arange(offset_len, arr_len + offset_len).astype(dtype)
    full_range = numpy.arange(arr_len + offset_len).astype(dtype)

    assert diff_is_negligible(a1.get(), offset_range)

    assert diff_is_negligible(a2_base.get(), full_range)
    assert diff_is_negligible(a2.get(), offset_range)

    assert diff_is_negligible(a3_base.get(), full_range)
    assert diff_is_negligible(a3.get(), offset_range)

Example #42

0

Show file

File: test_transformation.py Project: xexo7C8/reikna

def test_array_views(thr):

    a = get_test_array((6, 8, 10), numpy.int32)

    a_dev = thr.to_device(a)
    b_dev = thr.empty_like(a)

    in_view = a_dev[2:4, ::2, ::-1]
    out_view = b_dev[4:, 1:5, :]

    move = PureParallel.from_trf(
        transformations.copy(in_view, out_arr_t=out_view),
        guiding_array='output').compile(thr)

    move(out_view, in_view)
    b_res = b_dev.get()[4:, 1:5, :]
    b_ref = a[2:4, ::2, ::-1]

    assert diff_is_negligible(b_res, b_ref)

Example #43

0

Show file

File: lwe_gpu.py Project: stjordanis/nufhe

    def _build_plan(
            self, plan_factory, device_params,
            ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b):

        plan = plan_factory()

        extracted_n, t, base, inner_n = ks_a.shape

        mean = Reduce(noises_b, predicate_sum(noises_b.dtype))
        norm = transformations.div_const(mean.parameter.output, numpy.prod(noises_b.shape))
        mean.parameter.output.connect(norm, norm.input, mean=norm.output)

        noises_b_mean = plan.temp_array_like(mean.parameter.mean)

        mul_key = MatrixMulVector(noises_a)
        b_term = plan.temp_array_like(mul_key.parameter.output)

        build_keyswitch = PureParallel([
            Parameter('ks_a', Annotation(ks_a, 'o')),
            Parameter('ks_b', Annotation(ks_b, 'o')),
            Parameter('ks_cv', Annotation(ks_cv, 'o')),
            Parameter('in_key', Annotation(in_key, 'i')),
            Parameter('b_term', Annotation(b_term, 'i')),
            Parameter('noises_a', Annotation(noises_a, 'i')),
            Parameter('noises_b', Annotation(noises_b, 'i')),
            Parameter('noises_b_mean', Annotation(noises_b_mean, 'i'))],
            Snippet(
                TEMPLATE.get_def("make_lwe_keyswitch_key"),
                render_kwds=dict(
                    log2_base=self._log2_base, output_size=self._output_size,
                    double_to_t32=double_to_t32_module, noise=self._noise)),
            guiding_array="ks_b")

        plan.computation_call(mean, noises_b_mean, noises_b)
        plan.computation_call(mul_key, b_term, noises_a, out_key)
        plan.computation_call(
            build_keyswitch,
            ks_a, ks_b, ks_cv, in_key, b_term, noises_a, noises_b, noises_b_mean)

        return plan

Example #44

0

Show file

    def _build_plan(self, plan_factory, device_params, result, phase):
        plan = plan_factory()

        tr = Transformation([
            Parameter('result', Annotation(result, 'o')),
            Parameter('phase', Annotation(phase, 'i')),
        ],
                            """
            <%
                interv = 2**32 // mspace_size
                half_interv = interv // 2
            %>
            ${phase.ctype} phase = ${phase.load_same};
            ${result.store_same}(((unsigned int)phase + ${half_interv}) / ${interv});
            """,
                            render_kwds=dict(mspace_size=self._mspace_size,
                                             uint64=dtypes.ctype(
                                                 numpy.uint64)),
                            connectors=['result', 'phase'])

        plan.computation_call(
            PureParallel.from_trf(tr, guiding_array='result'), result, phase)

        return plan

Example #45

0

Show file

File: isaacudaimpl.py Project: xialulee/WaveSyn

def get_procs(thr, N):
    fft = FFTFactory.create(thr, (N,), compile_=False)
    unimod_trans = Transformation(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
        Parameter('input', Annotation(Type(np.complex128, N), 'i'))],
        """
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_same};
if (idx>${N}/2){
    val.x = 0.0;
    val.y = 0.0;
    ${output.store_same}(val);
}else
    ${output.store_same}(${polar_unit}(atan2(val.y, val.x)));
        """,
        render_kwds=dict(polar_unit=functions.polar_unit(dtype=np.float64), N=N)
    )
    fft.parameter.output.connect(unimod_trans, unimod_trans.input, uni=unimod_trans.output)
    fft_unimod = fft.compile(thr)
    
    mag_square = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('input', Annotation(Type(np.complex128, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
${input.ctype} val = ${input.load_idx}(idx);  
val.x = val.x*val.x + val.y*val.y;
val.y = 0;
${output.store_idx}(idx, val);
        '''
    )
    mag_square = mag_square.compile(thr)
    
    apply_mask = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('origin', Annotation(Type(np.complex128, N), 'i')),
         Parameter('mask', Annotation(Type(np.double, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
${output.store_idx}(idx, ${mul}(${origin.load_idx}(idx), ${mask.load_idx}(idx)));        
        ''',
        render_kwds=dict(mul=functions.mul(np.complex128, np.double))
    )
    apply_mask = apply_mask.compile(thr)
    
    combine_mag_phi = PureParallel(
        [Parameter('output', Annotation(Type(np.complex128, N), 'o')),
         Parameter('mag_square', Annotation(Type(np.complex128, N), 'i')),
         Parameter('phase', Annotation(Type(np.complex128, N), 'i'))],
        '''
VSIZE_T idx = ${idxs[0]};
double r = ${mag_square.load_idx}(idx).x;  
r = r<0.0 ? 0.0 : ${pow}(r, 0.5);
double2 v = ${phase.load_idx}(idx);
double angle = atan2(v.y, v.x);
${output.store_idx}(idx, ${polar}(r, angle));
        ''',
        render_kwds=dict(pow=functions.pow(np.double), polar=functions.polar(np.double))
    )
    combine_mag_phi = combine_mag_phi.compile(thr)
   
    return fft_unimod, mag_square, apply_mask, combine_mag_phi

Example #46

0

Show file

File: convolution.py Project: schreon/neuronaut

def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate):
    """ The output is the full discrete linear convolution of the inputs. """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread)
    if not key in kernel_cache.keys():
        logging.info("compiling " + str(key))

        # Extract shapes from the arrays
        n, channels, p_width, p_height = prev_deltas.shape
        n_1, filters, d_width, d_height = deltas.shape
        n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape

        # Some assertions to be sure everything is correct
        assert n_1 == n
        assert filters_1 == filters
        assert channels_1 == channels
        expected_shape = get_output_shape(prev_deltas, deltas, 'gradient')
        assert expected_shape == gradient_intermediate.shape
        assert d_width_1 == d_width
        assert d_height_1 == d_height

        # Render keywords
        render_kwds = {
            'n': n,
            'filters': filters,
            'channels': channels,
            'f_width': f_width,
            'f_height': f_height,
            'd_width': d_width,
            'd_height': d_height,
            'p_width': p_width,
            'p_height': p_height,
        }

        # The kernel
        kernel = PureParallel([
            Parameter('prev_deltas', Annotation(prev_deltas, 'i')),
            Parameter('deltas', Annotation(deltas, 'i')),
            Parameter('gradient_intermediate',
                      Annotation(gradient_intermediate, 'o'))
        ],
                              """

        const SIZE_T number = ${idxs[0]};
        const SIZE_T dx = ${idxs[1]};
        const SIZE_T dy = ${idxs[2]};
        const SIZE_T channel = ${idxs[3]};
        const SIZE_T filter = ${idxs[4]};
        const SIZE_T fx = ${idxs[5]};
        const SIZE_T fy = ${idxs[6]};


        // weight gradient at the weight position fx, fy is defined by the sum
        //
        //       (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum()
        //
        // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now.

        float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy);

        ${gradient_intermediate.store_same}(g);

        """,
                              guiding_array='gradient_intermediate',
                              render_kwds=render_kwds)

        kernel_cache[key] = kernel.compile(thread, fast_math=True)

    # run convolution -> intermediate
    kernel_cache[key](prev_deltas, deltas, gradient_intermediate)

    return gradient_intermediate

Example #47

0

Show file

File: convolution.py Project: schreon/neuronaut

def convolve2d_gradient(ctx, prev_deltas, deltas, gradient_intermediate):
    """ The output is the full discrete linear convolution of the inputs. """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (convolve2d_gradient, prev_deltas.shape, deltas.shape, thread)
    if not key in kernel_cache.keys():
        logging.info("compiling " + str(key))

        # Extract shapes from the arrays
        n, channels, p_width, p_height = prev_deltas.shape
        n_1, filters, d_width, d_height = deltas.shape
        n, d_width_1, d_height_1, channels_1, filters_1, f_width, f_height = gradient_intermediate.shape

        # Some assertions to be sure everything is correct
        assert n_1 == n
        assert filters_1 == filters
        assert channels_1 == channels
        expected_shape = get_output_shape(prev_deltas, deltas, 'gradient')
        assert expected_shape == gradient_intermediate.shape
        assert d_width_1 == d_width
        assert d_height_1 == d_height

        # Render keywords
        render_kwds = {
            'n':n,
            'filters':filters,
            'channels': channels,
            'f_width': f_width,
            'f_height': f_height,
            'd_width': d_width,
            'd_height': d_height,
            'p_width': p_width,
            'p_height': p_height,
        }

        # The kernel
        kernel = PureParallel(
            [
                Parameter('prev_deltas', Annotation(prev_deltas, 'i')),
                Parameter('deltas', Annotation(deltas, 'i')),
                Parameter('gradient_intermediate', Annotation(gradient_intermediate, 'o'))
            ],
            """

        const SIZE_T number = ${idxs[0]};
        const SIZE_T dx = ${idxs[1]};
        const SIZE_T dy = ${idxs[2]};
        const SIZE_T channel = ${idxs[3]};
        const SIZE_T filter = ${idxs[4]};
        const SIZE_T fx = ${idxs[5]};
        const SIZE_T fy = ${idxs[6]};


        // weight gradient at the weight position fx, fy is defined by the sum
        //
        //       (deltas * prev_deltas[fx:d_width+fx, fy:fy+d_height]).sum()
        //
        // alternatively we can store all delta positions and sum in a separate kernel - this is what we do now.

        float g = ${deltas.load_idx}(number, filter, dx, dy) * ${prev_deltas.load_idx}(number, channel, dx+fx, dy+fy);

        ${gradient_intermediate.store_same}(g);

        """, guiding_array='gradient_intermediate', render_kwds=render_kwds)

        kernel_cache[key] = kernel.compile(
            thread, fast_math=True)

    # run convolution -> intermediate
    kernel_cache[key](prev_deltas, deltas, gradient_intermediate)

    return gradient_intermediate

Example #48

0

Show file

File: convolution.py Project: schreon/neuronaut

def convolve2d_backprop(ctx, deltas, weights, deltas_intermediate):
    """ The output is the full discrete linear convolution of the inputs. """
    kernel_cache, thread = ctx.kernel_cache, ctx.thread

    key = (convolve2d_backprop, deltas.shape, weights.shape, thread)
    if not key in kernel_cache.keys():
        logging.info("compiling " + str(key))

        # Extract shapes from the arrays
        channels, filters, f_width, f_height = weights.shape
        n_1, filters_1, d_width, d_height = deltas.shape
        n, channels_1, filters_2, p_width, p_height = deltas_intermediate.shape

        # Some assertions to be sure everything is correct
        assert n_1 == n
        assert filters_2 == filters_1 == filters
        assert channels_1 == channels
        expected_shape = get_output_shape(deltas, weights, 'backprop')
        assert expected_shape == deltas_intermediate.shape

        # Render keywords
        render_kwds = {
            'n':n,
            'filters':filters,
            'channels': channels,
            'f_width': f_width,
            'f_height': f_height,
            'd_width': d_width,
            'd_height': d_height,
            'p_width': p_width,
            'p_height': p_height,
        }

        # The kernel
        kernel = PureParallel(
            [
                Parameter('deltas', Annotation(deltas, 'i')),
                Parameter('weights', Annotation(weights, 'i')),
                Parameter('deltas_intermediate', Annotation(deltas_intermediate, 'o'))
            ],
            """
        float d = 0.0f;
        SIZE_T x, y, i, j, fi, fj;
        const SIZE_T number = ${idxs[0]};
        const SIZE_T channel = ${idxs[1]};
        const SIZE_T filter = ${idxs[2]};
        const SIZE_T xout = ${idxs[3]};
        const SIZE_T yout = ${idxs[4]};
        for (i=0; i < ${f_width}; i++){
            for (j=0; j < ${f_height}; j++){
                x = xout - i;
                if (x < 0) continue;
                if (x >= ${d_width}) continue;
                y = yout - j;
                if (y < 0) continue;
                if (y >= ${d_height}) continue;
                // acces weights in flipped order!
                fi = ${f_width} - i - 1;
                fj = ${f_height} - j - 1;
                d += ${deltas.load_idx}(number, channel, x, y)
                   * ${weights.load_idx}(channel, filter, fi, fj);
            }
        }

        ${deltas_intermediate.store_same}(d);

        """, guiding_array='deltas_intermediate', render_kwds=render_kwds)

        kernel_cache[key] = kernel.compile(
            thread, fast_math=True)

    # run convolution -> intermediate
    kernel_cache[key](deltas, weights, deltas_intermediate)

    return deltas_intermediate