Ejemplo n.º 1
0
    def _macros(self, node, name):
        define_template = "#define %s %s\n"
        undef_template = "#undef %s\n"
        define_macros = []
        undef_macros = []

        rdim = get_scalar_constant_value(node.inputs[2])
        vdim = get_scalar_constant_value(node.inputs[3])

        define_macros.append(define_template %
                             ("DIM_SPECIFIC(str)", "str##_%d_%d" %
                              (rdim, vdim)))
        undef_macros.append(undef_template % "DIM_SPECIFIC")

        consts = {
            "REF_DIM": str(rdim),
            "VAL_DIM": str(vdim),
            "KEY_DIM": str(rdim),
            "MIN_QUAD_PROBES": str(MIN_QUAD_PROBES),
            "GID_0": "filt_fakegpu_GID_0",
            "LID_0": "filt_fakegpu_LID_0",
            "LDIM_0": "filt_fakegpu_LDIM_0",
            "KERNEL": "",
            "GLOBAL_MEM": ""
        }

        for k, v in consts.items():
            define_macros.append(define_template % (k, v))
            undef_macros.append(undef_template % k)

        return ''.join(define_macros), ''.join(undef_macros)
Ejemplo n.º 2
0
    def make_node(self, ref, values, ref_dim, val_dim, *_hash):
        assert (values.ndim == 3)
        ref = as_tensor_variable(ref.astype("float32"))
        values = as_tensor_variable(values.astype("float32"))

        ref_dim = get_scalar_constant_value(ref_dim)
        val_dim = get_scalar_constant_value(val_dim)
        if "int" not in str(ref_dim.dtype) or "int" not in str(val_dim.dtype):
            raise ValueError("ref_dim and val_dim must be integers.")

        scaled_ref = ref * float(np.sqrt(2 / 3) * (ref_dim + 1))

        if len(_hash) == 0:
            hash_struct = PermutohedralHashTable()(scaled_ref, ref_dim)
        else:
            assert (len(_hash) == 6)
            hash_struct = [as_tensor_variable(v) for v in _hash]

        # Should we not do this?
        bcast = [False for _ in range(3)]
        if val_dim == 1:
            bcast[0] = True

        out_type = values.type.clone(broadcastable=bcast)

        ref_dim = constant(ref_dim, dtype="int32", name="ref_dim")
        val_dim = constant(val_dim, dtype="int32", name="val_dim")

        inputs = [ref, values, ref_dim, val_dim] + hash_struct
        return Apply(self, inputs, [out_type()])
Ejemplo n.º 3
0
    def grad(self, inputs, ograds):
        ref, values, ref_dim, val_dim = inputs[:4]
        hash_struct = inputs[4:]
        ograd = ograds[0]

        ref_dim = get_scalar_constant_value(ref_dim)
        val_dim = get_scalar_constant_value(val_dim)

        def _conv(x):
            return GaussianFilter()(ref, x, ref_dim, val_dim, *hash_struct)

        # Since the kernels are separable and symmetric, the gradient w.r.t.
        # input is just the same filtering applied to the output grads.
        grad_i = _conv(ograd)

        def _gradr(r_i, vals, og, *args):
            return (og * (_conv(vals * r_i) - r_i * _conv(vals)) + vals *
                    (_conv(og * r_i) - r_i * _conv(og)))

        grad_r, _ = theano.scan(fn=_gradr,
                                sequences=[ref],
                                non_sequences=[values, ograd] + hash_struct,
                                outputs_info=None)

        grad_r = grad_r.sum(axis=1, acc_dtype="float32")

        grads = [DisconnectedType()() for i in range(len(inputs))]
        grads[0] = grad_r
        grads[1] = grad_i
        return grads
Ejemplo n.º 4
0
    def gpu_kernels(self, node, name):
        dim = get_scalar_constant_value(node.inputs[1])
        flags = Kernel.get_flags(node.inputs[0].dtype)

        def_macros, undef_macros = self._macros(node, name)
        hsup = (self._hash_support_code() + "\n" + self._lookup_code())

        knames = ["build_hash", "dedup", "find_valid"]
        kcodes = [
            "".join(
                open("%s%s%s.cu" %
                     (os.path.dirname(__file__), os.path.sep, kn)).readlines())
            for kn in knames
        ]
        kcodes = [
            "\n".join([def_macros, hsup, code, undef_macros])
            for code in kcodes
        ]
        kcodes = ["#include \"cluda.h\"\n" + code for code in kcodes]

        kparams = ([
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE, SIZE, SIZE
        ], [GpuArray, SIZE, GpuArray, SIZE,
            SIZE], [GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE])

        return [
            Kernel(code=kcode,
                   name="%s_%d" % (kname, dim),
                   params=kparams,
                   flags=flags)
            for kcode, kname, kparams in zip(kcodes, knames, kparams)
        ]
Ejemplo n.º 5
0
    def _macros(self, node, name):
        define_template = "#define %s %s\n"
        undef_template = "#undef %s\n"
        define_macros = []
        undef_macros = []

        dim = get_scalar_constant_value(node.inputs[1])
        define_macros.append(define_template %
                             ("DIM_SPECIFIC(str)", "str##_%d" % dim))
        undef_macros.append(undef_template % "DIM_SPECIFIC")

        consts = {
            "REF_DIM": str(dim),
            "KEY_DIM": str(dim),
            "DR": "%s.f" % str(dim),
            "INV_DR1": "(1.f / (%s.f+1.f))" % str(dim),
            "MIN_QUAD_PROBES": str(MIN_QUAD_PROBES),
            "GID_0": "hash_fakegpu_GID_0",
            "LID_0": "hash_fakegpu_LID_0",
            "LDIM_0": "hash_fakegpu_LDIM_0",
            "KERNEL": "",
            "GLOBAL_MEM": ""
        }

        for k, v in consts.items():
            define_macros.append(define_template % (k, v))
            undef_macros.append(undef_template % k)

        return ''.join(define_macros), ''.join(undef_macros)
Ejemplo n.º 6
0
    def compute_bcast(self, dist_params, size):
        """Compute the broadcast array for this distribution's `TensorType`.

        Parameters
        ----------
        dist_params: list
            Distribution parameters.
        size: int or Sequence (optional)
            Numpy-like size of the output (i.e. replications).

        """
        shape = self._infer_shape(size, dist_params)

        # Let's try to do a better job than `_infer_ndim_bcast` when
        # dimension sizes are symbolic.
        bcast = []
        for s in shape:
            s_owner = getattr(s, "owner", None)

            # Get rid of the `Assert`s added by `broadcast_shape`
            if s_owner and isinstance(s_owner.op, theano.tensor.opt.Assert):
                s = s_owner.inputs[0]

            try:
                s_val = get_scalar_constant_value(s)
            except NotScalarConstantError:
                s_val = False

            bcast += [s_val == 1]
        return bcast
Ejemplo n.º 7
0
    def infer_shape(self, node, in_shapes):
        dim = get_scalar_constant_value(node.inputs[1])
        point_shp = in_shapes[0]
        h, w = point_shp[:2]
        N = h*w
        cap = N*(dim+1)

        return [(cap,), (cap, dim), (dim+1, h, w), (dim+1, h, w), (cap,), (1,)]
Ejemplo n.º 8
0
def gaussian_filter(ref_img,
                    values,
                    kern_std,
                    ref_dim=None,
                    val_dim=None,
                    *_hash):
    """Applies a high-dimensional Gaussian filter to 'values' with pairwise
    Gaussian weights based on features in 'ref_img'.

    Parameters
    ----------

    ref_img : array_like, shape (ref_dim, H, W)
        The reference image from which to derive the pairwise Gaussian weights
        (the locations for each image pixel in a high-dimensional space).

    values : array_like, shape (val_dim, H, W)
        The image we are going to filter.

    kern_std : array_like, shape (ref_dim, )
        Standard deviation of the Gaussian filter in each dimension.

    ref_dim : int or None
        The reference image dimensionality. Must be a known scalar constant.
        For a color bilateral filter, this is 5: x, y, r, g, b.

        If None, attempt to infer the dimensionality from the shape of
        'ref_img'.

    val_dim : int or None
        The image dimensionality (color channels, usually). Must be a known
        scalar constant.

        If None, attempt to infer the dimensionality from the shape of
        'values'.
"""

    if ref_dim is None:
        ref_dim = get_scalar_constant_value(ref_img.shape[0])

    if val_dim is None:
        val_dim = get_scalar_constant_value(values.shape[0])

    scaled_ref = ref_img / kern_std[:, np.newaxis, np.newaxis]
    return GaussianFilter()(scaled_ref, values, ref_dim, val_dim, *_hash)
Ejemplo n.º 9
0
def local_concatenateGrad_mkl(node):
    if not mkl_available():
        return

    if not isinstance(node.op, Split):
        return

    if node.inputs[0].type.ndim != 4:
        return

    try:
        gz, axis, splits, = node.inputs
        if not isinstance(axis, integer_types):
            try:
                axis = int(get_scalar_constant_value(axis))
            except NotScalarConstantError:
                return

        if isinstance(axis, integer_types):
            # MKL Concatenate only supports axis=1
            if axis != 1:
                return

        # Retrieve the inputs to Join op
        #                         inp_0             inp_1    inp
        #                         |                 |        |
        # Splits <- MakeVector <- [Subtensor...] <- Shape <- inputs
        if not isinstance(splits.owner.op, theano.tensor.opt.MakeVector):
            return

        tensors = []
        for inp_0 in splits.owner.inputs:
            if not isinstance(inp_0.owner.op, theano.tensor.subtensor.Subtensor):
                return

            inp_1 = inp_0.owner.inputs[0]
            if not isinstance(inp_1.owner.op, theano.compile.ops.Shape):
                return

            inp = inp_1.owner.inputs[0]
            tensors.append(inp)

        tensors_internal = [U2IConcatenate()(x) for x in tensors]
        new_inputs = [axis] + tensors_internal
        z_internal = mkl_concatenate.Concatenate()(*new_inputs)
        gz_internal = I2UGrad()(z_internal, gz)

        concatenateGradOut = mkl_concatenate.ConcatenateGrad()(gz_internal, axis, *tensors_internal)
        gx_user = [U2IGrad()(_x, _gz) for _x, _gz in zip(tensors, concatenateGradOut)]

        rval = gx_user
        return rval
    except Exception as e:
        msg = ('Failed to apply local opt to Op %s. '
               'Exception message: %s\n') % (node.op, str(e))
        _logger.warning(msg)
        return
Ejemplo n.º 10
0
def local_0_dot_x(node):
    if not isinstance(node.op, T.Dot):
        return False

    x = node.inputs[0]
    y = node.inputs[1]
    replace = False
    try:
        if get_scalar_constant_value(x) == 0:
            replace = True
    except NotScalarConstantError:
        pass

    try:
        if get_scalar_constant_value(y) == 0:
            replace = True
    except NotScalarConstantError:
        pass

    if replace:
        constant_zero = T.constant(0, dtype=node.outputs[0].type.dtype)
        if x.ndim == 2 and y.ndim == 2:
            constant_zero = assert_(constant_zero,
                                    T.eq(x.shape[1], y.shape[0]))
            return [T.alloc(constant_zero, x.shape[0], y.shape[1])]
        elif x.ndim == 1 and y.ndim == 2:
            constant_zero = assert_(constant_zero,
                                    T.eq(x.shape[0], y.shape[0]))
            return [T.alloc(constant_zero, y.shape[1])]
        elif x.ndim == 2 and y.ndim == 1:
            constant_zero = assert_(constant_zero,
                                    T.eq(x.shape[1], y.shape[0]))
            return [T.alloc(constant_zero, x.shape[0])]
        elif x.ndim == 1 and y.ndim == 1:
            constant_zero = assert_(constant_zero,
                                    T.eq(x.shape[0], y.shape[0]))
            return [constant_zero]
        else:
            _logger.warning("Optimization Warning: "
                            "Optimization theano/opt.py:local_0_dot_x Found "
                            "that it could apply, but was not implemented "
                            "for dot product with these input types:\n"
                            "(%s, %s)",
                            x.type, y.type)
Ejemplo n.º 11
0
    def make_node(self, points, dim):
        assert (points.ndim == 3)
        points = as_tensor_variable(points.astype("float32"))

        dim = get_scalar_constant_value(dim)
        if "int" not in str(dim.dtype):
            raise ValueError("dim must be an integer.")

        dim = constant(dim, dtype="int32", name="dim")

        entries_type = TensorType("int32", broadcastable=(False, ))
        keys_type = TensorType("int16", broadcastable=(False, False))
        neib_ent_type = TensorType("int32",
                                   broadcastable=(False, False, False))
        bary_type = TensorType("float32",
                               broadcastable=points.type.broadcastable)

        valid_entries_type = TensorType("int32", broadcastable=(False, ))
        n_valid_type = TensorType("int32", broadcastable=(False, ))

        out_vars = [
            entries_type(name="hash_entries"),
            keys_type(name="hash_keys"),
            neib_ent_type(name="neighbor_entries"),
            bary_type(name="barycentric_coords"),
            valid_entries_type(name="valid_entries"),
            n_valid_type(name="n_valid")
        ]

        # Two sets of entries can't be meaningfully compared without also
        # having the corresponding keys. Since we can only define per-output
        # comparisons, we have to hope that any time someone compares two
        # tables for equality, they will check all outputs.
        out_vars[0].tag.values_eq_approx = lambda e1, e2: True
        out_vars[2].tag.values_eq_approx = lambda e1, e2: True

        # The number of valid entries between two equivalent tables may be
        # different since it includes duplicates.
        out_vars[5].tag.values_eq_approx = lambda n1, n2: True

        def keys_comparison(k1, k2):
            k1 = [tuple(k) for k in np.asarray(k1)]
            k2 = [tuple(k) for k in np.asarray(k2)]
            return set(k1) == set(k2)

        out_vars[1].tag.values_eq_approx = keys_comparison

        def valid_entries_comparison(e1, e2):
            e1 = np.asarray(e1)
            e2 = np.asarray(e2)
            return len(np.unique(e1)) == len(np.unique(e2))

        out_vars[4].tag.values_eq_approx = valid_entries_comparison

        return Apply(self, [points, dim], out_vars)
Ejemplo n.º 12
0
    def gpu_kernels(self, node, name):
        rdim = get_scalar_constant_value(node.inputs[2])
        vdim = get_scalar_constant_value(node.inputs[3])

        flags = Kernel.get_flags(node.inputs[0].dtype, node.inputs[1].dtype)

        def_macros, undef_macros = self._macros(node, name)
        hsup = (GpuHashTable._hash_support_code() + "\n" +
                GpuHashTable._lookup_code())

        knames = ["splat", "blur", "slice"]
        kcodes = [
            "".join(
                open("%s%s%s.cu" %
                     (os.path.dirname(__file__), os.path.sep, kn)).readlines())
            for kn in knames
        ]
        kcodes = [
            "\n".join([def_macros, hsup, code, undef_macros])
            for code in kcodes
        ]
        kparams = ([
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE
        ], [
            GpuArray, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray,
            SIZE, SIZE, SIZE
        ], [
            GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE,
            GpuArray, SIZE
        ])

        return [
            Kernel(code=kcode,
                   name="%s_%d_%d" % (kname, rdim, vdim),
                   params=kparams,
                   flags=flags)
            for kcode, kname, kparams in zip(kcodes, knames, kparams)
        ]
Ejemplo n.º 13
0
def scalarconsts_rest(inputs):
    """Partition a list of variables into two kinds:
    scalar constants, and the rest."""
    consts = []
    origconsts = []
    nonconsts = []
    for i in inputs:
        try:
            v = get_scalar_constant_value(i)
            consts.append(v)
            origconsts.append(i)
        except NotScalarConstantError:
            nonconsts.append(i)
    return consts, origconsts, nonconsts
Ejemplo n.º 14
0
def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
    if node.op == T._max_and_argmax:
        if len(node.outputs[1].clients) == 0:
            #MaxAndArgmax support variable axis,
            #but CAReduce support only constant axis.
            try:
                axis = get_scalar_constant_value(node.inputs[1])
            except NotScalarConstantError:
                return False

            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]
Ejemplo n.º 15
0
def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
    if node.op == T._max_and_argmax:
        if len(node.outputs[1].clients) == 0:
            #MaxAndArgmax support variable axis,
            #but CAReduce support only constant axis.
            try:
                axis = get_scalar_constant_value(node.inputs[1])
            except NotScalarConstantError:
                return False

            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]
Ejemplo n.º 16
0
 def shape_dim_i(x, i):
     #print 'shape keys', shape_of.keys()
     #print 'args (x, i):', x, i
     try:
         return x.data.shape[i]
     except AttributeError:
         pass
     try:
         return int(get_scalar_constant_value(shape_of[x][i]))
     except NotScalarConstantError:
         pass
     try:
         return shape_of[x][i].eval()
     except:
         return -1 # an unsatisfiable shape
Ejemplo n.º 17
0
    def apply(self, fgraph):
        did_something = True
        while did_something:
            nodelist = fgraph.toposort()
            did_something = False
            for node in nodelist:
                if node.op == T._max_and_argmax:
                    if len(node.outputs[1].clients) == 0:
                        try:
                            axis = get_scalar_constant_value(node.inputs[1])
                        except NotScalarConstantError:
                            return False

                        new = CAReduce(scal.maximum, axis)(node.inputs[0])
                        try:
                            fgraph.replace_all_validate(((node.outputs[0], new),), reason=self.__class__.__name__)
                            did_something = True
                            break
                        except InconsistencyError, e:
                            pass
Ejemplo n.º 18
0
    def make_node(self, x, repeats):
        x = basic.as_tensor_variable(x)
        repeats = basic.as_tensor_variable(repeats)

        if repeats.dtype not in tensor.integer_dtypes:
            raise TypeError("repeats.dtype must be an integer.")

        # Some dtypes are not supported by numpy's implementation of repeat.
        # Until another one is available, we should fail at graph construction
        # time, not wait for execution.
        ptr_bitwidth = theano.configdefaults.local_bitwidth()
        if ptr_bitwidth == 64:
            numpy_unsupported_dtypes = ("uint64",)
        if ptr_bitwidth == 32:
            numpy_unsupported_dtypes = ("uint32", "int64", "uint64")

        if repeats.dtype in numpy_unsupported_dtypes:
            raise TypeError(
                (
                    "dtypes %s are not supported by numpy.repeat "
                    "for the 'repeats' parameter, " % str(numpy_unsupported_dtypes)
                ),
                repeats.dtype,
            )

        if self.axis is None:
            broadcastable = [False]
        else:
            try:
                const_reps = basic.get_scalar_constant_value(repeats)
            except basic.NotScalarConstantError:
                const_reps = None
            if const_reps == 1:
                broadcastable = x.broadcastable
            else:
                broadcastable = list(x.broadcastable)
                broadcastable[self.axis] = False

        out_type = theano.tensor.TensorType(x.dtype, broadcastable)

        return theano.Apply(self, [x, repeats], [out_type()])
Ejemplo n.º 19
0
    def make_node(self, x, repeats):
        x = basic.as_tensor_variable(x)
        repeats = basic.as_tensor_variable(repeats)

        if repeats.dtype not in tensor.integer_dtypes:
            raise TypeError("repeats.dtype must be an integer.")

        # Some dtypes are not supported by numpy's implementation of repeat.
        # Until another one is available, we should fail at graph construction
        # time, not wait for execution.
        ptr_bitwidth = theano.configdefaults.local_bitwidth()
        if ptr_bitwidth == 64:
            numpy_unsupported_dtypes = ("uint64",)
        if ptr_bitwidth == 32:
            numpy_unsupported_dtypes = ("uint32", "int64", "uint64")

        if repeats.dtype in numpy_unsupported_dtypes:
            raise TypeError(
                (
                    "dtypes %s are not supported by numpy.repeat "
                    "for the 'repeats' parameter, " % str(numpy_unsupported_dtypes)
                ),
                repeats.dtype,
            )

        if self.axis is None:
            broadcastable = [False]
        else:
            try:
                const_reps = basic.get_scalar_constant_value(repeats)
            except basic.NotScalarConstantError:
                const_reps = None
            if const_reps == 1:
                broadcastable = x.broadcastable
            else:
                broadcastable = list(x.broadcastable)
                broadcastable[self.axis] = False

        out_type = theano.tensor.TensorType(x.dtype, broadcastable)

        return theano.Apply(self, [x, repeats], [out_type()])
Ejemplo n.º 20
0
    def apply(self, fgraph):
        did_something = True
        while did_something:
            nodelist = fgraph.toposort()
            did_something = False
            for node in nodelist:
                if node.op == T._max_and_argmax:
                    if len(node.outputs[1].clients) == 0:
                        try:
                            axis = get_scalar_constant_value(node.inputs[1])
                        except NotScalarConstantError:
                            return False

                        new = CAReduce(scal.maximum, axis)(node.inputs[0])
                        try:
                            fgraph.replace_all_validate(
                                ((node.outputs[0], new),),
                                reason=self.__class__.__name__)
                            did_something = True
                            break
                        except InconsistencyError, e:
                            pass
Ejemplo n.º 21
0
def isNaN_or_Inf_or_None(x):
    isNone = x is None
    try:
        isNaN = numpy.isnan(x)
        isInf = numpy.isinf(x)
        isStr = isinstance(x, string_types)
    except Exception:
        isNaN = False
        isInf = False
        isStr = False
    if not isNaN and not isInf:
        try:
            val = get_scalar_constant_value(x)
            isInf = numpy.isinf(val)
            isNaN = numpy.isnan(val)
        except Exception:
            isNaN = False
            isInf = False
    if isinstance(x, gof.Constant) and isinstance(x.data, string_types):
        isStr = True
    else:
        isStr = False
    return isNone or isNaN or isInf or isStr
Ejemplo n.º 22
0
def isNaN_or_Inf_or_None(x):
    isNone = x is None
    try:
        isNaN = numpy.isnan(x)
        isInf = numpy.isinf(x)
        isStr = isinstance(x, string_types)
    except Exception:
        isNaN = False
        isInf = False
        isStr = False
    if not isNaN and not isInf:
        try:
            val = get_scalar_constant_value(x)
            isInf = numpy.isinf(val)
            isNaN = numpy.isnan(val)
        except Exception:
            isNaN = False
            isInf = False
    if isinstance(x, gof.Constant) and isinstance(x.data, string_types):
        isStr = True
    else:
        isStr = False
    return isNone or isNaN or isInf or isStr
Ejemplo n.º 23
0
def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
    if node.op == T._max_and_argmax:
        if len(node.outputs[1].clients) == 0:
            # MaxAndArgmax support variable axis,
            # but CAReduce support only constant axis.
            if node.inputs[1].data is None:
                axis = None
            else:
                try:
                    axis = get_scalar_constant_value(node.inputs[1])
                except NotScalarConstantError:
                    axis = node.inputs[1]
                    if not isinstance(axis, T.TensorConstant):
                        return False
                    axis = axis.data

            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]

        if len(node.outputs[0].clients) == 0:
            return [None, T._argmax(node.inputs[0], node.inputs[1])]
Ejemplo n.º 24
0
def local_concatenate_mkl(node):
    if not mkl_available():
        return

    if not isinstance(node.op, Join):
        return

    if node.inputs[1].type.ndim != 4:
        return

    try:
        axis, tensors = node.inputs[0], node.inputs[1:]

        if not isinstance(axis, integer_types):
            try:
                axis = int(get_scalar_constant_value(axis))
            except NotScalarConstantError:
                return

        if isinstance(axis, integer_types):
            # MKL Concatenate only supports axis=1
            if axis != 1:
                return

        tensors_internal = [U2IConcatenate()(x) for x in tensors]
        new_inputs = [axis] + tensors_internal
        concatenateOut = mkl_concatenate.Concatenate()(*new_inputs)
        z_user = I2U()(concatenateOut)
        rval = z_user

        return [rval]
    except Exception as e:
        msg = ('Failed to apply local opt to Op %s. '
               'Exception message: %s\n') % (node.op, str(e))
        _logger.warning(msg)
        return
Ejemplo n.º 25
0
def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
    if node.op == T._max_and_argmax:
        if len(node.outputs[1].clients) == 0:
            # MaxAndArgmax support variable axis,
            # but CAReduce support only constant axis.
            if node.inputs[1].data is None:
                axis = None
            else:
                try:
                    axis = get_scalar_constant_value(node.inputs[1])
                except NotScalarConstantError:
                    axis = node.inputs[1]
                    if not isinstance(axis, T.TensorConstant):
                        return False
                    axis = axis.data

            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]

        if len(node.outputs[0].clients) == 0:
            return [None, T._argmax(node.inputs[0], node.inputs[1])]
Ejemplo n.º 26
0
    def c_code(self, node, name, inputs, outputs, sub):
        points = inputs[0]
        entries, keys, neib_ents, barycentric, valid_entries, n_valid = outputs
        dim = get_scalar_constant_value(node.inputs[1])
        fail = sub["fail"]

        code = """
npy_intp point_dims[3];
npy_intp entries_dim[1];
npy_intp keys_dims[2];
npy_intp neib_ents_dims[3];
npy_intp barycentric_dims[3];
npy_intp valid_entries_dim[1];

npy_intp n_valid_dim[1];
n_valid_dim[0] = 1;

point_dims[0] = PyArray_DIMS(%(points)s)[0];
point_dims[1] = PyArray_DIMS(%(points)s)[1];
point_dims[2] = PyArray_DIMS(%(points)s)[2];

npy_intp N = point_dims[1] * point_dims[2];
npy_intp cap = N*(point_dims[0]+1);

PyArrayObject* pcontig = NULL;
bool should_decref_pcontig = false;

if(point_dims[0] != %(dim)s) {
    PyErr_Format(PyExc_ValueError,
        "hashtable error: incorrect input dim 0.\\nExpected %(dim)s got %%d",
        point_dims[0]);
    %(fail)s;
}

if(PyArray_TYPE(%(points)s) != NPY_FLOAT) {
    PyErr_Format(PyExc_ValueError,
        "hashtable error: incorrect dtype for points.");
    %(fail)s;
}

entries_dim[0] = cap;

keys_dims[0] = cap;
keys_dims[1] = point_dims[0];

neib_ents_dims[0] = point_dims[0]+1;
neib_ents_dims[1] = point_dims[1];
neib_ents_dims[2] = point_dims[2];

barycentric_dims[0] = point_dims[0]+1;
barycentric_dims[1] = point_dims[1];
barycentric_dims[2] = point_dims[2];

valid_entries_dim[0] = cap;

if(!valid_output_ptr(%(entries)s, NPY_INT, 1, entries_dim)) {
    Py_XDECREF(%(entries)s);
    %(entries)s = (PyArrayObject*)PyArray_EMPTY(1, entries_dim, NPY_INT, 0);
}
if(!valid_output_ptr(%(keys)s, NPY_SHORT, 2, keys_dims)) {
    Py_XDECREF(%(keys)s);
    %(keys)s = (PyArrayObject*)PyArray_ZEROS(2, keys_dims, NPY_SHORT, 0);
}
if(!valid_output_ptr(%(neib_ents)s, NPY_INT, 3, neib_ents_dims)) {
    Py_XDECREF(%(neib_ents)s);
    %(neib_ents)s = (PyArrayObject*)PyArray_ZEROS(3, neib_ents_dims, NPY_INT, 0);
}
if(!valid_output_ptr(%(barycentric)s, NPY_FLOAT, 3, barycentric_dims)) {
    Py_XDECREF(%(barycentric)s);
    %(barycentric)s = (PyArrayObject*)PyArray_ZEROS(3, barycentric_dims, NPY_FLOAT, 0);
}
if(!valid_output_ptr(%(valid_entries)s, NPY_INT, 1, valid_entries_dim)) {
    Py_XDECREF(%(valid_entries)s);
    %(valid_entries)s = (PyArrayObject*)PyArray_ZEROS(1, valid_entries_dim, NPY_INT, 0);
}
if(!valid_output_ptr(%(n_valid)s, NPY_INT, 1, n_valid_dim)) {
    Py_XDECREF(%(n_valid)s);
    %(n_valid)s = (PyArrayObject*)PyArray_ZEROS(1, n_valid_dim, NPY_INT, 0);
} else {
    PyArray_FillWithScalar(%(n_valid)s, PyLong_FromLong(0));
}

if (!(%(entries)s && %(keys)s && %(neib_ents)s && %(barycentric)s &&
    %(valid_entries)s)) {

    PyErr_Format(PyExc_MemoryError,
            "error building hash table: failed to allocate output storage.");
    %(fail)s;
}

if (!PyArray_IS_C_CONTIGUOUS(%(points)s)) {
    should_decref_pcontig = true;
}
pcontig = PyArray_GETCONTIGUOUS(%(points)s);

PyArray_FillWithScalar(%(entries)s, PyLong_FromLong(-1));

#pragma omp parallel for
for(int i=0; i<N; ++i) {
    hash_fakegpu_GID_0 = i;
    build_hash_%(dim)s(
        (float*)PyArray_DATA(%(points)s), 0,
        (int*)PyArray_DATA(%(entries)s), 0,
        (short*)PyArray_DATA(%(keys)s), 0,
        (int*)PyArray_DATA(%(neib_ents)s), 0,
        (float*)PyArray_DATA(%(barycentric)s), 0,
        cap, N);
}

#pragma omp parallel for
for(int i=0; i<cap; ++i) {
    hash_fakegpu_GID_0 = i;
    dedup_%(dim)s(
        (int*)PyArray_DATA(%(entries)s), 0,
        (short*)PyArray_DATA(%(keys)s), 0,
        cap);
}

#pragma omp parallel for
for(int i=0; i<cap; ++i) {
    hash_fakegpu_GID_0 = i;
    find_valid_%(dim)s(
        (int*)PyArray_DATA(%(entries)s), 0,
        (int*)PyArray_DATA(%(valid_entries)s), 0,
        (int*)PyArray_DATA(%(n_valid)s), 0,
        cap);
}

if (should_decref_pcontig) {
    Py_DECREF(pcontig);
}
"""
        return code % locals()
Ejemplo n.º 27
0
    def c_code(self, node, name, inputs, outputs, sub):
        values = inputs[1]
        entries, keys, neib_ents, barycentric, valid_entries, nv = inputs[4:]
        output = outputs[0]

        rdim = get_scalar_constant_value(node.inputs[2])
        vdim = get_scalar_constant_value(node.inputs[3])

        fail = sub["fail"]
        inplace = "1" if self.inplace else "0"

        code = """
npy_intp val_dims[3];
npy_intp tmp_val_dims[2];
npy_intp output_dims[3];

val_dims[0] = PyArray_DIMS(%(values)s)[0];
val_dims[1] = PyArray_DIMS(%(values)s)[1];
val_dims[2] = PyArray_DIMS(%(values)s)[2];

size_t N = val_dims[1] * val_dims[2];
size_t cap = N*(%(rdim)s+1);

size_t ls_N, gs_N, ls_valid, gs_valid;
int nv = *((int*)PyArray_DATA(%(nv)s));

PyArrayObject* tmp_vals_1 = NULL;
PyArrayObject* tmp_vals_2 = NULL;
PyArrayObject* tmp_vptr_1 = NULL;
PyArrayObject* tmp_vptr_2 = NULL;
PyArrayObject* tmp_swap = NULL;

PyArrayObject* vcontig = NULL;
bool should_decref_vcontig = false;

if(val_dims[0] != %(vdim)s) {
    PyErr_Format(PyExc_ValueError,
        "blur error: bad input shape 0.\\nExpected %(vdim)s, got %%d",
        val_dims[0]);
    %(fail)s;
}

if(val_dims[1] != PyArray_DIMS(%(barycentric)s)[1] ||
   val_dims[2] != PyArray_DIMS(%(barycentric)s)[2]) {
    PyErr_Format(PyExc_ValueError,
            "blur error: bad input h/w.\\nExpected (%%d, %%d), got (%%d, %%d)",
            val_dims[1], val_dims[2]);
    %(fail)s;
}

tmp_val_dims[0] = cap;
tmp_val_dims[1] = val_dims[0];

output_dims[0] = val_dims[0];
output_dims[1] = val_dims[1];
output_dims[2] = val_dims[2];

tmp_vals_1 = (PyArrayObject*)PyArray_ZEROS(2, tmp_val_dims, NPY_FLOAT, 0);
tmp_vals_2 = (PyArrayObject*)PyArray_ZEROS(2, tmp_val_dims, NPY_FLOAT, 0);
if (!tmp_vals_1 || !tmp_vals_2) {
    PyErr_Format(PyExc_RuntimeError,
                 "error allocating temporary filtering storage.");
    %(fail)s;
}

tmp_vptr_1 = tmp_vals_1;
tmp_vptr_2 = tmp_vals_2;

if(%(inplace)s) {
    Py_XDECREF(%(output)s);
    %(output)s = %(values)s;
    Py_INCREF(%(output)s);
} else if(!valid_output_ptr(%(output)s, NPY_FLOAT, 3, output_dims)) {
    Py_XDECREF(%(output)s);
    %(output)s = (PyArrayObject*)PyArray_ZEROS(3, output_dims, NPY_FLOAT, 0);
}

if (!%(output)s) {
    PyErr_Format(PyExc_MemoryError,
        "error performing gaussian blur: failed to allocate output storage.");
    %(fail)s;
}

if (!PyArray_IS_C_CONTIGUOUS(%(values)s)) {
    should_decref_vcontig = true;
}
vcontig = PyArray_GETCONTIGUOUS(%(values)s);

#pragma omp parallel for
for(int i=0; i<N; ++i) {
    filt_fakegpu_GID_0 = i;
    splat_%(rdim)s_%(vdim)s(
        (float*)PyArray_DATA(vcontig), 0,
        (float*)PyArray_DATA(%(barycentric)s), 0,
        (int*)PyArray_DATA(%(entries)s), 0,
        (int*)PyArray_DATA(%(neib_ents)s), 0,
        (float*)PyArray_DATA(tmp_vals_1),
        N);
}

for(int ax=0; ax<%(rdim)s+1; ++ax) {
    #pragma omp parallel for
    for(int i=0; i<nv; ++i) {
        filt_fakegpu_GID_0 = i;
        blur_%(rdim)s_%(vdim)s(
            (float*)PyArray_DATA(tmp_vptr_2),
            (int*)PyArray_DATA(%(entries)s), 0,
            (int*)PyArray_DATA(%(valid_entries)s), 0,
            (short*)PyArray_DATA(%(keys)s), 0,
            (float*)PyArray_DATA(tmp_vptr_1),
            cap, nv, ax);
    }

    tmp_swap = tmp_vptr_1;
    tmp_vptr_1 = tmp_vptr_2;
    tmp_vptr_2 = tmp_swap;
}

#pragma omp parallel for
for(int i=0; i<N; ++i) {
    filt_fakegpu_GID_0 = i;
    slice_%(rdim)s_%(vdim)s(
        (float*)PyArray_DATA(%(output)s), 0,
        (float*)PyArray_DATA(%(barycentric)s), 0,
        (int*)PyArray_DATA(%(entries)s), 0,
        (int*)PyArray_DATA(%(neib_ents)s), 0,
        (float*)PyArray_DATA(tmp_vptr_2),
        N);
}

if (should_decref_vcontig) {
    Py_DECREF(vcontig);
}
"""
        return code % locals()
Ejemplo n.º 28
0
    def c_code(self, node, name, inputs, outputs, sub):
        points = inputs[0]
        entries, keys, neib_ents, barycentric, valid_entries, n_valid = outputs

        dim = get_scalar_constant_value(node.inputs[1])

        fail = sub["fail"]
        ctx = sub["params"]
        sync = bool(theano.config.gpuarray.sync)
        kname_build = "k_build_hash_%d" % dim
        kname_dedup = "k_dedup_%d" % dim
        kname_fve = "k_find_valid_%d" % dim

        code = """
int err = GA_NO_ERROR;

size_t point_dims[3];
size_t entries_dim[1];
size_t keys_dims[2];
size_t neib_ents_dims[3];
size_t barycentric_dims[3];
size_t valid_entries_dim[1];

size_t n_valid_dim[1];
n_valid_dim[0] = 1;

point_dims[0] = PyGpuArray_DIMS(%(points)s)[0];
point_dims[1] = PyGpuArray_DIMS(%(points)s)[1];
point_dims[2] = PyGpuArray_DIMS(%(points)s)[2];

size_t N = point_dims[1] * point_dims[2];
size_t cap = N*(point_dims[0]+1);

size_t ls_N, gs_N, ls_cap, gs_cap;

if(point_dims[0] != %(dim)s) {
    PyErr_Format(PyExc_ValueError,
        "hashtable error: incorrect input dim 0.\\nExpected %(dim)s got %%d",
        point_dims[0]);
    %(fail)s;
}

entries_dim[0] = cap;

keys_dims[0] = cap;
keys_dims[1] = point_dims[0];

neib_ents_dims[0] = point_dims[0]+1;
neib_ents_dims[1] = point_dims[1];
neib_ents_dims[2] = point_dims[2];

barycentric_dims[0] = point_dims[0]+1;
barycentric_dims[1] = point_dims[1];
barycentric_dims[2] = point_dims[2];

valid_entries_dim[0] = cap;

if(!valid_output_ptr(%(entries)s, GA_INT, 1, entries_dim)) {
    Py_XDECREF(%(entries)s);
    %(entries)s = pygpu_empty(1, entries_dim, GA_INT, GA_C_ORDER, %(ctx)s,
        Py_None);
}
if(!valid_output_ptr(%(keys)s, GA_SHORT, 2, keys_dims)) {
    Py_XDECREF(%(keys)s);
    %(keys)s = pygpu_zeros(2, keys_dims, GA_SHORT, GA_C_ORDER, %(ctx)s,
        Py_None);
}
if(!valid_output_ptr(%(neib_ents)s, GA_INT, 3, neib_ents_dims)) {
    Py_XDECREF(%(neib_ents)s);
    %(neib_ents)s = pygpu_zeros(3, neib_ents_dims, GA_INT, GA_C_ORDER, %(ctx)s,
        Py_None);
}
if(!valid_output_ptr(%(barycentric)s, GA_FLOAT, 3, barycentric_dims)) {
    Py_XDECREF(%(barycentric)s);
    %(barycentric)s = pygpu_zeros(3, barycentric_dims, GA_FLOAT, GA_C_ORDER,
        %(ctx)s, Py_None);
}
if(!valid_output_ptr(%(valid_entries)s, GA_INT, 1, valid_entries_dim)) {
    Py_XDECREF(%(valid_entries)s);
    %(valid_entries)s = pygpu_zeros(1, valid_entries_dim, GA_INT, GA_C_ORDER,
        %(ctx)s, Py_None);
}
if(!valid_output_ptr(%(n_valid)s, GA_INT, 1, n_valid_dim)) {
    Py_XDECREF(%(n_valid)s);
    %(n_valid)s = pygpu_zeros(1, n_valid_dim, GA_INT, GA_C_ORDER,
        %(ctx)s, Py_None);
} else {
    GpuArray_memset(&%(n_valid)s->ga, 0);
}

if (!(%(entries)s && %(keys)s && %(neib_ents)s && %(barycentric)s &&
    %(valid_entries)s)) {

    PyErr_Format(PyExc_MemoryError,
            "error building hash table: failed to allocate output storage.");
    %(fail)s;
}

GpuArray_memset(&%(entries)s->ga, -1);

gs_N = ls_N = 0;
GpuKernel_sched(&%(kname_build)s, N, &gs_N, &ls_N);
gs_N = N / ls_N;
if (ls_N*gs_N < N) { ++gs_N; }

err = build_hash_%(dim)s_call(1, &gs_N, &ls_N, 0,
    %(points)s->ga.data, %(points)s->ga.offset / sizeof(float),
    %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
    %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short),
    %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int),
    %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float),
    cap, N);

if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
        "gpuarray error building hash table:\\n%%s.\\n",
        GpuKernel_error(&%(kname_build)s, err));
    %(fail)s;
}

GpuArray_sync(&%(entries)s->ga);
GpuArray_sync(&%(keys)s->ga);

gs_cap = ls_cap = 0;
GpuKernel_sched(&%(kname_dedup)s, cap, &gs_cap, &ls_cap);
gs_cap = cap / ls_cap;
if (ls_cap*gs_cap < cap) { ++gs_cap; }

err = dedup_%(dim)s_call(1, &gs_cap, &ls_cap, 0,
    %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
    %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short),
    cap);

if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
        "gpuarray error cleaning hash table:\\n%%s.\\n",
        GpuKernel_error(&%(kname_dedup)s, err));
    %(fail)s;
}

GpuArray_sync(&%(entries)s->ga);
GpuArray_sync(&%(keys)s->ga);

err = find_valid_%(dim)s_call(1, &gs_cap, &ls_cap, 0,
    %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
    %(valid_entries)s->ga.data, %(valid_entries)s->ga.offset / sizeof(int),
    %(n_valid)s->ga.data, %(n_valid)s->ga.offset / sizeof(int),
    cap);

if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
        "gpuarray error counting valid hash entries:\\n%%s.\\n",
        GpuKernel_error(&%(kname_fve)s, err));
    %(fail)s;
}

GpuArray_sync(&%(entries)s->ga);
GpuArray_sync(&%(keys)s->ga);
GpuArray_sync(&%(neib_ents)s->ga);
GpuArray_sync(&%(barycentric)s->ga);
GpuArray_sync(&%(n_valid)s->ga);
"""
        return code % locals()
Ejemplo n.º 29
0
    def c_code(self, node, name, inputs, outputs, sub):
        values = inputs[1]
        entries, keys, neib_ents, barycentric, valid_entries, nv = inputs[4:]
        output = outputs[0]

        rdim = get_scalar_constant_value(node.inputs[2])
        vdim = get_scalar_constant_value(node.inputs[3])

        fail = sub["fail"]
        ctx = sub["params"]
        kname_splat = "k_splat_%d_%d" % (rdim, vdim)
        kname_blur = "k_blur_%d_%d" % (rdim, vdim)
        kname_slice = "k_slice_%d_%d" % (rdim, vdim)
        inplace = "1" if self.inplace else "0"

        code = """
int err = GA_NO_ERROR;

size_t val_dims[3];
size_t tmp_val_dims[2];
size_t output_dims[3];

val_dims[0] = PyGpuArray_DIMS(%(values)s)[0];
val_dims[1] = PyGpuArray_DIMS(%(values)s)[1];
val_dims[2] = PyGpuArray_DIMS(%(values)s)[2];

size_t N = val_dims[1] * val_dims[2];
size_t cap = N*(%(rdim)s+1);

size_t ls_N, gs_N, ls_valid, gs_valid;
int nv;
GpuArray_read((void*)(&nv), sizeof(int), &%(nv)s->ga);

GpuArray tmp_vals_1, tmp_vals_2;
GpuArray* tmp_vptr_1 = &tmp_vals_1;
GpuArray* tmp_vptr_2 = &tmp_vals_2;
GpuArray* tmp_swap = NULL;

if(val_dims[0] != %(vdim)s) {
    PyErr_Format(PyExc_ValueError,
        "blur error: bad input shape 0.\\nExpected %(vdim)s, got %%d",
        val_dims[0]);
    %(fail)s;
}

if(val_dims[1] != PyGpuArray_DIMS(%(barycentric)s)[1] ||
   val_dims[2] != PyGpuArray_DIMS(%(barycentric)s)[2]) {
    PyErr_Format(PyExc_ValueError,
            "blur error: bad input h/w.\\nExpected (%%d, %%d), got (%%d, %%d)",
            val_dims[1], val_dims[2]);
    %(fail)s;
}

tmp_val_dims[0] = cap;
tmp_val_dims[1] = val_dims[0];

output_dims[0] = val_dims[0];
output_dims[1] = val_dims[1];
output_dims[2] = val_dims[2];

if(%(inplace)s) {
    Py_XDECREF(%(output)s);
    %(output)s = %(values)s;
    Py_INCREF(%(output)s);
} else if(!valid_output_ptr(%(output)s, GA_FLOAT, 3, output_dims)) {
    Py_XDECREF(%(output)s);
    %(output)s = pygpu_zeros(3, output_dims, GA_FLOAT, GA_C_ORDER,
                             %(ctx)s, Py_None);
}

err = GpuArray_zeros(&tmp_vals_1, %(ctx)s->ctx, GA_FLOAT, 2, tmp_val_dims,
                     GA_C_ORDER);
if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
                 "gpuarray error allocating memory:\\n%%s.\\n",
                 GpuArray_error(&tmp_vals_1, err));
    %(fail)s;
}

err = GpuArray_zeros(&tmp_vals_2, %(ctx)s->ctx, GA_FLOAT, 2, tmp_val_dims,
                     GA_C_ORDER);
if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
                 "gpuarray error allocating memory:\\n%%s.\\n",
                 GpuArray_error(&tmp_vals_2, err));
    %(fail)s;
}

if (!%(output)s) {
    PyErr_Format(PyExc_MemoryError,
        "error performing gaussian blur: failed to allocate output storage.");
    %(fail)s;
}

gs_N = ls_N = 0;
GpuKernel_sched(&%(kname_splat)s, N, &gs_N, &ls_N);
gs_N = N / ls_N;
if (ls_N*gs_N < N) { ++gs_N; }

err = splat_%(rdim)s_%(vdim)s_call(1, &gs_N, &ls_N, 0,
    %(values)s->ga.data, %(values)s->ga.offset / sizeof(float),
    %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float),
    %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
    %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int),
    tmp_vals_1.data,
    N);

if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError, "gpuarray error splatting:\\n%%s.\\n",
        GpuKernel_error(&%(kname_splat)s, err));
    %(fail)s;
}
GpuArray_sync(&tmp_vals_1);

gs_valid = ls_valid = 0;
GpuKernel_sched(&%(kname_blur)s, nv, &gs_valid, &ls_valid);
gs_valid = nv / ls_valid;
if (ls_valid*gs_valid < nv) { ++gs_valid; }

for(int ax=0; ax<%(rdim)s+1; ++ax) {
    err = blur_%(rdim)s_%(vdim)s_call(1, &gs_valid, &ls_valid, 0,
        tmp_vptr_2->data,
        %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
        %(valid_entries)s->ga.data, %(valid_entries)s->ga.offset / sizeof(int),
        %(keys)s->ga.data, %(keys)s->ga.offset / sizeof(short),
        tmp_vptr_1->data,
        cap, nv, ax);

    if(err != GA_NO_ERROR) {
        PyErr_Format(PyExc_RuntimeError, "gpuarray error blurring:\\n%%s.\\n",
            GpuKernel_error(&%(kname_blur)s, err));
        %(fail)s;
    }

    GpuArray_sync(tmp_vptr_2);

    tmp_swap = tmp_vptr_1;
    tmp_vptr_1 = tmp_vptr_2;
    tmp_vptr_2 = tmp_swap;
}

err = slice_%(rdim)s_%(vdim)s_call(1, &gs_N, &ls_N, 0,
    %(output)s->ga.data, %(output)s->ga.offset / sizeof(float),
    %(barycentric)s->ga.data, %(barycentric)s->ga.offset / sizeof(float),
    %(entries)s->ga.data, %(entries)s->ga.offset / sizeof(int),
    %(neib_ents)s->ga.data, %(neib_ents)s->ga.offset / sizeof(int),
    tmp_vptr_2->data,
    N);

if(err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError, "gpuarray error slicing:\\n%%s.\\n",
        GpuKernel_error(&%(kname_slice)s, err));
    %(fail)s;
}

GpuArray_sync(&%(output)s->ga);
GpuArray_clear(&tmp_vals_1);
GpuArray_clear(&tmp_vals_2);
"""
        return code % locals()