Esempio n. 1
0
    def test_params_type_with_enums(self):
        # Test that we fail if we create a params type with common enum names inside different enum types.
        try:
            ParamsType(enum1=EnumList('A', 'B', 'C'), enum2=EnumList('A', 'B', 'F'))
        except AttributeError:
            pass
        else:
            raise Exception('ParamsType should fail with common enum names inside different enum types.')

        # Test that we fail if we create a params type with common names in both aliases and constants.
        try:
            ParamsType(enum1=EnumList(('A', 'a'), ('B', 'b')), enum2=EnumList(('ONE', 'a'), ('TWO', 'two')))
        except AttributeError:
            ParamsType(enum1=EnumList(('A', 'a'), ('B', 'b')), enum2=EnumList(('ONE', 'one'), ('TWO', 'two')))
        else:
            raise Exception('ParamsType should fail when there are aliases with same names as some constants.')

        # Test that we can access enum values through wrapper directly.
        w = ParamsType(enum1=EnumList('A', ('B', 'beta'), 'C'), enum2=EnumList(('D', 'delta'), 'E', 'F'))
        assert w.A == 0 and w.B == 1 and w.C == 2
        assert w.D == 0 and w.E == 1 and w.F == 2
        # Test constants access through aliases.
        assert w.enum_from_alias('beta') == w.B
        assert w.enum_from_alias('delta') == w.D
        assert w.enum_from_alias('C') == w.C  # C is not an alias, so it should return a constant named C.
        # Test that other regular wrapper attributes are still available.
        assert len(w.fields) == len(w.types) == w.length
        assert w.name
Esempio n. 2
0
class GpuAdvancedIncSubtensor1(Op):
    """
    Implement AdvancedIncSubtensor1 on the gpu.

    """
    _f16_ok = True
    __props__ = ('inplace', 'set_instead_of_inc')
    params_type = ParamsType(
        inplace=bool_t,
        set_instead_of_inc=bool_t,
        context=gpu_context_type,
        # following params are used into c_init_code_struct(),
        # as inputs are not available in that function.
        ndim_input_0=size_t,
        ndim_input_1=size_t,
        typecode_input_0=int_t,
        typecode_input_1=int_t)

    def __init__(self, inplace=False, set_instead_of_inc=False):
        self.inplace = inplace
        self.set_instead_of_inc = set_instead_of_inc
        if inplace:
            self.destroy_map = {0: [0]}

    def clone_inplace(self):
        return self.__class__(inplace=True,
                              set_instead_of_inc=self.set_instead_of_inc)

    def make_node(self, x, y, ilist):
        ctx_name = infer_context_name(x, y)
        x_ = as_gpuarray_variable(x, ctx_name)
        y_ = as_gpuarray_variable(y, ctx_name)
        ilist_ = tensor.as_tensor_variable(ilist)

        assert x_.type.ndim >= y_.type.ndim

        if ilist_.type.dtype not in tensor.integer_dtypes:
            raise TypeError('index must be integers')
        if ilist_.type.ndim != 1:
            raise TypeError('index must be vector')
        if x_.type.ndim == 0:
            raise TypeError('cannot index into a scalar')
        if y_.type.ndim > x_.type.ndim:
            if self.set_instead_of_inc:
                opname = 'set'
            else:
                opname = 'increment'
            raise TypeError('cannot %s x subtensor with ndim=%s'
                            ' by y with ndim=%s to x subtensor with ndim=%s ' %
                            (opname, x_.type.ndim, y_.type.ndim))

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

    def get_params(self, node):
        return self.params_type.get_params(
            self,
            context=node.outputs[0].type.context,
            # following params are used into c_init_code_struct().
            ndim_input_0=node.inputs[0].ndim,
            ndim_input_1=node.inputs[1].ndim,
            typecode_input_0=node.inputs[0].type.typecode,
            typecode_input_1=node.inputs[1].type.typecode)

    # We can't use the parent version that loops on each index
    # as we also need to loop when set_instead_of_inc is True and the
    # parent doesn't loop in that case.
    def perform(self, node, inp, out_, params=None):
        # TODO opt to make this inplace
        x, y, idx = inp
        out, = out_

        if not self.inplace:
            x = x.copy()

        out[0] = x

        if len(idx) == 0:
            return

        # Make sure idx is not a GpuArray otherwise we cannot use its
        # content to index x and y (This is because we serve as
        # fallback for _dev20).
        if isinstance(idx, gpuarray.GpuArray):
            idx = np.asarray(idx)

        # If `y` has as many dimensions as `x`, then we want to iterate
        # jointly on `x` and `y`. Otherwise, it means `y` should be
        # broadcasted to fill all relevant rows of `x`.
        if y.ndim == x.ndim and y.shape[0] != 1:
            assert len(y) == len(idx)
            if self.set_instead_of_inc:
                for (j, i) in enumerate(idx):
                    x[i] = y[j]
            else:
                k = get_iadd(node.inputs[0], node.inputs[1])
                for (j, i) in enumerate(idx):
                    k(x[i], y[j], broadcast=True)
        else:
            if y.ndim == x.ndim:
                # First dim is always 1 in this case.
                reshaped_y = y.reshape(y.shape[1:])
            else:
                nb_dims_to_add = (x.ndim - 1) - y.ndim
                reshaped_y = y.reshape((1, ) * nb_dims_to_add + y.shape)

            if self.set_instead_of_inc:
                for i in idx:
                    x[i] = reshaped_y
            else:
                k = get_iadd(node.inputs[0], node.inputs[1])
                for i in idx:
                    k(x[i], reshaped_y, broadcast=True)

    def c_headers(self):
        return [
            '<numpy_compat.h>', '<gpuarray/error.h>', '<gpuarray/array.h>',
            '<gpuarray/elemwise.h>', 'gpuarray_helper.h'
        ]

    def c_header_dirs(self):
        return [os.path.dirname(__file__)]

    def c_support_code_struct(self, node, nodename):
        return "\nGpuElemwise *iadd;\n"

    def c_init_code_struct(self, node, name, sub):
        return """
        gpuelemwise_arg args[2] = {{0}};
        args[0].name = "a";
        args[0].typecode = %(params)s->typecode_input_0;
        args[0].flags = GE_READ|GE_WRITE;
        args[1].name = "b";
        args[1].typecode = %(params)s->typecode_input_1;
        args[1].flags = GE_READ;
        iadd = GpuElemwise_new(%(params)s->context->ctx, "", "a += b",
                               2, args, %(params)s->ndim_input_1, GE_CONVERT_F16);
        if (iadd == NULL) {
          PyErr_SetString(PyExc_RuntimeError, "Could not intialize inplace add support");
          %(fail)s
        }
        """ % dict(params=sub['params'], fail=sub['fail'])

    def c_code(self, node, name, inputs, outputs, sub):
        if (node.inputs[0].ndim != node.inputs[1].ndim):
            raise NotImplementedError("This case does not have C code yet.")

        return """
        PyGpuArrayObject *row_x, *row_y;
        size_t nd = %(params)s->ndim_input_0;
        ssize_t *start = NULL, *step = NULL;
        size_t num_indices, j;
        int ret;
        int broadcast_y;

        start = (ssize_t*)malloc(nd * sizeof(ssize_t));
        step = (ssize_t*)malloc(nd * sizeof(ssize_t));
        if (start == NULL || step == NULL) {
            PyErr_NoMemory();
            %(fail)s
        }

        for (j = 0; j < nd; ++j) {
          start[j] = 0;
          step[j] = 1;
        }
        step[0] = 0;
        num_indices = PyArray_SIZE(%(ind)s);
        if (!%(params)s->inplace) {
          %(out)s = theano_try_copy(%(out)s, %(x)s);
          if (%(out)s == NULL) {
            // Exception already set
            %(fail)s
            }
        } else {
          Py_XDECREF(%(out)s);
          %(out)s = %(x)s;
          Py_INCREF(%(out)s);
        }
        if (num_indices != 0) {
          if ((num_indices - 1) > LONG_MAX) {
            PyErr_Format(PyExc_AssertionError,
                         "num_indices %%lld exceeds LONG_MAX + 1", (long long)num_indices);
            %(fail)s
          }
          broadcast_y = PyGpuArray_DIM(%(y)s, 0) == 1;
          for (j = 0; j < num_indices; j++) {
            start[0] = *(dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
            if (start[0] < 0)
              start[0] += PyGpuArray_DIM(%(out)s, 0);
            if (start[0] < 0 || start[0] >= PyGpuArray_DIM(%(out)s, 0)) {
               PyErr_SetString(PyExc_IndexError, "index out of bounds");
               %(fail)s;
            }
            row_x = pygpu_index(%(out)s, start, (ssize_t *)PyGpuArray_DIMS(%(out)s), step);
            if (row_x == NULL)
              %(fail)s;

            if (broadcast_y)
              start[0] = 0;
            else
              start[0] = j;

            row_y = pygpu_index(%(y)s, start, (ssize_t *)PyGpuArray_DIMS(%(y)s), step);
            if (row_y == NULL) {
              Py_DECREF(row_x);
              %(fail)s;
            }

            if (%(params)s->set_instead_of_inc) {
              ret = GpuArray_setarray(&row_x->ga, &row_y->ga);
            } else {
              void *args[2];
              args[0] = (void *)&row_x->ga;
              args[1] = (void *)&row_y->ga;
              ret = GpuElemwise_call(iadd, args, GE_BROADCAST);
            }
            Py_DECREF(row_x);
            Py_DECREF(row_y);
            if (ret != GA_NO_ERROR)
              PyErr_SetString(PyExc_RuntimeError, "Failed to set/inc elements");
          }
        }

        free(start);
        free(step);
        """ % dict(x=inputs[0],
                   y=inputs[1],
                   ind=inputs[2],
                   out=outputs[0],
                   params=sub['params'],
                   fail="""
                   {
                        free(start);
                        free(step);
                        %(fail)s
                   }
                   """ % dict(fail=sub['fail']))

    def c_code_cache_version(self):
        return (4, )
Esempio n. 3
0
 def test_hash_and_eq_params(self):
     wp1 = ParamsType(
         a=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     wp2 = ParamsType(
         a=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     w1 = Params(
         wp1,
         a=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     w2 = Params(
         wp2,
         a=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 == w2
     assert not (w1 != w2)
     assert hash(w1) == hash(w2)
     # Changing attributes names only (a -> other_name).
     wp2_other = ParamsType(
         other_name=Generic(),
         array=TensorType("int64", (False, )),
         floatting=Scalar("float64"),
         npy_scalar=TensorType("float64", tuple()),
     )
     w2 = Params(
         wp2_other,
         other_name=1,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
     # Changing attributes values only (now a=2).
     w2 = Params(
         wp2,
         a=2,
         array=np.asarray([1, 2, 4, 5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
     # Changing NumPy array values (5 -> -5).
     w2 = Params(
         wp2,
         a=1,
         array=np.asarray([1, 2, 4, -5, 7]),
         floatting=-4.5,
         npy_scalar=np.asarray(12),
     )
     assert w1 != w2
Esempio n. 4
0
class GpuCumOp(GpuKernelBase, Op):
    """
    Parameters
    ----------
    axis
        Can not be None. If you want the array flattened, do it before.
    """

    SUPPORTED_NDIMS = 3
    __props__ = ("axis", "mode")
    params_type = ParamsType(axis=scalar.int32, context=gpu_context_type)

    def __init__(self, axis, mode="add"):
        assert axis is not None
        self.axis = int(axis)
        self.mode = mode

    def __eq__(self, other):
        if type(other) != type(self):
            return False
        return self.axis == other.axis and self.mode == other.mode

    def __hash__(self):
        return hash(self.axis) ^ hash(self.mode)

    def c_code_cache_version(self):
        return (7, )

    def c_headers(self):
        return [
            "<numpy_compat.h>", "<gpuarray/types.h>", "<gpuarray_helper.h>"
        ]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir()]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def make_node(self, x):
        assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp"

        context_name = infer_context_name(x)

        x = as_gpuarray_variable(x, context_name)

        if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
            raise NotImplementedError("Only cum op on 1D, 2D and\
                                       3D arrays are supported right now!")

        if self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError("axis(={0}) out of bounds".format(self.axis))
        return Apply(self, [x], [x.type()])

    def gpu_kernels(self, node, nodename):
        kernels = []
        # cumadd
        kname = "k_cumadd"
        op = {"mul": "*", "add": "+"}[self.mode]
        k_var = "k_cumadd_" + nodename
        dtype_x = node.inputs[0].dtype
        flags = Kernel.get_flags(dtype_x)
        code = ("""#include "cluda.h"

        KERNEL void %(kname)s(float* input, ga_size input_offset,
                              float* output, ga_size output_offset,
                              ga_ssize inputStrides_x, ga_ssize inputStrides_y, ga_ssize inputStrides_z,
                              ga_ssize outputStrides_x, ga_ssize outputStrides_y, ga_ssize outputStrides_z,
                              const int offsetY, const int offsetZ,
                              const int beforeLastElementIdx, const int lastElementIdx){
            input = (float *)(((char *)input) + input_offset);
            output = (float *)(((char *)output) + output_offset);
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

            int dataOffsetY_input = idY * inputStrides_y + idZ * inputStrides_z;
            int dataOffsetY_output = idY * outputStrides_y + idZ * outputStrides_z;
            int idx_last_input = lastElementIdx*inputStrides_x + dataOffsetY_input;
            int idx_last_output = lastElementIdx*outputStrides_x + dataOffsetY_output;
            int idx_beforelast = beforeLastElementIdx*outputStrides_x + dataOffsetY_output;
            output[idx_last_output] = input[idx_last_input] %(op)s output[idx_beforelast];
            }
        """ % locals())
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "intc",
            "intc",
            "intc",
            "intc",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        # blockCumOp
        kname = "k_blockCumOp"
        k_var = "k_blockCumOp_" + nodename
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "int32",
            "int32",
            gpuarray.GpuArray,
            gpuarray.SIZE,
        ]
        code = ("""#include "cluda.h"

        // helper functions
        WITHIN_KERNEL
        void k_reductionPhase(float* partialCumOp) {
            // Traverse down from leaves to root building partial sums at internal nodes in the tree.
            for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if (index < blockDim.x*2) {
                    partialCumOp[index] %(op)s= partialCumOp[index - stride];
                }
            }
        }

        WITHIN_KERNEL
        void k_fetchData(float* partialCumOp, float* input, int globalThreadID,
                         ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                         int offsetY, int offsetZ) {
            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ; int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            partialCumOp[threadIdx.x*2]     = input[idx_even];
            partialCumOp[threadIdx.x*2 + 1] = input[idx_odd];
        }

        WITHIN_KERNEL
        void k_reversePhase(float* partialCumOp) {
            // Traverse back up the tree building the scan from the partial sums
            for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
                local_barrier();
                unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
                if (index + stride < blockDim.x*2) {
                    partialCumOp[index + stride] %(op)s= partialCumOp[index];
                }
            }
        }

        WITHIN_KERNEL
        void k_pushData(float* partialCumOp, float* output, int globalThreadID,
                        ga_ssize dataStrides_x, ga_ssize dataStrides_y, ga_ssize dataStrides_z,
                        int offsetY, int offsetZ) {
            local_barrier();
            // blockIdx.y and blockIdx.z represents the current independent cum op
            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;
            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            output[idx_even] = partialCumOp[threadIdx.x*2];
            output[idx_odd]  = partialCumOp[threadIdx.x*2 + 1];
        }

        KERNEL void k_blockCumOp(float* input, ga_size input_offset,
                                 float* output, ga_size output_offset,
                                 size_t nbElementsPerCumOp, ga_ssize inputStrides_x,
                                 ga_ssize inputStrides_y,  ga_ssize inputStrides_z,
                                 ga_ssize outputStrides_x, ga_ssize outputStrides_y,
                                 ga_ssize outputStrides_z, int offsetY,
                                 int offsetZ, float* blockSum, ga_size blockSum_offset) {
            input = (float *)(((char *)input) + input_offset);
            output = (float *)(((char *)output) + output_offset);
            blockSum = (float *)(((char *)blockSum) + blockSum_offset);

            // Regarding blockIdx and threadIdx, 'CumOp' is always performed along the X axis.
            // The Y and Z axis of the grid will contain all independent cumops of the 2D/3D case.

            int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;

            // Check if current thread has data to process.
            if (globalThreadID >= (nbElementsPerCumOp+1)/2) {
                return;
            }

            extern __shared__ float partialCumOp[];

            // Load data in shared memory
            k_fetchData(partialCumOp, input, globalThreadID, inputStrides_x, inputStrides_y, inputStrides_z, offsetY, offsetZ);

            // Use a dichotomy approach to compute the cum op (i.e. balanced binary tree).
            // The tree is sweeped from the leaves to the root and from the root to the leaves.
            // Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
            k_reductionPhase(partialCumOp);
            k_reversePhase(partialCumOp);

            // Write the final output to global memory
            k_pushData(partialCumOp, output, globalThreadID, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);

            if (blockSum != NULL){
                if (threadIdx.x == blockDim.x - 1) {
                    blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumOp[threadIdx.x*2 + 1];
                }
            }
        }
        """ % locals())
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        # k_finalCumOp
        kname = "k_finalCumOp"
        k_var = "k_finalCumOp_" + nodename
        code = ("""#include "cluda.h"

        KERNEL void k_finalCumOp(float* output, ga_size output_offset,
                                 float* blockSum, ga_size blockSum_offset,
                                 size_t nbElementsPerCumOp,
                                 ga_ssize dataStrides_x,  ga_ssize dataStrides_y,  ga_ssize dataStrides_z,
                                 int offsetY, int offsetZ) {

            output = (float *)(((char *)output) + output_offset);
            blockSum = (float *)(((char *)blockSum) + blockSum_offset);

            int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;

            // Check if current has data to process.
            if (globalThreadID >= (nbElementsPerCumOp+1)/2)
                return;

            int idY = blockIdx.y + offsetY;
            int idZ = blockIdx.z + offsetZ;

            const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];

            int offset = idY * dataStrides_y + idZ * dataStrides_z;
            int idx_even = (globalThreadID*2    ) * dataStrides_x + offset;
            int idx_odd  = (globalThreadID*2 + 1) * dataStrides_x + offset;
            output[idx_even] %(op)s= currentBlockSum;
            output[idx_odd] %(op)s= currentBlockSum;
        }
        """ % locals())
        params = [
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.GpuArray,
            gpuarray.SIZE,
            gpuarray.SIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            gpuarray.SSIZE,
            "int32",
            "int32",
        ]
        kernels.append(
            Kernel(code=code,
                   name=kname,
                   params=params,
                   flags=flags,
                   objvar=k_var))
        return kernels

    def c_code(self, node, nodename, inp, out, sub):
        if node.inputs[0].type.context.kind != b"cuda":
            raise NotImplementedError("cuda only")
        return """
            const size_t* shape = PyGpuArray_DIMS(%(x)s);
            bool needAllocation = !%(z)s || PyGpuArray_NDIM(%(x)s) != PyGpuArray_NDIM(%(z)s);

            int axis = %(params)s->axis;
            if (axis < 0) {
                // Convert negative axis to positive axis.
                axis += PyGpuArray_NDIM(%(x)s);
            }

            if (theano_prep_output(&%(z)s, PyGpuArray_NDIM(%(x)s), PyGpuArray_DIMS(%(x)s),
                                   %(x)s->ga.typecode, GA_C_ORDER, %(params)s->context) != 0) {
                %(fail)s;
            }

            { // Namespace for kernel calls //
                size_t max_threads_dim0;
                size_t max_grid_size1;
                size_t max_grid_size2;
                int err;
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE0, &max_threads_dim0);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims0");
                    %(fail)s;
                }
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE1, &max_grid_size1);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size1");
                    %(fail)s;
                }
                err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXGSIZE2, &max_grid_size2);
                if (err != GA_NO_ERROR){
                    PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_grid_size2");
                    %(fail)s;
                }
                if (cumOp_%(nodename)s(%(x)s, %(z)s, axis, max_threads_dim0, max_grid_size1, max_grid_size2) == -1){
                    %(fail)s;
                }
            }
        """ % dict(
            x=inp[0],
            z=out[0],
            nodename=nodename,
            fail=sub["fail"],
            params=sub["params"],
        )

    def c_support_code_struct(self, node, nodename):
        code = ("""

        int cumOp_%(nodename)s(PyGpuArrayObject* input, PyGpuArrayObject* output, int axis, size_t maxThreads, size_t maxGridY, size_t maxGridZ) {
            size_t shape[3] = { 1, 1, 1 };
            ssize_t inputStrides_x;
            ssize_t inputStrides_y;
            ssize_t inputStrides_z;
            ssize_t outputStrides_x;
            ssize_t outputStrides_y;
            ssize_t outputStrides_z;
            switch (PyGpuArray_NDIM(input))
            {
            case 1:
                shape[0] = PyGpuArray_DIMS(input)[0];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                break;
            case 2:
                shape[0] = PyGpuArray_DIMS(input)[0];
                shape[1] = PyGpuArray_DIMS(input)[1];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
                break;
            case 3:
                shape[0] = PyGpuArray_DIMS(input)[0];
                shape[1] = PyGpuArray_DIMS(input)[1];
                shape[2] = PyGpuArray_DIMS(input)[2];
                inputStrides_x = PyGpuArray_STRIDES(input)[0] / sizeof(float);
                inputStrides_y = PyGpuArray_STRIDES(input)[1] / sizeof(float);
                inputStrides_z = PyGpuArray_STRIDES(input)[2] / sizeof(float);
                outputStrides_x = PyGpuArray_STRIDES(output)[0] / sizeof(float);
                outputStrides_y = PyGpuArray_STRIDES(output)[1] / sizeof(float);
                outputStrides_z = PyGpuArray_STRIDES(output)[2] / sizeof(float);
                break;
            default:
                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
                return -1;
            }
            if (shape[axis] <= 1) {
                int err = pygpu_move(output, input);
                return err;
            }
            // Perform cum op on array of even size.
            size_t nbElementsPerCumOp = shape[axis] - (shape[axis] %% 2);
            // Determine how many elements can be processed in one block.
            size_t dimBlockX = ((nbElementsPerCumOp > 2*maxThreads ? 2*maxThreads : nbElementsPerCumOp)+1)/2;
            // Determine how many blocks are needed in total.
            size_t dimGridX = (nbElementsPerCumOp+2*dimBlockX-1) / (2*dimBlockX);  // Nb. of blocks needed per cum op.
            size_t dimGridY;  // Nb. of independent cum ops (width).
            size_t dimGridZ;  // Nb. of independent cum ops (height).
            ssize_t tmp;
            switch (axis)
            {
            case 0:
                dimGridY = shape[1];
                dimGridZ = shape[2];
                break;
            case 1:
                dimGridY = shape[0];
                dimGridZ = shape[2];
                tmp = inputStrides_x;
                inputStrides_x = inputStrides_y;
                inputStrides_y = tmp;
                tmp = outputStrides_x;
                outputStrides_x = outputStrides_y;
                outputStrides_y = tmp;
                break;
            case 2:
                dimGridY = shape[1];
                dimGridZ = shape[0];

                tmp = inputStrides_x;
                inputStrides_x = inputStrides_z;
                inputStrides_z = tmp;

                tmp = outputStrides_x;
                outputStrides_x = outputStrides_z;
                outputStrides_z = tmp;

                break;
            default:
                PyErr_SetString(PyExc_RuntimeError, "Unsupported Axis");
                return -1;
            }

            const size_t shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
            PyGpuArrayObject* deviceBlockSum = pygpu_empty(2, shapeBlockSum, output->ga.typecode,
                                                           GA_C_ORDER, input->context, Py_None);
            if (deviceBlockSum == NULL){
                return -1;
            }
            // Perform `maxGridY`*`maxGridZ` cum ops in parallel.
            for (size_t offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
                size_t localDimGridY = (dimGridY - offsetY < maxGridY) ? (dimGridY - offsetY) : (maxGridY);

                for (size_t offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
                    size_t localDimGridZ = (dimGridZ - offsetZ < maxGridZ) ? (dimGridZ - offsetZ) : (maxGridZ);
                    size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                    size_t dimBlock[3] = {dimBlockX, 1, 1};  // One cum op per block.
                    size_t sharedBytes = (2*dimBlockX) * sizeof(float);

                    int err = k_blockCumOp_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, nbElementsPerCumOp, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, deviceBlockSum->ga.data, deviceBlockSum->ga.offset);
                    if (err != GA_NO_ERROR){
                        PyErr_SetString(PyExc_RuntimeError, "blockCumOp call failed");
                        return -1;
                    }

                    if (dimGridX > 1) {
                        // Do a cum op over the blockSum (recursive).
                        if (cumOp_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
                            Py_DECREF(deviceBlockSum);
                            return -1;
                        }
                        // Since there are more than one block (i.e. `dimGridX > 1`)
                        //  report partial cum ops of previous blocks to subsequents ones.
                        size_t dimGrid[3] = {dimGridX, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {dimBlockX, 1, 1};

                        int err = k_finalCumOp_call(3, dimGrid, dimBlock, sharedBytes, output->ga.data, output->ga.offset, deviceBlockSum->ga.data, deviceBlockSum->ga.offset, nbElementsPerCumOp, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "finalCumOp call failed");
                            return -1;
                        }
                    }
                    // If shape[axis] is odd, the last element is compute manually
                    if (shape[axis] != nbElementsPerCumOp){
                        size_t dimGrid[3] = {1, localDimGridY, localDimGridZ};
                        size_t dimBlock[3] = {1, 1, 1};

                        int err = k_cumadd_call(3, dimGrid, dimBlock, sharedBytes, input->ga.data, input->ga.offset, output->ga.data, output->ga.offset, inputStrides_x, inputStrides_y, inputStrides_z, outputStrides_x, outputStrides_y, outputStrides_z, offsetY, offsetZ, shape[axis] - 2, shape[axis] - 1);
                        if (err != GA_NO_ERROR){
                            PyErr_SetString(PyExc_RuntimeError, "cumadd call failed");
                            return -1;
                        }

                    }
                }
            }
            Py_XDECREF(deviceBlockSum);
            return 0;
        }
        """ % locals())
        return super(GpuCumOp, self).c_support_code_struct(node,
                                                           nodename) + code
Esempio n. 5
0
class BaseCorr3dMM(gof.OpenMPOp):
    """
    Base class for `Corr3dMM`, `Corr3dMM_gradWeights` and
    `Corr3dMM_gradInputs`. Cannot be used directly.

    Every sub-class must define internal attribute ``_direction`` out of __init__().
    ``_direction`` must take one of following values:

     - "forward" to correlate bottom with weights and store results in top.
     - "backprop weights" to do a valid convolution of bottom with top
       (swapping the first two dimensions) and store results in weights.
     - "backprop inputs" to do a full convolution of top with weights
       (swapping the first two dimensions) and store results in bottom.

    Parameters
    ----------
    border_mode : {'valid', 'full', 'half'}
        Additionally, the padding size could be directly specified by an integer
        or a tuple of three of integers
    subsample
        Perform subsampling of the output (default: (1, 1, 1)).
    filter_dilation
        Perform dilated correlation (default: (1, 1, 1))
    num_groups
        Perform grouped convolutions (default: 1)
    """

    check_broadcast = False
    __props__ = ("border_mode", "subsample", "filter_dilation", "num_groups")

    _direction = None

    params_type = ParamsType(
        direction=EnumList(
            ("DIRECTION_FORWARD", "forward"),  # 0
            ("DIRECTION_BACKPROP_WEIGHTS", "backprop weights"),  # 1
            ("DIRECTION_BACKPROP_INPUTS", "backprop inputs"),
        ),  # 2
        dH=int64,
        dW=int64,
        dD=int64,
        dilH=int64,
        dilW=int64,
        dilD=int64,
        padH=int64,
        padW=int64,
        padD=int64,
        num_groups=int64,
    )

    def __init__(
            self,
            border_mode="valid",
            subsample=(1, 1, 1),
            filter_dilation=(1, 1, 1),
            openmp=None,
            num_groups=1,
    ):
        super(BaseCorr3dMM, self).__init__(openmp=openmp)
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
                raise ValueError("invalid border_mode {}, which must be a "
                                 "non-negative integer".format(border_mode))
            border_mode = (border_mode, border_mode, border_mode)
        if isinstance(border_mode, tuple):
            if len(border_mode) != 3 or min(border_mode) < 0:
                raise ValueError(
                    "invalid border_mode {}, which must be a tuple of "
                    "three non-negative integers".format(border_mode))
            pad_h, pad_w, pad_d = map(int, border_mode)
            border_mode = (pad_h, pad_w, pad_d)
        if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0)
                or border_mode in ("valid", "full", "half")):
            raise ValueError(
                "invalid border_mode {}, which must be either "
                '"valid", "full", "half", an integer or a tuple of three'
                " integers".format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 3:
            raise ValueError("subsample must have three elements")
        if len(filter_dilation) != 3:
            raise ValueError("filter_dilation must have three elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
        if num_groups < 1:
            raise ValueError("Number of groups should be greater than 0")
        self.num_groups = num_groups

        if not theano.config.blas.ldflags:
            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
            self.blas_type = ""
        else:
            if "openblas" in theano.config.blas.ldflags:
                self.blas_type = "openblas"
            elif "mkl" in theano.config.blas.ldflags:
                self.blas_type = "mkl"
            else:
                self.blas_type = ""

        if self._direction not in [
                "forward", "backprop weights", "backprop inputs"
        ]:
            raise ValueError("_direction must be one of 'forward', "
                             "'backprop weights', 'backprop inputs'")

    @property
    def pad(self):
        if self.border_mode == "half":
            return (-1, -1, -1)
        elif self.border_mode == "full":
            return (-2, -2, -2)
        elif isinstance(self.border_mode, tuple):
            return self.border_mode
        else:
            assert self.border_mode == "valid"
            return (0, 0, 0)

    # Direction should be converted to real enum value,
    # as it is compared to integer later in c_code_helper().
    direction = property(
        lambda self: self.params_type.enum_from_alias(self._direction))

    dH = property(lambda self: self.subsample[0])
    dW = property(lambda self: self.subsample[1])
    dD = property(lambda self: self.subsample[2])

    dilH = property(lambda self: self.filter_dilation[0])
    dilW = property(lambda self: self.filter_dilation[1])
    dilD = property(lambda self: self.filter_dilation[2])

    padH = property(lambda self: self.pad[0])
    padW = property(lambda self: self.pad[1])
    padD = property(lambda self: self.pad[2])

    def __str__(self):
        return "%s{%s, %s, %s, %s}" % (
            self.__class__.__name__,
            self.border_mode,
            str(self.subsample),
            str(self.filter_dilation),
            str(self.num_groups),
        )

    @staticmethod
    def as_common_dtype(in1, in2):
        """
        Upcast input variables if necessary.
        """
        dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
        return in1.astype(dtype), in2.astype(dtype)

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, "num_groups"):
            self.num_groups = 1

    def c_support_code(self):
        ccodes = blas_headers.blas_header_text()
        if self.blas_type == "openblas":
            ccodes += blas_headers.openblas_threads_text()
        elif self.blas_type == "mkl":
            ccodes += blas_headers.mkl_threads_text()
        return ccodes

    def c_libraries(self):
        return ldflags()

    def c_compile_args(self):
        compile_args = ldflags(libs=False, flags=True)
        compile_args += super(BaseCorr3dMM, self).c_compile_args()
        return compile_args

    def c_lib_dirs(self):
        return ldflags(libs=False, libs_dir=True)

    def c_header_dirs(self):
        return ldflags(libs=False, include_dir=True)

    def c_headers(self):
        headers = ["<stdio.h>"]
        headers += super(BaseCorr3dMM, self).c_headers()
        return headers

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (8, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
        # these files
        sub = {}
        dtype = str(node.__dict__["inputs"][0].dtype)
        assert dtype in ("float32", "float64")
        if dtype == "float32":
            sub["gemm"] = "sgemm_"
            sub["float_type"] = "npy_float"
            sub["float_typenum"] = "NPY_FLOAT"
            sub["n_bytes"] = 4
            sub["c_float_type"] = "float"
        else:
            sub["gemm"] = "dgemm_"
            sub["float_type"] = "npy_double"
            sub["float_typenum"] = "NPY_DOUBLE"
            sub["n_bytes"] = 8
            sub["c_float_type"] = "double"

        if self.openmp:
            sub["omp_flags"] = "#pragma omp parallel for schedule(static)"
            sub["omp_get_max_threads"] = "omp_get_max_threads()"
            sub["omp_get_thread_num"] = "omp_get_thread_num()"

            if self.blas_type == "openblas":
                sub["blas_set_num_threads"] = "openblas_set_num_threads"
                sub["blas_get_num_threads"] = "openblas_get_num_threads()"
            elif self.blas_type == "mkl":
                sub["blas_set_num_threads"] = "mkl_set_num_threads"
                sub["blas_get_num_threads"] = "mkl_get_max_threads()"
            else:
                sub["blas_set_num_threads"] = ""
                sub["blas_get_num_threads"] = "0"
        else:
            sub["omp_flags"] = ""
            sub["omp_get_max_threads"] = "1"
            sub["omp_get_thread_num"] = "0"
            sub["blas_set_num_threads"] = ""
            sub["blas_get_num_threads"] = "0"

        files = [os.path.join("c_code", "corr3d_gemm.c")]
        codes = [
            open(os.path.join(os.path.split(__file__)[0], f)).read()
            for f in files
        ]
        final_code = ""
        for code in codes:
            final_code += code
        return final_code % sub

    def c_code_helper(self,
                      bottom,
                      weights,
                      top,
                      sub,
                      height=None,
                      width=None,
                      depth=None):
        """
        This generates the C code for Corr3dMM (direction="forward"),
        Corr3dMM_gradWeights (direction="backprop weights"), and
        Corr3dMM_gradInputs (direction="backprop inputs").
        Depending on the direction, one of bottom, weights, top will
        receive the output, while the other two serve as inputs.

        :param bottom: Variable name of the input images in the forward pass,
            or the gradient of the input images in backprop wrt. inputs
        :param weights: Variable name of the filters in the forward pass,
            or the gradient of the filters in backprop wrt. weights
        :param top: Variable name of the output images / feature maps in the
            forward pass, or the gradient of the outputs in the backprop passes
        :param sub: Dictionary of substitutions useable to help generating the
            C code.
        :param height: If self.subsample[0] != 1, a variable giving the height
            of the filters for direction="backprop weights" or the height of
            the input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the height of the
            filters for direction="backprop weights".  Ignored otherwise.
        :param width: If self.subsample[1] != 1, a variable giving the width
            of the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the width of the
            filters for direction="backprop weights".  Ignored otherwise.
        :param depth: If self.subsample[1] != 1, a variable giving the depth
            of the filters for direction="backprop weights" or the depth of the
            input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the depth of the
            filters for direction="backprop weights".  Ignored otherwise.
        """

        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
        if height:
            height = "(*(npy_int64 *)(PyArray_DATA(%s)))" % height
        else:
            if ((self.direction != 0) and
                (self.dH != 1)) or ((self.direction == 1) and
                                    (self.padH == -1)):
                raise ValueError(
                    "height must be given for backprop with vertical sampling or border_mode='half'"
                )
            height = "-1"
        if width:
            width = "(*(npy_int64 *)(PyArray_DATA(%s)))" % width
        else:
            if ((self.direction != 0) and
                (self.dW != 1)) or ((self.direction == 1) and
                                    (self.padW == -1)):
                raise ValueError(
                    "width must be given for backprop with horizontal sampling or border_mode='half'"
                )
            width = "-1"
        if depth:
            depth = "(*(npy_int64 *)(PyArray_DATA(%s)))" % depth
        else:
            if ((self.direction != 0) and
                (self.dD != 1)) or ((self.direction == 1) and
                                    (self.padD == -1)):
                raise ValueError(
                    "depth must be given for backprop with depth sampling or border_mode='half'"
                )
            depth = "-1"

        return """
    // Mandatory args
    int direction = %(params)s->direction;  // forward, bprop weights, bprop inputs

    // Optional args
    int dH = %(params)s->dH;
    int dW = %(params)s->dW;
    int dD = %(params)s->dD;
    int dilH = %(params)s->dilH;
    int dilW = %(params)s->dilW;
    int dilD = %(params)s->dilD;
    int padH = %(params)s->padH;
    int padW = %(params)s->padW;
    int padD = %(params)s->padD;
    int numgroups = %(params)s->num_groups;

    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
    PyArrayObject * top = %(top)s;
    PyArrayObject * out2 = NULL;
    PyArrayObject **out = NULL;

    switch(%(params)s->direction) {
        case DIRECTION_FORWARD:
            out = &%(top)s;
            break;
        case DIRECTION_BACKPROP_WEIGHTS:
            out = &%(weights)s;
            break;
        case DIRECTION_BACKPROP_INPUTS:
            out = &%(bottom)s;
            break;
        default:
            PyErr_SetString(PyExc_ValueError, "CPU Corr3dMM: Invalid direction.");
            {%(fail)s}
            break;
    }

    // Obtain or infer kernel width, height and depth
    // (we need to know it early to be able to handle auto-padding)
    int kH, kW, kD, dil_kH, dil_kW, dil_kD;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[2];
        kW = PyArray_DIMS(weights)[3];
        kD = PyArray_DIMS(weights)[4];
    }
    else {
        if (%(height)s != -1) {
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + 2*padH - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if (%(width)s != -1) {
            kW = %(width)s;
        }
        else if (padW == -2) {
            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
            kW = (PyArray_DIMS(bottom)[3] + 2*padW - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        if (%(depth)s != -1) {
            kD = %(depth)s;
        }
        else if (padD == -2) {
            kD = (2 - PyArray_DIMS(bottom)[4] + (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
        }
        else {
            kD = (PyArray_DIMS(bottom)[4] + 2*padD - (PyArray_DIMS(top)[4] - 1) * dD - 1) / dilD + 1;
        }
    }

    // Implicit dilated kernel size
    dil_kH = (kH - 1) * dilH + 1;
    dil_kW = (kW - 1) * dilW + 1;
    dil_kD = (kD - 1) * dilD + 1;

    // Auto-padding if requested
    if (padH == -1) {  // vertical half padding
        padH = dil_kH / 2;
    }
    else if (padH == -2) {  // vertical full padding
        padH = dil_kH - 1;
    }
    else if (padH < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padH must be >= -2");
        %(fail)s
    }
    if (padW == -1) {  // horizontal half padding
        padW = dil_kW / 2;
    }
    else if (padW == -2) {  // horizontal full padding
        padW = dil_kW - 1;
    }
    else if (padW < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padW must be >= -2");
        %(fail)s
    }
    if (padD == -1) {  // depth half padding
        padD = dil_kD / 2;
    }
    else if (padD == -2) {  // depth full padding
        padD = dil_kD - 1;
    }
    else if (padD < 0) {
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: padD must be >= -2");
        %(fail)s
    }

    // Infer output shape
    npy_intp out_dim[5];
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width, depth)
        // height and width: top = (bottom + 2*pad - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + 2*padH - ((PyArray_DIMS(weights)[2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + 2*padW - ((PyArray_DIMS(weights)[3]-1)*dilW + 1)) / dW + 1);
        out_dim[4] = (npy_intp)((PyArray_DIMS(bottom)[4] + 2*padD - ((PyArray_DIMS(weights)[4]-1)*dilD + 1)) / dD + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
        {
            PyErr_Format(PyExc_ValueError,
                         "Corr3dMM: impossible output shape\\n"
                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                         (long int)PyArray_DIMS(bottom)[4],
                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                         (long int)PyArray_DIMS(weights)[4],
                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                         (long int)out_dim[3], (long int)out_dim[4]);
            %(fail)s
        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width, depth)
        // height and width: weights = (bottom + 2*pad - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        out_dim[1] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
        out_dim[2] = (npy_intp)kH;  // already inferred further above
        out_dim[3] = (npy_intp)kW;  // how convenient
        out_dim[4] = (npy_intp)kD;
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
        {
            PyErr_Format(PyExc_ValueError,
                         "Corr3dMM backprop wrt. weights: impossible output shape\\n"
                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
                         (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                         (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                         (long int)PyArray_DIMS(bottom)[4],
                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                         (long int)out_dim[3], (long int)out_dim[4],
                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
                         (long int)PyArray_DIMS(top)[4]);
            %(fail)s
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width, depth)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[1] * numgroups;
        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[2]-1)*dilH + 1 - 2*padH);
        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[3]-1)*dilW + 1 - 2*padW);
        out_dim[4] = (npy_intp)((%(depth)s != -1) ? %(depth)s : (PyArray_DIMS(top)[4] - 1) * dD + (PyArray_DIMS(weights)[4]-1)*dilD + 1 - 2*padD);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0 || out_dim[4] <= 0)
        {
            PyErr_Format(PyExc_ValueError,
                         "Corr3dMM backprop wrt. inputs: impossible output shape\\n"
                         "  bottom shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                         "  top shape: %%ld x %%ld x %%ld x %%ld x %%ld\\n",
                         (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                         (long int)out_dim[3], (long int)out_dim[4],
                         (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                         (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                         (long int)PyArray_DIMS(weights)[4],
                         (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                         (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3],
                         (long int)PyArray_DIMS(top)[4]);
            %(fail)s
        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorr3dMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }

    // Prepare output array
    int typenum;
    if ( !(*out
           && PyArray_NDIM(*out)==4
           && PyArray_IS_C_CONTIGUOUS(*out)
           && PyArray_DIMS(*out)[0]==out_dim[0]
           && PyArray_DIMS(*out)[1]==out_dim[1]
           && PyArray_DIMS(*out)[2]==out_dim[2]
           && PyArray_DIMS(*out)[3]==out_dim[3]
           && PyArray_DIMS(*out)[4]==out_dim[4]))
    {
        Py_XDECREF(*out);
        if (direction != 1) {
          typenum = PyArray_TYPE(weights);
        }
        else {
          typenum = PyArray_TYPE(bottom);
        }
        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
        *out = (PyArrayObject*)PyArray_ZEROS(5,
                                          out_dim,
                                          typenum,
                                          0);
        if (NULL == *out)
        {
            PyErr_Format(PyExc_RuntimeError,
                    "BaseCorr3dMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld x %%lld",
                    (long long)out_dim[0], (long long)out_dim[1],
                    (long long)out_dim[2], (long long)out_dim[3], (long long)out_dim[4]);
            %(fail)s
        }
    }

    // Call corr3dMM code
    out2 = corr3dMM(%(bottom)s, %(weights)s, %(top)s, direction,
                    dH, dW, dD, dilH, dilW, dilD, padH, padW, padD,
                    numgroups);
    if (out2==NULL){
       %(fail)s
    }
    assert (out2 == *out);

""" % dict(
            bottom=bottom,
            weights=weights,
            top=top,
            height=height,
            width=width,
            depth=depth,
            fail=sub["fail"],
            params=sub["params"],
        )
Esempio n. 6
0
class GpuExtractDiag2D(GpuKernelBase, Op):
    """
    Extracting diagonal of a 2D matrix on the GPU.

    """
    __props__ = ('context_name', 'keepdims')
    _f16_ok = True
    params_type = ParamsType(context=gpu_context_type, keepdims=bool_t)

    def __init__(self, context_name=None, keepdims=False):
        self.context_name = context_name
        self.keepdims = keepdims

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=get_context(
                                               self.context_name),
                                           keepdims=self.keepdims)

    def make_node(self, x, k=0):  #TODO: dtype check
        x = as_gpuarray_variable(x, context_name=self.context_name)
        k = tensor.as_tensor_variable(k)
        assert x.ndim == 2
        assert k.ndim == 0
        broadcastable = (False, True) if self.keepdims else (False, )
        otype = GpuArrayType(dtype=x.type.dtype,
                             broadcastable=broadcastable,
                             context_name=self.context_name)
        return gof.Apply(self, [x, k], [otype()])

    def infer_shape(self, node, in_shapes):
        in_shape, _ = in_shapes
        dim1 = in_shape[0]
        dim2 = in_shape[1]
        k = node.inputs[1]
        diag_size = T.switch(T.ge(k, 0), T.clip(dim2 - k, 0, dim1),
                             T.clip(dim1 + k, 0, dim2))
        if self.keepdims:
            diag_size = (diag_size, 1)
        else:
            diag_size = (diag_size, )
        return [diag_size]

    def grad(self, inp, grads):
        return [
            GpuAllocDiag2D()(grads[0], inp[1], *(inp[0].shape)),
            grad_not_implemented(self, 1, inp[1])
        ]

    def gpu_kernels(self, node, name):
        dtype_x = node.inputs[0].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        dtype_y = node.outputs[0].dtype
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x))
        load_x = load_w(dtype_x)
        write_y = write_w(dtype_y)
        code = """
        #include "cluda.h"
        KERNEL void extract(const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_ssize k, ga_size l) {
            x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off);
            y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off);
            ga_ssize coff = max(k, (ga_ssize) 0);
            ga_ssize roff = -min(k, (ga_ssize) 0);
            ga_size index = GID_0 * LDIM_0 + LID_0;
            if (index < l) {
                %(work_x)s t = %(load_x)s(x[(index + roff) * stridesX0 + (index + coff) * stridesX1]);
                y[index * stridesY0] = %(write_y)s(t);
            }
        }""" % dict(type_x=type_x,
                    type_y=type_y,
                    work_x=work_x,
                    load_x=load_x,
                    write_y=write_y,
                    name=name)
        return [
            Kernel(code=code,
                   name="extract",
                   params=[
                       gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray,
                       gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray,
                       gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SIZE
                   ],
                   flags=Kernel.get_flags(dtype_x, dtype_y),
                   objvar='k_extract_' + name)
        ]

    def c_headers(self):
        return [
            '<numpy_compat.h>', '<gpuarray_helper.h>', '<gpuarray/types.h>'
        ]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir()]

    def c_code(self, node, name, inp, out, sub):  #TODO: fix error msg
        x, k = inp
        y, = out
        fail = sub['fail']
        params = sub['params']
        typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        kname = self.gpu_kernels(node, name)[0].objvar
        s = """
        int err;
        size_t* dims = (size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(x)s);
        size_t k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
        size_t col_off = (size_t) (k > 0?k:0);
        size_t row_off = (size_t) (k < 0?-k:0);
        size_t diag_size = (size_t) std::max((ssize_t) std::min((ssize_t)dims[0] - (ssize_t)row_off, (ssize_t)dims[1] - (ssize_t)col_off), (ssize_t) 0);
        size_t ls = std::min(diag_size, (size_t) 1024);
        size_t gs = (diag_size + ls - 1) / ls;
        size_t ndims = %(params)s->keepdims ? 2 : 1;
        size_t out_dims[ndims];
        out_dims[0] = diag_size;
        if (ndims == 2) {
            out_dims[1] = 1;
        }

        size_t itemsize_x = 1;
        size_t itemsize_y = 1;
        ssize_t stridesX0 = 1;
        ssize_t stridesX1 = 1;
        ssize_t stridesY0 = 1;

        if (%(y)s == NULL || %(y)s->ga.nd != ndims || %(y)s->ga.dimensions[0] != diag_size || (ndims > 1 && %(y)s->ga.dimensions[1] != 1)) {
            Py_CLEAR(%(y)s);
            %(y)s = pygpu_empty(ndims, out_dims, %(typecode)s, GA_C_ORDER, %(params)s->context, Py_None);
        }
        if (%(y)s == NULL) {
            %(fail)s
        }

        itemsize_x = GpuArray_ITEMSIZE(&%(x)s->ga);
        itemsize_y = GpuArray_ITEMSIZE(&%(y)s->ga);
        stridesX0 = PyGpuArray_STRIDES(%(x)s)[0] / itemsize_x;
        stridesX1 = PyGpuArray_STRIDES(%(x)s)[1] / itemsize_x;
        stridesY0 = PyGpuArray_STRIDES(%(y)s)[0] / itemsize_y;

        if (row_off < dims[0] && col_off < dims[1]) {
            err = extract_call(1, &gs, &ls, 0, stridesX0, stridesX1, %(x)s->ga.data, %(x)s->ga.offset, stridesY0, %(y)s->ga.data, %(y)s->ga.offset, k, diag_size);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError, "gpuarray error: kExtract: %%s. n%%lu, m=%%lu.", GpuKernel_error(&%(kname)s, err), (unsigned long)dims[0], (unsigned long)dims[1]);
                %(fail)s;
            }
        } else {
            %(fail)s;
        }
        """ % locals()
        return s

    def c_code_cache_version(self):
        return (1, )
Esempio n. 7
0
    def test_params_type_filtering(self):
        shape_tensor5 = (1, 2, 2, 3, 2)
        size_tensor5 = shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[2] * shape_tensor5[3] * shape_tensor5[4]
        random_tensor = np.random.normal(size=size_tensor5).reshape(shape_tensor5)

        w = ParamsType(a1=TensorType('int32', (False, False)),
                       a2=TensorType('float64', (False, False, False, False, False)),
                       a3=Generic())

        # With a value that does not match the params type.
        o = Params(w,
                   a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int64'),
                   a2=random_tensor.astype('float32'),
                   a3=2000)
        # should fail (o.a1 is not int32, o.a2 is not float64)
        self.assertRaises(TypeError, w.filter, o, True)
        # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed)
        self.assertRaises(TypeError, w.filter, o, False, False)
        # Should pass.
        w.filter(o, strict=False, allow_downcast=True)

        # With a value that matches the params type.
        o1 = Params(w,
                    a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=random_tensor.astype('float64'),
                    a3=2000)
        # All should pass.
        w.filter(o1, strict=True)
        w.filter(o1, strict=False, allow_downcast=False)
        w.filter(o1, strict=False, allow_downcast=True)

        # Check values_eq and values_eq_approx.
        o2 = Params(w,
                    a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=random_tensor.astype('float64'),
                    a3=2000)
        assert w.values_eq(o1, o2)
        assert w.values_eq_approx(o1, o2)

        # Check value_eq_approx.
        # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx.
        # So, I just play a little with float values.
        o3 = Params(w,
                    a1=np.asarray([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=(random_tensor.astype('float32') * 10 / 2.2 * 2.19999999999 / 10).astype('float64'),
                    a3=2000.0 - 0.00000000000000001)
        assert w.values_eq_approx(o1, o3)
Esempio n. 8
0
class CumOp(Op):
    # See function cumsum/cumprod for docstring

    __props__ = ("axis", "mode")
    check_input = False
    params_type = ParamsType(c_axis=int_t,
                             mode=EnumList(("MODE_ADD", "add"),
                                           ("MODE_MUL", "mul")))

    def __init__(self, axis=None, mode="add"):
        if mode not in ("add", "mul"):
            raise ValueError('{}: Unknown mode "{}"'.format(
                type(self).__name__, mode))
        self.axis = axis
        self.mode = mode

    c_axis = property(lambda self: np.MAXDIMS
                      if self.axis is None else self.axis)

    def make_node(self, x):
        x = basic.as_tensor_variable(x)
        out_type = x.type()

        if self.axis is None:
            out_type = theano.tensor.vector(dtype=x.dtype)  # Flatten
        elif self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError("axis(={}) out of bounds".format(self.axis))

        return theano.Apply(self, [x], [out_type])

    def perform(self, node, inputs, output_storage, params):
        x = inputs[0]
        z = output_storage[0]
        if self.mode == "add":
            z[0] = np.cumsum(x, axis=self.axis)
        else:
            z[0] = np.cumprod(x, axis=self.axis)

    def grad(self, inputs, output_gradients):
        (x, ) = inputs
        (gi, ) = output_gradients

        if self.axis is None:
            if self.mode == "add":
                return [cumsum(gi[::-1])[::-1].reshape(x.shape)]
            elif self.mode == "mul":
                fx = cumprod(x, axis=self.axis)
                return [cumsum((fx * gi)[::-1])[::-1].reshape(x.shape) / x]
            else:
                raise NotImplementedError(
                    '%s: unknown gradient for mode "%s"' %
                    (type(self).__name__, self.mode))

        reverse_slicing = [slice(None, None, None)] * gi.ndim
        reverse_slicing[self.axis] = slice(None, None, -1)
        reverse_slicing = tuple(reverse_slicing)
        # We need to reverse the gradients along ``self.axis``,
        #  compute cumsum, then reverse again
        if self.mode == "add":
            return [cumsum(gi[reverse_slicing], self.axis)[reverse_slicing]]
        elif self.mode == "mul":
            fx = cumprod(x, axis=self.axis)
            return [
                cumsum(
                    (fx * gi)[reverse_slicing], self.axis)[reverse_slicing] / x
            ]
        else:
            raise NotImplementedError(
                '{}: unknown gradient for mode "{}"'.format(
                    type(self).__name__, self.mode))

    def infer_shape(self, node, shapes):
        if self.axis is None:
            return [(basic.prod(shapes[0]), )]  # Flatten

        return shapes

    def c_code(self, node, name, inames, onames, sub):
        (x, ) = inames
        (z, ) = onames
        axis = self.axis
        fail = sub["fail"]
        params = sub["params"]

        code = ("""
                int axis = %(params)s->c_axis;
                if (axis == 0 && PyArray_NDIM(%(x)s) == 1)
                    axis = NPY_MAXDIMS;
                npy_intp shape[1] = { PyArray_SIZE(%(x)s) };
                if(axis == NPY_MAXDIMS && !(%(z)s && PyArray_DIMS(%(z)s)[0] == shape[0]))
                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, PyArray_TYPE((PyArrayObject*) py_%(x)s));
                }

                else if(axis != NPY_MAXDIMS && !(%(z)s && PyArray_CompareLists(PyArray_DIMS(%(z)s), PyArray_DIMS(%(x)s), PyArray_NDIM(%(x)s))))
                {
                    Py_XDECREF(%(z)s);
                    %(z)s = (PyArrayObject*) PyArray_SimpleNew(PyArray_NDIM(%(x)s), PyArray_DIMS(%(x)s), PyArray_TYPE(%(x)s));
                }

                if (!%(z)s)
                    %(fail)s;
                {

                    PyObject * t = NULL;
                    if(%(params)s->mode == MODE_ADD)
                        t = PyArray_CumSum(
                            %(x)s, axis,
                            PyArray_TYPE(%(x)s), %(z)s);
                    else if(%(params)s->mode == MODE_MUL)
                        t = PyArray_CumProd(
                            %(x)s, axis,
                            PyArray_TYPE(%(x)s), %(z)s);

                    if (!t){
                       %(fail)s;
                    }
                    // Because PyArray_CumSum/CumProd returns a newly created reference on t.
                    Py_XDECREF(t);
                }
            """ % locals())

        return code

    def c_code_cache_version(self):
        return (8, )

    def __str__(self):
        return "{}{{{}, {}}}".format(self.__class__.__name__, self.axis,
                                     self.mode)
Esempio n. 9
0
class GpuMaxPoolRop(CGpuKernelBase):
    """
    Implements the R-operator for the downsample operation.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        self.mode = mode
        CGpuKernelBase.__init__(
            self, ["c_code/pool_max_rop.c"], "APPLY_SPECIFIC(max_pool_rop)"
        )
        assert mode == "max"
        assert ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def c_headers(self):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, eval_point, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        eval_point = as_gpuarray_variable(eval_point, ctx_name)
        assert eval_point.ndim == nd + 2

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0,) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.ignore_border:
                raise ValueError("Padding works only with ignore_border=True")
            if isinstance(ws, (tuple, list)):
                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise ValueError("Padding must be smaller than strides")

        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = theano.tensor.cast(ws, "int64")
        stride = theano.tensor.cast(stride, "int64")
        pad = theano.tensor.cast(pad, "int64")

        return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
        shp = Pool.out_shape(
            in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
        )
        return [shp]
Esempio n. 10
0
class GpuPool(CGpuKernelBase):
    """
    Implement the max and average pooling on the gpu.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(
        ignore_border=bool_t, mode=PoolingMode_t, context=gpu_context_type
    )

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode == "average":
            mode = "average_inc_pad"
        self.mode = mode
        CGpuKernelBase.__init__(self, ["c_code/pool.c"], "APPLY_SPECIFIC(pool)")
        assert PoolingMode_t.has_alias(self.mode)
        assert self.ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def c_headers(self):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp)
        inp = as_gpuarray_variable(inp, ctx_name)
        nd = self.ndim
        assert inp.ndim == nd + 2
        if stride is None:
            stride = ws
        if pad is None:
            pad = (0,) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.ignore_border:
                raise ValueError("Padding works only with ignore_border=True")
            if isinstance(ws, (tuple, list)):
                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise ValueError("Padding must be smaller than strides")

        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = theano.tensor.cast(ws, "int64")
        stride = theano.tensor.cast(stride, "int64")
        pad = theano.tensor.cast(pad, "int64")

        return Apply(self, [inp, ws, stride, pad], [inp.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
        shp = Pool.out_shape(
            in_shapes[0], ws, self.ignore_border, stride, pad, self.ndim
        )
        return [shp]

    def grad(self, inp, grads):
        img, ws, stride, pad = inp
        (grad,) = grads

        grad = gpu_contiguous(grad)

        disc = [theano.gradient.DisconnectedType()() for i in inp[1:]]
        if self.mode == "max":
            out = self(img, ws, stride, pad)
            g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)(
                img, out, grad, ws, stride, pad
            )
            return [g_out] + disc
        else:
            g_out = GpuAveragePoolGrad(
                ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode
            )(img, grad, ws, stride, pad)
            return [g_out] + disc

    def connection_pattern(self, node):
        return [[1], [0], [0], [0]]

    def R_op(self, inputs, eval_points):
        if self.mode != "max":
            # Rop for average or sum is simply pooling evaluated at eval point
            eval_inputs = [eval_points[0]] + inputs[1:]
            return [self(*eval_inputs)]

        # R_op can receive None as eval_points.
        # That mean there is no diferientiable path through that input
        # If this imply that you cannot compute some outputs,
        # return None for those.
        if eval_points[0] is None:
            return [None]
        z = self(*inputs)
        x, ws, stride, pad = inputs
        return [
            GpuDownsampleFactorMaxGradGrad(self.ignore_border, self.mode, self.ndim)(
                x, z, eval_points[0], ws, stride, pad
            )
        ]
Esempio n. 11
0
class GpuAveragePoolGrad(CGpuKernelBase):
    """
    Implement the grad of average pooling on the gpu.

    """

    __props__ = ("ignore_border", "mode", "ndim")
    params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode="max", ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode == "average":
            mode = "average_inc_pad"
        self.mode = mode
        CGpuKernelBase.__init__(
            self, ["c_code/pool_ave_grad.c"], "APPLY_SPECIFIC(ave_pool_grad)"
        )
        assert mode in ("sum", "average_inc_pad", "average_exc_pad")
        assert ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def c_headers(self):
        return ["gpuarray_api.h", "gpuarray_helper.h", "numpy_compat.h"]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir(), pygpu.get_include()]

    def make_node(self, inp, out_grad, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp, out_grad)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
        assert out_grad.ndim == nd + 2

        assert out_grad.ndim == inp.ndim

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0,) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.mode == "average_exc_pad":
                raise ValueError("Padding must be zero for average_exc_pad")
        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in theano.tensor.int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = theano.tensor.cast(ws, "int64")
        stride = theano.tensor.cast(stride, "int64")
        pad = theano.tensor.cast(pad, "int64")

        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])

    def infer_shape(self, fgraph, node, in_shapes):
        return [in_shapes[0]]

    def grad(self, inp, grads):
        x, gz, ws, stride, pad = inp
        (ggx,) = grads
        return [
            theano.tensor.zeros_like(x),
            GpuPool(ignore_border=self.ignore_border, ndim=self.ndim, mode=self.mode)(
                ggx, ws, stride, pad
            ),
        ] + [theano.gradient.DisconnectedType()() for i in inp[2:]]

    def connection_pattern(self, node):
        return [[1], [1], [0], [0], [0]]
Esempio n. 12
0
 def params_type(self):
     return ParamsType(i=theano.scalar.basic.int64)
Esempio n. 13
0
class GpuMagmaEigh(GpuMagmaBase):
    """Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
    library.

    Parameters
    ----------
    UPLO : Specifies whether the calculation is done with the lower triangular
           part of matrix (`L`, default) or the upper triangular part (`U`).
    compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
                default). If `False`, computes only eigenvalues of matrix.
    """

    __props__ = ("lower", "compute_v")
    _cop_num_inputs = 1
    _cop_num_outputs = 2
    check_input = False
    params_type = ParamsType(lower=bool_t,
                             compute_v=bool_t,
                             context=gpu_context_type)

    def __init__(self, UPLO="L", compute_v=True):
        assert UPLO in ["L", "U"]
        self.lower = UPLO == "L"
        self.compute_v = compute_v
        COp.__init__(self, ["c_code/magma_eigh.c"],
                     "APPLY_SPECIFIC(magma_eigh)")

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        if self.compute_v:
            return theano.Apply(
                self,
                [A],
                # return D, V
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)(),
                    A.type(),
                ],
            )
        else:
            return theano.Apply(
                self,
                [A],
                # return D
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)()
                ],
            )

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)
Esempio n. 14
0
class BaseCorrMM(gof.OpenMPOp):
    """
    Base class for `CorrMM`, `CorrMM_gradWeights` and
    `CorrMM_gradInputs`. Cannot be used directly.

    Every sub-class must define internal attribute ``_direction`` out of __init__().
    ``_direction`` must take one of following values:

     - "forward" to correlate bottom with weights and store results in top.
     - "backprop weights" to do a valid convolution of bottom with top
       (swapping the first two dimensions) and store results in weights.
     - "backprop inputs" to do a full convolution of top with weights
       (swapping the first two dimensions) and store results in bottom.

    Parameters
    ----------
    border_mode : {'valid', 'full', 'half'}
        Additionally, the padding size could be directly specified by an integer,
        a pair of integers, or two pairs of integers.
    subsample
        Perform subsampling of the output (default: (1, 1)).
    filter_dilation
        Perform dilated correlation (default: (1,1))
    num_groups
        Perform grouped convolutions (default: 1)
    unshared
        Perform unshared correlation (default: False)
    """
    check_broadcast = False
    __props__ = ('border_mode', 'subsample', 'filter_dilation', 'num_groups',
                 'unshared')

    _direction = None

    params_type = ParamsType(
        direction=EnumList(
            ('DIRECTION_FORWARD', 'forward'),  # 0
            ('DIRECTION_BACKPROP_WEIGHTS', 'backprop weights'),  # 1
            ('DIRECTION_BACKPROP_INPUTS', 'backprop inputs')),  # 2
        dH=int64,
        dW=int64,
        dilH=int64,
        dilW=int64,
        padH_l=int64,
        padH_r=int64,
        padW_l=int64,
        padW_r=int64,
        num_groups=int64,
        unshared=int8)

    def __init__(self,
                 border_mode="valid",
                 subsample=(1, 1),
                 filter_dilation=(1, 1),
                 num_groups=1,
                 unshared=False,
                 openmp=None):
        super(BaseCorrMM, self).__init__(openmp=openmp)
        if isinstance(border_mode, integer_types):
            if border_mode < 0:
                raise ValueError('invalid border_mode {}, which must be a '
                                 'non-negative integer'.format(border_mode))
            border_mode = ((border_mode, border_mode), ) * 2
        elif isinstance(border_mode, tuple):
            if len(border_mode) != 2:
                raise ValueError('invalid border_mode {} which must be a '
                                 'tuple of length 2'.format(border_mode))
            border = ()
            for mode in border_mode:
                if isinstance(mode, tuple) and len(mode) == 2 and \
                        min(mode) >= 0:
                    border += ((int(mode[0]), int(mode[1])), )
                elif mode >= 0:
                    border += ((int(mode), int(mode)), )
                else:
                    raise ValueError(
                        'invalid border mode {}. The tuple can only contain '
                        'integers or tuples of length 2'.format(border_mode))
            border_mode = border
        elif border_mode not in ('valid', 'full', 'half'):
            raise ValueError(
                'invalid border_mode {}, which must be either '
                '"valid", "full", "half", an integer or a tuple '
                'of two integers or a pair of integers'.format(border_mode))
        self.border_mode = border_mode
        if len(subsample) != 2:
            raise ValueError("subsample must have two elements")
        if len(filter_dilation) != 2:
            raise ValueError("filter_dilation must have two elements")
        self.subsample = tuple(subsample)
        self.filter_dilation = tuple(filter_dilation)
        self.unshared = unshared

        if not theano.config.blas.ldflags:
            # Theano will use a NumPy C implementation of [sd]gemm_ instead.
            self.blas_type = ''
        else:
            if 'openblas' in theano.config.blas.ldflags:
                self.blas_type = 'openblas'
            elif 'mkl' in theano.config.blas.ldflags:
                self.blas_type = 'mkl'
            else:
                self.blas_type = ''

        if self._direction not in [
                "forward", "backprop weights", "backprop inputs"
        ]:
            raise ValueError("_direction must be one of 'forward', "
                             "'backprop weights', 'backprop inputs'")
        if num_groups < 1:
            raise ValueError("Number of groups should be greater than 0")
        self.num_groups = num_groups

    @property
    def pad(self):
        if self.border_mode == "half":
            return ((-1, -1), ) * 2
        elif self.border_mode == "full":
            return ((-2, -2), ) * 2
        elif isinstance(self.border_mode, tuple):
            return self.border_mode
        else:
            assert self.border_mode == "valid"
            return ((0, 0), ) * 2

    # Direction should be converted to real enum value,
    # as it is compared to integer later in c_code_helper().
    direction = property(
        lambda self: self.params_type.enum_from_alias(self._direction))

    dH = property(lambda self: self.subsample[0])
    dW = property(lambda self: self.subsample[1])

    dilH = property(lambda self: self.filter_dilation[0])
    dilW = property(lambda self: self.filter_dilation[1])

    padH_l = property(lambda self: self.pad[0][0])
    padH_r = property(lambda self: self.pad[0][1])
    padW_l = property(lambda self: self.pad[1][0])
    padW_r = property(lambda self: self.pad[1][1])

    def __str__(self):
        return '%s{%s, %s, %s, %s %s}' % (
            self.__class__.__name__, self.border_mode, str(
                self.subsample), str(self.filter_dilation), str(
                    self.num_groups), str(self.unshared))

    @staticmethod
    def as_common_dtype(in1, in2):
        """
        Upcast input variables if necessary.
        """
        dtype = theano.scalar.upcast(in1.dtype, in2.dtype)
        return in1.astype(dtype), in2.astype(dtype)

    def __setstate__(self, d):
        self.__dict__.update(d)
        if not hasattr(self, 'num_groups'):
            self.num_groups = 1

    def c_support_code(self):
        ccodes = blas_headers.blas_header_text()
        if self.blas_type == 'openblas':
            ccodes += blas_headers.openblas_threads_text()
        elif self.blas_type == 'mkl':
            ccodes += blas_headers.mkl_threads_text()
        return ccodes

    def c_libraries(self):
        return ldflags()

    def c_compile_args(self):
        compile_args = ldflags(libs=False, flags=True)
        compile_args += super(BaseCorrMM, self).c_compile_args()
        return compile_args

    def c_lib_dirs(self):
        return ldflags(libs=False, libs_dir=True)

    def c_header_dirs(self):
        return ldflags(libs=False, include_dir=True)

    def c_headers(self):
        headers = ['<stdio.h>']
        headers += super(BaseCorrMM, self).c_headers()
        return headers

    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
        return (10, self.openmp, blas_header_version())

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of
        # these files
        sub = {}
        dtype = str(node.__dict__['inputs'][0].dtype)
        assert dtype in ('float32', 'float64')
        if dtype == 'float32':
            sub['gemm'] = 'sgemm_'
            sub['gemv'] = 'sgemv_'
            sub['float_type'] = 'npy_float'
            sub['float_typenum'] = 'NPY_FLOAT'
            sub['n_bytes'] = 4
            sub['c_float_type'] = 'float'
        else:
            sub['gemm'] = 'dgemm_'
            sub['gemv'] = 'dgemv_'
            sub['float_type'] = 'npy_double'
            sub['float_typenum'] = 'NPY_DOUBLE'
            sub['n_bytes'] = 8
            sub['c_float_type'] = 'double'

        if self.openmp:
            sub['omp_flags'] = '#pragma omp parallel for schedule(static)'
            sub['omp_get_max_threads'] = 'omp_get_max_threads()'
            sub['omp_get_thread_num'] = 'omp_get_thread_num()'

            if self.blas_type == 'openblas':
                sub['blas_set_num_threads'] = 'openblas_set_num_threads'
                sub['blas_get_num_threads'] = 'openblas_get_num_threads()'
            elif self.blas_type == 'mkl':
                sub['blas_set_num_threads'] = 'mkl_set_num_threads'
                sub['blas_get_num_threads'] = 'mkl_get_max_threads()'
            else:
                sub['blas_set_num_threads'] = ''
                sub['blas_get_num_threads'] = '0'
        else:
            sub['omp_flags'] = ''
            sub['omp_get_max_threads'] = '1'
            sub['omp_get_thread_num'] = '0'
            sub['blas_set_num_threads'] = ''
            sub['blas_get_num_threads'] = '0'

        files = [os.path.join('c_code', 'corr_gemm.c')]
        codes = [
            open(os.path.join(os.path.split(__file__)[0], f)).read()
            for f in files
        ]
        final_code = ''
        for code in codes:
            final_code += code
        return final_code % sub

    def c_code_helper(self,
                      bottom,
                      weights,
                      top,
                      sub,
                      height=None,
                      width=None):
        """
        This generates the C code for CorrMM (direction="forward"),
        CorrMM_gradWeights (direction="backprop weights"), and
        CorrMM_gradInputs (direction="backprop inputs").
        Depending on the direction, one of bottom, weights, top will
        receive the output, while the other two serve as inputs.

        :param bottom: Variable name of the input images in the forward pass,
            or the gradient of the input images in backprop wrt. inputs
        :param weights: Variable name of the filters in the forward pass,
            or the gradient of the filters in backprop wrt. weights
        :param top: Variable name of the output images / feature maps in the
            forward pass, or the gradient of the outputs in the backprop passes
        :param sub: Dictionary of substitutions useable to help generating the
            C code.
        :param height: If self.subsample[0] != 1, a variable giving the height
            of the filters for direction="backprop weights" or the height of
            the input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the height of the
            filters for direction="backprop weights".  Ignored otherwise.
        :param width: If self.subsample[1] != 1, a variable giving the width
            of the filters for direction="backprop weights" or the width of the
            input images for direction="backprop inputs".

            If self.border_mode == 'half', a variable giving the width of the
            filters for direction="backprop weights".  Ignored otherwise.
        """

        # When subsampling, we cannot unambiguously infer the height and width
        # of bottom and weights from top, so we require them to be given.
        # Similarly, when border_mode="half", we cannot infer the weight size.
        if height:
            height = '(*(npy_int64 *)(PyArray_DATA(%s)))' % height
        else:
            if ((self.direction != 0) and
                (self.dH != 1)) or ((self.direction == 1) and
                                    (self.padH_l == -1 or self.padH_r == -1)):
                raise ValueError(
                    "height must be given for backprop with vertical sampling or border_mode='half'"
                )
            height = '-1'
        if width:
            width = '(*(npy_int64 *)(PyArray_DATA(%s)))' % width
        else:
            if ((self.direction != 0) and
                (self.dW != 1)) or ((self.direction == 1) and
                                    (self.padW_l == -1 or self.padW_r == -1)):
                raise ValueError(
                    "width must be given for backprop with horizontal sampling or border_mode='half'"
                )
            width = '-1'

        return """
    // Mandatory args
    int direction = %(params)s->direction;  // forward, bprop weights, bprop inputs

    // Optional args
    int dH = %(params)s->dH;
    int dW = %(params)s->dW;
    int dilH = %(params)s->dilH;
    int dilW = %(params)s->dilW;
    int padH_l = %(params)s->padH_l;
    int padH_r = %(params)s->padH_r;
    int padW_l = %(params)s->padW_l;
    int padW_r = %(params)s->padW_r;
    int numgroups = %(params)s->num_groups;
    int unshared = %(params)s->unshared;

    PyArrayObject * bottom = %(bottom)s;
    PyArrayObject * weights = %(weights)s;
    PyArrayObject * top = %(top)s;
    PyArrayObject * out2 = NULL;
    PyArrayObject **out = NULL;

    switch(%(params)s->direction) {
        case DIRECTION_FORWARD:
            out = &%(top)s;
            break;
        case DIRECTION_BACKPROP_WEIGHTS:
            out = &%(weights)s;
            break;
        case DIRECTION_BACKPROP_INPUTS:
            out = &%(bottom)s;
            break;
        default:
            PyErr_SetString(PyExc_ValueError, "CPU CorrMM: Invalid direction.");
            {%(fail)s}
            break;
    }

    int wdim, odim;
    wdim = unshared ? 6 : 4;
    odim = 4; //Can be set to 6 later for unshared backprop wrt weights

    // Obtain or infer kernel width and height
    // (we need to know it early to be able to handle auto-padding)
    int kH, kW, dil_kH, dil_kW;
    if (direction != 1) {
        // weight is an input variable, we can just read its shape
        kH = PyArray_DIMS(weights)[wdim-2];
        kW = PyArray_DIMS(weights)[wdim-1];
    }
    else {
        if (%(height)s != -1) {
            // kernel height is specified (perhaps vertical subsampling or half padding)
            kH = %(height)s;
        }
        else if (padH_l == -2 || padH_r == -2) {
            // vertical full padding, we can infer the kernel height
            kH = (2 - PyArray_DIMS(bottom)[2] + (PyArray_DIMS(top)[2] - 1) * dH - 1)/ dilH + 1;
        }
        else {
            // explicit padding, we can infer the kernel height
            kH = (PyArray_DIMS(bottom)[2] + padH_l + padH_r - (PyArray_DIMS(top)[2] - 1) * dH - 1) / dilH +1;
        }
        if (%(width)s != -1) {
            // kernel width is specified (perhaps horizontal subsampling or half padding)
            kW = %(width)s;
        }
        else if (padW_l == -2 || padW_r == -2) {
            kW = (2 - PyArray_DIMS(bottom)[3] + (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
        else {
            kW = (PyArray_DIMS(bottom)[3] + padW_l + padW_r - (PyArray_DIMS(top)[3] - 1) * dW - 1) / dilW + 1;
        }
    }

    // Implicit dilated kernel size
    dil_kH = (kH - 1) * dilH + 1;
    dil_kW = (kW - 1) * dilW + 1;

    // Auto-padding if requested
    if (padH_l == -1 || padH_r == -1) {  // vertical half padding
        padH_l = padH_r = dil_kH / 2;
    }
    else if (padH_l == -2 || padH_r == -2) {  // vertical full padding
        padH_l = padH_r = dil_kH - 1;
    }
    else if (padH_l < -2 || padH_r < -2) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padH_l and padH_r must be >= -2");
        %(fail)s
    }
    if (padW_l == -1 || padW_r == -1) {  // horizontal half padding
        padW_l = padW_r = dil_kW / 2;
    }
    else if (padW_l == -2 || padW_r == -2) {  // horizontal full padding
        padW_l = padW_r = dil_kW - 1;
    }
    else if (padW_l < -2 || padW_r < -2) {
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: padW_l and padW_r must be >= -2");
        %(fail)s
    }

    // Infer output shape
    npy_intp out_dim[6];
    out_dim[4] = out_dim[5] = 0; //Only used for unshared backprop wrt weights
    switch(direction) {
    case 0:  // forward pass
        // output is top: (batchsize, num_filters, height, width)
        // height and width: top = (bottom + pad_l + pad_r - ((weight-1)*dil + 1)) / sample + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(bottom)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[0];
        out_dim[2] = (npy_intp)((PyArray_DIMS(bottom)[2] + padH_l + padH_r - ((PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1)) / dH + 1);
        out_dim[3] = (npy_intp)((PyArray_DIMS(bottom)[3] + padW_l + padW_r - ((PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1)) / dW + 1);
        if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
        {
            if (unshared) {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3]);
            }
            else {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3]);
            }
            %(fail)s
        }
        break;
    case 1:  // backprop wrt. weights
        // output is weights: (num_filters, num_channels, height, width)
        // height and width: weights = (bottom + pad_l + pad_r - (top - 1) * sample - 1) / dil + 1
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[1];
        if (unshared){
            odim = 6;
            out_dim[1] = (npy_intp)PyArray_DIMS(top)[2];
            out_dim[2] = (npy_intp)PyArray_DIMS(top)[3];
        }
        out_dim[wdim-3] = (npy_intp)PyArray_DIMS(bottom)[1] / numgroups;
        out_dim[wdim-2] = (npy_intp)kH;  // already inferred further above
        out_dim[wdim-1] = (npy_intp)kW;  // how convenient
        if (unshared) {
            if (out_dim[0] < 0 || out_dim[1] <= 0 || out_dim[2] <= 0 || out_dim[3] < 0
                    || out_dim[4] <= 0 || out_dim[5] <= 0){
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. weights: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3], (long int)out_dim[4], (long int)out_dim[5],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        else {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. weights: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)PyArray_DIMS(bottom)[0], (long int)PyArray_DIMS(bottom)[1],
                             (long int)PyArray_DIMS(bottom)[2], (long int)PyArray_DIMS(bottom)[3],
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        break;
    case 2:  // backprop wrt. inputs
        // output is bottom: (batchsize, num_channels, height, width)
        // height and width: bottom = (top - 1) * sample + (weights-1)*dil + 1 - 2*pad
        out_dim[0] = (npy_intp)PyArray_DIMS(top)[0];
        out_dim[1] = (npy_intp)PyArray_DIMS(weights)[wdim-3] * numgroups;
        out_dim[2] = (npy_intp)((%(height)s != -1) ? %(height)s : (PyArray_DIMS(top)[2] - 1) * dH + (PyArray_DIMS(weights)[wdim-2]-1)*dilH + 1 - padH_l - padH_r);
        out_dim[3] = (npy_intp)((%(width)s != -1) ? %(width)s : (PyArray_DIMS(top)[3] - 1) * dW + (PyArray_DIMS(weights)[wdim-1]-1)*dilW + 1 - padW_l - padW_r);
        if (unshared) {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(weights)[4], (long int)PyArray_DIMS(weights)[5],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        else {
            if (out_dim[0] < 0 || out_dim[1] < 0 || out_dim[2] <= 0 || out_dim[3] <= 0)
            {
                PyErr_Format(PyExc_ValueError,
                             "CorrMM backprop wrt. inputs: impossible output shape\\n"
                             "  bottom shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  weights shape: %%ld x %%ld x %%ld x %%ld\\n"
                             "  top shape: %%ld x %%ld x %%ld x %%ld\\n",
                             (long int)out_dim[0], (long int)out_dim[1], (long int)out_dim[2],
                             (long int)out_dim[3],
                             (long int)PyArray_DIMS(weights)[0], (long int)PyArray_DIMS(weights)[1],
                             (long int)PyArray_DIMS(weights)[2], (long int)PyArray_DIMS(weights)[3],
                             (long int)PyArray_DIMS(top)[0], (long int)PyArray_DIMS(top)[1],
                             (long int)PyArray_DIMS(top)[2], (long int)PyArray_DIMS(top)[3]);
                %(fail)s
            }
        }
        break;
    default:
        PyErr_SetString(PyExc_ValueError, "BaseCorrMM: direction must be 0, 1, or 2\\n");
        %(fail)s
    }

    // Prepare output array
    int typenum;
    int failure;
    failure = !(*out
           && PyArray_NDIM(*out)==odim
           && PyArray_IS_C_CONTIGUOUS(*out)
           && PyArray_DIMS(*out)[0]==out_dim[0]
           && PyArray_DIMS(*out)[1]==out_dim[1]
           && PyArray_DIMS(*out)[2]==out_dim[2]
           && PyArray_DIMS(*out)[3]==out_dim[3]);
    if (odim == 6){
        failure = failure || !(PyArray_DIMS(*out)[4]==out_dim[4]
                && PyArray_DIMS(*out)[5]==out_dim[5]);
    }
    if ( failure )
    {
        Py_XDECREF(*out);
        if (direction != 1) {
          typenum = PyArray_TYPE(weights);
        }
        else {
          typenum = PyArray_TYPE(bottom);
        }
        //Change to PyArray_ZEROS which is faster than PyArray_EMPTY.
        *out = (PyArrayObject*)PyArray_ZEROS(odim,
                                          out_dim,
                                          typenum,
                                          0);
        if (NULL == *out)
        {
            if (odim == 4) {
                PyErr_Format(PyExc_RuntimeError,
                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld",
                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3]);
            }
            if (odim == 6) {
                PyErr_Format(PyExc_RuntimeError,
                        "BaseCorrMM: Failed to allocate output of %%lld x %%lld x %%lld x %%lld %%lld %%lld",
                        (long long)out_dim[0], (long long)out_dim[1], (long long)out_dim[2], (long long)out_dim[3],
                        (long long)out_dim[4], (long long)out_dim[5]);
            }
            %(fail)s
        }
    }

    // Call corrMM code
    out2 = corrMM(%(bottom)s, %(weights)s, %(top)s, direction, dH, dW, dilH, dilW,
                padH_l, padH_r, padW_l, padW_r, numgroups, unshared);
    if (out2==NULL){
       %(fail)s
    }
    assert (out2 == *out);

""" % dict(bottom=bottom,
           weights=weights,
           top=top,
           height=height,
           width=width,
           fail=sub['fail'],
           params=sub['params'])
Esempio n. 15
0
class GpuBinarySearchSorted(GpuKernelBase, Op):
    """
    Searchsorted on GPU

    """
    __props__ = ('context_name', 'dtype_int64')
    _f16_ok = True
    params_type = ParamsType(context=gpu_context_type, dtype_int64=bool_t)

    def __init__(self, context_name=None, dtype_int64=False):
        self.context_name = context_name
        self.dtype_int64 = dtype_int64

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=get_context(
                                               self.context_name),
                                           dtype_int64=self.dtype_int64)

    def make_node(self, d, x):
        d = as_gpuarray_variable(d, context_name=self.context_name)
        x = as_gpuarray_variable(x, context_name=self.context_name)
        assert d.ndim == 1
        assert x.ndim == 1
        broadcastable = (False, )
        otype = GpuArrayType(dtype='int64' if self.dtype_int64 else 'int32',
                             broadcastable=broadcastable,
                             context_name=self.context_name)
        return gof.Apply(self, [d, x], [otype()])

    def infer_shape(self, node, in_shapes):
        _, x_shape = in_shapes
        return [x_shape]

    def grad(self, inp, grads):
        return [grad_not_implemented(self, i, inp[i]) for i in range(2)]

    def gpu_kernels(self, node, name):
        dtype_d = node.inputs[0].dtype
        type_d = gpuarray.dtype_to_ctype(dtype_d)
        dtype_x = node.inputs[1].dtype
        type_x = gpuarray.dtype_to_ctype(dtype_x)
        dtype_y = node.outputs[0].dtype
        type_y = gpuarray.dtype_to_ctype(dtype_y)
        work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d))
        load_d = load_w(dtype_d)
        work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x))
        load_x = load_w(dtype_x)
        code = """
        #include "cluda.h"
        KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) {
            d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off);
            x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off);
            y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off);
            ga_size index = threadIdx.x + blockIdx.x * blockDim.x;
            if (index < lx) {
                ga_long a = 0;
                ga_long b = (ga_long)(ld - 1);
                %(work_d)s minval = %(load_d)s(d[a]);
                %(work_d)s maxval = %(load_d)s(d[b * stridesD0]);
                %(work_x)s val = %(load_x)s(x[index * stridesX0]);
                if (val > maxval) {
                    a = (ga_long)ld;
                    b = (ga_long)ld;
                } else if (val <= minval) {
                    a = 0;
                    b = 0;
                }
                while (b - a > 0) {
                    ga_long h = (b + a) / 2;
                    %(work_d)s t = %(load_d)s(d[h * stridesD0]);
                    if (val < t) {
                        b = h;
                    } else {
                        a = h + 1;
                    }
                }
                y[index * stridesY0] = b;
            }
        }""" % dict(type_d=type_d,
                    type_x=type_x,
                    type_y=type_y,
                    work_d=work_d,
                    load_d=load_d,
                    work_x=work_x,
                    load_x=load_x,
                    name=name)
        return [
            Kernel(code=code,
                   name="binsearchsorted",
                   params=[
                       gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                       gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                       gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE,
                       gpuarray.SIZE, gpuarray.SSIZE
                   ],
                   flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y),
                   objvar='k_binsearchsorted_' + name)
        ]

    def c_headers(self):
        return [
            '<numpy_compat.h>', '<gpuarray_helper.h>', '<gpuarray/types.h>'
        ]

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir()]

    def c_code(self, node, name, inp, out, sub):  #TODO: fix error msg
        d, x = inp
        y, = out
        fail = sub['fail']
        params = sub['params']
        typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
        kname = self.gpu_kernels(node, name)[0].objvar
        s = """
        int err;
        size_t dimd = ((size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(d)s))[0];
        size_t dimx = ((size_t*)PyGpuArray_DIMS((PyGpuArrayObject*)%(x)s))[0];
        size_t ls = 1024;
        size_t gs = (dimx / 1024) + 1;
        size_t out_dims[1] = {dimx};

        size_t itemsize_d = 1;
        size_t itemsize_x = 1;
        size_t itemsize_y = 1;
        ssize_t stridesD0 = 1;
        ssize_t stridesX0 = 1;
        ssize_t stridesY0 = 1;

        if (%(y)s == NULL || %(y)s->ga.nd != 1 || %(y)s->ga.dimensions[0] != dimx) {
            Py_CLEAR(%(y)s);
            %(y)s = pygpu_zeros(1, out_dims, %(typecode)s, GA_C_ORDER, %(params)s->context, Py_None);
        }
        if (%(y)s == NULL) {
            %(fail)s
        }

        itemsize_d = GpuArray_ITEMSIZE(&%(d)s->ga);
        itemsize_x = GpuArray_ITEMSIZE(&%(x)s->ga);
        itemsize_y = GpuArray_ITEMSIZE(&%(y)s->ga);
        stridesD0 = PyGpuArray_STRIDES(%(d)s)[0] / itemsize_d;
        stridesX0 = PyGpuArray_STRIDES(%(x)s)[0] / itemsize_x;
        stridesY0 = PyGpuArray_STRIDES(%(y)s)[0] / itemsize_y;
        err = binsearchsorted_call(1, &gs, &ls, 0, stridesD0, %(d)s->ga.data, %(d)s->ga.offset, stridesX0, %(x)s->ga.data, %(x)s->ga.offset, stridesY0, %(y)s->ga.data, %(y)s->ga.offset, dimx, (ssize_t)dimd);
        if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError, "gpuarray error: kExtract: %%s. n%%lu, m=%%lu.", GpuKernel_error(&%(kname)s, err), (unsigned long)dimx, (unsigned long)dimd);
            %(fail)s;
        }
        """ % locals()
        return s

    def c_code_cache_version(self):
        return (1, )
class QuadraticOpFunc(Op):
    __props__ = ('a', 'b', 'c')
    params_type = ParamsType(a=tensor_type_0d, b=scalar_type, c=generic_type)

    def __init__(self, a, b, c):
        self.a = a
        self.b = b
        self.c = c

    def make_node(self, x):
        x = tensor.as_tensor_variable(x)
        return Apply(self, [x], [x.type()])

    def perform(self, node, inputs, output_storage, coefficients):
        x = inputs[0]
        y = output_storage[0]
        y[0] = coefficients.a * (x**2) + coefficients.b * x + coefficients.c

    def c_code_cache_version(self):
        return (1, 5)

    def c_support_code_apply(self, node, name):
        float_type = node.inputs[0].type.dtype_specs()[1]
        return """
        /* Computes: x = a*x*x + b*x + c for x in tensor. */
        int quadratic_%(name)s(PyArrayObject* tensor, %(float_type)s a, %(float_type)s b, %(float_type)s c) {
            NpyIter* iterator = NpyIter_New(tensor,
                NPY_ITER_READWRITE | NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK,
                NPY_KEEPORDER, NPY_NO_CASTING, NULL);
            if(iterator == NULL) {
                PyErr_SetString(PyExc_RuntimeError, "Unable to iterate over a tensor for an elemwise operation.");
                return -1;
            }
            NpyIter_IterNextFunc* get_next = NpyIter_GetIterNext(iterator, NULL);
            char** data_ptr = NpyIter_GetDataPtrArray(iterator);
            npy_intp* stride_ptr = NpyIter_GetInnerStrideArray(iterator);
            npy_intp* innersize_ptr = NpyIter_GetInnerLoopSizePtr(iterator);
            do {
                char* data = *data_ptr;
                npy_intp stride = *stride_ptr;
                npy_intp count = *innersize_ptr;
                while(count) {
                    %(float_type)s x = *((%(float_type)s*)data);
                    *((%(float_type)s*)data) = a*x*x + b*x + c;
                    data += stride;
                    --count;
                }
            } while(get_next(iterator));
            NpyIter_Deallocate(iterator);
            return 0;
        }
        """ % {
            'name': name,
            'float_type': float_type
        }

    def c_code(self, node, name, inputs, outputs, sub):
        return """
        %(float_type)s a = (%(float_type)s) (*(npy_float64*) PyArray_GETPTR1(%(coeff)s->a, 0)); // 0-D TensorType.
        %(float_type)s b =                                                   %(coeff)s->b;      // Scalar.
        %(float_type)s c =                 (%(float_type)s) PyFloat_AsDouble(%(coeff)s->c);     // Generic.
        Py_XDECREF(%(Y)s);
        %(Y)s = (PyArrayObject*)PyArray_EMPTY(PyArray_NDIM(%(X)s), PyArray_DIMS(%(X)s), PyArray_TYPE(%(X)s), PyArray_IS_F_CONTIGUOUS(%(X)s));
        if (PyArray_CopyInto(%(Y)s, %(X)s) != 0) {
            PyErr_SetString(PyExc_RuntimeError, "Unable to copy input into output.");
            %(fail)s
        };
        if (quadratic_%(name)s(%(Y)s, a, b, c) != 0) {
            PyErr_SetString(PyExc_RuntimeError, "Unable to compute quadratic function.");
            %(fail)s
        }
        """ % dict(name=name,
                   coeff=sub['params'],
                   fail=sub['fail'],
                   X=inputs[0],
                   Y=outputs[0],
                   float_type=node.inputs[0].type.c_element_type())
    def test_params_type_filtering(self):
        shape_tensor5 = (1, 2, 2, 3, 2)
        size_tensor5 = shape_tensor5[0] * shape_tensor5[1] * shape_tensor5[
            2] * shape_tensor5[3] * shape_tensor5[4]
        random_tensor = numpy.random.normal(
            size=size_tensor5).reshape(shape_tensor5)

        w = ParamsType(a1=TensorType('int32', (False, False)),
                       a2=TensorType('float64',
                                     (False, False, False, False, False)),
                       a3=Generic())

        # With a value that does not match the params type.
        o = Params(w,
                   a1=numpy.asarray([[1, 2, 3, 4, 5, 6],
                                     [7, 8, 9, 10, 11, 12]]).astype('int64'),
                   a2=random_tensor.astype('float32'),
                   a3=2000)
        # should fail (o.a1 is not int32, o.a2 is not float64)
        self.assertRaises(TypeError, w.filter, o, True)
        # should fail (o.a1 is not int32, o.a2 is not float64, and downcast is disallowed)
        self.assertRaises(TypeError, w.filter, o, False, False)
        # Should pass.
        w.filter(o, strict=False, allow_downcast=True)

        # With a value that matches the params type.
        o1 = Params(w,
                    a1=numpy.asarray([[1, 2, 3, 4, 5, 6],
                                      [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=random_tensor.astype('float64'),
                    a3=2000)
        # All should pass.
        w.filter(o1, strict=True)
        w.filter(o1, strict=False, allow_downcast=False)
        w.filter(o1, strict=False, allow_downcast=True)

        # Check values_eq and values_eq_approx.
        o2 = Params(w,
                    a1=numpy.asarray([[1, 2, 3, 4, 5, 6],
                                      [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=random_tensor.astype('float64'),
                    a3=2000)
        assert w.values_eq(o1, o2)
        assert w.values_eq_approx(o1, o2)

        # Check value_eq_approx.
        # NB: I don't know exactly which kind of differences is rejected by values_eq but accepted by values_eq_approx.
        # So, I just play a little with float values.
        o3 = Params(w,
                    a1=numpy.asarray([[1, 2, 3, 4, 5, 6],
                                      [7, 8, 9, 10, 11, 12]]).astype('int32'),
                    a2=(random_tensor.astype('float32') * 10 / 2.2 *
                        2.19999999999 / 10).astype('float64'),
                    a3=2000.0 - 0.00000000000000001)
        assert w.values_eq_approx(o1, o3)
Esempio n. 18
0
class GpuSparseBlockGemv(COp):
    """
    GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
    information.

    This should not be directly called since the interface is subject
    to change without notice.  Use the sandbox.blocksparse.sparse_block_dot()
    function for a stable interface.
    """
    __props__ = ('inplace', )
    params_type = ParamsType(inplace=bool_t, context=gpu_context_type)

    # NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.

    def __init__(self, inplace=False):
        COp.__init__(self, "c_code/blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
        self.inplace = inplace
        if self.inplace:
            self.destroy_map = {0: [0]}

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_header_dirs(self):
        return [gpuarray_helper_inc_dir()]

    def c_headers(self):
        return [
            '<gpuarray/buffer_blas.h>', '<gpuarray/buffer.h>',
            '<gpuarray_helper.h>'
        ]

    def make_node(self, o, W, h, inputIdx, outputIdx):
        ctx = infer_context_name(o, W, h)
        o = as_gpuarray_variable(o, ctx)
        W = as_gpuarray_variable(W, ctx)
        h = as_gpuarray_variable(h, ctx)
        inputIdx = as_tensor_variable(inputIdx)
        outputIdx = as_tensor_variable(outputIdx)
        assert o.ndim == 3
        assert W.ndim == 4
        assert h.ndim == 3
        assert inputIdx.ndim == 2
        assert outputIdx.ndim == 2

        assert inputIdx.type.dtype in discrete_dtypes
        assert outputIdx.type.dtype in discrete_dtypes

        return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])

    def infer_shape(self, node, input_shapes):
        return [input_shapes[0]]

    def grad(self, inputs, grads):
        o, W, h, inputIdx, outputIdx = inputs
        go = grads[0]

        Wgrad = gpu_sparse_block_outer(W.zeros_like(), h, go, inputIdx,
                                       outputIdx)
        hgrad = gpu_sparse_block_gemv(h.zeros_like(), W.dimshuffle(
            (1, 0, 3, 2)), go, outputIdx, inputIdx)
        return [
            go, Wgrad, hgrad,
            grad_undefined(self, 3, inputIdx,
                           "grad of inputIdx makes no sense"),
            grad_undefined(self, 4, outputIdx,
                           "grad of outputIdx makes no sense")
        ]
Esempio n. 19
0
class GpuMagmaSVD(GpuMagmaBase):
    """Computes the svd of a matrix :math:`A` using magma library.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
        to get them in expected order ``U, S, VT``.

    """

    __props__ = ("full_matrices", "compute_uv")
    _cop_num_inputs = 1
    _cop_num_outputs = 3
    check_input = False
    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
        ExternalCOp.__init__(self, ["c_code/magma_svd.c"], "APPLY_SPECIFIC(magma_svd)")

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        A = gpu_contiguous(A)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        if A.dtype != "float32":
            raise TypeError("only `float32` is supported for now")
        if self.compute_uv:
            return theano.Apply(
                self,
                [A],
                # return S, U, VT
                [
                    GpuArrayType(
                        A.dtype, broadcastable=[False], context_name=ctx_name
                    )(),
                    A.type(),
                    A.type(),
                ],
            )
        else:
            return theano.Apply(
                self,
                [A],
                # return only S
                [GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)()],
            )

    def prepare_node(self, node, storage_map, compute_map, impl):
        super().prepare_node(node, storage_map, compute_map, impl)
        # Check node to prevent eventual errors with old pickled nodes.
        if self.compute_uv:
            A, B, C = node.outputs
            # We expect order: S (vector), U (matrix), VT (matrix)
            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, (
                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) "
                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
            )

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, fgraph, node, shapes):
        (x_shape,) = shapes
        M, N = x_shape
        K = tensor.minimum(M, N)
        s_shape = (K,)
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]
Esempio n. 20
0
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
    """
    Images2Neibs for the GPU.

    """
    params_type = ParamsType(mode=Images2Neibs.BORDER_MODE, context=gpu_context_type)

    def get_params(self, node):
        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def make_node(self, ten4, neib_shape, neib_step=None):
        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = T.as_tensor_variable(neib_shape)
        if neib_step is None:
            neib_step = neib_shape
        else:
            neib_step = T.as_tensor_variable(neib_step)

        assert ten4.ndim == 4
        assert neib_shape.ndim == 1
        assert neib_step.ndim == 1
        assert neib_shape.dtype in T.integer_dtypes
        assert neib_step.dtype in T.integer_dtypes

        return Apply(self, [ten4, neib_shape, neib_step],
                     [GpuArrayType(broadcastable=(False, False),
                                   dtype=ten4.type.dtype,
                                   context_name=ten4.type.context_name)()])

    def c_code_cache_version(self):
        return (13,)

    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

    def gpu_kernels(self, node, nodename):
        dtype_ten4 = node.inputs[0].dtype
        dtype_z = node.outputs[0].dtype
        flags = Kernel.get_flags(dtype_ten4, dtype_z)
        type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4)
        type_z = gpuarray.dtype_to_ctype(dtype_z)
        # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here.
        mode_constants = self.BORDER_MODE.c_support_code()
        kernels = []
        kname = "k_multi_warp_less"
        k_var = "k_multi_warp_less_" + nodename
        code = """
        // a version that uses less registers but doesn't work in all cases.
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            ga_int i = LID_1;     // loop over c
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                ga_int j = LID_0;  // loop over d
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED){
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }""" % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
        params = [
            'intc',
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))

        kname = "k_multi_warp"
        k_var = "k_multi_warp_" + nodename
        code = """
        %(mode_constants)s
        KERNEL void %(kname)s(
            const ga_int mode,
            const ga_int nb_batch,
            const ga_int nb_stack,
            const ga_int height,
            const ga_int width,
            const ga_int c,
            const ga_int d,
            const ga_int step_x,
            const ga_int step_y,
            const ga_int grid_c,
            const ga_int grid_d,
            const ga_size stride0, const ga_size stride1,
            const ga_size stride2, const ga_size stride3,
            GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4,
            const ga_size out_s0, const ga_size out_s1,
            GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out
        )
        {
            const ga_int wrap_centered_half_idx_shift_x = c/2;
            const ga_int wrap_centered_half_idx_shift_y = d/2;
            global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4);
            global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out);

            for(ga_int tblock = GID_0*LDIM_2+LID_2;
                tblock<nb_batch*nb_stack*grid_c*grid_d;
                tblock+=GDIM_0*LDIM_2){
                const ga_int b = tblock%%grid_d;
                ga_int left = tblock/grid_d;
                const ga_int a = left%%grid_c;
                left = left/grid_c;
                const ga_int s = left%%nb_stack;
                left = left/nb_stack;
                const ga_int n = left;

                if(n>nb_batch)continue;
                if(s>nb_stack)continue;
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            ga_int z_row = b + grid_d*(a + grid_c*
                                                    (s + nb_stack*n));
                            // loop over c
                            for (ga_int i = LID_1; i < c; i+=LDIM_1)
                            {
                                ga_int ten4_2 = i + a * step_x;
                                if(mode == MODE_WRAP_CENTERED) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                    if ( ten4_2 < 0 )
                                        ten4_2 += height;
                                    else if (ten4_2 >= height)
                                        ten4_2 -= height;
                                } else if (mode == MODE_HALF) {
                                    ten4_2 -= wrap_centered_half_idx_shift_x;
                                } else if (mode == MODE_FULL) {
                                    ten4_2 -= c - 1;
                                }
                                // loop over d
                                for (ga_int j = LID_0; j < d; j+=LDIM_0)
                                {
                                    ga_int ten4_3 = j + b * step_y;
                                    if(mode == MODE_WRAP_CENTERED) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                        if ( ten4_3 < 0 )
                                            ten4_3 += width;
                                        else if (ten4_3 >= width)
                                            ten4_3 -= width;
                                    } else if (mode == MODE_HALF) {
                                        ten4_3 -= wrap_centered_half_idx_shift_y;
                                    } else if (mode == MODE_FULL) {
                                        ten4_3 -= d - 1;
                                    }

                                    ga_int z_col = j + d * i;
                                    ga_int z_idx = z_col * out_s1 +
                                                z_row * out_s0;
                                    if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){
                                        global_out[z_idx] = 0;
                                    } else {
                                        ga_int ten4_idx = stride3*ten4_3 +
                                                       stride2*ten4_2 +
                                                       stride1*s + stride0*n;
                                        global_out[z_idx] = global_ten4[ten4_idx];
                                    }
                                }
                            }
            }
        }
        """ % dict(kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants)
        params = [
            'intc',
            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
            'intc', 'intc', 'intc', 'intc',
            'uintp', 'uintp', 'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            'uintp', 'uintp',
            gpuarray.GpuArray, 'uintp',
            ]
        kernels.append(Kernel(code=code, name=kname, params=params,
                              flags=flags, objvar=k_var))
        return kernels

    def c_support_code(self):
        return """
        template <typename T>
        static T ceil_intdiv(T a, T b)
        {
            return (a/b) + ((a % b) ? 1: 0);
        }
        """

    def c_code(self, node, name, inp, out, sub):
        err_check = """
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "gpuarray error: *fptr: %%s.",
                             GpuKernel_error(fptr, err));
                %(fail)s;
            }
        """ % dict(fail=sub['fail'])
        sync = ""
        if config.gpuarray.sync:
            sync = """
            err = GpuArray_sync(&%(z)s->ga);
            %(err_check)s
            """ % dict(z=out[0], err_check=err_check)
        # NB: To reduce C code variability:
        # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
        # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
        # For typecode_z, I use ten4->ga.typecode (for same reason as above)
        return """
        int grid_c = -1;
        int grid_d = -1;
        size_t itemsize_ten4 = GpuArray_ITEMSIZE(&%(ten4)s->ga);
        size_t itemsize_z = itemsize_ten4;
        int typecode_z = %(ten4)s->ga.typecode;

        {
            if (PyGpuArray_NDIM(%(ten4)s) != 4)
            {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs: pvals wrong rank");
                %(fail)s;
            }
            if (PyArray_NDIM(%(neib_shape)s) != 1)
            {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs: unis wrong rank");
                %(fail)s;
            }

            if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuImages2Neibs: neib_shape has to contain two"
                             " elements");
                %(fail)s;
            }

            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 0);
            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 1);
            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

            if (step_x <=0 || step_y <=0)
            {
                PyErr_Format(PyExc_ValueError,
                             "neib_step wrong step ; values <= 0. Got %%lld %%lld.",
                             (long long) step_x, (long long) step_y);
                %(fail)s;
            }

            if (c <=0 || d <=0)
            {
                PyErr_Format(PyExc_ValueError,
                             "neib_shape values <= 0. Got %%lld %%lld.",
                             (long long)c, (long long)d);
                %(fail)s;
            }

            if (%(params)s->mode == MODE_WRAP_CENTERED) {
                if (c%%2!=1 || d%%2!=1){
                    PyErr_Format(PyExc_TypeError,
                                 "GpuImages2Neibs: in mode wrap_centered need patch with odd shapes");
                    %(fail)s;
                }
                if ( PyGpuArray_DIMS(%(ten4)s)[2] < c ||
                     PyGpuArray_DIMS(%(ten4)s)[3] < d)
                {
                    PyErr_Format(PyExc_TypeError,
                                 "GpuImages2Neibs: in wrap_centered mode,"
                                 " don't support image shapes smaller then"
                                 " the patch shapes: neib_shape=(%%d,%%d),"
                                 " ten4[2:]=[%%d,%%d]",
                                 c, d, PyGpuArray_DIMS(%(ten4)s)[2],
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                grid_c = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[2]),
                                     (size_t)step_x);
                grid_d = ceil_intdiv(((PyGpuArray_DIMS(%(ten4)s))[3]),
                                     (size_t)step_y);


            } else if (%(params)s->mode == MODE_VALID) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
                                 " ten4.shape[2]=%%d not consistent",
                                 c, step_x,
                                 PyGpuArray_DIMS(%(ten4)s)[2]);
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
                                 " ten4.shape[3]=%%d not consistent",
                                 d, step_y,
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
            } else if (%(params)s->mode == MODE_IGNORE_BORDERS) {
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-c)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-d)/step_y);
            } else if (%(params)s->mode == MODE_HALF) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2)) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[0]=%%d, neib_step[0]=%%d and"
                                 " ten4.shape[2]=%%d not consistent",
                                 c, step_x,
                                 PyGpuArray_DIMS(%(ten4)s)[2]);
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ((((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2)) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError, "GpuImages2Neibs:"
                                 " neib_shape[1]=%%d, neib_step[1]=%%d and"
                                 " ten4.shape[3]=%%d not consistent",
                                 d, step_y,
                                 PyGpuArray_DIMS(%(ten4)s)[3]);
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]-(c%%2))/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]-(d%%2))/step_y);
            } else if (%(params)s->mode == MODE_FULL) {
                if ( ((PyGpuArray_DIMS(%(ten4)s))[2] < c) ||
                 ( (((PyGpuArray_DIMS(%(ten4)s))[2]+c-2) %% step_x)!=0))
                {
                    PyErr_Format(PyExc_TypeError,
                                 "neib_shape[0]=%%ld, neib_step[0]=%%ld and"
                                 " ten4.shape[2]=%%ld not consistent",
                                 (long int)c, (long int)step_x,
                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[2]));
                    %(fail)s;
                }
                if ( ((PyGpuArray_DIMS(%(ten4)s))[3] < d) ||
                     ( (((PyGpuArray_DIMS(%(ten4)s))[3]+d-2) %% step_y)!=0))
                {
                    PyErr_Format(PyExc_TypeError,
                                 "neib_shape[1]=%%ld, neib_step[1]=%%ld and"
                                 " ten4.shape[3]=%%ld not consistent",
                                 (long int)d, (long int)step_y,
                                 (long int)(PyGpuArray_DIMS(%(ten4)s)[3]));
                    %(fail)s;
                }
                //number of patch in height
                grid_c = 1+(((PyGpuArray_DIMS(%(ten4)s))[2]+c-2)/step_x);
                //number of patch in width
                grid_d = 1+(((PyGpuArray_DIMS(%(ten4)s))[3]+d-2)/step_y);
            } else {
                PyErr_Format(PyExc_TypeError,
                             "GpuImages2Neibs:: unknown mode %%d", %(params)s->mode);
                 %(fail)s;
            }

            // new dimensions for z
            const int z_dim1 = c * d;
            const int z_dim0 =  grid_c
                                * grid_d
                                * PyGpuArray_DIMS(%(ten4)s)[1]
                                * PyGpuArray_DIMS(%(ten4)s)[0];

            if ((NULL == %(z)s)
                || (PyGpuArray_DIMS(%(z)s)[0] != z_dim0)
                || (PyGpuArray_DIMS(%(z)s)[1] != z_dim1))
            {
                Py_XDECREF(%(z)s);
                size_t dims[2];
                dims[0] = z_dim0;
                dims[1] = z_dim1;
                %(z)s = pygpu_empty(2, dims, typecode_z,
                                    GA_C_ORDER, %(params)s->context, Py_None);
                if (!%(z)s)
                {
                    PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
                                    " failed to alloc z output");
                    %(fail)s;
                }
            }

        }

        { // NESTED SCOPE

            const int mode = %(params)s->mode;
            const int nb_batch = PyGpuArray_DIMS(%(ten4)s)[0];
            const int nb_stack = PyGpuArray_DIMS(%(ten4)s)[1];
            const int height = PyGpuArray_DIMS(%(ten4)s)[2];
            const int width = PyGpuArray_DIMS(%(ten4)s)[3];

            const int c = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 0);
            const int d = *(npy_%(dtype_neib_shape)s*) PyArray_GETPTR1(
                                                     %(neib_shape)s, 1);
            const npy_intp step_x = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(npy_%(dtype_neib_step)s*)
                                         PyArray_GETPTR1(%(neib_step)s, 1);

            size_t threads_per_block[3] = {d, c, 1};
            //get the max threads per blocks
            size_t max_threads_dim;
            int err = gpucontext_property(%(params)s->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
            if (err != GA_NO_ERROR){
                PyErr_SetString(PyExc_RuntimeError, "Could not fetch max_threads_dims");
                %(fail)s;
            }
            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[1]>1)threads_per_block[1]--;
            while(threads_per_block[0]*threads_per_block[1]>max_threads_dim && threads_per_block[0]>1)threads_per_block[0]--;

            //Make bigger block to have better memory access pattern and
            //a higher core utilisation. for smaller patch size

            while(c*d*(threads_per_block[2]+1) < 128 && threads_per_block[2]<64 &&
                  threads_per_block[2]<PyGpuArray_DIMS(%(z)s)[0]){
                threads_per_block[2]++;
            }
            int nb_block;
            if (PyGpuArray_DIMS(%(z)s)[0] %% threads_per_block[2] == 0)
                nb_block = PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2];
            else
                nb_block = (PyGpuArray_DIMS(%(z)s)[0] / threads_per_block[2]) + 1;
            size_t n_blocks[3] = {std::min(32*1024,nb_block), 1, 1};

            GpuKernel *fptr;
            if(threads_per_block[0]==d && threads_per_block[1]==c){
                fptr = &k_multi_warp_less_%(name)s;
            }else{
                fptr = &k_multi_warp_%(name)s;
            }
            /*
            printf("%%zu %%zu %%zu %%zu %%zu %%zu %%zu\\n",
                   max_threads_dim, threads_per_block[0], threads_per_block[1], threads_per_block[2],
                   n_blocks[0], n_blocks[1], n_blocks[2]);
            */
            size_t stride_A0 = PyGpuArray_STRIDES(%(ten4)s)[0] / itemsize_ten4;
            size_t stride_A1 = PyGpuArray_STRIDES(%(ten4)s)[1] / itemsize_ten4;
            size_t stride_A2 = PyGpuArray_STRIDES(%(ten4)s)[2] / itemsize_ten4;
            size_t stride_A3 = PyGpuArray_STRIDES(%(ten4)s)[3] / itemsize_ten4;
            size_t stride_Z0 = PyGpuArray_STRIDES(%(z)s)[0] / itemsize_z;
            size_t stride_Z1 = PyGpuArray_STRIDES(%(z)s)[1] / itemsize_z;
            void *kernel_params[] = {(void *)&mode,
                                     (void *)&nb_batch,
                                     (void *)&nb_stack,
                                     (void *)&height, (void *)&width,
                                     (void *)&c, (void *)&d,
                                     (void *)&step_x, (void *)&step_y,
                                     (void *)&grid_c, (void *)&grid_d,
                                     (void *)&stride_A0,
                                     (void *)&stride_A1,
                                     (void *)&stride_A2,
                                     (void *)&stride_A3,
                                     (void *)%(ten4)s->ga.data,
                                     (void *)&%(ten4)s->ga.offset,
                                     (void *)&stride_Z0,
                                     (void *)&stride_Z1,
                                     (void *)%(z)s->ga.data,
                                     (void *)&%(z)s->ga.offset};
            err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
            %(err_check)s
            %(sync)s
        } // END NESTED SCOPE
        """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
                   dtype_neib_shape=node.inputs[1].dtype,
                   dtype_neib_step=node.inputs[2].dtype,
                   err_check=err_check,
                   sync=sync,
                   name=name,
                   params=sub['params'],
                   fail=sub['fail'])

    def perform(self, node, inp, out, params):
        # Disable the perform method from the CPU version
        Op.perform(self, node, inp, out, params)
Esempio n. 21
0
class GpuMagmaSVD(COp):
    """Computes the svd of a matrix :math:`A` using magma library.

    .. warning::

        Because of implementation constraints, this Op returns outputs
        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
        to get them in expected order ``U, S, VT``.

    """
    __props__ = ('full_matrices', 'compute_uv')
    _cop_num_inputs = 1
    _cop_num_outputs = 3
    check_input = False
    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
        COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')

    def c_headers(self):
        return [
            'gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
            'gpuarray_helper.h', 'magma.h'
        ]

    def c_header_dirs(self):
        dirs = [os.path.dirname(__file__), pygpu.get_include()]
        if config.magma.include_path:
            dirs.append(config.magma.include_path)
        return dirs

    def c_libraries(self):
        return ['magma']

    def c_lib_dirs(self):
        if config.magma.library_path:
            return [config.magma.library_path]
        return []

    def make_node(self, A):
        ctx_name = infer_context_name(A)
        A = as_gpuarray_variable(A, ctx_name)
        if A.ndim != 2:
            raise LinAlgError("Matrix rank error")
        assert A.dtype == 'float32'
        if self.compute_uv:
            return theano.Apply(
                self,
                [A],
                # return S, U, VT
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)(),
                    A.type(),
                    A.type()
                ])
        else:
            return theano.Apply(
                self,
                [A],
                # return only S
                [
                    GpuArrayType(A.dtype,
                                 broadcastable=[False],
                                 context_name=ctx_name)()
                ])

    def prepare_node(self, node, storage_map, compute_map, impl):
        # Check node to prevent eventual errors with old pickled nodes.
        if self.compute_uv:
            A, B, C = node.outputs
            # We expect order: S (vector), U (matrix), VT (matrix)
            assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
                "Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
                "instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        x_shape, = shapes
        M, N = x_shape
        K = tensor.minimum(M, N)
        s_shape = (K, )
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]
Esempio n. 22
0
class GpuAveragePoolGrad(CGpuKernelBase):
    """
    Implement the grad of average pooling on the gpu.

    """
    __props__ = ('ignore_border', 'mode', 'ndim')
    params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)

    def __init__(self, ignore_border, mode='max', ndim=2):
        self.ndim = ndim
        self.ignore_border = ignore_border
        if mode == 'average':
            mode = 'average_inc_pad'
        self.mode = mode
        CGpuKernelBase.__init__(self, ['pool_ave_grad.c'],
                                'APPLY_SPECIFIC(ave_pool_grad)')
        assert mode in ('sum', 'average_inc_pad', 'average_exc_pad')
        assert ndim in [2, 3]

    def get_params(self, node):
        return self.params_type.get_params(self,
                                           context=node.inputs[0].type.context)

    def c_headers(self):
        return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']

    def c_header_dirs(self):
        return [os.path.dirname(__file__), pygpu.get_include()]

    def make_node(self, inp, out_grad, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp, out_grad)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert (inp.ndim == nd + 2)
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
        assert (out_grad.ndim == nd + 2)

        assert (out_grad.ndim == inp.ndim)

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.mode == 'average_exc_pad':
                raise ValueError('Padding must be zero for average_exc_pad')
        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in theano.tensor.int_dtypes:
            raise TypeError('Window shape parameters must be ints.')
        if stride.dtype not in theano.tensor.int_dtypes:
            raise TypeError('Stride parameters must be ints.')
        if pad.dtype not in theano.tensor.int_dtypes:
            raise TypeError('Padding parameters must be ints.')

        ws = theano.tensor.cast(ws, 'int64')
        stride = theano.tensor.cast(stride, 'int64')
        pad = theano.tensor.cast(pad, 'int64')

        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])

    def infer_shape(self, node, in_shapes):
        return [in_shapes[0]]

    def grad(self, inp, grads):
        x, gz, ws, stride, pad = inp
        ggx, = grads
        return ([
            theano.tensor.zeros_like(x),
            GpuPool(ignore_border=self.ignore_border,
                    ndim=self.ndim,
                    mode=self.mode)(ggx, ws, stride, pad)
        ] + [theano.gradient.DisconnectedType()() for i in inp[2:]])

    def connection_pattern(self, node):
        return [[1], [1], [0], [0], [0]]