Exemple #1
0
    def c_code(self, node, name, inp, out, sub):
        ndim = len(inp)
        zz = out[0]
        fail = sub['fail']

        code = ["""
int i;
size_t shape[%(ndim)s];
""" % dict(ndim=ndim)]

        for i, shp_i in enumerate(inp):
            code.append("""
shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
""" % dict(i=i, shp_i=shp_i))

        code.append("""
if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
                       pygpu_default_context())) {
  %(fail)s
}
""" % dict(zz=zz,
           ndim=ndim,
           type=gpuarray.dtype_to_typecode(self.dtype),
           fail=fail))

        return ''.join(code)
Exemple #2
0
    def c_code(self, node, name, inp, out, sub):
        ndim = len(inp)
        zz = out[0]
        fail = sub["fail"]

        code = [
            """
int i;
size_t shape[%(ndim)s];
"""
            % dict(ndim=ndim)
        ]

        for i, shp_i in enumerate(inp):
            code.append(
                """
shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
"""
                % dict(i=i, shp_i=shp_i)
            )

        code.append(
            """
if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
                       %(ctx)s)) {
  %(fail)s
}
"""
            % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype), fail=fail, ctx=sub["params"])
        )

        return "".join(code)
Exemple #3
0
 def __init__(self, dtype, broadcastable, name=None):
     # In case this was not provided and no global value is available
     self.dtype = str(dtype)
     self.broadcastable = tuple(bool(b) for b in broadcastable)
     self.ndim = len(self.broadcastable)
     self.name = name
     try:
         self.typecode = gpuarray.dtype_to_typecode(self.dtype)
     except gpuarray.GpuArrayException:
         raise TypeError("Unsupported dtype for %s: %s" %
                         (self.__class__.__name__, self.dtype))
Exemple #4
0
 def __init__(self, dtype, broadcastable, name=None):
     # In case this was not provided and no global value is available
     self.dtype = str(dtype)
     self.broadcastable = tuple(bool(b) for b in broadcastable)
     self.ndim = len(self.broadcastable)
     self.name = name
     try:
         self.typecode = gpuarray.dtype_to_typecode(self.dtype)
     except gpuarray.GpuArrayException:
         raise TypeError("Unsupported dtype for %s: %s" %
                         (self.__class__.__name__, self.dtype))
Exemple #5
0
 def __init__(self, dtype, broadcastable, context_name=None, name=None):
     # In case this was not provided and no global value is available
     self.dtype = str(dtype)
     self.broadcastable = tuple(bool(b) for b in broadcastable)
     self.ndim = len(self.broadcastable)
     self.name = name
     self.context_name = context_name
     # This will check that the passed context name is valid and registered.
     get_context(self.context_name)
     try:
         self.typecode = gpuarray.dtype_to_typecode(self.dtype)
     except gpuarray.GpuArrayException:
         raise TypeError("Unsupported dtype for %s: %s" % (self.__class__.__name__, self.dtype))
Exemple #6
0
 def __init__(self, dtype, broadcastable, context_name=None, name=None):
     # In case this was not provided and no global value is available
     self.dtype = str(dtype)
     self.broadcastable = tuple(bool(b) for b in broadcastable)
     self.ndim = len(self.broadcastable)
     self.name = name
     self.context_name = context_name
     # This will check that the passed context name is valid and registered.
     get_context(self.context_name)
     try:
         self.typecode = gpuarray.dtype_to_typecode(self.dtype)
     except gpuarray.GpuArrayException:
         raise TypeError("Unsupported dtype for %s: %s" %
                         (self.__class__.__name__, self.dtype))
Exemple #7
0
 def m(t):
     if t == gpuarray.GpuArray:
         return "GA_BUFFER"
     else:
         return str(gpuarray.dtype_to_typecode(t))
 def get_op_params(self):
     return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
Exemple #9
0
    def c_code(self, node, name, inputs, outputs, sub):
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)

        #check that all inputs have valid dimensions
        emitted_inames = {}
        code = """
        int n_blocks = 0;
        int threads_per_block = 0;
        size_t numEls = 0;
        """
        if nd > 0:
            code += """
            size_t dims[%(nd)s] = {%(initial_dims)s};
            """ % locals()
        else:
            code += """
            size_t *dims = NULL;
            """
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[idx]
                continue

            broadcasts = map(int, node.inputs[idx].broadcastable)
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[idx].ndim
            if nd > 0:
                code += """
                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
                """ % locals()
            else:
                code += """
                int *broadcasts_%(iname)s = NULL;
                """ % locals()
            emitted_inames[iname] = node.inputs[idx]

        #check that all inputs have valid dimensions
        emitted_inames = {}
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                continue
            code += """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
                         "need %(nd)s dims, not %%i",
                         PyGpuArray_NDIM(%(iname)s));
            %(fail)s;
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
            if ((!(broadcasts_%(iname)s[i] &&
                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(iname)s)[i],
                             dims[i]
                            );
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            typecode = dtype_to_typecode(node.outputs[idx].dtype)
            if idx not in self.inplace_pattern.keys():
                code += """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
                            pygpu_default_context(), Py_None);
            if (!%(oname)s) {
                        //TODO, this check don't seam good.
                        //TODO, set exception?
                            %(fail)s
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                code += """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);

        //first use at least a full warp
        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE

        //next start adding multiprocessors
        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
        n_blocks = std::min(numEls/threads_per_block +
                               (numEls %% threads_per_block?1:0),
                           (size_t)30);

        // next start adding more warps per multiprocessor
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);

                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
        param = []
        for i in range(nd):
            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
                                                              i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = dtype_to_ctype(var.dtype)
            param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
        code += ',\n'.join(param) + ");\n"
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
        return str(code)
Exemple #10
0
 def m(t):
     if t == gpuarray.GpuArray:
         return "GA_BUFFER"
     else:
         return str(gpuarray.dtype_to_typecode(t))
Exemple #11
0
    def c_code(self, node, nodename, inps, outs, sub):
        context = node.inputs[0].type.context
        if context.kind != b'cuda':
            raise NotImplementedError('%s: We only have CUDA '
                                      'implementation so far.' %
                                      self.__class__.__name__)
        x, k = inps
        inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
        if not self.return_indices:
            yv, = outs
        elif self.return_values:
            yv, yi = outs
        else:
            yi, = outs
        out_dtype_s = self.idx_dtype
        out_dtc = ga.dtype_to_typecode(out_dtype_s)
        fail = sub['fail']
        ctx = sub['params']
        k_dtype = node.inputs[1].type.dtype_specs()[1]
        # max threads per block
        MAX_TPB = context.maxlsize
        # max blocks per grid
        MAX_BPG = context.maxgsize0
        WARP_SIZE = 32

        ndim = node.inputs[0].ndim
        reordered_axes = list(range(ndim))
        axis = self.axis % ndim
        del (reordered_axes[axis])
        reordered_axes = [axis] + reordered_axes
        dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
        prep_output = ''
        if self.return_values:
            def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
            params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (
                yv, yv)
            params_dv += ''.join('(void*)(dvstrides+%d), ' % i
                                 for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yv)s, %(ndim)d, odims,
        %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n''' % locals()
        else:
            def_dvstrides = params_dv = ''

        if self.return_indices:
            def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
            params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi
            params_di += ''.join('(void*)(distrides+%d), ' % i
                                 for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yi)s, %(ndim)d, odims,
        %(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n''' % locals()
        else:
            def_distrides = params_di = ''
        sstrides = ', '.join('(void*)(sstrides+%d)' % i
                             for i in reordered_axes)
        code = '''
{
    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
    const size_t *dims = PyGpuArray_DIMS(%(x)s);
    size_t odims[%(ndim)d];
    for (int i=0; i<%(ndim)d; i++)
        odims[i] = dims[i];

    odims[%(axis)d] = k_>=0 ? k_ : -k_;

    if (0 == odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth must not be zero");
        %(fail)s;
    } else if (dims[%(axis)d] < odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth cannot be larger than the size of specified axis %(axis)d");
        %(fail)s;
    }
    %(prep_output)s

    size_t grid_size=1, block_size=1;
    for (int i=0; i<%(ndim)d; ++i) {
        if (i!=%(axis)d)
            grid_size *= dims[i];
        else
            block_size = dims[i];
    }
    // round up to multiples of warp size
    block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;

    if (grid_size > %(MAX_BPG)d) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: too many slices to work with, expected <= %(MAX_BPG)d");
        %(fail)s;
    }

    %(def_dvstrides)s;
    %(def_distrides)s;
    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
    void* args[] = {
        %(dims)s
        %(params_dv)s
        %(params_di)s
        (void*)(&k_),
        (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
        %(sstrides)s,
        (void*)(dims+%(axis)d),
    };

    int err;
    if (dims[%(axis)d] > (1u << 31)) {
        block_size = %(MAX_TPB)d;
        err = GpuKernel_call(
            &k_topk_dense_xlarge%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    } else if (block_size > %(MAX_TPB)d) {
        block_size = %(MAX_TPB)d;
        err = GpuKernel_call(
            &k_topk_dense_large%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    } else {
        err = GpuKernel_call(
            &k_topk_dense%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    }
    if (err != GA_NO_ERROR) {
        PyErr_SetString(
            PyExc_RuntimeError,
            "topk: gpu kernel failed to execute");
        %(fail)s;
    }
}
        '''
        return code % locals()
Exemple #12
0
    def c_code(self, node, nodename, inps, outs, sub):
        context = node.inputs[0].type.context
        if context.kind != b'cuda':
            raise NotImplementedError(
                '%s: We only have CUDA '
                'implementation so far.' % self.__class__.__name__)
        x, k = inps
        inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
        if not self.return_indices:
            yv, = outs
        elif self.return_values:
            yv, yi = outs
        else:
            yi, = outs
        out_dtype_s = self.idx_dtype
        out_dtc = ga.dtype_to_typecode(out_dtype_s)
        fail = sub['fail']
        ctx = sub['params']
        k_dtype = node.inputs[1].type.dtype_specs()[1]
        # max threads per block
        MAX_TPB = context.maxlsize
        # max blocks per grid
        MAX_BPG = context.maxgsize0
        WARP_SIZE = 32

        ndim = node.inputs[0].ndim
        reordered_axes = list(range(ndim))
        axis = self.axis % ndim
        del(reordered_axes[axis])
        reordered_axes = [axis] + reordered_axes
        dims = ''.join('(void*)(dims+%d), ' % i for i in reordered_axes[1:])
        prep_output = ''
        if self.return_values:
            def_dvstrides = 'const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)' % yv
            params_dv = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % (yv, yv)
            params_dv += ''.join('(void*)(dvstrides+%d), ' % i for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yv)s, %(ndim)d, odims,
        %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n''' % locals()
        else:
            def_dvstrides = params_dv = ''

        if self.return_indices:
            def_distrides = 'const ssize_t *distrides = PyGpuArray_STRIDES(%s)' % yi
            params_di = '(void*)((char*)(%s->ga.data) + (%s->ga.offset)),\n' % yi
            params_di += ''.join('(void*)(distrides+%d), ' % i for i in reordered_axes)
            prep_output += '''
    if (0 != theano_prep_output(
        &%(yi)s, %(ndim)d, odims,
        %(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n''' % locals()
        else:
            def_distrides = params_di = ''
        sstrides = ', '.join('(void*)(sstrides+%d)' % i for i in reordered_axes)
        code = '''
{
    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
    const size_t *dims = PyGpuArray_DIMS(%(x)s);
    size_t odims[%(ndim)d];
    for (int i=0; i<%(ndim)d; i++)
        odims[i] = dims[i];

    odims[%(axis)d] = k_>=0 ? k_ : -k_;

    if (0 == odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth must not be zero");
        %(fail)s;
    } else if (dims[%(axis)d] < odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth cannot be larger than the size of specified axis %(axis)d");
        %(fail)s;
    }
    %(prep_output)s

    size_t grid_size=1, block_size=1;
    for (int i=0; i<%(ndim)d; ++i) {
        if (i!=%(axis)d)
            grid_size *= dims[i];
        else
            block_size = dims[i];
    }
    // round up to multiples of warp size
    block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;

    if (grid_size > %(MAX_BPG)d) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: too many slices to work with, expected <= %(MAX_BPG)d");
        %(fail)s;
    }

    %(def_dvstrides)s;
    %(def_distrides)s;
    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);
    void* args[] = {
        %(dims)s
        %(params_dv)s
        %(params_di)s
        (void*)(&k_),
        (void*)((char*)(%(x)s->ga.data) + (%(x)s->ga.offset)),
        %(sstrides)s,
        (void*)(dims+%(axis)d),
    };

    int err;
    if (dims[%(axis)d] > (1u << 31)) {
        block_size = %(MAX_TPB)d;
        err = GpuKernel_call(
            &k_topk_dense_xlarge%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    } else if (block_size > %(MAX_TPB)d) {
        block_size = %(MAX_TPB)d;
        err = GpuKernel_call(
            &k_topk_dense_large%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    } else {
        err = GpuKernel_call(
            &k_topk_dense%(nodename)s, 1,
            &grid_size, &block_size, 0, args);
    }
    if (err != GA_NO_ERROR) {
        PyErr_SetString(
            PyExc_RuntimeError,
            "topk: gpu kernel failed to execute");
        %(fail)s;
    }
}
        '''
        return code % locals()
Exemple #13
0
    def c_code(self, node, nodename, inps, outs, sub):
        context = node.inputs[0].type.context
        if context.kind != b"cuda":
            raise NotImplementedError("%s: We only have CUDA "
                                      "implementation so far." %
                                      self.__class__.__name__)
        x, k = inps
        inp_dtc = ga.dtype_to_typecode(node.inputs[0].dtype)
        if not self.return_indices:
            (yv, ) = outs
        elif self.return_values:
            yv, yi = outs
        else:
            (yi, ) = outs
        out_dtype_s = self.idx_dtype
        out_dtc = ga.dtype_to_typecode(out_dtype_s)
        fail = sub["fail"]
        ctx = sub["params"]
        k_dtype = node.inputs[1].type.dtype_specs()[1]
        # max threads per block
        MAX_TPB = context.maxlsize0
        # max blocks per grid
        MAX_BPG = context.maxgsize0
        WARP_SIZE = 32

        ndim = node.inputs[0].ndim
        reordered_axes = list(range(ndim))
        axis = self.axis % ndim
        del reordered_axes[axis]
        reordered_axes = [axis] + reordered_axes
        dims = "".join("dims[%d], " % i for i in reordered_axes[1:])
        prep_output = ""
        if self.return_values:
            def_dvstrides = "const ssize_t *dvstrides = PyGpuArray_STRIDES(%s)" % yv
            params_dv = "%s->ga.data, %s->ga.offset,\n" % (yv, yv)
            params_dv += "".join("dvstrides[%d], " % i for i in reordered_axes)
            prep_output += ("""
    if (0 != theano_prep_output(
        &%(yv)s, %(ndim)d, odims,
        %(inp_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n""" % locals())
        else:
            def_dvstrides = params_dv = ""

        if self.return_indices:
            def_distrides = "const ssize_t *distrides = PyGpuArray_STRIDES(%s)" % yi
            params_di = "%s->ga.data, %s->ga.offset,\n" % (yi, yi)
            params_di += "".join("distrides[%d], " % i for i in reordered_axes)
            prep_output += ("""
    if (0 != theano_prep_output(
        &%(yi)s, %(ndim)d, odims,
        %(out_dtc)s, GA_C_ORDER, %(ctx)s)) {
        %(fail)s;
    }\n""" % locals())
        else:
            def_distrides = params_di = ""
        sstrides = ", ".join("sstrides[%d]" % i for i in reordered_axes)
        code = """
{
    const ssize_t k_ = ((%(k_dtype)s*)(PyArray_DATA(%(k)s)))[0];
    const size_t *dims = PyGpuArray_DIMS(%(x)s);
    size_t odims[%(ndim)d];
    for (int i=0; i<%(ndim)d; i++)
        odims[i] = dims[i];

    odims[%(axis)d] = k_>=0 ? k_ : -k_;

    if (0 == odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth must not be zero");
        %(fail)s;
    } else if (dims[%(axis)d] < odims[%(axis)d]) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: kth cannot be larger than the size of specified axis %(axis)d");
        %(fail)s;
    }
    %(prep_output)s

    size_t grid_size=1, block_size=1;
    for (int i=0; i<%(ndim)d; ++i) {
        if (i!=%(axis)d)
            grid_size *= dims[i];
        else
            block_size = dims[i];
    }
    // round up to multiples of warp size
    block_size = ((block_size + %(WARP_SIZE)d - 1) / %(WARP_SIZE)d) * %(WARP_SIZE)d;

    if (grid_size > %(MAX_BPG)d) {
        PyErr_SetString(
            PyExc_ValueError,
            "topk: too many slices to work with, expected <= %(MAX_BPG)d");
        %(fail)s;
    }

    %(def_dvstrides)s;
    %(def_distrides)s;
    const ssize_t *sstrides = PyGpuArray_STRIDES(%(x)s);

    int err;
    if (dims[%(axis)d] > (1u << 31)) {
        block_size = %(MAX_TPB)d;
        err = k_topk_dense_xlarge_call(
                1, &grid_size, &block_size, 0,
                %(dims)s
                %(params_dv)s
                %(params_di)s
                k_,
                %(x)s->ga.data,
                %(x)s->ga.offset,
                %(sstrides)s,
                dims[%(axis)d]
        );
    } else if (block_size > %(MAX_TPB)d) {
        block_size = %(MAX_TPB)d;
        err = k_topk_dense_large_call(
                1, &grid_size, &block_size, 0,
                %(dims)s
                %(params_dv)s
                %(params_di)s
                k_,
                %(x)s->ga.data,
                %(x)s->ga.offset,
                %(sstrides)s,
                dims[%(axis)d]
        );
    } else {
        err = k_topk_dense_call(
                1, &grid_size, &block_size, 0,
                %(dims)s
                %(params_dv)s
                %(params_di)s
                k_,
                %(x)s->ga.data,
                %(x)s->ga.offset,
                %(sstrides)s,
                dims[%(axis)d]
        );
    }
    if (err != GA_NO_ERROR) {
        PyErr_SetString(
            PyExc_RuntimeError,
            "topk: gpu kernel failed to execute");
        %(fail)s;
    }
}
        """
        return code % locals()
Exemple #14
0
    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
            # We special case the no-reduction case since the gpu
            # kernel has trouble handling it.
            return """
        Py_XDECREF(%(out)s);
        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
        if (!%(out)s) {
            %(fail)s
        }

        if (%(sync)d)
            GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
           sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
                                           node.inputs[0].ndim)
        if self.axis is None:
            redux = [True] * node.inputs[0].ndim
        else:
            redux = self.redux
        acc_dtype = getattr(self, 'acc_dtype', None)
        if acc_dtype is None:
            acc_dtype = node.outputs[0].type.dtype
        input = inp[0]
        output = out[0]
        nd_out = node.outputs[0].ndim
        code = """
        size_t gs = 1;
        unsigned int n = 1;
        unsigned int proxy_dim[%(nd_in)s];
        unsigned int proxy_off;
        int proxy_str[%(nd_in)s];
        void *args[%(n_args)s];
        PyGpuArrayObject *tmp;
        int err;
""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)

        if nd_out != 0:
            code += """
        size_t out_dims[%(nd_out)s];
        int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
""" % dict(nd_out=nd_out, output=output)
            j = 0
            for i in range(node.inputs[0].ndim):
                if not self.redux[i]:
                    code += """
         out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
         if (!need_out)
             need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
""" % dict(j=j, i=i, input=input, output=output)
                    j += 1
            code += """
         if (need_out) {
             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
             if (!%(output)s) {
                 %(fail)s
             }
         }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
            Py_XDECREF(%(output)s);
            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
                                     pygpu_default_context(), Py_None);
            if (!%(output)s) {
                %(fail)s
            }
        }
""" % dict(output=output, fail=sub['fail'],
           out_type=dtype_to_typecode(node.outputs[0].type.dtype))

        if acc_dtype != node.outputs[0].type.dtype:
            code += """
        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
                          Py_None);
        if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
        tmp = %(output)s;
        Py_INCREF(tmp);
""" % dict(output=output)

        # We need the proxies since we are passing a pointer to the
        # data into the call and therefore we need a real copy of the
        # data in the proper type.
        code += """
        args[0] = &n;
        args[1] = &tmp->ga;
""" % dict(output=output)

        p = 2
        for i in range(node.inputs[0].ndim):
            code += """
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
        args[%(p)s] = &proxy_dim[%(i)s];
        n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
            p += 1
            if not redux[i]:
                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)

        code += """
        args[%(p)s] = &%(input)s->ga;
        proxy_off = %(input)s->ga.offset;
        args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
        p += 2

        for i in range(node.inputs[0].ndim):
            code += """
        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
        args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
            p += 1

        code += """
        if (gs == 0) gs = 1;
        n /= gs;
        err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
        if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "compyte error: GpuCAReduce: %%s.",
                         GpuKernel_error(&%(k_var)s, err));
            %(fail)s
        }

        if (%(cast_out)d) {
            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "compyte error: GpuCAReduce [cast]: %%s.",
                             GpuArray_error(&tmp->ga, err));
                %(fail)s
            }
        } else {
            Py_XDECREF(%(output)s);
            %(output)s = tmp;
        }

        if (%(sync)d)
            GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

        return code
Exemple #15
0
    def c_code(self, node, name, inp, out, sub):
        if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
            # We special case the no-reduction case since the gpu
            # kernel has trouble handling it.
            return """
        Py_XDECREF(%(out)s);
        %(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
        if (!%(out)s) {
            %(fail)s
        }

        if (%(sync)d)
            GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
           sync=bool(config.gpuarray.sync))
        k = self.get_kernel_cache(node)
        _, src, _, ls = k._get_basic_kernel(k.init_local_size,
                                           node.inputs[0].ndim)
        if self.axis is None:
            redux = [True] * node.inputs[0].ndim
        else:
            redux = self.redux
        acc_dtype = getattr(self, 'acc_dtype', None)
        if acc_dtype is None:
            acc_dtype = node.outputs[0].type.dtype
        input = inp[0]
        output = out[0]
        nd_out = node.outputs[0].ndim
        code = """
        size_t gs = 1;
        unsigned int n = 1;
        unsigned int proxy_dim[%(nd_in)s];
        unsigned int proxy_off;
        int proxy_str[%(nd_in)s];
        void *args[%(n_args)s];
        PyGpuArrayObject *tmp;
        int err;
""" % dict(n_args=4 + (node.inputs[0].ndim * 2), nd_in=node.inputs[0].ndim)

        if nd_out != 0:
            code += """
        size_t out_dims[%(nd_out)s];
        int need_out = %(output)s == NULL || %(output)s->ga.nd != %(nd_out)s;
""" % dict(nd_out=nd_out, output=output)
            j = 0
            for i in range(node.inputs[0].ndim):
                if not self.redux[i]:
                    code += """
         out_dims[%(j)s] = %(input)s->ga.dimensions[%(i)s];
         if (!need_out)
             need_out |= %(output)s->ga.dimensions[%(j)s] != out_dims[%(j)s];
""" % dict(j=j, i=i, input=input, output=output)
                    j += 1
            code += """
         if (need_out) {
             %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None);
             if (!%(output)s) {
                 %(fail)s
             }
         }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
           out_type=dtype_to_typecode(node.outputs[0].type.dtype))
        else:
            code += """
        if (%(output)s == NULL || %(output)s->ga.nd != 0) {
            Py_XDECREF(%(output)s);
            %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
                                     pygpu_default_context(), Py_None);
            if (!%(output)s) {
                %(fail)s
            }
        }
""" % dict(output=output, fail=sub['fail'],
           out_type=dtype_to_typecode(node.outputs[0].type.dtype))

        if acc_dtype != node.outputs[0].type.dtype:
            code += """
        tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
                          %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
                          Py_None);
        if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
        else:
            code += """
        tmp = %(output)s;
        Py_INCREF(tmp);
""" % dict(output=output)

        # We need the proxies since we are passing a pointer to the
        # data into the call and therefore we need a real copy of the
        # data in the proper type.
        code += """
        args[0] = &n;
        args[1] = &tmp->ga;
""" % dict(output=output)

        p = 2
        for i in range(node.inputs[0].ndim):
            code += """
        proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
        args[%(p)s] = &proxy_dim[%(i)s];
        n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
            p += 1
            if not redux[i]:
                code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)

        code += """
        args[%(p)s] = &%(input)s->ga;
        proxy_off = %(input)s->ga.offset;
        args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
        p += 2

        for i in range(node.inputs[0].ndim):
            code += """
        proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
        args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
            p += 1

        code += """
        if (gs == 0) gs = 1;
        n /= gs;
        err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
        if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
                         "compyte error: GpuCAReduce: %%s.",
                         GpuKernel_error(&%(k_var)s, err));
            %(fail)s
        }

        if (%(cast_out)d) {
            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "compyte error: GpuCAReduce [cast]: %%s.",
                             GpuArray_error(&tmp->ga, err));
                %(fail)s
            }
        } else {
            Py_XDECREF(%(output)s);
            %(output)s = tmp;
        }

        if (%(sync)d)
            GpuArray_sync(&%(output)s->ga);
""" % dict(k_var=self.c_kernel_obj(name), sync=bool(config.gpuarray.sync),
           ls=ls, fail=sub['fail'], output=output, input=input,
           cast_out=bool(acc_dtype != node.outputs[0].type.dtype))

        return code
 def get_params(self, node):
     from pygpu.gpuarray import dtype_to_typecode
     return self.params_type.get_params(typecode=dtype_to_typecode(self.dtype),
                                        context=get_context(self.context_name))
Exemple #17
0
    def get_op_params(self):
        from pygpu.gpuarray import dtype_to_typecode

        return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
    def get_op_params(self):
        from pygpu.gpuarray import dtype_to_typecode

        return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
Exemple #19
0
 def get_params(self, node):
     from pygpu.gpuarray import dtype_to_typecode
     return self.params_type.get_params(
         typecode=dtype_to_typecode(self.dtype),
         context=get_context(self.context_name))
 def get_op_params(self):
     return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
Exemple #21
0
    def c_code(self, node, name, inputs, outputs, sub):
        nd = node.outputs[0].ndim
        fail = sub["fail"]
        initial_dims = ','.join('1' for i in xrange(nd))
        opname = str(self.scalar_op)

        #check that all inputs have valid dimensions
        emitted_inames = {}
        code = """
        int n_blocks = 0;
        int threads_per_block = 0;
        size_t numEls = 0;
        """
        if nd > 0:
            code += """
            size_t dims[%(nd)s] = {%(initial_dims)s};
            """ % locals()
        else:
            code += """
            size_t *dims = NULL;
            """
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[idx]
                continue

            broadcasts = map(int, node.inputs[idx].broadcastable)
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[idx].ndim
            if nd > 0:
                code += """
                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
                """ % locals()
            else:
                code += """
                int *broadcasts_%(iname)s = NULL;
                """ % locals()
            emitted_inames[iname] = node.inputs[idx]

        #check that all inputs have valid dimensions
        emitted_inames = {}
        for idx, iname in enumerate(inputs):
            if iname in emitted_inames:
                continue
            code += """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
        {
            PyErr_Format(PyExc_TypeError,
                         "need %(nd)s dims, not %%i",
                         PyGpuArray_NDIM(%(iname)s));
            %(fail)s;
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i];
            if ((!(broadcasts_%(iname)s[i] &&
                 PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(idx)d (indices start at 0) has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(iname)s)[i],
                             dims[i]
                            );
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True
        #check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            typecode = dtype_to_typecode(node.outputs[idx].dtype)
            if idx not in self.inplace_pattern.keys():
                code += """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = pygpu_empty(%(nd)d, dims,
                            %(typecode)s, GA_C_ORDER,
                            pygpu_default_context(), Py_None);
            if (!%(oname)s) {
                        //TODO, this check don't seam good.
                        //TODO, set exception?
                            %(fail)s
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                code += """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             PyGpuArray_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
        z = outputs[0]
        code += """numEls = PyGpuArray_SIZE(%(z)s);

        //first use at least a full warp
        threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE

        //next start adding multiprocessors
        // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now.
        n_blocks = std::min(numEls/threads_per_block +
                               (numEls %% threads_per_block?1:0),
                           (size_t)30);

        // next start adding more warps per multiprocessor
        if (threads_per_block * n_blocks < numEls)
            threads_per_block = std::min(numEls/n_blocks, (size_t) 256);

                //std::cerr << "calling callkernel returned\\n";
        """ % locals()

        code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals()
        param = []
        for i in range(nd):
            param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
                                                              i=i))
        for n, (name, var) in enumerate(zip(inputs + outputs,
                                       node.inputs + node.outputs)):
            if (n - len(inputs)) in self.inplace_pattern:
                continue
            dtype = var.dtype
            param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
            param.append("%(name)s->ga.offset" % locals())
            for i in range(nd):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
        code += ',\n'.join(param) + ");\n"
        if config.gpuarray.sync:
            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
        return str(code)