Beispiel #1
0
            pycuda_fct(inputs[0][0],
                       inputs[1][0],
                       z[0],
                       numpy.intc(inputs[1][0].size),
                       block=block,
                       grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk


pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")


@local_optimizer([GpuElemwise])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if (not any([any(i.type.broadcastable) for i in node.inputs])
                and all([i.ndim <= 2 for i in node.inputs])):
            new_op = PycudaElemwiseSourceModuleOp(
                node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]

            c_code,
            "pycuda_elemwise_kernel_%s"%str(self.scalar_op),
            preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""")
        return out_node

    def perform(self, node, inputs, out):
        #TODO assert all input have the same shape
        z, = out
        if z[0] is None or z[0].shape!=inputs[0].shape:
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
        i = inputs + z
        self.pycuda_fct(*i)

pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")

@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]

pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)

@local_optimizer([])
def local_pycuda_gpu_elemwise_kernel(node):
Beispiel #3
0
            ins = gpu_contiguous(ins)
            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
            return [out]

    class NoCuDNNRaise(Optimizer):
        def apply(self, fgraph):
            """ Raise a RuntimeError if cudnn can't be used"""
            if not dnn_available():
                # Make an assert error as we want Theano to fail, not
                # just skip this optimization.
                raise AssertionError(
                    "cuDNN optimization was enabled, but Theano was not able"
                    " to use it. We got this error: \n" +
                    dnn_available.msg)
    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')

    @register_opt('cudnn')
    @local_optimizer([SoftmaxGrad])
    def local_softmax_dnn_grad(node):
        if (
            isinstance(node.op, SoftmaxGrad)
            and (isinstance(node.inputs[0].owner.op, HostFromGpu)
                 or isinstance(node.inputs[1].owner.op, HostFromGpu))
        ):
            if not dnn_available():
                return
            ins = []
            for n in node.inputs:
                if isinstance(n.owner.op, HostFromGpu):
                    n = n.owner.inputs[0]
Beispiel #4
0
            ins = gpu_contiguous(ins)
            out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins)
            out = as_cuda_ndarray_variable(out.dimshuffle(0, 1))
            return [out]

    class NoCuDNNRaise(Optimizer):
        def apply(self, fgraph):
            """ Raise a RuntimeError if cudnn can't be used"""
            if not dnn_available():
                # Make an assert error as we want Theano to fail, not
                # just skip this optimization.
                raise AssertionError(
                    "cuDNN optimization was enabled, but Theano was not able"
                    " to use it. We got this error: \n" + dnn_available.msg)

    gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')

    @register_opt('cudnn')
    @local_optimizer([SoftmaxGrad])
    def local_softmax_dnn_grad(node):
        if (isinstance(node.op, SoftmaxGrad)
                and (isinstance(node.inputs[0].owner.op, HostFromGpu)
                     or isinstance(node.inputs[1].owner.op, HostFromGpu))):
            ins = []
            for n in node.inputs:
                if isinstance(n.owner.op, HostFromGpu):
                    n = n.owner.inputs[0]
                ins.append(n.dimshuffle(0, 1, 'x', 'x'))

            out = GpuDnnSoftmaxGrad('bc01', 'accurate',
                                    'channel')(gpu_contiguous(ins[0]),