pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp( node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op]
c_code, "pycuda_elemwise_kernel_%s"%str(self.scalar_op), preamble="""#include<Python.h> #include <numpy/arrayobject.h>""") return out_node def perform(self, node, inputs, out): #TODO assert all input have the same shape z, = out if z[0] is None or z[0].shape!=inputs[0].shape: z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) i = inputs + z self.pycuda_fct(*i) pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]): new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op] pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise) @local_optimizer([]) def local_pycuda_gpu_elemwise_kernel(node):
ins = gpu_contiguous(ins) out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins) out = as_cuda_ndarray_variable(out.dimshuffle(0, 1)) return [out] class NoCuDNNRaise(Optimizer): def apply(self, fgraph): """ Raise a RuntimeError if cudnn can't be used""" if not dnn_available(): # Make an assert error as we want Theano to fail, not # just skip this optimization. raise AssertionError( "cuDNN optimization was enabled, but Theano was not able" " to use it. We got this error: \n" + dnn_available.msg) gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') @register_opt('cudnn') @local_optimizer([SoftmaxGrad]) def local_softmax_dnn_grad(node): if ( isinstance(node.op, SoftmaxGrad) and (isinstance(node.inputs[0].owner.op, HostFromGpu) or isinstance(node.inputs[1].owner.op, HostFromGpu)) ): if not dnn_available(): return ins = [] for n in node.inputs: if isinstance(n.owner.op, HostFromGpu): n = n.owner.inputs[0]
ins = gpu_contiguous(ins) out = GpuDnnSoftmax('bc01', 'accurate', 'channel')(ins) out = as_cuda_ndarray_variable(out.dimshuffle(0, 1)) return [out] class NoCuDNNRaise(Optimizer): def apply(self, fgraph): """ Raise a RuntimeError if cudnn can't be used""" if not dnn_available(): # Make an assert error as we want Theano to fail, not # just skip this optimization. raise AssertionError( "cuDNN optimization was enabled, but Theano was not able" " to use it. We got this error: \n" + dnn_available.msg) gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') @register_opt('cudnn') @local_optimizer([SoftmaxGrad]) def local_softmax_dnn_grad(node): if (isinstance(node.op, SoftmaxGrad) and (isinstance(node.inputs[0].owner.op, HostFromGpu) or isinstance(node.inputs[1].owner.op, HostFromGpu))): ins = [] for n in node.inputs: if isinstance(n.owner.op, HostFromGpu): n = n.owner.inputs[0] ins.append(n.dimshuffle(0, 1, 'x', 'x')) out = GpuDnnSoftmaxGrad('bc01', 'accurate', 'channel')(gpu_contiguous(ins[0]),