""" % dict(img=img, kerns=kern, out=out, bmode=bmode, fail=sub['fail'], id=sub['struct_id'], name=name) def c_code_cache_version(self): return (4, ) from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, gpu_optimizer) @local_optimizer([GpuConv]) def local_conv_dnn(node): if isinstance(node.op, GpuConv): if (node.op.subsample != (1, 1) or node.op.border_mode not in ['full', 'valid']): return img, kern = node.inputs border_mode = node.op.border_mode return [ GpuDnnConv(border_mode)(gpu_contiguous(img), gpu_contiguous(kern)) ] gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
return codeSource gpu_conv_transpd = GpuConvTransp3D() @local_optimizer([ConvTransp3D]) def local_gpu_conv_transp3d(node): if isinstance(node.op, ConvTransp3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): W, b, d, H, RShape = node.inputs return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))] # Not enabled by default as we don't want people to use it. gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d) #If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction def computeR(W, b, d, H, Rshape=None): assert len(W.shape) == 5 assert len(H.shape) == 5 assert len(b.shape) == 1 assert len(d) == 3 outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape assert outputChannelsAgain == outputChannels assert b.shape[0] == inputChannels
output%(id)d, CudaNdarray_DEV_DATA(%(out)s), CUDNN_RESULT_NO_ACCUMULATE ); if (err%(name)s != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %%s", cudnnGetErrorString(err%(name)s)); %(fail)s } """ % dict(img=img, kerns=kern, out=out, bmode=bmode, fail=sub['fail'], id=sub['struct_id'], name=name) def c_code_cache_version(self): return (4,) from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, gpu_optimizer) @local_optimizer([GpuConv]) def local_conv_dnn(node): if isinstance(node.op, GpuConv): if (node.op.subsample != (1, 1) or node.op.border_mode not in ['full', 'valid']): return img, kern = node.inputs border_mode = node.op.border_mode return [GpuDnnConv(border_mode)(gpu_contiguous(img), gpu_contiguous(kern))] gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
return codeSource gpu_conv_transpd = GpuConvTransp3D() @local_optimizer([ConvTransp3D]) def local_gpu_conv_transp3d(node): if isinstance(node.op, ConvTransp3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): W, b, d, H, RShape = node.inputs return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))] # Not enabled by default as we don't want people to use it. gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d) # If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction def computeR(W, b, d, H, Rshape=None): assert len(W.shape) == 5 assert len(H.shape) == 5 assert len(b.shape) == 1 assert len(d) == 3 outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape assert outputChannelsAgain == outputChannels assert b.shape[0] == inputChannels dr, dc, dt = d
} """ return codeSource gpu_convd = GpuConv3D() @local_optimizer([Conv3D]) def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([ i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs ]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [ host_from_gpu( gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d)) ] # Not enabled by default as we don't want people to use it. gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)
from theano.sandbox.cuda.opt import local_optimizer, gpu_optimizer @local_optimizer([GpuConv]) def local_conv_dnn(node): raise_no_dnn() if isinstance(node.op, GpuConv): if node.op.border_mode not in ['full', 'valid']: return img, kern = node.inputs border_mode = node.op.border_mode subsample = node.op.subsample return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern), border_mode=border_mode, subsample=subsample)] gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn') @local_optimizer([GpuDownsampleFactorMax]) def local_pool_dnn(node): if isinstance(node.op, GpuDownsampleFactorMax): if node.op.ignore_border: return img, = node.inputs ds = node.op.ds return [dnn_pool(gpu_contiguous(img), ds, ds)] gpu_optimizer.register("pool_cudnn", local_pool_dnn, 'cudnn') @local_optimizer([GpuDownsampleFactorMaxGrad])
out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch out_frame*nkern+//the output frame out_row*out_wid*out_dur*nkern+//the output row out_col*out_dur*nkern + //the output_col kern_id //the output image (channel) ] = sum + bias[kern_id]; } } """ return codeSource gpu_convd = GpuConv3D() @local_optimizer([Conv3D]) def local_gpu_conv3d(node): if isinstance(node.op, Conv3D): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): V, W, b, d = node.inputs return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V), as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))] # Not enabled by default as we don't want people to use it. gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)