def local_cudnn_maxandargmax(node): if not isinstance(node.op, GpuMaxAndArgmax): return if not dnn_available(node.inputs[0].type.context_name): return if version(raises=False) < 6000: return if node.inputs[0].ndim > 8: return if node.inputs[0].dtype != node.outputs[0].dtype: return if node.inputs[0].dtype not in ["float16", "float32", "float64"]: return # order of the axes influences the output indices if node.op.axis is not None and tuple(sorted( node.op.axis)) != node.op.axis: return max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype, node.outputs[0].dtype, True)(node.inputs[0]) # cudnn can only return int32 indices return ( max, as_gpuarray_variable(arg.astype("int64"), node.outputs[1].type.context_name), )
def local_dnn_argmax(op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if version(raises=False) < 6000: return if inputs[0].ndim > 8: return if inputs[0].dtype not in ["float16", "float32", "float64"]: return # order of the axes influences the output indices if op.axis is not None and tuple(sorted(op.axis)) != op.axis: return max, arg = GpuDnnReduction("maximum", op.axis, inputs[0].dtype, inputs[0].dtype, True)(*inputs) return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
def local_abstractconv_cudnn_alt(node): if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)): return if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1): return None if node.op.unshared: return None if isinstance(node.op.border_mode, tuple) and any( isinstance(p, tuple) for p in node.op.border_mode): # Asymmetric padding not yet supported return None inp1 = node.inputs[0] inp2 = node.inputs[1] if not dnn_available(inp1.type.context_name): return op = node.op border_mode = node.op.border_mode subsample = node.op.subsample filter_dilation = node.op.filter_dilation num_groups = node.op.num_groups precision, _ = get_precision(None, [inp1, inp2]) if node.op.filter_flip: conv_mode = "conv" else: conv_mode = "cross" if isinstance(op, AbstractConv2d): if border_mode == "half" or subsample != (1, 1) or num_groups != 1: return None if border_mode == "full": direction_hint = "bprop inputs" elif border_mode == "valid" and filter_dilation == (1, 1): direction_hint = "bprop weights" else: return None rval = dnn_conv( inp1, inp2, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, direction_hint=direction_hint, conv_mode=conv_mode, num_groups=num_groups, ) elif isinstance(op, AbstractConv2d_gradWeights): if (border_mode == "valid" and subsample == (1, 1) and filter_dilation == (1, 1) and num_groups == 1): img = gpu_contiguous(inp1) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(img, topgrad) img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)) ishape = [shape_i_op(i)(img) for i in range(img.ndim)] tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] out_shp = get_conv_output_shape( ishape, tshape, border_mode=border_mode, subsample=subsample, filter_dilation=filter_dilation, ) out_shp = assert_conv_shape(out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) desc = GpuDnnConvDesc( border_mode=border_mode, subsample=subsample, dilation=filter_dilation, conv_mode="cross", precision=precision, )(out.shape) conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc) if conv_mode == "conv": conv = conv[:, :, ::-1, ::-1] rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) else: return None elif isinstance(op, AbstractConv2d_gradInputs): if border_mode == "valid" and subsample == (1, 1) and num_groups == 1: kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(kerns, topgrad) conv_mode = "cross" if conv_mode == "conv" else "conv" desc = GpuDnnConvDesc( border_mode="full", subsample=subsample, dilation=filter_dilation, conv_mode=conv_mode, precision=precision, )(kerns.shape) tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] shape = get_conv_output_shape( tshape, kshape, border_mode="full", subsample=subsample, filter_dilation=filter_dilation, ) shape = assert_conv_shape(shape) out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape) rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc) else: return None return [rval]
def local_dnn_reduction(node): if not isinstance(node.op, GpuCAReduceCuda): return if not dnn_available(node.inputs[0].type.context_name): return if version(raises=False) < 6000: return if node.inputs[0].ndim > 8: return acc_dtype = node.op._acc_dtype(node.inputs[0].dtype) if node.inputs[0].dtype != node.outputs[0].dtype: # We can mix float16 and float32, but not float64. if node.inputs[0].dtype == "float64" or node.outputs[ 0].dtype == "float64": return if acc_dtype != "float32": return if node.inputs[0].dtype not in ["float16", "float32", "float64"]: return if node.inputs[0].dtype == "float64" and acc_dtype != "float64": return if node.inputs[0].dtype == "float32" and acc_dtype != "float32": return if node.inputs[0].dtype == "float16" and acc_dtype == "float64": return def _identity(a): return a def _square(a): return GpuElemwise(theano.scalar.basic.sqr)(a) scal = node.op.scalar_op.name post = _identity if node.op.pre_scalar_op is not None: if isinstance(node.op.scalar_op, theano.scalar.basic.Add): if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr): scal = "norm2" post = _square elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs): scal = "norm1" else: return elif isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and isinstance( node.op.pre_scalar_op, theano.scalar.basic.Abs): scal = "absmax" else: return if not cudnn.cudnnReduceTensorOp_t.has_alias(scal): return with inherit_stack_trace(node.outputs): ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype, False)(node.inputs[0]) return [post(ret)]
# We use FWD 2D to check it. # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) # should support all configurations, for both v5.1, v6 and v7. inputs = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) filters = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) conv = dnn_conv(inputs, filters, precision=precision, algo="small") f = theano.function([], conv, mode=mode_with_gpu) try: f() except RuntimeError as e: assert "CUDNN_STATUS_ARCH_MISMATCH" in str(e) return False return True cudnn = cudnn_defs.get_definitions(version(raises=False)) class ConvCase: """ Helper class to describe a special test case quickly. This handles only 2D and 3D cases. """ FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2 def __init__( self, type, inputs_shape, filters_shape,
# We use FWD 2D to check it. # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) # should support all configurations, for both v5.1, v6 and v7. inputs = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) filters = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) conv = dnn_conv(inputs, filters, precision=precision, algo='small') f = theano.function([], conv, mode=mode_with_gpu) try: f() except RuntimeError as e: assert 'CUDNN_STATUS_ARCH_MISMATCH' in e.message return False return True cudnn = cudnn_defs.get_definitions(version(raises=False)) class ConvCase: """ Helper class to describe a special test case quickly. This handles only 2D and 3D cases. """ FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2 def __init__(self, type, inputs_shape, filters_shape, algo=None, dtype=None, precision=None, subsample=None, dilation=None, border_mode='valid', conv_mode='conv', alpha=1, beta=0,