def check_dtype_config_support(dtype, precision): # We use FWD 2D to check it. # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) # should support all configurations, for both v5.1, v6 and v7. inputs = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) filters = aesara.shared(np.zeros((1, 1, 2, 2), dtype=dtype)) conv = dnn_conv(inputs, filters, precision=precision, algo="small") f = aesara.function([], conv, mode=mode_with_gpu) try: f() except RuntimeError as e: assert "CUDNN_STATUS_ARCH_MISMATCH" in str(e) return False return True
def run_gradweight_runtime_algorithm(algo): with aesara.config.change_flags(dnn__conv__algo_bwd_filter=algo): inputs = TensorType(dtype, _broadcastable)() filters = TensorType(dtype, _broadcastable)() conv = dnn_conv( img=inputs, kerns=filters, algo=algo, precision=dtype, subsample=unit_shape, dilation=unit_shape, ) grad_w = aesara.gradient.grad(conv.sum(), [filters]) f = aesara.function([inputs, filters], grad_w, mode=mode_with_gpu) assert 1 == len([ node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, GpuDnnConvGradW) ]) assert not any( isinstance(node.op, GpuDnnConv) for node in f.maker.fgraph.apply_nodes) assert not any( isinstance(node.op, GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes) if self.ndim == 3: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] conv_ref = self.cpu_conv_class(subsample=unit_shape)( ref_cast(inputs), flipped_filters) grad_w_ref = aesara.gradient.grad(conv_ref.sum(), [filters]) f_ref = aesara.function([inputs, filters], grad_w_ref, mode="FAST_RUN") runtime_shapes = self.runtime_shapes if algo in ("time_once", "guess_once"): runtime_shapes = [list(runtime_shapes[0])] runtime_shapes[0][0] = 5 for ntimes, (inputs_shape, filters_shape) in runtime_shapes: print("Shapes:", inputs_shape, filters_shape) for i in range(ntimes): inputs_val = np.random.random(inputs_shape).astype( dtype) filters_val = np.random.random(filters_shape).astype( dtype) gpu_res = f(inputs_val, filters_val) cpu_res = f_ref(inputs_val, filters_val) utt.assert_allclose(cpu_res, np.asarray(gpu_res))
def run_fwd_runtime_algorithm(algo): inputs = TensorType(dtype, _broadcastable)() filters = TensorType(dtype, _broadcastable)() # Scale down the input values to prevent very large absolute errors # due to float rounding lower_inputs = inputs / 10 lower_filters = filters / 10 conv = dnn_conv( img=lower_inputs, kerns=lower_filters, algo=algo, precision=dtype, subsample=unit_shape, dilation=unit_shape, ) f = aesara.function([inputs, filters], conv, mode=mode_with_gpu) if self.ndim == 3: flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = lower_filters[:, :, ::-1, ::-1] conv_ref = self.cpu_conv_class(subsample=unit_shape)( ref_cast(lower_inputs), flipped_filters) f_ref = aesara.function([inputs, filters], conv_ref, mode="FAST_RUN") runtime_shapes = self.runtime_shapes if algo in ("time_once", "guess_once"): runtime_shapes = [list(runtime_shapes[0])] runtime_shapes[0][0] = 5 for ntimes, (inputs_shape, filters_shape) in runtime_shapes: print("Shapes:", inputs_shape, filters_shape) for i in range(ntimes): inputs_val = np.random.random(inputs_shape).astype(dtype) filters_val = np.random.random(filters_shape).astype(dtype) gpu_res = np.asarray(f(inputs_val, filters_val)) cpu_res = f_ref(inputs_val, filters_val) self.scale_numpy_arrays_inplace(cpu_res, gpu_res, 1) utt.assert_allclose(cpu_res, gpu_res)
def local_abstractconv_cudnn_alt(fgraph, node): if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)): return if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1): return None if node.op.unshared: return None if isinstance(node.op.border_mode, tuple) and any( isinstance(p, tuple) for p in node.op.border_mode): # Asymmetric padding not yet supported return None inp1 = node.inputs[0] inp2 = node.inputs[1] if not dnn_available(inp1.type.context_name): return op = node.op border_mode = node.op.border_mode subsample = node.op.subsample filter_dilation = node.op.filter_dilation num_groups = node.op.num_groups precision, _ = get_precision(None, [inp1, inp2]) if node.op.filter_flip: conv_mode = "conv" else: conv_mode = "cross" if isinstance(op, AbstractConv2d): if border_mode == "half" or subsample != (1, 1) or num_groups != 1: return None if border_mode == "full": direction_hint = "bprop inputs" elif border_mode == "valid" and filter_dilation == (1, 1): direction_hint = "bprop weights" else: return None rval = dnn_conv( inp1, inp2, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, direction_hint=direction_hint, conv_mode=conv_mode, num_groups=num_groups, ) elif isinstance(op, AbstractConv2d_gradWeights): if (border_mode == "valid" and subsample == (1, 1) and filter_dilation == (1, 1) and num_groups == 1): img = gpu_contiguous(inp1) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(img, topgrad) img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)) ishape = [shape_i_op(i)(img) for i in range(img.ndim)] tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] out_shp = get_conv_output_shape( ishape, tshape, border_mode=border_mode, subsample=subsample, filter_dilation=filter_dilation, ) out_shp = assert_conv_shape(out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) desc = GpuDnnConvDesc( border_mode=border_mode, subsample=subsample, dilation=filter_dilation, conv_mode="cross", precision=precision, )(out.shape) conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc) if conv_mode == "conv": conv = conv[:, :, ::-1, ::-1] rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) else: return None elif isinstance(op, AbstractConv2d_gradInputs): if border_mode == "valid" and subsample == (1, 1) and num_groups == 1: kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(kerns, topgrad) conv_mode = "cross" if conv_mode == "conv" else "conv" desc = GpuDnnConvDesc( border_mode="full", subsample=subsample, dilation=filter_dilation, conv_mode=conv_mode, precision=precision, )(kerns.shape) tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] shape = get_conv_output_shape( tshape, kshape, border_mode="full", subsample=subsample, filter_dilation=filter_dilation, ) shape = assert_conv_shape(shape) out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape) rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc) else: return None return [rval]
def run_conv_fwd(self, algo, dtype, precision, parameters): ( inputs_shape, filters_shape, subsample, dilation, border_mode, conv_mode, alpha, beta, ) = parameters inputs_val = np.random.random(inputs_shape).astype(dtype) filters_val = np.random.random(filters_shape).astype(dtype) # Scale down the input values to prevent very large absolute errors # due to float rounding inputs_val /= 10 filters_val /= 10 inputs = aesara.shared(inputs_val) filters = aesara.shared(filters_val) if beta == 0: out = None else: out = self.array_like_conv_output(inputs_shape, filters_shape, border_mode, subsample, dilation, dtype) out /= 10 # Compile an Aesara function for the cuDNN implementation conv = dnn_conv( img=inputs, kerns=filters, alpha=alpha, beta=beta, out=out, border_mode=border_mode, subsample=subsample, dilation=dilation, conv_mode=conv_mode, algo=algo, precision=precision, ) f = aesara.function([], conv, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters flipped according to the width, height and time axis if conv_mode == "conv": if inputs.ndim == 5: flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters[:, :, ::-1, ::-1] else: flipped_filters = filters # Compile an Aesara function for the reference implementation conv_ref = self.cpu_conv_class(border_mode=border_mode, subsample=subsample, filter_dilation=dilation)( ref_cast(inputs), flipped_filters) f_ref = aesara.function([], conv_ref, mode="FAST_RUN") # Compare the results of the two implementations res_ref = f_ref() res = np.asarray(f()) if algo in cudnn.deterministic_fwd_algorithms: utt.assert_allclose(res, np.asarray(f())) atol, rtol = self.get_atol_rtol(algo, dtype, precision) if beta == 0: cpu_res = alpha * res_ref else: cpu_res = alpha * res_ref + beta * out self.scale_numpy_arrays_inplace(cpu_res, res, alpha) utt.assert_allclose(cpu_res, res, rtol=rtol, atol=atol)