Esempio n. 1
0
def local_cudnn_maxandargmax(fgraph, node):
    if not isinstance(node.op, GpuMaxAndArgmax):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    if node.inputs[0].dtype != node.outputs[0].dtype:
        return

    if node.inputs[0].dtype not in ("float16", "float32", "float64"):
        return

    # order of the axes influences the output indices
    if node.op.axis is not None and tuple(sorted(
            node.op.axis)) != node.op.axis:
        return

    max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype,
                               node.outputs[0].dtype, True)(node.inputs[0])

    # cudnn can only return int32 indices
    return (
        max,
        as_gpuarray_variable(arg.astype("int64"),
                             node.outputs[1].type.context_name),
    )
Esempio n. 2
0
def local_gpua_avg_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        # We reuse out_grad because cuDNN does not use the value of the `out`
        # argument but still checks its shape for average pooling. This
        # has been observed in v2 and v3 as far as I know.
        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride,
                                         pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_grad_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
Esempio n. 3
0
def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_padded = pad_dims(out, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
Esempio n. 4
0
def local_softmax_dnn(fgraph, node):
    if isinstance(node.op, GpuSoftmax):
        if not dnn_available(node.outputs[0].type.context_name):
            return
        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax("accurate", "channel")(ins)
        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
        return [out]
Esempio n. 5
0
    def apply(self, fgraph):
        """
        Raise a error if cudnn can't be used.

        """
        for c in list_contexts():
            if not dnn_available(c):
                # Make an assert error as we want Aesara to fail, not
                # just skip this optimization.
                raise AssertionError(
                    "cuDNN optimization was enabled, but Aesara was not able "
                    "to use it for context " + str(c) +
                    ". We got this error: \n" + dnn_available.msg)
Esempio n. 6
0
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    ins = []
    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
        if n.ndim != 2:
            return
        ins.append(n.dimshuffle(0, "x", 1, "x"))

    out = GpuDnnSoftmaxGrad("accurate", "instance")(gpu_contiguous(ins[0]),
                                                    gpu_contiguous(ins[1]))
    return [out.dimshuffle(0, 2)]
Esempio n. 7
0
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    # Transform the input in the format expected by GpuDnnSoftmax
    inp = inputs[0]
    if inp.ndim != 2:
        return
    if not dnn_available(ctx_name):
        return

    inp = inp.dimshuffle(0, 1, "x", "x")
    inp.tag.context_name = ctx_name

    # Apply GpuDnnSoftmax and return the result
    out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
    return [out.dimshuffle(0, 1)]
Esempio n. 8
0
    def run_test_case_gi(self,
                         i,
                         f,
                         o,
                         s,
                         b,
                         flip,
                         provide_shape,
                         fd=(1, 1),
                         expect_error=False):
        if not dnn_available(test_ctx_name):
            pytest.skip(dnn_available.msg)

        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        mode = mode_with_gpu

        if not expect_error:
            self.run_gradinput(
                inputs_shape=i,
                filters_shape=f,
                output_shape=o,
                subsample=s,
                verify_grad=True,
                mode=mode,
                provide_shape=provide_shape,
                border_mode=b,
                filter_flip=flip,
                target_op=GpuDnnConvGradI,
                filter_dilation=fd,
            )
        else:
            with pytest.raises((RuntimeError, ValueError)):
                self.run_gradinput(
                    inputs_shape=i,
                    filters_shape=f,
                    output_shape=o,
                    subsample=s,
                    verify_grad=False,
                    mode=mode,
                    provide_shape=provide_shape,
                    border_mode=b,
                    filter_flip=flip,
                    target_op=GpuDnnConvGradI,
                    ref=None,
                    filter_dilation=fd,
                )
Esempio n. 9
0
    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        if not dnn_available(test_ctx_name):
            pytest.skip(dnn_available.msg)

        mode = mode_with_gpu

        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        o = self.get_output_shape(i, f, s, b, fd)

        self.run_fwd(
            inputs_shape=i,
            filters_shape=f,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConv,
        )
        self.run_gradweight(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradW,
        )
        self.run_gradinput(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradI,
        )
Esempio n. 10
0
def local_dnn_argmax(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return

    if version(raises=False) < 6000:
        return

    if inputs[0].ndim > 8:
        return

    if inputs[0].dtype not in ("float16", "float32", "float64"):
        return

    # order of the axes influences the output indices
    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
        return

    max, arg = GpuDnnReduction("maximum", op.axis, inputs[0].dtype,
                               inputs[0].dtype, True)(*inputs)

    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
Esempio n. 11
0
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
    # dnn_pool expects exactly 2 non-pooling dimensions
    if img.ndim == nd + 2:
        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        img_padded = pad_dims(img, 2, nd)
        ret_padded = dnn_pool(img_padded,
                              ws,
                              stride=stride,
                              pad=pad,
                              mode=mode)
        return unpad_dims(ret_padded, img, 2, nd)
Esempio n. 12
0
class TestDnnConv2d(BaseTestConv2d):
    @classmethod
    def setup_class(cls):
        super().setup_class()
        cls.shared = staticmethod(gpuarray_shared_constructor)
        # provide_shape is not used by the cuDNN impementation
        cls.provide_shape = [False]

    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        mode = mode_with_gpu

        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        o = self.get_output_shape(i, f, s, b, fd)

        self.run_fwd(
            inputs_shape=i,
            filters_shape=f,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConv,
        )
        self.run_gradweight(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradW,
        )
        self.run_gradinput(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradI,
        )

    @pytest.mark.skipif(dnn_available(test_ctx_name), reason=dnn_available.msg)
    def run_test_case_gi(
        self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False
    ):
        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        mode = mode_with_gpu

        if not expect_error:
            self.run_gradinput(
                inputs_shape=i,
                filters_shape=f,
                output_shape=o,
                subsample=s,
                verify_grad=True,
                mode=mode,
                provide_shape=provide_shape,
                border_mode=b,
                filter_flip=flip,
                target_op=GpuDnnConvGradI,
                filter_dilation=fd,
            )
        else:
            with pytest.raises((RuntimeError, ValueError)):
                self.run_gradinput(
                    inputs_shape=i,
                    filters_shape=f,
                    output_shape=o,
                    subsample=s,
                    verify_grad=False,
                    mode=mode,
                    provide_shape=provide_shape,
                    border_mode=b,
                    filter_flip=flip,
                    target_op=GpuDnnConvGradI,
                    ref=None,
                    filter_dilation=fd,
                )
Esempio n. 13
0
def local_abstractconv_cudnn_alt(fgraph, node):
    if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
                                AbstractConv2d_gradInputs)):
        return

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
    if node.op.unshared:
        return None
    if isinstance(node.op.border_mode, tuple) and any(
            isinstance(p, tuple) for p in node.op.border_mode):
        # Asymmetric padding not yet supported
        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

    if not dnn_available(inp1.type.context_name):
        return

    op = node.op
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
    precision, _ = get_precision(None, [inp1, inp2])

    if node.op.filter_flip:
        conv_mode = "conv"
    else:
        conv_mode = "cross"

    if isinstance(op, AbstractConv2d):
        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
            return None
        if border_mode == "full":
            direction_hint = "bprop inputs"
        elif border_mode == "valid" and filter_dilation == (1, 1):
            direction_hint = "bprop weights"
        else:
            return None

        rval = dnn_conv(
            inp1,
            inp2,
            border_mode=border_mode,
            subsample=subsample,
            dilation=filter_dilation,
            direction_hint=direction_hint,
            conv_mode=conv_mode,
            num_groups=num_groups,
        )

    elif isinstance(op, AbstractConv2d_gradWeights):
        if (border_mode == "valid" and subsample == (1, 1)
                and filter_dilation == (1, 1) and num_groups == 1):
            img = gpu_contiguous(inp1)
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(img, topgrad)
            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            out_shp = get_conv_output_shape(
                ishape,
                tshape,
                border_mode=border_mode,
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            out_shp = assert_conv_shape(out_shp)
            out = GpuAllocEmpty(dtype=img.dtype,
                                context_name=ctx_name)(*out_shp)
            desc = GpuDnnConvDesc(
                border_mode=border_mode,
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode="cross",
                precision=precision,
            )(out.shape)

            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad,
                                                                out, desc)
            if conv_mode == "conv":
                conv = conv[:, :, ::-1, ::-1]

            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
        else:
            return None

    elif isinstance(op, AbstractConv2d_gradInputs):
        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(kerns, topgrad)
            conv_mode = "cross" if conv_mode == "conv" else "conv"
            desc = GpuDnnConvDesc(
                border_mode="full",
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode=conv_mode,
                precision=precision,
            )(kerns.shape)

            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
            shape = get_conv_output_shape(
                tshape,
                kshape,
                border_mode="full",
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            shape = assert_conv_shape(shape)
            out = GpuAllocEmpty(dtype=topgrad.dtype,
                                context_name=ctx_name)(*shape)
            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns,
                                                                out, desc)
        else:
            return None

    return [rval]
Esempio n. 14
0
def local_dnn_reduction(fgraph, node):
    if not isinstance(node.op, GpuCAReduceCuda):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)

    if node.inputs[0].dtype != node.outputs[0].dtype:
        # We can mix float16 and float32, but not float64.
        if node.inputs[0].dtype == "float64" or node.outputs[
                0].dtype == "float64":
            return
        if acc_dtype != "float32":
            return

    if node.inputs[0].dtype not in ("float16", "float32", "float64"):
        return

    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
        return

    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
        return

    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
        return

    def _identity(a):
        return a

    def _square(a):
        return GpuElemwise(aesara.scalar.basic.sqr)(a)

    scal = node.op.scalar_op.name
    post = _identity

    if node.op.pre_scalar_op is not None:
        if isinstance(node.op.scalar_op, aesara.scalar.basic.Add):
            if isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Sqr):
                scal = "norm2"
                post = _square
            elif isinstance(node.op.pre_scalar_op, aesara.scalar.basic.Abs):
                scal = "norm1"
            else:
                return
        elif isinstance(node.op.scalar_op,
                        aesara.scalar.basic.ScalarMaximum) and isinstance(
                            node.op.pre_scalar_op, aesara.scalar.basic.Abs):
            scal = "absmax"
        else:
            return

    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

    with inherit_stack_trace(node.outputs):
        ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype,
                              False)(node.inputs[0])
        return [post(ret)]