Beispiel #1
0
def local_gpua_avg_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        # We reuse out_grad because cuDNN does not use the value of the `out`
        # argument but still checks its shape for average pooling. This
        # has been observed in v2 and v3 as far as I know.
        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride,
                                         pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_grad_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
Beispiel #2
0
    def make_node(self, inp1, inp2):
        if not cusolver_available:
            raise RuntimeError("CUSOLVER is not available and "
                               "GpuCusolverSolve Op can not be constructed.")
        if skcuda.__version__ <= "0.5.1":
            warnings.warn(
                "The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8"
            )
        context_name = infer_context_name(inp1, inp2)

        inp1 = as_gpuarray_variable(inp1, context_name)
        inp2 = as_gpuarray_variable(inp2, context_name)

        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)

        assert inp1.ndim == 2
        assert inp2.ndim == 2
        assert inp1.dtype == inp2.dtype

        return Apply(
            self,
            [inp1, inp2],
            [
                GpuArrayType(
                    inp1.dtype,
                    broadcastable=inp1.broadcastable,
                    context_name=context_name,
                )()
            ],
        )
Beispiel #3
0
def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_padded = pad_dims(out, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
Beispiel #4
0
    def make_node(self, inp, out, out_grad, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp, out, out_grad)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        out = as_gpuarray_variable(out, ctx_name)
        assert out_grad.ndim == nd + 2
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
        assert out.ndim == nd + 2

        assert out_grad.ndim == inp.ndim
        assert inp.ndim == out.ndim

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, out, out_grad, ws, stride, pad], [inp.type()])
Beispiel #5
0
    def make_node(self, inp1, inp2):
        if not cublas_available:
            raise RuntimeError("CUBLAS is not available and "
                               "GpuCublasTriangularSolve Op "
                               "can not be constructed.")
        context_name = infer_context_name(inp1, inp2)

        inp1 = as_gpuarray_variable(inp1, context_name)
        inp2 = as_gpuarray_variable(inp2, context_name)

        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)

        assert inp1.ndim == 2
        assert inp2.ndim in [1, 2]
        assert inp1.dtype == inp2.dtype

        return Apply(
            self,
            [inp1, inp2],
            [
                GpuArrayType(
                    inp1.dtype,
                    broadcastable=inp2.broadcastable,
                    context_name=context_name,
                )()
            ],
        )
Beispiel #6
0
    def make_node(self, inp, out_grad, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp, out_grad)
        nd = self.ndim
        inp = as_gpuarray_variable(inp, ctx_name)
        assert inp.ndim == nd + 2
        out_grad = as_gpuarray_variable(out_grad, ctx_name)
        assert out_grad.ndim == nd + 2

        assert out_grad.ndim == inp.ndim

        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.mode == "average_exc_pad":
                raise ValueError("Padding must be zero for average_exc_pad")
        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])
Beispiel #7
0
 def make_node(self, x, b, y_idx):
     ctx_name = infer_context_name(x, b, y_idx)
     x = as_gpuarray_variable(x, ctx_name)
     b = as_gpuarray_variable(b, ctx_name)
     y_idx = as_gpuarray_variable(y_idx, ctx_name)
     nll = GpuArrayType(x.type.dtype,
                        y_idx.type.broadcastable,
                        context_name=ctx_name)()
     sm = x.type()
     am = y_idx.type()
     return Apply(self, [x, b, y_idx], [nll, sm, am])
Beispiel #8
0
 def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
     ctx = infer_context_name(o, x, y)
     one = aet.constant(np.asarray(1.0, dtype="float32"))
     o = as_gpuarray_variable(o, ctx)
     x = as_gpuarray_variable(x, ctx)
     y = as_gpuarray_variable(y, ctx)
     xIdx = as_tensor_variable(xIdx)
     yIdx = as_tensor_variable(yIdx)
     if alpha is None:
         alpha = one
     return Apply(self, [o, x, y, xIdx, yIdx, alpha], [o.type()])
Beispiel #9
0
    def make_node(self, inp, ws, stride=None, pad=None):
        ctx_name = infer_context_name(inp)
        inp = as_gpuarray_variable(inp, ctx_name)
        nd = self.ndim
        assert inp.ndim == nd + 2
        if stride is None:
            stride = ws
        if pad is None:
            pad = (0, ) * nd
        elif isinstance(pad, (tuple, list)):
            if max(pad) != 0 and not self.ignore_border:
                raise ValueError("Padding works only with ignore_border=True")
            if isinstance(ws, (tuple, list)):
                if any(pad[i] >= ws[i] for i in range(nd)):
                    raise ValueError("Padding must be smaller than strides")

        ws = as_tensor_variable(ws)
        stride = as_tensor_variable(stride)
        pad = as_tensor_variable(pad)
        assert ws.ndim == stride.ndim and ws.ndim == pad.ndim
        assert ws.ndim == 1
        if ws.dtype not in int_dtypes:
            raise TypeError("Window shape parameters must be ints.")
        if stride.dtype not in int_dtypes:
            raise TypeError("Stride parameters must be ints.")
        if pad.dtype not in int_dtypes:
            raise TypeError("Padding parameters must be ints.")

        ws = aesara.tensor.cast(ws, "int64")
        stride = aesara.tensor.cast(stride, "int64")
        pad = aesara.tensor.cast(pad, "int64")

        return Apply(self, [inp, ws, stride, pad], [inp.type()])
Beispiel #10
0
    def make_node(self, inp, s=None):
        # A shape parameter s can be provided as an input. For now this is used to
        # manage odd transform sizes.
        # Later this could be extended to handle padding and trunkation,
        # following numpy's interface. However, cuFFT expects array that match
        # the shape given to the plan, so padding will have to be done in the op.
        # The effect of padding on gradients has yet to be investigated.

        if not skcuda_available:
            raise RuntimeError("skcuda is needed for CuFFTOp")

        if not pygpu_available:
            raise RuntimeError("pygpu is needed for CuFFTOp")

        if not pycuda_available:
            raise RuntimeError("pycuda is needed for CuFFTOp")

        inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp)))

        # If no shape is provided as input, default to input data shape.
        if s is None:
            s = inp.shape[1:]
        s = as_tensor_variable(s)

        assert inp.dtype == "float32"
        assert s.ndim == 1
        assert s.dtype in integer_dtypes

        return Apply(self, [inp, s], [self.output_type(inp)()])
Beispiel #11
0
 def make_node(self, A):
     ctx_name = infer_context_name(A)
     A = as_gpuarray_variable(A, ctx_name)
     A = gpu_contiguous(A)
     if A.ndim != 2:
         raise LinAlgError("Matrix rank error")
     if A.dtype != "float32":
         raise TypeError("only `float32` is supported for now")
     if self.compute_uv:
         return Apply(
             self,
             [A],
             # return S, U, VT
             [
                 GpuArrayType(A.dtype,
                              broadcastable=[False],
                              context_name=ctx_name)(),
                 A.type(),
                 A.type(),
             ],
         )
     else:
         return Apply(
             self,
             [A],
             # return only S
             [
                 GpuArrayType(A.dtype,
                              broadcastable=[False],
                              context_name=ctx_name)()
             ],
         )
Beispiel #12
0
    def make_node(self, ten4, neib_shape, neib_step=None):
        ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
        neib_shape = tt.as_tensor_variable(neib_shape)
        if neib_step is None:
            neib_step = neib_shape
        else:
            neib_step = tt.as_tensor_variable(neib_step)

        assert ten4.ndim == 4
        assert neib_shape.ndim == 1
        assert neib_step.ndim == 1
        assert neib_shape.dtype in tt.integer_dtypes
        assert neib_step.dtype in tt.integer_dtypes

        return Apply(
            self,
            [ten4, neib_shape, neib_step],
            [
                GpuArrayType(
                    broadcastable=(False, False),
                    dtype=ten4.type.dtype,
                    context_name=ten4.type.context_name,
                )()
            ],
        )
Beispiel #13
0
def local_cudnn_maxandargmax(fgraph, node):
    if not isinstance(node.op, GpuMaxAndArgmax):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    if node.inputs[0].dtype != node.outputs[0].dtype:
        return

    if node.inputs[0].dtype not in ("float16", "float32", "float64"):
        return

    # order of the axes influences the output indices
    if node.op.axis is not None and tuple(sorted(
            node.op.axis)) != node.op.axis:
        return

    max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype,
                               node.outputs[0].dtype, True)(node.inputs[0])

    # cudnn can only return int32 indices
    return (
        max,
        as_gpuarray_variable(arg.astype("int64"),
                             node.outputs[1].type.context_name),
    )
Beispiel #14
0
    def make_node(self, o, W, h, inputIdx, outputIdx):
        ctx = infer_context_name(o, W, h)
        o = as_gpuarray_variable(o, ctx)
        W = as_gpuarray_variable(W, ctx)
        h = as_gpuarray_variable(h, ctx)
        inputIdx = as_tensor_variable(inputIdx)
        outputIdx = as_tensor_variable(outputIdx)
        assert o.ndim == 3
        assert W.ndim == 4
        assert h.ndim == 3
        assert inputIdx.ndim == 2
        assert outputIdx.ndim == 2

        assert inputIdx.type.dtype in discrete_dtypes
        assert outputIdx.type.dtype in discrete_dtypes

        return Apply(self, [o, W, h, inputIdx, outputIdx], [o.type()])
Beispiel #15
0
def local_softmax_dnn(fgraph, node):
    if isinstance(node.op, GpuSoftmax):
        if not dnn_available(node.outputs[0].type.context_name):
            return
        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax("accurate", "channel")(ins)
        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
        return [out]
Beispiel #16
0
 def make_node(self, A):
     ctx_name = infer_context_name(A)
     A = as_gpuarray_variable(A, ctx_name)
     A = gpu_contiguous(A)
     if A.ndim != 2:
         raise LinAlgError("Matrix rank error")
     if A.dtype != "float32":
         raise TypeError("only `float32` is supported for now")
     return Apply(self, [A], [A.type()])
Beispiel #17
0
    def make_node(self, pvals, unis):
        ctx_name = infer_context_name(pvals, unis)
        pvals = as_gpuarray_variable(pvals, ctx_name)
        unis = as_gpuarray_variable(unis, ctx_name)
        assert pvals.dtype in ["float32", "float16", "float64"]
        assert unis.dtype in ["float32", "float16", "float64"]

        if pvals.ndim != 2:
            raise NotImplementedError("pvals ndim should be 2", pvals.ndim)
        if unis.ndim != 1:
            raise NotImplementedError("unis ndim should be 1", unis.ndim)
        if self.odtype == "auto":
            odtype = pvals.dtype
        else:
            odtype = self.odtype
        br = (pvals.broadcastable[1], pvals.broadcastable[0])
        out = GpuArrayType(broadcastable=br, dtype=odtype, context_name=ctx_name)()

        return Apply(self, [pvals, unis], [out])
Beispiel #18
0
 def make_node(self, rstate, size):
     # error checking slightly redundant here, since
     # this op should not be called directly.
     #
     # call through MRG_RandomStream instead.
     broad = []
     for i in range(self.output_type.ndim):
         broad.append(aet.extract_constant(size[i]) == 1)
     output_type = self.output_type.clone(broadcastable=broad)()
     rstate = as_gpuarray_variable(rstate, infer_context_name(rstate))
     return Apply(self, [rstate, size], [rstate.type(), output_type])
Beispiel #19
0
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    ins = []
    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
        if n.ndim != 2:
            return
        ins.append(n.dimshuffle(0, "x", 1, "x"))

    out = GpuDnnSoftmaxGrad("accurate", "instance")(gpu_contiguous(ins[0]),
                                                    gpu_contiguous(ins[1]))
    return [out.dimshuffle(0, 2)]
Beispiel #20
0
    def make_node(self, x):
        assert x.type.dtype == "float32", "Only float32 supported for GpuCumOp"

        context_name = infer_context_name(x)

        x = as_gpuarray_variable(x, context_name)

        if x.ndim > GpuCumOp.SUPPORTED_NDIMS:
            raise NotImplementedError("Only cum op on 1D, 2D and\
                                       3D arrays are supported right now!")

        if self.axis >= x.ndim or self.axis < -x.ndim:
            raise ValueError(f"axis(={self.axis}) out of bounds")
        return Apply(self, [x], [x.type()])
Beispiel #21
0
    def make_node(self, activations, labels, input_lengths):
        context_name = infer_context_name(activations)
        t_activations = as_gpuarray_variable(activations,
                                             context_name=context_name)
        # Ensure activations array is C-contiguous
        t_activations = gpu_contiguous(t_activations)

        # Labels and input lengths are always on the CPU
        t_labels = as_tensor_variable(labels)
        t_input_lengths = as_tensor_variable(input_lengths)

        if t_activations.type.dtype != "float32":
            raise TypeError("activations must use the float32 type.")

        if t_activations.ndim != 3:
            raise ValueError("activations must have 3 dimensions.")

        if t_labels.type.dtype != "int32":
            raise TypeError("labels must use the int32 type.")

        if t_labels.ndim != 2:
            raise ValueError("labels must have 2 dimensions.")

        if t_input_lengths.type.dtype != "int32":
            raise TypeError("input_lengths must use the int32 type.")

        if t_input_lengths.ndim != 1:
            raise ValueError("input_lengths must have 1 dimension.")

        costs = GpuArrayType(dtype="float32",
                             broadcastable=(False, ),
                             context_name=context_name)()
        outputs = [costs]

        if self.compute_grad:
            gradients = GpuArrayType(
                dtype="float32",
                broadcastable=(
                    False,
                    False,
                    False,
                ),
                context_name=context_name,
            )()
            outputs += [gradients]

        return Apply(self,
                     inputs=[t_activations, t_labels, t_input_lengths],
                     outputs=outputs)
Beispiel #22
0
def local_gpua_cumop(op, ctx_name, inputs, outputs):
    if inputs[0].dtype != "float32":
        return False
    axis = op.axis
    x = inputs[0]
    if axis is not None and x.ndim > GpuCumOp.SUPPORTED_NDIMS:
        return False

    x = as_gpuarray_variable(x, ctx_name)

    if axis is None and x.ndim > 1:
        x = GpuReshape(1)(x, (-1, ))

    # ``gpu_cumop`` assume array has been flattened if needed.
    if axis is None:
        axis = 0

    return GpuCumOp(axis, op.mode)(x)
Beispiel #23
0
    def make_node(self, inp):
        if not cusolver_available:
            raise RuntimeError("CUSOLVER is not available and "
                               "GpuCholesky Op can not be constructed.")
        if skcuda.__version__ <= "0.5.1":
            warnings.warn("The GpuCholesky op requires scikit-cuda > "
                          "0.5.1 to work with CUDA 8")
        if not pygpu_available:
            raise RuntimeError("Missing pygpu or triu/tril functions."
                               "Install or update libgpuarray.")
        context_name = infer_context_name(inp)

        inp = as_gpuarray_variable(inp, context_name)

        inp = gpu_contiguous(inp)

        assert inp.ndim == 2

        return Apply(self, [inp], [inp.type()])
Beispiel #24
0
def local_dnn_argmax(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return

    if version(raises=False) < 6000:
        return

    if inputs[0].ndim > 8:
        return

    if inputs[0].dtype not in ("float16", "float32", "float64"):
        return

    # order of the axes influences the output indices
    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
        return

    max, arg = GpuDnnReduction("maximum", op.axis, inputs[0].dtype,
                               inputs[0].dtype, True)(*inputs)

    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
Beispiel #25
0
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
    # dnn_pool expects exactly 2 non-pooling dimensions
    if img.ndim == nd + 2:
        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        img_padded = pad_dims(img, 2, nd)
        ret_padded = dnn_pool(img_padded,
                              ws,
                              stride=stride,
                              pad=pad,
                              mode=mode)
        return unpad_dims(ret_padded, img, 2, nd)
Beispiel #26
0
 def make_node(self, dnll, sm, y_idx):
     ctx_name = infer_context_name(dnll, sm, y_idx)
     dnll = as_gpuarray_variable(dnll, ctx_name)
     sm = as_gpuarray_variable(sm, ctx_name)
     y_idx = as_gpuarray_variable(y_idx, ctx_name)
     return Apply(self, [dnll, sm, y_idx], [sm.type()])
Beispiel #27
0
 def make_node(self, x):
     x = as_gpuarray_variable(x, infer_context_name(x))
     return Apply(self, [x], [x.type()])
Beispiel #28
0
 def make_node(self, x, b):
     ctx_name = infer_context_name(x, b)
     x = as_gpuarray_variable(x, ctx_name)
     b = as_gpuarray_variable(b, ctx_name)
     return Apply(self, [x, b], [x.type()])
Beispiel #29
0
def local_abstractconv_cudnn_alt(fgraph, node):
    if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
                                AbstractConv2d_gradInputs)):
        return

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
    if node.op.unshared:
        return None
    if isinstance(node.op.border_mode, tuple) and any(
            isinstance(p, tuple) for p in node.op.border_mode):
        # Asymmetric padding not yet supported
        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

    if not dnn_available(inp1.type.context_name):
        return

    op = node.op
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
    precision, _ = get_precision(None, [inp1, inp2])

    if node.op.filter_flip:
        conv_mode = "conv"
    else:
        conv_mode = "cross"

    if isinstance(op, AbstractConv2d):
        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
            return None
        if border_mode == "full":
            direction_hint = "bprop inputs"
        elif border_mode == "valid" and filter_dilation == (1, 1):
            direction_hint = "bprop weights"
        else:
            return None

        rval = dnn_conv(
            inp1,
            inp2,
            border_mode=border_mode,
            subsample=subsample,
            dilation=filter_dilation,
            direction_hint=direction_hint,
            conv_mode=conv_mode,
            num_groups=num_groups,
        )

    elif isinstance(op, AbstractConv2d_gradWeights):
        if (border_mode == "valid" and subsample == (1, 1)
                and filter_dilation == (1, 1) and num_groups == 1):
            img = gpu_contiguous(inp1)
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(img, topgrad)
            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            out_shp = get_conv_output_shape(
                ishape,
                tshape,
                border_mode=border_mode,
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            out_shp = assert_conv_shape(out_shp)
            out = GpuAllocEmpty(dtype=img.dtype,
                                context_name=ctx_name)(*out_shp)
            desc = GpuDnnConvDesc(
                border_mode=border_mode,
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode="cross",
                precision=precision,
            )(out.shape)

            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad,
                                                                out, desc)
            if conv_mode == "conv":
                conv = conv[:, :, ::-1, ::-1]

            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
        else:
            return None

    elif isinstance(op, AbstractConv2d_gradInputs):
        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(kerns, topgrad)
            conv_mode = "cross" if conv_mode == "conv" else "conv"
            desc = GpuDnnConvDesc(
                border_mode="full",
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode=conv_mode,
                precision=precision,
            )(kerns.shape)

            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
            shape = get_conv_output_shape(
                tshape,
                kshape,
                border_mode="full",
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            shape = assert_conv_shape(shape)
            out = GpuAllocEmpty(dtype=topgrad.dtype,
                                context_name=ctx_name)(*shape)
            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns,
                                                                out, desc)
        else:
            return None

    return [rval]