def local_gpua_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return inp, out, out_grad, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) out = gpu_contiguous(as_gpuarray_variable(out, ctx_name)) out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) mode = op.mode # the GPU ops expect exactly 2 non-pooling dimensions if inp.ndim == nd + 2: return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad) else: # reshape to 4D or 5D with 2 non-pooling dimensions inp_padded = pad_dims(inp, 2, nd) out_padded = pad_dims(out, 2, nd) out_grad_padded = pad_dims(out_grad, 2, nd) ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_padded, out_grad_padded, ws, stride, pad) return unpad_dims(ret_padded, inp, 2, nd)
def local_gpua_avg_pool_dnn_grad_stride(fgraph, op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return inp, out_grad, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name)) out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name)) mode = op.mode # the GPU ops expect exactly 2 non-pooling dimensions if inp.ndim == nd + 2: # We reuse out_grad because cuDNN does not use the value of the `out` # argument but still checks its shape for average pooling. This # has been observed in v2 and v3 as far as I know. return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride, pad) else: # reshape to 4D or 5D with 2 non-pooling dimensions inp_padded = pad_dims(inp, 2, nd) out_grad_padded = pad_dims(out_grad, 2, nd) ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_grad_padded, out_grad_padded, ws, stride, pad) return unpad_dims(ret_padded, inp, 2, nd)
def make_node(self, inp1, inp2): if not cublas_available: raise RuntimeError("CUBLAS is not available and " "GpuCublasTriangularSolve Op " "can not be constructed.") context_name = infer_context_name(inp1, inp2) inp1 = as_gpuarray_variable(inp1, context_name) inp2 = as_gpuarray_variable(inp2, context_name) inp1 = gpu_contiguous(inp1) inp2 = gpu_contiguous(inp2) assert inp1.ndim == 2 assert inp2.ndim in [1, 2] assert inp1.dtype == inp2.dtype return Apply( self, [inp1, inp2], [ GpuArrayType( inp1.dtype, broadcastable=inp2.broadcastable, context_name=context_name, )() ], )
def make_node(self, inp1, inp2): if not cusolver_available: raise RuntimeError("CUSOLVER is not available and " "GpuCusolverSolve Op can not be constructed.") if skcuda.__version__ <= "0.5.1": warnings.warn( "The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8" ) context_name = infer_context_name(inp1, inp2) inp1 = as_gpuarray_variable(inp1, context_name) inp2 = as_gpuarray_variable(inp2, context_name) inp1 = gpu_contiguous(inp1) inp2 = gpu_contiguous(inp2) assert inp1.ndim == 2 assert inp2.ndim == 2 assert inp1.dtype == inp2.dtype return Apply( self, [inp1, inp2], [ GpuArrayType( inp1.dtype, broadcastable=inp1.broadcastable, context_name=context_name, )() ], )
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return ins = [] for n in inputs: n = as_gpuarray_variable(n, ctx_name) if n.ndim != 2: return ins.append(n.dimshuffle(0, "x", 1, "x")) out = GpuDnnSoftmaxGrad("accurate", "instance")(gpu_contiguous(ins[0]), gpu_contiguous(ins[1])) return [out.dimshuffle(0, 2)]
def make_node(self, inp, s=None): # A shape parameter s can be provided as an input. For now this is used to # manage odd transform sizes. # Later this could be extended to handle padding and trunkation, # following numpy's interface. However, cuFFT expects array that match # the shape given to the plan, so padding will have to be done in the op. # The effect of padding on gradients has yet to be investigated. if not skcuda_available: raise RuntimeError("skcuda is needed for CuFFTOp") if not pygpu_available: raise RuntimeError("pygpu is needed for CuFFTOp") if not pycuda_available: raise RuntimeError("pycuda is needed for CuFFTOp") inp = gpu_contiguous(as_gpuarray_variable(inp, infer_context_name(inp))) # If no shape is provided as input, default to input data shape. if s is None: s = inp.shape[1:] s = as_tensor_variable(s) assert inp.dtype == "float32" assert s.ndim == 1 assert s.dtype in integer_dtypes return Apply(self, [inp, s], [self.output_type(inp)()])
def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") if self.compute_uv: return Apply( self, [A], # return S, U, VT [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)(), A.type(), A.type(), ], ) else: return Apply( self, [A], # return only S [ GpuArrayType(A.dtype, broadcastable=[False], context_name=ctx_name)() ], )
def local_softmax_dnn(fgraph, node): if isinstance(node.op, GpuSoftmax): if not dnn_available(node.outputs[0].type.context_name): return ins = node.inputs[0].dimshuffle(0, 1, "x", "x") ins = gpu_contiguous(ins) out = GpuDnnSoftmax("accurate", "channel")(ins) out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name) return [out]
def make_node(self, A): ctx_name = infer_context_name(A) A = as_gpuarray_variable(A, ctx_name) A = gpu_contiguous(A) if A.ndim != 2: raise LinAlgError("Matrix rank error") if A.dtype != "float32": raise TypeError("only `float32` is supported for now") return Apply(self, [A], [A.type()])
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs): # Transform the input in the format expected by GpuDnnSoftmax inp = inputs[0] if inp.ndim != 2: return if not dnn_available(ctx_name): return inp = inp.dimshuffle(0, 1, "x", "x") inp.tag.context_name = ctx_name # Apply GpuDnnSoftmax and return the result out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp)) return [out.dimshuffle(0, 1)]
def make_node(self, activations, labels, input_lengths): context_name = infer_context_name(activations) t_activations = as_gpuarray_variable(activations, context_name=context_name) # Ensure activations array is C-contiguous t_activations = gpu_contiguous(t_activations) # Labels and input lengths are always on the CPU t_labels = as_tensor_variable(labels) t_input_lengths = as_tensor_variable(input_lengths) if t_activations.type.dtype != "float32": raise TypeError("activations must use the float32 type.") if t_activations.ndim != 3: raise ValueError("activations must have 3 dimensions.") if t_labels.type.dtype != "int32": raise TypeError("labels must use the int32 type.") if t_labels.ndim != 2: raise ValueError("labels must have 2 dimensions.") if t_input_lengths.type.dtype != "int32": raise TypeError("input_lengths must use the int32 type.") if t_input_lengths.ndim != 1: raise ValueError("input_lengths must have 1 dimension.") costs = GpuArrayType(dtype="float32", broadcastable=(False, ), context_name=context_name)() outputs = [costs] if self.compute_grad: gradients = GpuArrayType( dtype="float32", broadcastable=( False, False, False, ), context_name=context_name, )() outputs += [gradients] return Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def test_gpu_contiguous(): a = tt.fmatrix("a") i = tt.iscalar("i") a_val = np.asarray(np.random.rand(4, 5), dtype="float32") # The reshape is needed otherwise we make the subtensor on the CPU # to transfer less data. f = aesara.function( [a, i], gpu_contiguous(a.reshape((5, 4))[::i]), mode=mode_with_gpu ) topo = f.maker.fgraph.toposort() assert any([isinstance(node.op, GpuSubtensor) for node in topo]) assert any([isinstance(node.op, GpuContiguous) for node in topo]) assert f(a_val, 1).flags.c_contiguous assert f(a_val, 2).flags.c_contiguous assert f(a_val, 2).flags.c_contiguous
def make_node(self, inp): if not cusolver_available: raise RuntimeError("CUSOLVER is not available and " "GpuCholesky Op can not be constructed.") if skcuda.__version__ <= "0.5.1": warnings.warn("The GpuCholesky op requires scikit-cuda > " "0.5.1 to work with CUDA 8") if not pygpu_available: raise RuntimeError("Missing pygpu or triu/tril functions." "Install or update libgpuarray.") context_name = infer_context_name(inp) inp = as_gpuarray_variable(inp, context_name) inp = gpu_contiguous(inp) assert inp.ndim == 2 return Apply(self, [inp], [inp.type()])
def grad(self, inp, grads): img, ws, stride, pad = inp (grad, ) = grads grad = gpu_contiguous(grad) disc = [aesara.gradient.DisconnectedType()() for i in inp[1:]] if self.mode == "max": out = self(img, ws, stride, pad) g_out = GpuMaxPoolGrad(ndim=self.ndim, ignore_border=self.ignore_border)(img, out, grad, ws, stride, pad) return [g_out] + disc else: g_out = GpuAveragePoolGrad(ndim=self.ndim, ignore_border=self.ignore_border, mode=self.mode)(img, grad, ws, stride, pad) return [g_out] + disc
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs): if not dnn_available(ctx_name): return if not op.ignore_border: return img, ws, stride, pad = inputs nd = op.ndim if nd not in (2, 3): return img = gpu_contiguous(as_gpuarray_variable(img, ctx_name)) mode = op.mode # dnn_pool expects exactly 2 non-pooling dimensions if img.ndim == nd + 2: return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode) else: # reshape to 4D or 5D with 2 non-pooling dimensions img_padded = pad_dims(img, 2, nd) ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode) return unpad_dims(ret_padded, img, 2, nd)
def local_dnn_convi_output_merge(node, *inputs): inputs = inputs[0:2] + (gpu_contiguous(inputs[2]), ) + inputs[3:] return [ GpuDnnConvGradI(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs) ]
def local_abstractconv_cudnn_alt(fgraph, node): if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs)): return if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1): return None if node.op.unshared: return None if isinstance(node.op.border_mode, tuple) and any( isinstance(p, tuple) for p in node.op.border_mode): # Asymmetric padding not yet supported return None inp1 = node.inputs[0] inp2 = node.inputs[1] if not dnn_available(inp1.type.context_name): return op = node.op border_mode = node.op.border_mode subsample = node.op.subsample filter_dilation = node.op.filter_dilation num_groups = node.op.num_groups precision, _ = get_precision(None, [inp1, inp2]) if node.op.filter_flip: conv_mode = "conv" else: conv_mode = "cross" if isinstance(op, AbstractConv2d): if border_mode == "half" or subsample != (1, 1) or num_groups != 1: return None if border_mode == "full": direction_hint = "bprop inputs" elif border_mode == "valid" and filter_dilation == (1, 1): direction_hint = "bprop weights" else: return None rval = dnn_conv( inp1, inp2, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, direction_hint=direction_hint, conv_mode=conv_mode, num_groups=num_groups, ) elif isinstance(op, AbstractConv2d_gradWeights): if (border_mode == "valid" and subsample == (1, 1) and filter_dilation == (1, 1) and num_groups == 1): img = gpu_contiguous(inp1) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(img, topgrad) img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3)) ishape = [shape_i_op(i)(img) for i in range(img.ndim)] tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] out_shp = get_conv_output_shape( ishape, tshape, border_mode=border_mode, subsample=subsample, filter_dilation=filter_dilation, ) out_shp = assert_conv_shape(out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) desc = GpuDnnConvDesc( border_mode=border_mode, subsample=subsample, dilation=filter_dilation, conv_mode="cross", precision=precision, )(out.shape) conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad, out, desc) if conv_mode == "conv": conv = conv[:, :, ::-1, ::-1] rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) else: return None elif isinstance(op, AbstractConv2d_gradInputs): if border_mode == "valid" and subsample == (1, 1) and num_groups == 1: kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3)) topgrad = gpu_contiguous(inp2) ctx_name = infer_context_name(kerns, topgrad) conv_mode = "cross" if conv_mode == "conv" else "conv" desc = GpuDnnConvDesc( border_mode="full", subsample=subsample, dilation=filter_dilation, conv_mode=conv_mode, precision=precision, )(kerns.shape) tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)] kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)] shape = get_conv_output_shape( tshape, kshape, border_mode="full", subsample=subsample, filter_dilation=filter_dilation, ) shape = assert_conv_shape(shape) out = GpuAllocEmpty(dtype=topgrad.dtype, context_name=ctx_name)(*shape) rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns, out, desc) else: return None return [rval]