def check_cudnn(): result ={} try: result['available'] = dnn.dnn_available() if len(dnn.version()) > 0: result['version'] = str(dnn.version()[0]) except: result['available'] = False return result
def test_dnn_conv_grad(): if not cuda.dnn.dnn_available() or dnn.version() == -1: raise SkipTest("alpha != 1.0 not supported in cudnn v1") b = 1 c = 4 f = 3 ih = 2 iw = 8 kh = 2 kw = 2 img_val = numpy.random.random((b, c, ih, iw)).astype("float32") kern_val = numpy.random.random((f, c, kh, kw)).astype("float32") out_val = numpy.random.random((b, f, ih - kw + 1, iw - kw + 1)).astype("float32") def dconv(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode="valid", subsample=(1, 1), conv_mode="conv")(img.shape, kern.shape) return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75) def dconvi(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode="valid", subsample=(1, 1), conv_mode="conv")(img.shape, kern.shape) return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0, beta=0.0) def dconvw(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode="valid", subsample=(1, 1), conv_mode="conv")(img.shape, kern.shape) return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75, beta=-1.0) utt.verify_grad(dconv, [img_val, kern_val, out_val]) utt.verify_grad(dconvi, [img_val, kern_val, out_val]) utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_gradi(self): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') ftensor5 = T.TensorType(dtype="float32", broadcastable=(False, ) * 5) img = ftensor5('img') kerns = ftensor5('kerns') out = ftensor5('out') img_val = numpy.asarray(numpy.random.rand(8, 4, 6, 7, 5), dtype='float32') kern_vals = numpy.asarray(numpy.random.rand(9, 4, 5, 1, 2), dtype='float32') for params in product(['valid', 'full'], [(1, 1, 1), (2, 2, 2)], ['conv', 'cross']): out_vals = numpy.zeros(dnn.GpuDnnConv3d.get_out_shape( img_val.shape, kern_vals.shape, border_mode=params[0], subsample=params[1]), dtype='float32') desc = dnn.GpuDnnConvDesc(border_mode=params[0], subsample=params[1], conv_mode=params[2])(img.shape, kerns.shape) conv_grad_i = dnn.GpuDnnConv3dGradI()( kerns, out, img, desc, ) self._compile_and_check([kerns, out, img], [conv_grad_i], [kern_vals, out_vals, img_val], dnn.GpuDnnConv3dGradI)
def test_conv3d_gradi(self): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5) img = ftensor5("img") kerns = ftensor5("kerns") out = ftensor5("out") img_val = numpy.asarray(numpy.random.rand(8, 4, 6, 7, 5), dtype="float32") kern_vals = numpy.asarray(numpy.random.rand(9, 4, 5, 1, 2), dtype="float32") for params in product(["valid", "full"], [(1, 1, 1), (2, 2, 2)], ["conv", "cross"]): out_vals = numpy.zeros( dnn.GpuDnnConv3d.get_out_shape( img_val.shape, kern_vals.shape, border_mode=params[0], subsample=params[1] ), dtype="float32", ) desc = dnn.GpuDnnConvDesc(border_mode=params[0], subsample=params[1], conv_mode=params[2])( img.shape, kerns.shape ) conv_grad_i = dnn.GpuDnnConv3dGradI()(kerns, out, img, desc) self._compile_and_check( [kerns, out, img], [conv_grad_i], [kern_vals, out_vals, img_val], dnn.GpuDnnConv3dGradI )
def __init__(self, layers, size=(2, 2), stride=None, pad=(0, 0), mode="max", ignore_border=True, json_param={}): super().__init__(layer_index=len(layers)) self.input = layers[-1].output self.input_shape = layers[-1].output_shape self.size = json_param.get("size", size) self.pad = json_param.get("pad", pad) self.ignore_border = json_param.get("ignoreBorder", ignore_border) self.mode = json_param.get("mode", mode) self.stride = json_param.get("stride", stride) if self.stride is None: self.stride = self.size #output dim if self.ignore_border: h = int( math.floor( (self.input_shape[2] + 2 * self.pad[0] - self.size[0]) / self.stride[0])) + 1 w = int( math.floor( (self.input_shape[3] + 2 * self.pad[1] - self.size[1]) / self.stride[1])) + 1 else: h = int( math.ceil( (self.input_shape[2] + 2 * self.pad[0]) / self.stride[0])) w = int( math.ceil( (self.input_shape[3] + 2 * self.pad[1]) / self.stride[1])) #theano optimizer is sometimes failing to use cudnn pooling! use_cudnn = (dnn.dnn_available() and dnn.version() >= (4000, 4000) and self.ignore_border) if use_cudnn: self.output = dnn.dnn_pool(self.input, ws=self.size, pad=self.pad, stride=self.stride, mode=self.mode) else: self.output = tensor.signal.pool.pool_2d( self.input, ds=self.size, padding=self.pad, ignore_border=self.ignore_border, st=self.stride, mode=self.mode) self.output_shape = (self.input_shape[0], self.input_shape[1], h, w) logging.verbose("Adding", self)
def get_op_params(self): if self.inplace: inpl_def = [('CONV_INPLACE', '1')] else: inpl_def = [] if version() == -1: alg_def = ('CONV_ALGO', "0") else: # it seems only this works for nd convolutions? alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM') return [alg_def] + inpl_def
def test_dnn_conv_grad(): if not cuda.dnn.dnn_available() or dnn.version() == -1: raise SkipTest('alpha != 1.0 not supported in cudnn v1') b = 1 c = 4 f = 3 ih = 2 iw = 8 kh = 2 kw = 2 img_val = numpy.random.random((b, c, ih, iw)).astype('float32') kern_val = numpy.random.random((f, c, kh, kw)).astype('float32') out_val = numpy.random.random( (b, f, ih - kw + 1, iw - kw + 1)).astype('float32') def dconv(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='conv')(img.shape, kern.shape) return dnn.GpuDnnConv()(img, kern, out, desc, alpha=0.5, beta=0.75) def dconvi(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='conv')(img.shape, kern.shape) return dnn.GpuDnnConvGradI()(kern, out, img, desc, alpha=-1.0, beta=0.0) def dconvw(img, kern, out): desc = dnn.GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), conv_mode='conv')(img.shape, kern.shape) return dnn.GpuDnnConvGradW()(img, out, kern, desc, alpha=0.75, beta=-1.0) utt.verify_grad(dconv, [img_val, kern_val, out_val]) utt.verify_grad(dconvi, [img_val, kern_val, out_val]) utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_gradw(self): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') ftensor5 = T.TensorType(dtype="float32", broadcastable=(False,) * 5) img = ftensor5('img') kerns = ftensor5('kerns') out = ftensor5('out') img_val = numpy.asarray( numpy.random.rand(9, 2, 4, 8, 7), dtype='float32' ) kern_vals = numpy.asarray( numpy.random.rand(11, 2, 3, 1, 4), dtype='float32' ) for params in product( ['valid', 'full'], [(1, 1, 1), (2, 2, 2)], ['conv', 'cross'] ): out_vals = numpy.zeros( dnn.GpuDnnConv3d.get_out_shape(img_val.shape, kern_vals.shape, border_mode=params[0], subsample=params[1]), dtype='float32') desc = dnn.GpuDnnConvDesc( border_mode=params[0], subsample=params[1], conv_mode=params[2] )(img.shape, out.shape) conv_grad_w = dnn.GpuDnnConv3dGradW()( img, out, kerns, desc, ) self._compile_and_check( [img, out, kerns], [conv_grad_w], [img_val, out_vals, kern_vals], dnn.GpuDnnConv3dGradW )
def get_conv3d_test_cases(): # Every element of test_shapes follows the format # [input_shape, filter_shape, subsample] test_shapes = [ [(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)], [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)], # Test with 1x1x1 filters [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)], # Test with dimensions larger than 1024 (thread block dim) [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)], [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)], [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)], [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)], [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)], [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)], # The equivalent of this caused a crash with conv2d [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)], ] # With border mode 'full', test with kernel bigger than image in some/all # dimensions test_shapes_full = [ [(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)], ] border_modes = ["valid", "full", (1, 2, 3), (3, 2, 1), 1, 2] conv_modes = ["conv", "cross"] if cuda.dnn.dnn_available() and dnn.version() >= (3000, 3000): itt = chain(product(test_shapes, border_modes, conv_modes), product(test_shapes_full, ["full"], conv_modes)) else: # CuDNN, before V3, did not support kernels larger than the inputs, # even if the original inputs were padded so they would be larger than # the kernels. If using a version older than V3 don't run the tests # with kernels larger than the unpadded inputs. itt = product(test_shapes, border_modes, conv_modes) return itt
def get_conv3d_test_cases(): # Every element of test_shapes follows the format # [input_shape, filter_shape, subsample] test_shapes = [ [(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)], [(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)], [(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)], # Test with 1x1x1 filters [(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)], # Test with dimensions larger than 1024 (thread block dim) [(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)], [(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)], [(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)], [(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)], [(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)], [(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)], # The equivalent of this caused a crash with conv2d [(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)] ] # With border mode 'full', test with kernel bigger than image in some/all # dimensions test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)], [(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]] border_modes = ['valid', 'full', (1, 2, 3), (3, 2, 1), 1, 2] conv_modes = ['conv', 'cross'] if cuda.dnn.dnn_available() and dnn.version() >= (3000, 3000): itt = chain(product(test_shapes, border_modes, conv_modes), product(test_shapes_full, ['full'], conv_modes)) else: # CuDNN, before V3, did not support kernels larger than the inputs, # even if the original inputs were padded so they would be larger than # the kernels. If using a version older than V3 don't run the tests # with kernels larger than the unpadded inputs. itt = product(test_shapes, border_modes, conv_modes) return itt
def test_conv3d_bwd(): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') def run_conv3d_bwd(inputs_shape, filters_shape, subsample, border_mode, conv_mode): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) # Compile a theano function for the CuDNN implementation conv = dnn.dnn_conv3d(img=inputs, kerns=filters, border_mode=border_mode, subsample=subsample, conv_mode=conv_mode) grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters]) f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters filpped according to the width, height and time axis if conv_mode == 'conv': flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters # If border mode is anything but 'valid', the reference implementation # should operate on padded inputs if border_mode == 'valid': padded_inputs = inputs else: if border_mode == 'full': pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)] else: if isinstance(border_mode, int): pad_per_dim = [border_mode] * 3 else: pad_per_dim = border_mode pad_before_after = ([(0, 0), (0, 0)] + [(p, p) for p in pad_per_dim]) padded_inputs_val = numpy.pad(inputs_val, pad_before_after, 'constant') padded_inputs = shared(padded_inputs_val) # Compile a theano function for the reference implementation conv_ref = theano.tensor.nnet.conv3D( V=padded_inputs.dimshuffle(0, 2, 3, 4, 1), W=flipped_filters.dimshuffle(0, 2, 3, 4, 1), b=bias, d=subsample) (grad_padded_i_ref, grad_w_ref) = theano.tensor.grad(conv_ref.sum(), [padded_inputs, filters]) # Recover grad_i_ref from grad_padded_i_ref if border_mode == 'valid': grad_i_ref = grad_padded_i_ref else: shp = grad_padded_i_ref.shape grad_i_ref = grad_padded_i_ref[ :, :, pad_per_dim[0]:shp[2] - pad_per_dim[0], pad_per_dim[1]:shp[3] - pad_per_dim[1], pad_per_dim[2]:shp[4] - pad_per_dim[2]] f_ref = theano.function([], [grad_i_ref, grad_w_ref]) # Compare the results of the two implementations res_ref = f_ref() res = f() utt.assert_allclose(res_ref[0], res[0]) utt.assert_allclose(res_ref[1], res[1]) test_cases = get_conv3d_test_cases() for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases: yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode, conv_mode)
def test_conv3d_fwd(): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') def run_conv3d_fwd(inputs_shape, filters_shape, subsample, border_mode, conv_mode): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') # Scale down the input values to prevent very large absolute errors # due to float rounding inputs_val /= 10 filters_val /= 10 inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) # Compile a theano function for the CuDNN implementation conv = dnn.dnn_conv3d(img=inputs, kerns=filters, border_mode=border_mode, subsample=subsample, conv_mode=conv_mode) f = theano.function([], conv, mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters filpped according to the width, height and time axis if conv_mode == 'conv': flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters # If border mode is anything but 'valid', the reference implementation # should operate on padded inputs if border_mode == 'valid': padded_inputs = inputs else: if border_mode == 'full': pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)] else: if isinstance(border_mode, int): pad_per_dim = [border_mode] * 3 else: pad_per_dim = border_mode pad_before_after = ([(0, 0), (0, 0)] + [(p, p) for p in pad_per_dim]) padded_inputs_val = numpy.pad(inputs_val, pad_before_after, 'constant') padded_inputs = shared(padded_inputs_val) # Compile a theano function for the reference implementation conv_ref = theano.tensor.nnet.conv3D( V=padded_inputs.dimshuffle(0, 2, 3, 4, 1), W=flipped_filters.dimshuffle(0, 2, 3, 4, 1), b=bias, d=subsample) f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3)) # Compare the results of the two implementations res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res) test_cases = get_conv3d_test_cases() for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases: yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode, conv_mode)
def test_conv3d_bwd(): if not (cuda.dnn.dnn_available() and dnn.version() >= (2000, 2000)): raise SkipTest('"CuDNN 3D convolution requires CuDNN v2') def run_conv3d_bwd(inputs_shape, filters_shape, subsample, border_mode, conv_mode): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) # Compile a theano function for the CuDNN implementation conv = dnn.dnn_conv3d(img=inputs, kerns=filters, border_mode=border_mode, subsample=subsample, conv_mode=conv_mode) grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters]) f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu) # If conv_mode is 'conv' the reference implementation should use # filters filpped according to the width, height and time axis if conv_mode == 'conv': flipped_filters = filters[:, :, ::-1, ::-1, ::-1] else: flipped_filters = filters # If border mode is anything but 'valid', the reference implementation # should operate on padded inputs if border_mode == 'valid': padded_inputs = inputs else: if border_mode == 'full': pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)] else: if isinstance(border_mode, int): pad_per_dim = [border_mode] * 3 else: pad_per_dim = border_mode pad_before_after = ([(0, 0), (0, 0)] + [(p, p) for p in pad_per_dim]) padded_inputs_val = numpy.pad(inputs_val, pad_before_after, 'constant') padded_inputs = shared(padded_inputs_val) # Compile a theano function for the reference implementation conv_ref = theano.tensor.nnet.conv3D( V=padded_inputs.dimshuffle(0, 2, 3, 4, 1), W=flipped_filters.dimshuffle(0, 2, 3, 4, 1), b=bias, d=subsample) (grad_padded_i_ref, grad_w_ref) = theano.tensor.grad(conv_ref.sum(), [padded_inputs, filters]) # Recover grad_i_ref from grad_padded_i_ref if border_mode == 'valid': grad_i_ref = grad_padded_i_ref else: shp = grad_padded_i_ref.shape grad_i_ref = grad_padded_i_ref[:, :, pad_per_dim[0]:shp[2] - pad_per_dim[0], pad_per_dim[1]:shp[3] - pad_per_dim[1], pad_per_dim[2]:shp[4] - pad_per_dim[2]] f_ref = theano.function([], [grad_i_ref, grad_w_ref]) # Compare the results of the two implementations res_ref = f_ref() res = f() utt.assert_allclose(res_ref[0], res[0]) utt.assert_allclose(res_ref[1], res[1]) test_cases = get_conv3d_test_cases() for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases: yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode, conv_mode)
def local_dnn3d_convi_alpha_merge(node, *inputs): if not dnn_available() or version() == -1: return None return [GpuDnn3dConvGradI()(*inputs)]
def c_code_cache_version(self): return (2, version())