def test_reshape(): a = tcn.CudaNdarrayType((False,))() b = tcn.CudaNdarrayType((False,False))() c = T.reshape(a, [2,3]) #basic f = theano.function([a], c, mode=mode_without_gpu) fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))) assert numpy.all(fv == numpy.asarray([[0,1,2], [3,4,5]])) #test that it works without inplace operations a_val = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')) a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')) b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0,1,2],[3,4,5]],dtype='float32')) f_sub = theano.function([a,b], c-b, mode=mode_without_gpu) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) #test that it works with inplace operations a_val = theano._asarray([0,1,2,3,4,5], dtype='float32') a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float32') b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float32') f_sub = theano.function([a,b], c-b, mode=mode_without_gpu) assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) # verify gradient def just_vals(v): return T.Reshape(2)(v, theano._asarray([2,3], dtype='int32')) utt.verify_grad(just_vals, [a_val])
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) # try to compile reference implementation without shape, # so we don't have to compile hundreds of versions conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) try: conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) except Exception: # compile with shape information only when needed conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], ishape, kshape, mode, subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [ corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk ] try: conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) except Exception: # skip if the reference implementation can't do it pass f = theano.function([i, k], outputs, mode=theano_mode.excluding('conv_dnn', 'conv_gemm')) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip( allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [ corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk ] conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) if not theano.config.blas.ldflags: # Some of the operations are not transferred to the GPU, # and withoug BLAS, the abstract Op will not be optimized # to CorrMM either, so we have to accept the use of the # slow Python convolution in that case. mode = theano_mode.excluding('AbstractConvCheck') else: mode = theano_mode f = theano.function([i, k], outputs, mode=mode) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip( allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def test_transfer_cuda_gpu(): import theano.sandbox.cuda as cuda_ndarray if cuda_ndarray.cuda_available is False: raise SkipTest("Can't test interaction with cuda if cuda not present") g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') c = cuda_ndarray.CudaNdarrayType((False, False))('c') av = theano._asarray(rng.rand(5, 4), dtype='float32') gv = gpuarray.array(av) cv = cuda_ndarray.CudaNdarray(av) gvs = gv[:, ::-2] cvs = cv[:, ::-2] f = theano.function([c], gpu_from_cuda(c)) fv = f(cv) assert GpuArrayType.values_eq_approx(fv, gv) fvs = f(cvs) assert GpuArrayType.values_eq_approx(fvs, gvs) f = theano.function([g], cuda_from_gpu(g)) fv = f(gv) assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv) fvs = f(gvs) assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() # TODO: also test custom pad values corr_op = op(mode, subsample)(i, k) conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], border_mode=mode, subsample=subsample) conv_op_di = theano.grad(conv_op.sum(), i) conv_op_dk = theano.grad(conv_op.sum(), k) corr_op_di = theano.grad(corr_op.sum(), i) corr_op_dk = theano.grad(corr_op.sum(), k) outputs = [ corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk ] conv_op_dik = theano.grad(conv_op_di.sum(), k) conv_op_dki = theano.grad(conv_op_dk.sum(), i) corr_op_dik = theano.grad(corr_op_di.sum(), k) corr_op_dki = theano.grad(corr_op_dk.sum(), i) outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki]) # TODO: fix when the abstractconv tests can pass debug mode. mode = theano_mode if theano.config.mode == 'DEBUG_MODE': mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') f = theano.function([i, k], outputs, mode=mode) allvals = f(npy_img, npy_kern) for a, b, oa, ob, p in zip( allvals[::2], allvals[1::2], outputs[::2], outputs[1::2], ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight', 'dtop/dweight/dbottom')): assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2] assert_allclose(a, b, rtol=1e-4)
def cpu_var_to_gpu_var(x): from theano.sandbox import cuda type = cuda.CudaNdarrayType(broadcastable=x.broadcastable) name = 'gpu_%s' % x.name name = None gpu_var = cuda.CudaNdarrayVariable(type=type, name=name) cpu_var = cuda.host_from_gpu(gpu_var) return gpu_var, cpu_var return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
def SharedThunkLSTMFunc(rst, inpshape, oupshape, noot, backwards, MOMENTUM, LEARN_RATE): print "MAKE INNER FUNCTION", inpshape, oupshape, noot, backwards cuda2d = cuda.CudaNdarrayType((False, False)) #T.fmatrix isym = SymbolLayer(cuda2d(), (100, inpshape)) oval, l1f = BlockLSTMUnrollArrayToArray(rst, isym, oupshape, noot=noot, backwards=backwards) oflag = cuda2d() oupfunc = theano.function([isym.output], cuda.basic_ops.as_cuda_ndarray_variable( oval.output)) infshape = lambda x0: (x0[0][0], oupshape) #GRADFUNC g = T.sum(oval.output * oflag) iglist = T.grad(g, [isym.output] + l1f.params) olist = iglist[0] glist = iglist[1:] #Generate MOMENTUM mom = [] for i in l1f.params: init = np.zeros_like(i.get_value()) mom.append(theano.shared(init, name=i.name + '_momentum_ct')) #Additive update updates = [] for i, j in zip(glist, mom): updates.append((j, j - i * LEARN_RATE)) momup = [] for i in mom: momup.append((i, i * MOMENTUM)) print "MAIN UPDATES", updates resetmom = theano.function([], [], updates=momup) getgrad = theano.function([isym.output, oflag], cuda.basic_ops.as_cuda_ndarray_variable(olist), updates=updates) sharedop = GPUSharedThunkOp(oupfunc, infshape, getgrad) #Make sharedlayer class SharedLayer(Layer, Param, CMomentum): def get_momentums(self): return [] def __init__(self, inp, paramroot=False): if paramroot: self.params = l1f.params self.get_momentums = lambda: mom else: self.params = [] self.output = sharedop(inp.output) self.output_shape = oval.output_shape return SharedLayer, resetmom
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, direction): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) subsample = (subsx, subsy) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType( broadcastable=[sh == 1 for sh in npy_kern.shape])() if direction == 'fprop': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) elif direction == 'bprop img': cpuval = py_conv(npy_img, npy_kern, 'full', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) elif direction == 'bprop kern': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = numpy.array(f( npy_img.transpose(1, 0, 2, 3), npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose( 1, 0, 2, 3) assert_allclose(cpuval, gpuval, rtol=1e-4)
def make_node(self, img): assert hasattr(self, '_props'), "Your version of theano is too old " \ "to support __props__." # Theano's CudaNdArray support strides. But this require writing C # code calling the functions of sandbox/cuda/cuda_ndarray.cuh # and passing all the strides to the kernel to do the correct # computation. Instead, enforce contiguous arrays. cu_img = cuda.basic_ops.gpu_contiguous( cuda.basic_ops.as_cuda_ndarray_variable(img)) assert cu_img.dtype == 'float32' # N x nchannels x nbins output = cuda.CudaNdarrayType( dtype='float32', broadcastable=[False, False, False])() return theano.Apply(self, [cu_img], [output])
def make_node(self, img, kern): img = cuda.basic_ops.gpu_contiguous( cuda.basic_ops.as_cuda_ndarray_variable(img)) kern = cuda.basic_ops.gpu_contiguous( cuda.basic_ops.as_cuda_ndarray_variable(kern)) if img.type.ndim != 5: raise TypeError('img must be 5D tensor') if kern.type.ndim != 5: raise TypeError('kern must be 5D tensor') broadcastable = [ kern.type.broadcastable[-1], False, False, False, img.type.broadcastable[-1] ] return theano.Apply(self, [img, kern], [cuda.CudaNdarrayType(broadcastable)()])
def make_node(self, img, topgrad, shape): img = cuda.basic_ops.as_cuda_ndarray_variable(img) topgrad = cuda.basic_ops.as_cuda_ndarray_variable(topgrad) if img.type.ndim != 5: raise TypeError('img must be 5D tensor') if topgrad.type.ndim != 5: raise TypeError('topgrad must be 5D tensor') depth_height_width = [shape[0], shape[1], shape[2]] broadcastable = [ img.type.broadcastable[0], False, False, False, topgrad.type.broadcastable[0] ] return theano.Apply(self, [img, topgrad] + depth_height_width, [cuda.CudaNdarrayType(broadcastable)()])
def test_elemwise_collapse6(): """ Test when all inputs have two broadcastable dimension at the beginning""" shape = (4, 5) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x', 'x', 0, 1) b = tcn.CudaNdarrayType((True, True, False, False))() f = pfunc([b], [a3 + b], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(1, 1, shape[0], shape[1]), dtype='float32') v = cuda_ndarray.CudaNdarray(v) #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v)
def test_elemwise_collapse2(): """ Test when only one inputs have one broadcastable dimension """ shape = (4, 5, 9) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle(0, 'x', 1, 2) b = tcn.CudaNdarrayType((False, False, False, False))() c = a3 + b f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]), dtype='float32') v = cuda_ndarray.CudaNdarray(v) #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
def test_elemwise_collapse4(): """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar""" shape = (4, 5) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x', 0, 1, 'x') b = tcn.CudaNdarrayType((False, False, False, False))() c = (a3 + b + 2) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4), dtype='float32') v = cuda_ndarray.CudaNdarray(v) #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
def speed_elemwise_collapse(): """ used to time if the collapse of ccontiguous dims are useful """ shape = (30, 40, 50, 600) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2[:, ::2, :, :] b = tcn.CudaNdarrayType((False, False, False, False))() c = a3 + b * tensor.exp(1 + b ** a3) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(*shape), dtype='float32') v = v[:, ::2, :, :] v = cuda_ndarray.CudaNdarray(v) t1 = time.time() for i in range(100): #let debugmode catch errors f(v) t2 = time.time()
def test_elemwise_collapse(): """ Test when all inputs have one(and the same) broadcastable dimension """ shape = (4,5,60) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle(0,'x',1,2) b = tcn.CudaNdarrayType((False, True, False, False))() c = a3+b f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32') v=cuda_ndarray.CudaNdarray(v) if False: for id,n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out=f(v)[0] assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v) print "Expected collapse of all dimensions"
def test_elemwise_collapse5(): """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar""" shape = (4,5) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x','x',0,1) b = tcn.CudaNdarrayType((False, False, False, False))() c = (a3+b+2) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32') v=cuda_ndarray.CudaNdarray(v) if False: for id,n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out=f(v)[0] assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2) print "Expected collapse to 2 dimensions"
def speed_elemwise_collapse2(): """ used to test the speed up of the generalised collapse of ccontiguous dims""" shape = (30,40,50,600) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2[:,:,:,::2] b = tcn.CudaNdarrayType((False, False, False, False))() c = a3+b * tensor.exp(1 + b**a3) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(*shape),dtype='float32') v = v[:,:,:,::2] v=cuda_ndarray.CudaNdarray(v) for id,n in enumerate(f.maker.env.toposort()): print id, n t1=time.time() for i in range(100): #let debugmode catch errors f(v) t2=time.time()
#test with broadcast for shape, pattern in [((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]), ((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: op = careduce_op(scalar_op, axis=pattern) pat = tensor_pattern_to_gpu_pattern(shape, pattern) #GpuCAReduce{maximum} support only those patterns if scalar_op is theano.scalar.maximum and pat not in [ (0, 1), (0, 1, 1), (0, 1, 1)]: continue shape = numpy.asarray(shape) * 2 a = tensor.TensorType('float32', (False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))() b = op(a) b2 = op(a2) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val, dtype='float32') val2 = cuda.CudaNdarray(val) if len(shape) == 1: val = val[::2] val2 = val2[::2] elif len(shape) == 2: val = val[::2, ::2] val2 = val2[::2, ::2] elif len(shape) == 3: val = val[::2, ::2, ::2]
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None, theano_mode=None, cls=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -( theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode) if cls is not None: assert any([ isinstance(node.op, cls) for node in f.maker.fgraph.toposort() ]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort()) t2 = time.time() gpuval = f(img, kern) t3 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) # CPU val computed after GPU val to get the GPU errors. t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape) assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol) assert numpy.all(numpy.isfinite(gpuval)), gpuval assert [(sh == 1) is br for sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])] if (t2 is not None): if mode == 'valid': approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2 else: approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2) approx_fp /= 1e6 cpu_mflops = approx_fp / (t1 - t0) gpu_mflops = approx_fp / (t3 - t2) if verbose > 0: print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout) print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
def output_type(self, inp): return cuda.CudaNdarrayType(broadcastable=[False] * (inp.type.ndim))
def output_type(self, inp): return cuda.CudaNdarrayType(broadcastable=[False, False])
def cpu_to_gpu_var(x): type = cuda.CudaNdarrayType(broadcastable=x.broadcastable) name = gpu_name(x.name) gpu_var = cuda.CudaNdarrayVariable(type=type, name=name) cpu_var = cuda.host_from_gpu(gpu_var) return gpu_var, cpu_var
from theano import tensor from theano.gof.python25 import any from theano.tests.unittest_tools import seed_rng # Skip test if cuda_ndarray is not available. import theano.sandbox.cuda as cuda_ndarray if cuda_ndarray.cuda_available == False: raise SkipTest('Optional package cuda disabled') #needed as the gpu conv don't have a perform implementation. if theano.config.mode == 'FAST_COMPILE': theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') else: theano_mode = theano.compile.mode.get_default_mode().including('gpu') cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4) device_id = theano.sandbox.cuda.use.device_number if device_id is None: cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32')) device_id = theano.sandbox.cuda.use.device_number if device_id is None: cuda.use("gpu", force=False, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False, test_driver=True) device_id = theano.sandbox.cuda.use.device_number cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray device_prop = cuda_ndarray.device_properties(device_id)
def output_type(self, inp): return cuda.CudaNdarrayType( broadcastable=[False] * (inp.type.ndim + 1)) # add one extra dim for real/imag
def output_type(self, inp): return cuda.CudaNdarrayType( broadcastable=[False] * (inp.type.ndim - 1)) # remove extra real/imag dim
def test_sum(): """ test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0011, 0101, 0111, 1011, 1111 test sum pattern implemented with reshape: 1000, 0100, 0010, 0001, 11111 others implemented by reshape that are not tested 0011,0101,0110,1001,1010,1100 1110,1101,1011 TODO: test with broadcast """ for shape, pattern in [((100,3,1300),[1]), ((0,),[0]),((5,),[0]), ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((0,0,0,0),[0,1,2,3]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((5,4,3,10,11),[1,2]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions ((4100,3),[0]),((3,4101),[0]),#10 ((1024,33),[0]),((33,1024),[0]),#10 ((1025,33),[0]),((33,1025),[0]),#10 ((4100,3),[1]),((3,4101),[1]),#01 ((1024,33),[1]),((33,1024),[1]),#01 ((1025,33),[1]),((33,1025),[1]),#01 ((4100,3),[0,1]),((3,4101),[0,1]),#11 ((1024,33),[0,1]),((33,1024),[0,1]),#01 ((1025,33),[0,1]),((33,1025),[0,1]),#01 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 #test pattern implemented by reshape ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ]: a = tensor.TensorType('float32',(False,)*len(shape))() b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') f = theano.function([a],b, mode=mode_with_gpu) f2 = theano.function([a],b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] if val.size==0: assert f2(val)==f(val), ('shape', shape, 'pattern', pattern) else: try: #We raise the error threashold as we sum big matrix #and this cause small rounding difference with some seed #example in debug mode with unittests.rseed=9275 orig_rtol = theano.tensor.basic.float32_rtol theano.tensor.basic.float32_rtol = 2e-5 assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) finally: theano.tensor.basic.float32_rtol = orig_rtol #test with dimshuffle #we shuffle the 2 outer dims. for shape, pattern in [#((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: a = tensor.TensorType('float32',(False,)*len(shape))() dim_pattern = range(len(shape)) dim_pattern[0]=1 dim_pattern[1]=0 a = a.dimshuffle(dim_pattern) b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') f = theano.function([a],b, mode=mode_with_gpu) f2 = theano.function([a],b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) #test with broadcast for shape, pattern in [((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: shape = numpy.asarray(shape)*2 a = tensor.TensorType('float32',(False,)*len(shape))() a2 = tcn.CudaNdarrayType((False,)*len(shape))() b = T.Sum(pattern)(a) b2 = T.Sum(pattern)(a2) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') val2 = cuda.CudaNdarray(val) if len(shape)==1: val = val[::2] val2 = val2[::2] elif len(shape)==2: val = val[::2,::2] val2 = val2[::2,::2] elif len(shape)==3: val = val[::2,::2,::2] val2 = val2[::2,::2,::2] elif len(shape)==4: val = val[::2,::2,::2,::2] val2 = val2[::2,::2,::2,::2] f = theano.function([a],b, mode=mode_without_gpu) f2 = theano.function([a2],b2, mode=mode_with_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()] assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))