def test_pycuda_theano(): """Simple example with pycuda function and Theano CudaNdarray object.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(100).astype(numpy.float32) b = numpy.random.randn(100).astype(numpy.float32) # Test with Theano object ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) dest = cuda_ndarray.CudaNdarray.zeros(a.shape) multiply_them(dest, ga, gb, block=(400, 1, 1), grid=(1, 1)) assert (numpy.asarray(dest) == a * b).all()
def test_copy_subtensor0(): sizeof_float = 4 a = theano._asarray(numpy.random.rand(30, 20, 5, 5), dtype='float32') cuda_a = cuda_ndarray.CudaNdarray(a) a_view = cuda_a.view() a_view_strides = a_view._strides a_view._set_stride(2, -a_view_strides[2]) a_view._set_stride(3, -a_view_strides[3]) a_view._dev_data += 24 * sizeof_float a_view_copy = copy.deepcopy(a_view) assert numpy.all(a[:, :, ::-1, ::-1] == numpy.asarray(a_view_copy))
def test_leak2(): import theano.sandbox.cuda as cuda for i in xrange(1000000): n = numpy.asarray([2.3, 4.5], dtype='f') c = sys.getrefcount(n) a = cuda.CudaNdarray(n) assert c == sys.getrefcount(n) del a if not i % 1000: print('.', end=' ') print(gc.collect(), end=' ') print(gc.collect()) sys.stdout.flush()
def test_elemwise_collapse5(): """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar""" shape = (4, 5) a = cuda_ndarray.CudaNdarray( theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x', 'x', 0, 1) b = tcn.CudaNdarrayType((False, False, False, False))() c = (a3 + b + 2) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(5, 4, shape[0], shape[1]), dtype='float32') v = cuda_ndarray.CudaNdarray(v) if False: for id, n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v + 2) print "Expected collapse to 2 dimensions"
def test_elemwise_collapse2(): """ Test when only one inputs have one broadcastable dimension """ shape = (4, 5, 9) a = cuda_ndarray.CudaNdarray( theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle(0, 'x', 1, 2) b = tcn.CudaNdarrayType((False, False, False, False))() c = a3 + b f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]), dtype='float32') v = cuda_ndarray.CudaNdarray(v) if False: for id, n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v) print "Expected collapse to 3 dimensions"
def thunk(): N, nchans, winw, winh = inputs[0][0].shape nbins = self.nbins x0 = np.array([br[0] for br in self.binranges], dtype=np.float32) x0 = cuda.CudaNdarray(x0) x1 = np.array([br[1] for br in self.binranges], dtype=np.float32) x1 = cuda.CudaNdarray(x1) cx = np.array([nbins / (br[1] - br[0]) for br in self.binranges], dtype=np.float32) cx = cuda.CudaNdarray(cx) z = outputs[0] if z[0] is None: # Need to allocate z[0] = cuda.CudaNdarray.zeros((N, nchans, nbins)) # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#thread-hierarchy # https://github.com/inducer/pycuda/blob/master/pycuda/gpuarray.py#L81 grid, thread_blocks = pycuda.gpuarray.splay(N) pycuda_fct(inputs[0][0], z[0], np.intc(N), np.intc(nchans), np.intc(winw), np.intc(winh), np.intc(nbins), x0, x1, cx, block=thread_blocks, grid=grid)
def speed_elemwise_collapse2(): """ used to test the speed up of the generalised collapse of ccontiguous dims""" shape = (30,40,50,600) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2[:,:,:,::2] b = tcn.CudaNdarrayType((False, False, False, False))() c = a3+b * tensor.exp(1 + b**a3) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(*shape),dtype='float32') v = v[:,:,:,::2] v=cuda_ndarray.CudaNdarray(v) for id,n in enumerate(f.maker.env.toposort()): print id, n t1=time.time() for i in range(100): #let debugmode catch errors f(v) t2=time.time()
def test_sum(): shape = (2, 3) a0 = theano._asarray(numpy.arange(shape[0] * shape[1]).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1, 1]))) a0sum = a0.sum(axis=0) b0sum = b0.reduce_sum([1, 0]) # print 'asum\n',a0sum # print 'bsum\n',numpy.asarray(b0sum) assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0]))) assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0, 1]))) assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0, 0]))) shape = (3, 4, 5, 6, 7, 8) a0 = theano._asarray(numpy.arange(3 * 4 * 5 * 6 * 7 * 8).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose( a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0, 0, 1, 0, 1]))) shape = (16, 2048) a0 = theano._asarray(numpy.arange(16 * 2048).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1, 0]))) shape = (16, 10) a0 = theano._asarray(numpy.arange(160).reshape(shape), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1, 1])))
def _test_dummy(): ishape = (1, 1, 5, 5) kshape = (1, 1, 3, 3) mode = 'valid' subsample = (1, 1) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) #print >> sys.stdout, '_params_allgood trying ', ishape, kshape, mode t2 = None rval = True t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() gpuval = cuda_ndarray.conv(img, kern, mode, subsample) t2 = time.time() gpuval = numpy.asarray(gpuval) print gpuval print cpuval
def test_setitem_assign_to_slice(): a = numpy.arange(27) a.resize((3,3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7,8,9], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) # first get a slice of a _c = _a[:,:,1] # set middle row through cube to 7,8,9 # (this corresponds to middle row of matrix _c) _c[:,1] = _b a[:,:,1][:,1] = b assert numpy.allclose(a,numpy.asarray(_a)) #test direct transfert from numpy _d = _a[1,:,:] _d[1,:] = b*10 a[1,:,:][1,:] = b*10 assert numpy.allclose(a,numpy.asarray(_a))
def test_setitem_matrix_tensor3(): a = numpy.arange(27) a.resize((3, 3, 3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7, 8, 9], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) # set middle row through cube to 7,8,9 _a[:, 1, 1] = _b a[:, 1, 1] = b assert numpy.allclose(a, numpy.asarray(_a)) # test direct transfert from numpy _a[:, 1, 1] = b * 100 a[:, 1, 1] = b * 100 assert numpy.allclose(a, numpy.asarray(_a)) row = theano._asarray([777, 888, 999], dtype='float32') _a[1, 1, :] = row a[1, 1, :] = row assert numpy.allclose(a, numpy.asarray(_a))
def cmp(a_shp, b_shp): a = numpy.random.randn(*a_shp).astype(numpy.float32) b = numpy.random.randn(*b_shp).astype(numpy.float32) x = tensor.ftensor3() y = tensor.ftensor3() f = theano.function([x, y], batched_dot(x, y), mode=mode_with_gpu) z0 = numpy.asarray(f(a, b)) ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum(a[:, :, :, None] * b[:, None, :, :], axis=-2) unittest_tools.assert_allclose(z0, z_test) unittest_tools.assert_allclose(z1, z_test)
def test_elemwise_collapse7(atol=1e-6): """ Test when one input have one broadcastable dimension and the other is a scalar""" shape = (5, 4, 1) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a.copy(), 'a') a3 = a2.dimshuffle(0, 'x', 1, 2) f = pfunc([], [a3 + 2], mode=mode_with_gpu) #let debugmode catch errors out = f()[0] ans = (a + 2).reshape(shape[0], 1, shape[1], shape[2]) assert numpy.allclose(out, ans, atol=atol)
def test_setitem_matrix_bad_type(): a = numpy.arange(27) a.resize((3,3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7,8], dtype='float64') #test direct transfert from numpy try: # attempt to assign the ndarray b with setitem _a[1,:,:] = b assert False except TypeError, e: #print e assert True
def test_host_to_device(): #print >>sys.stdout, 'starting test_host_to_dev' for shape in ((), (3, ), (2, 3), (3, 4, 5, 6)): a = theano._asarray(numpy.random.rand(*shape), dtype='float32') b = cuda_ndarray.CudaNdarray(a) c = numpy.asarray(b) assert numpy.all(a == c) # test with float32 dtype d = numpy.asarray(b, dtype='float32') assert numpy.all(a == d) # test with not float32 dtype try: numpy.asarray(b, dtype='int8') assert False except TypeError: pass
def test_deepcopy(): a = cuda.fmatrix() a_v = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32')) # We force the c code to check that we generate c code mode = theano.Mode("c", mode_with_gpu.optimizer) f = theano.function([a], a, mode=mode) theano.printing.debugprint(f) out = f(a_v) assert out is not a_v assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out)) # We force the python linker as the default code should work for this op mode = theano.Mode("py", mode_with_gpu.optimizer) f = theano.function([a], a, mode=mode) theano.printing.debugprint(f) out = f(a_v) assert out is not a_v assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out))
def test_output_broadcast_cuda(self): from theano.sandbox import cuda if not cuda.cuda_available: raise SkipTest("Optional package Cuda disabled") if cuda.use.device_number is None: # We should normally set VecAsRowAndCol as a GPUOp But we # don't want to do this here as this will disable others # tests in this file. So we manually init the GPU if # needed to remove warning. cuda.use("gpu", force=True, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False) v = cuda.fvector('v') c, r = VecAsRowAndCol()(v) f = theano.function([v], [c, r]) v_val = cuda.CudaNdarray(self.rng.randn(5).astype('float32')) f(v_val)
def test_nvcc_bug(): """ The fct k_elemwise_unary_rowmajor_copy(used by cuda.copy()) in cuda_ndarray.cu is not well compiled with nvcc 3.0 and 3.1 beta. We found a workaround, so it sould work correctly. Without the workaround, this test fail. """ shape = (5,4) aa = theano._asarray(numpy.random.rand(*shape), dtype='float32') a = aa[::,::-1] b = cuda_ndarray.CudaNdarray(aa)[::,::-1] c = copy.copy(b) d = copy.deepcopy(b) assert numpy.allclose(a, numpy.asarray(b)) assert numpy.allclose(a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(d)) b+=b assert numpy.allclose(a+a, numpy.asarray(b)) assert numpy.allclose(a+a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(d))
def test_copy(): print >>sys.stdout, 'starting test_copy' shape = (500,499) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') print >>sys.stdout, '.. creating device object' b = cuda_ndarray.CudaNdarray(a) print >>sys.stdout, '.. copy' c = copy.copy(b) print >>sys.stdout, '.. deepcopy' d = copy.deepcopy(b) print >>sys.stdout, '.. comparisons' assert numpy.allclose(a, numpy.asarray(b)) assert numpy.allclose(a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(d)) b+=b assert numpy.allclose(a+a, numpy.asarray(b)) assert numpy.allclose(a+a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(d))
def test_stride_manipulation(): a = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32') b = cuda_ndarray.CudaNdarray(a) v = b.view() v._dev_data += 0 c = numpy.asarray(v) assert numpy.all(a == c) sizeof_float = 4 offset = 0 b_strides = b._strides for i in xrange(len(b.shape)): offset += (b.shape[i] - 1) * b_strides[i] v._set_stride(i, -b_strides[i]) v._dev_data += offset * sizeof_float c = numpy.asarray(v) assert numpy.all(c == [[5, 4, 3], [2, 1, 0]])
def test_hostfromgpu_shape_i(): """ Test that the shape is lifted over hostfromgpu """ pass m = mode_with_gpu.including('local_dot_to_dot22', 'local_dot22_to_dot22scalar', 'specialize') a = T.fmatrix('a') ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))() av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') cv = cuda.CudaNdarray( numpy.asarray(numpy.random.rand(5, 4), dtype='float32')) f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m) assert cuda.basic_ops.gpu_from_host in [ x.op for x in f.maker.env.toposort() ] f = theano.function([a], cuda.basic_ops.gpu_from_host(a).shape, mode=m) topo = f.maker.env.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(av)) == (5, 4) f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca), mode=m) assert cuda.basic_ops.host_from_gpu in [ x.op for x in f.maker.env.toposort() ] f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca).shape, mode=m) topo = f.maker.env.toposort() assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[2].op, T.opt.MakeVector) assert tuple(f(cv)) == (5, 4)
def test_dot(): print >>sys.stdout, 'starting test_dot' utt.seed_rng() rng = numpy.random.RandomState(utt.fetch_seed()) a0 = theano._asarray(rng.randn(4, 7), dtype='float32') a1 = theano._asarray(rng.randn(7, 6), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) b1 = cuda_ndarray.CudaNdarray(a1) assert _allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1)) a1 = theano._asarray(rng.randn(6, 7), dtype='float32') b1 = cuda_ndarray.CudaNdarray(a1) numpy_version = numpy.dot(a0, a1.T) transposed = cuda_ndarray.dimshuffle(b1,(1,0)) cuda_version = cuda_ndarray.dot(b0, transposed) assert _allclose(numpy_version, cuda_version) a1 = theano._asarray(rng.randn(7, 6), dtype='float32') b1 = cuda_ndarray.CudaNdarray(a1) a0 = theano._asarray(rng.randn(7, 4), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) assert _allclose(numpy.dot(a0.T, a1), cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)), b1)) a1 = theano._asarray(rng.randn(6, 7), dtype='float32') b1 = cuda_ndarray.CudaNdarray(a1) assert _allclose(numpy.dot(a0.T, a1.T), cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0,(1,0)), cuda_ndarray.dimshuffle(b1,(1,0))))
def test_setitem_broadcast(): #test scalar to vector without stride a = numpy.arange(3) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray(9, dtype='float32') _b = cuda_ndarray.CudaNdarray(b) _a[:] = _b.reshape((1,)) a[:] = b.reshape((1,)) assert numpy.allclose(numpy.asarray(_a),a) #test vector to matrice without stride a = numpy.arange(9) a.resize((3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([7,8,9], dtype='float32') _b = cuda_ndarray.CudaNdarray(b) _a[:,:] = _b.reshape((1,3)) a[:,:] = b.reshape((1,3)) assert numpy.allclose(numpy.asarray(_a),a) #test vector to matrice with stride a = numpy.arange(27) a.resize((3,3,3)) a = theano._asarray(a, dtype='float32') _a = cuda_ndarray.CudaNdarray(a) b = theano._asarray([[7,8,9],[10,11,12]], dtype='float32') _b = cuda_ndarray.CudaNdarray(b)[0] b = b[0] _a[:,:,1] = _b.reshape((1,3)) a[:,:,1] = b.reshape((1,3)) assert numpy.allclose(numpy.asarray(_a),a)
def test_mapping_getitem_w_int(): def _cmp(x,y): assert x.shape == y.shape if not numpy.all(x == y): print x print y assert numpy.all(x == y) def _cmpf(x,*y): try: x.__getitem__(y) except IndexError: pass else: raise Exception("Did not generate out or bound error") def _cmpfV(x,*y): try: if len(y)==1: x.__getitem__(*y) else: x.__getitem__(y) except ValueError: pass else: raise Exception("Did not generate out or bound error") dim =(2,) a = theano._asarray(numpy.random.rand(*dim), dtype='float32') _a = cuda_ndarray.CudaNdarray(a) _cmp(numpy.asarray(_a[1]), a[1]) _cmp(numpy.asarray(_a[-1]), a[-1]) _cmp(numpy.asarray(_a[0]), a[0]) _cmp(numpy.asarray(_a[::1]), a[::1]) _cmp(numpy.asarray(_a[::-1]), a[::-1]) _cmp(numpy.asarray(_a[...]), a[...]) _cmpf(_a,2) dim =() a = theano._asarray(numpy.random.rand(*dim), dtype='float32') _a = cuda_ndarray.CudaNdarray(a) _cmp(numpy.asarray(_a[...]), a[...]) _cmpf(_a,0) _cmpfV(_a,slice(1)) dim =(5,4,3,2) a = theano._asarray(numpy.random.rand(*dim), dtype='float32') _a = cuda_ndarray.CudaNdarray(a) _cmpf(_a,slice(-1),slice(-1),10,-10) _cmpf(_a,slice(-1),slice(-1),-10,slice(-1)) _cmpf(_a,0,slice(0,-1,-20),-10) _cmpf(_a,10) _cmpf(_a,(10,0,0,0)) _cmpf(_a,-10) #test with integer _cmp(numpy.asarray(_a[1]), a[1]) _cmp(numpy.asarray(_a[-1]), a[-1]) _cmp(numpy.asarray(_a[numpy.int64(1)]), a[numpy.int64(1)]) _cmp(numpy.asarray(_a[numpy.int64(-1)]), a[numpy.int64(-1)]) #test with slice _cmp(numpy.asarray(_a[1:]), a[1:]) _cmp(numpy.asarray(_a[1:2]), a[1:2]) _cmp(numpy.asarray(_a[-1:1]), a[-1:1]) #test with tuple (mix slice, integer, numpy.int64) _cmp(numpy.asarray(_a[:,:,::numpy.int64(-1), ::-1]), a[:,:,::-1,::-1]) _cmp(numpy.asarray(_a[:,:,numpy.int64(1),-1]), a[:,:,1,-1]) _cmp(numpy.asarray(_a[:,:,::-1, ::-1]), a[:,:,::-1,::-1]) _cmp(numpy.asarray(_a[:,:,::-10, ::-10]), a[:,:,::-10,::-10]) _cmp(numpy.asarray(_a[:,:,1,-1]), a[:,:,1,-1]) _cmp(numpy.asarray(_a[:,:,-1,:]), a[:,:,-1,:]) _cmp(numpy.asarray(_a[:,::-2,-1,:]), a[:,::-2,-1,:]) _cmp(numpy.asarray(_a[:,::-20,-1,:]), a[:,::-20,-1,:]) _cmp(numpy.asarray(_a[:,::-2,-1]), a[:,::-2,-1]) _cmp(numpy.asarray(_a[0,::-2,-1]), a[0,::-2,-1]) _cmp(numpy.asarray(_a[-1,-1,-1,-2]), a[-1,-1,-1,-2]) _cmp(numpy.asarray(_a[...]), a[...])
def subtest(shape): a = theano._asarray(numpy.random.rand(*shape_1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert b.shape == a.shape
def test_add_iadd_idiv(): for shapes in ( [(5,5),(5,1)], [(5,5),(1,5)], (), (0,), (3,), (2,3), (1,10000000),(10000,1000),(1000000,10), (4100,33,34),(33,4100,34),(33,34,4100), (4100,33,3,6),(33,4100,3,6),(33,3,4100,6),(33,3,6,4100), (4100,3,34,6),(3,4100,34,6),(3,34,4100,6),(3,34,6,4100), (4100,3,4,36),(3,4100,4,36),(3,4,4100,36),(3,4,36,4100), (0,0,0,0,0), (3,34,35,36,37), (33,34,3,36,37), (33,34,35,36,3), ): if isinstance(shapes,tuple): shape = shapes shape2 = shapes a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32') a0_orig = a0.copy() a1 = a0.copy() assert numpy.allclose(a0, a1) else: shape = shapes[0] shape2 = shapes[1] a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32') a0_orig = a0.copy() a1 = theano._asarray(numpy.random.rand(*shape2), dtype='float32') b0 = cuda_ndarray.CudaNdarray(a0) b1 = cuda_ndarray.CudaNdarray(a1) assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a1, numpy.asarray(b1)) # add don't support stride if shape == shape2: t0 = time.time() bsum = b0 + b1 bsum = b0 + b1 t1 = time.time() gpu_dt = t1 - t0 t0 = time.time() asum = a0 + a1 asum = a0 + a1 t1 = time.time() cpu_dt = t1 - t0 print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt) assert numpy.allclose(asum, numpy.asarray(bsum)) #test not contiguous version. #should raise not implemented. a0 = a0_orig.copy() b0 = cuda_ndarray.CudaNdarray(a0) if len(shape)==0: continue elif len(shape) == 1: _b = b1[::-1] elif len(shape) == 2: _b = b1[::, ::-1] elif len(shape) == 3: _b = b1[::, ::, ::-1] elif len(shape) == 4: _b = b1[::, ::, ::, ::-1] elif len(shape) == 5: _b = b1[::, ::, ::, ::, ::-1] # TODO: b0[...,::-1] don't work if shape == shape2: t = False try: _c = _b+b1 except TypeError: t = True assert t # test inplace version t0 = time.time() b0 += b1 t1 = time.time() gpu_dt = t1 - t0 t0 = time.time() a0 += a1 t1 = time.time() cpu_dt = t1 - t0 print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt) assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, a0_orig + a1) b0 /= b1 a0 /= a1 assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, (a0_orig + a1)/a1) # test inplace version # for not contiguous input b0 += _b a0 += a1[..., ::-1] assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, (a0_orig+a1)/a1+a1[..., ::-1]) b0 /= _b a0 /= a1[..., ::-1] assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, ((a0_orig+a1)/a1+a1[..., ::-1])/a1[..., ::-1])
def test_sum(): """ test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0011, 0101, 0111, 1011, 1111 test sum pattern implemented with reshape: 1000, 0100, 0010, 0001, 11111 others implemented by reshape that are not tested 0011,0101,0110,1001,1010,1100 1110,1101,1011 TODO: test with broadcast """ for shape, pattern in [((100,3,1300),[1]), ((0,),[0]),((5,),[0]), ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test. ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]), ((0,0,0,0),[0,1,2,3]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), ((5,4,3,10,11),[1,2]), ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]), #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions ((4100,3),[0]),((3,4101),[0]),#10 ((1024,33),[0]),((33,1024),[0]),#10 ((1025,33),[0]),((33,1025),[0]),#10 ((4100,3),[1]),((3,4101),[1]),#01 ((1024,33),[1]),((33,1024),[1]),#01 ((1025,33),[1]),((33,1025),[1]),#01 ((4100,3),[0,1]),((3,4101),[0,1]),#11 ((1024,33),[0,1]),((33,1024),[0,1]),#01 ((1025,33),[0,1]),((33,1025),[0,1]),#01 ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100 ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001 ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110 ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011 #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111 ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011 ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101 ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011 ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111 ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111 #test pattern implemented by reshape ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ]: a = tensor.TensorType('float32',(False,)*len(shape))() b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') f = theano.function([a],b, mode=mode_with_gpu) f2 = theano.function([a],b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] if val.size==0: assert f2(val)==f(val), ('shape', shape, 'pattern', pattern) else: try: #We raise the error threashold as we sum big matrix #and this cause small rounding difference with some seed #example in debug mode with unittests.rseed=9275 orig_rtol = theano.tensor.basic.float32_rtol theano.tensor.basic.float32_rtol = 2e-5 assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) finally: theano.tensor.basic.float32_rtol = orig_rtol #test with dimshuffle #we shuffle the 2 outer dims. for shape, pattern in [#((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: a = tensor.TensorType('float32',(False,)*len(shape))() dim_pattern = range(len(shape)) dim_pattern[0]=1 dim_pattern[1]=0 a = a.dimshuffle(dim_pattern) b = T.Sum(pattern)(a) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') f = theano.function([a],b, mode=mode_with_gpu) f2 = theano.function([a],b, mode=mode_without_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) #test with broadcast for shape, pattern in [((5,),[0]), ((5,4),[0,1]),((5,4),[0]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: shape = numpy.asarray(shape)*2 a = tensor.TensorType('float32',(False,)*len(shape))() a2 = tcn.CudaNdarrayType((False,)*len(shape))() b = T.Sum(pattern)(a) b2 = T.Sum(pattern)(a2) val = numpy.random.rand(numpy.prod(shape)).reshape(shape) # val = numpy.ones(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape) val = theano._asarray(val,dtype='float32') val2 = cuda.CudaNdarray(val) if len(shape)==1: val = val[::2] val2 = val2[::2] elif len(shape)==2: val = val[::2,::2] val2 = val2[::2,::2] elif len(shape)==3: val = val[::2,::2,::2] val2 = val2[::2,::2,::2] elif len(shape)==4: val = val[::2,::2,::2,::2] val2 = val2[::2,::2,::2,::2] f = theano.function([a],b, mode=mode_without_gpu) f2 = theano.function([a2],b2, mode=mode_with_gpu) assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()] assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -(theano._asarray(numpy.arange( numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) #we take the stride after the transfert as we make c_contiguous #data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] t2 = None rval = True try: t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() i = cuda_tensor4() k = cuda_tensor4() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(img, kern) t2 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert numpy.allclose(numpy.asarray(gpuval), numpy.asarray(gpuval2)) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) if gpuval.shape != cpuval.shape: print >> sys.stdout, "ERROR: shape mismatch", print >> sys.stdout, gpuval.shape, cpuval.shape rval = False if rval: rval = numpy.allclose(cpuval, gpuval, rtol=rtol) assert numpy.all(numpy.isfinite(gpuval)) except NotImplementedError, e: print >> sys.stdout, '_params_allgood Failed allclose', e rval = False
def setUp(self): self.input = cuda.ftensor4() self.filters = cuda.ftensor4() self.topgrad = cuda.ftensor4() self.constant_tensor = cuda.CudaNdarray( numpy.zeros((3, 5, 7, 11), dtype='float32'))
def test_dimshuffle(self): utt.seed_rng() rng = numpy.random.RandomState(utt.fetch_seed()) # 2d -> 0d a = theano._asarray(rng.randn(1, 1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b, ())) # Test when we drop a axis that don't have shape 1 a = theano._asarray(rng.randn(2, 1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, ()) # Test that we can't take a dimensions multiple time a = theano._asarray(rng.randn(2, 1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) self.assertRaises(ValueError, cuda_ndarray.dimshuffle, b, (1, 1)) # 1d a = theano._asarray(rng.randn(3, ), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b, (0, ))) assert numpy.allclose(a[None, :, None], cuda_ndarray.dimshuffle(b, (-1, 0, -1))) # 2d a = theano._asarray(rng.randn(3, 11), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.transpose(a), cuda_ndarray.dimshuffle(b, (1, 0))) assert numpy.allclose( numpy.transpose(a)[None, :, None, :, None], cuda_ndarray.dimshuffle(b, (-1, 1, -1, 0, -1))) # 2d -> 1d a = theano._asarray(rng.randn(1, 11), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a[:], cuda_ndarray.dimshuffle(b, (1, ))) a = theano._asarray(rng.randn(11, 1), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a.reshape((11, )), cuda_ndarray.dimshuffle(b, (0, ))) # 3d a = theano._asarray(rng.randn(3, 4, 5), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(a, cuda_ndarray.dimshuffle(b, (0, 1, 2))) assert numpy.allclose(numpy.swapaxes(a, 0, 1), cuda_ndarray.dimshuffle(b, (1, 0, 2))) assert numpy.allclose(numpy.swapaxes(a, 0, 2), cuda_ndarray.dimshuffle(b, (2, 1, 0))) assert numpy.allclose(numpy.swapaxes(a, 1, 2), cuda_ndarray.dimshuffle(b, (0, 2, 1))) assert numpy.allclose( numpy.swapaxes(a, 1, 2)[None, :, None, :, :, None], cuda_ndarray.dimshuffle(b, (-1, 0, -1, 2, 1, -1))) # 4d a = theano._asarray(rng.randn(3, 11, 4, 5), dtype='float32') b = cuda_ndarray.CudaNdarray(a) assert numpy.allclose(numpy.swapaxes(a, 0, 1), cuda_ndarray.dimshuffle(b, (1, 0, 2, 3))) assert numpy.allclose(numpy.swapaxes(a, 0, 2), cuda_ndarray.dimshuffle(b, (2, 1, 0, 3))) assert numpy.allclose(numpy.swapaxes(a, 0, 3), cuda_ndarray.dimshuffle(b, (3, 1, 2, 0))) assert numpy.allclose(numpy.swapaxes(a, 0, 3), cuda_ndarray.dimshuffle(b, (3, 1, 2, 0))) assert numpy.allclose( numpy.swapaxes(a, 0, 3)[None, :, None, :, :, :], cuda_ndarray.dimshuffle(b, (-1, 3, -1, 1, 2, 0)))