def test_opt_gpujoin_onlyajoin(): # from a bug in normal sampling _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32') _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32') a = cuda.shared_constructor(_a) b = cuda.shared_constructor(_b) c = tensor.join(1, a, b) f = theano.function([], c, mode=mode_with_gpu) f() graph_nodes = f.maker.fgraph.toposort() assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu) assert isinstance(graph_nodes[-2].op, cuda.GpuJoin) assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1)) # test mixed dtype _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float64') b = theano.tensor.constant(_b) c = tensor.join(1, a, b) f = theano.function([], c, mode=mode_with_gpu) f() graph_nodes = f.maker.fgraph.toposort() assert isinstance(graph_nodes[-1].op, theano.tensor.Join) assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
def test_opt_gpujoin_joinvectors_elemwise_then_minusone(): # from a bug in gpu normal sampling _a = numpy.asarray([1, 2, 3, 4], dtype='float32') _b = numpy.asarray([5, 6, 7, 8], dtype='float32') a = cuda.shared_constructor(_a) b = cuda.shared_constructor(_b) a_prime = tensor.cos(a) b_prime = tensor.sin(b) c = tensor.join(0, a_prime, b_prime) d = c[:-1] f = theano.function([], d, mode=mode_with_gpu) graph_nodes = f.maker.fgraph.toposort() assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu) assert isinstance(graph_nodes[-2].op, cuda.GpuSubtensor) assert isinstance(graph_nodes[-3].op, cuda.GpuJoin) concat = numpy.concatenate([numpy.cos(_a), numpy.sin(_b)], axis=0) concat = concat[:-1] assert numpy.allclose(numpy.asarray(f()), concat)
def test_elemwise_composite_support_code(): """ This was generating an error at compile time. Commit 3d1690fa346103594356ecaeceeb2c6757b45d2b fixed that. """ X = tcn.shared_constructor(value=numpy.zeros((100, 10), dtype="float32"), name='X') W = tcn.shared_constructor(value=numpy.zeros((10, 1), dtype="float32"), name='W') U = T.dot(X, W) Y = tcn.shared_constructor(value=numpy.zeros((100, 1), dtype="float32"), name='Y') P = T.exp(-(Y - U) ** 2) epsilon = numpy.asarray(0.001, dtype="float32") NLL = -T.mean(T.log(P + epsilon)) # SupportCodeError G = T.grad(NLL, wrt=[W]) backup = theano.config.warn.identify_1pexp_bug theano.config.warn.identify_1pexp_bug = False try: f_grad = theano.function(inputs=[], outputs=G, mode=mode_with_gpu) finally: theano.config.warn.identify_1pexp_bug = backup f_grad() topo = f_grad.maker.env.toposort() assert sum([isinstance(node.op, T.Elemwise) for node in topo]) == 1 assert sum([isinstance(node.op, tcn.GpuElemwise) for node in topo]) == 1
def test_elemwise2(): """ Several kinds of elemwise expressions with dimension permutations """ rng = numpy.random.RandomState(int(time.time())) shape = (3, 5) for pattern in [(0, 1), (1, 0)]: a = tcn.shared_constructor(theano._asarray(rng.rand(*shape), dtype='float32'), name=None) b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))() f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(rng.rand(*shape), dtype='float32') * .3) shape = (3, 4, 5, 6) a = tcn.shared_constructor(theano._asarray(rng.rand(*shape), dtype='float32'), 'a') b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))() f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) * tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(rng.rand(*shape), dtype='float32'))
def test_pool(): #(batch, channel, x, y) shps = [(1, 1, 2, 2), ] shps = [(channel, x, y, batch) for (batch, channel, x, y) in shps] #numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) warnings.warn("TODO: Razvan needs to finish this") for shp in shps: for ds in range(1, min(4, shp[2] + 1)): for start in [0]: for stride in range(1, min(shp[2], ds, 4) + 1): #print 'test_pool shape=%s, ds=%d, stride=%d start=%d' % ( # str(shp), ds, stride, start) va = my_rand(*shp) tva = va.flatten() #print 'va', tva, tva.max(), tva.argmax() vb = my_rand(*shp) tvb = vb.flatten() #print 'vb', tvb, tvb.max(), tvb.argmax(),\ # tvb[tva.argmax()] a = tcn.shared_constructor(va, 'a') b = tcn.shared_constructor(vb, 'b') op = MaxPool(ds=ds, stride=stride) v = op(a) rval = theano.tensor.Rop(v, a, b) f = theano.function([], rval, mode=mode_with_gpu) print f.maker.fgraph.toposort() #ssert any([isinstance(node.op, MaxPool) # for node in f.maker.fgraph.toposort()]) out = numpy.asarray(f())
def test_gpujoin_preserves_broadcasting(): _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32") _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32") a = tcn.shared_constructor(_a) b = tcn.shared_constructor(_b) # [0,0] : the two original dims were non-broadcastable # [1,x,0]: new order and broadcastability gpu_dimshuffle = GpuDimShuffle([0, 0], [1, "x", 0]) a_shuffled = gpu_dimshuffle(a) b_shuffled = gpu_dimshuffle(b) c = gpu_join(0, a_shuffled, b_shuffled) assert c.type.broadcastable == (False, True, False) f = theano.function([], c, mode=mode_with_gpu) res = f() a_reshaped = numpy.asarray([[[1, 3]], [[2, 4]]], dtype="float32") b_reshaped = numpy.asarray([[[5, 8]], [[6, 9]], [[7, 10]]], dtype="float32") concat = numpy.concatenate([a_reshaped, b_reshaped], axis=0) assert numpy.all(res == concat)
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc( [b, b2], [tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.fgraph.toposort()]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval, bval2) assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, bval2)
def test_nvidia_driver2(): """ Test that the gpu device is initialized by theano when we manually make a shared variable on the gpu. The driver should always be tested during theano initialization of the gpu device """ a = numpy.random.rand(10000).astype("float32") cuda.shared_constructor(a) assert theano.sandbox.cuda.use.device_number is not None
def test_gpujoin_twomatrices_joincolumns(): _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32") _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32") a = tcn.shared_constructor(_a) b = tcn.shared_constructor(_b) c = gpu_join(1, a, b) f = theano.function([], c) assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100): if config.mode == 'DEBUG_MODE': n_train = 1 if use_gpu: w = tcn.shared_constructor(0.01 * (my_rand(n_in, n_hid) - 0.5), 'w') b = tcn.shared_constructor(my_zeros(n_hid), 'b') v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c') c = tcn.shared_constructor(my_zeros(n_out), 'c') else: w = shared(0.01 * (my_rand(n_in, n_hid) - 0.5), 'w') b = shared(my_zeros(n_hid), 'b') v = shared(my_zeros((n_hid, n_out)), 'c') c = shared(my_zeros(n_out), 'c') x = tensor.fmatrix('x') y = tensor.fmatrix('y') lr = tensor.fscalar('lr') hid = tensor.tanh(tensor.dot(x, w) + b) out = tensor.tanh(tensor.dot(hid, v) + c) loss = tensor.sum(0.5 * (out - y) ** 2 * lr) if 0: print('loss type', loss.type) params = [w, b, v, c] gparams = tensor.grad(loss, params) mode = get_mode(use_gpu) # print 'building pfunc ...' train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, g in izip(params, gparams)]) if 0: for i, n in enumerate(train.maker.fgraph.toposort()): print(i, n) xval = my_rand(n_batch, n_in) yval = my_rand(n_batch, n_out) lr = theano._asarray(0.01, dtype='float32') t0 = time.time() rval = [] for i in xrange(n_train): rval.append(train(xval, yval, lr)) dt = time.time() - t0 print_mode(mode) return numpy.asarray(rval), dt
def test_gpujoin_twomatrices_badshapes(): _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32") _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32") a = tcn.shared_constructor(_a) b = tcn.shared_constructor(_b) # try to join on dimension 0 where they don't agree (2!=3) c = gpu_join(0, a, b) f = theano.function([], c) try: f() assert False except ValueError: assert True
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') b = tensor.fmatrix('b') c = tensor.fmatrix('c') f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()]) bval = my_rand(*b_shp) cval = my_rand(a_shp[0], b_shp[1]) f(bval, cval) assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval), a.get_value()) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, cval)
def test_downsample(): shps = [ (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[2]: continue if ds[1] > shp[3]: continue # GpuDownsampleFactorMax doesn't like having more than 512 columns # in the output tensor. if float(shp[3]) / ds[1] > 512: continue for ignore_border in (True, False): print "test_downsample", shp, ds, ignore_border ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), "a") f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu) assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort()]) assert numpy.allclose(f(), f2()) g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu) g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu) assert any( [isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort()] ) assert any([isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort()]) assert numpy.allclose(g(), g2())
def test_local_assert_no_cpu_op(): numpy.random.seed(1) m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32") ms = cuda.shared_constructor(m, name="m_shared") out = theano.tensor.tanh(ms).dot(ms.T) mode_local_assert = mode_with_gpu.including("assert_no_cpu_op") mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_0") mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_1") old = config.assert_no_cpu_op old2 = config.on_opt_error # If the flag is raise try: config.assert_no_cpu_op = 'raise' config.on_opt_error = 'ignore' assert_raises(AssertionError, theano.function, [], out, mode=mode_local_assert) finally: config.assert_no_cpu_op = old config.on_opt_error = old2 # If the flag is ignore try: config.assert_no_cpu_op = 'ignore' theano.function([], out, mode=mode_local_assert) finally: config.assert_no_cpu_op = old
def test_elemwise1(): """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ shape = (3, 4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.5, 'a') b = tensor.fmatrix() #let debugmode catch any mistakes print >> sys.stdout, "STARTING FUNCTION 1" f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 2" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu) for i, node in enumerate(f.maker.env.toposort()): print i, node f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) print >> sys.stdout, "STARTING FUNCTION 3" #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
def test_memory_lazy(): """As test_memory, but with the ifelse op. We need to test it as the ifelse op with the [c]vm create op not executed in the graph. This mess with [c]vm gc implementation. """ shapes = (50, 100) # more_alloc1 is not the same for both dtype. # when dtype is float32, the computation is done on the gpu. # This insert constant on the gpu during compilation # that raise the number of alloc. # When dtype is float64, only the shared is on the gpu and it is transferd # to the cpu for computation. So no extra alloc after compilation. # more_alloc1 if after the first compilation for dtype, more_alloc1 in [("float32", 1), ("float64", 0)]: print(dtype) test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype) some_vector = tensor.vector('some_vector', dtype=dtype) some_matrix = some_vector.reshape(shapes) branch_select = tensor.iscalar() mem1 = freemem() print("Before shared variable", mem1) variables = cuda.shared_constructor(np.ones((shapes[1],), dtype='float32')) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) derp = ifelse.IfElse(1)(branch_select, derp, some_matrix[:shapes[0]].sum()) derp += 1 print("Shared took ", np.prod(variables.get_value( borrow=True, return_internal_type=True).shape) * 4 / 1024, "kB") mem2 = freemem() print("Before compilation", mem2) mem2_1 = freemem(extra_alloc=more_alloc1) obj = theano.function([some_vector, branch_select], derp, mode=mode_with_gpu) #theano.printing.debugprint(obj, print_type=True) mem3 = freemem() print("After function compilation 1", mem3) assert mem2_1 == mem3, (mem2_1, mem3) for i in range(3): obj(test_params, 1) print("After function evaluation branch true", freemem()) assert mem2_1 == freemem(), (mem2_1, freemem()) obj(test_params, 0) print("After function evaluation branch false", freemem()) assert mem2_1 == freemem(), (mem2_1, freemem()) del obj print("After deleting function 1", freemem()) assert mem2 == freemem(), (mem2, freemem()) del derp, variables print("After deleting shared variable and ref to it", freemem()) assert mem1 == freemem(), (mem1, freemem())
def shared(val): # If we don't put shared on the GPU, we won't be able to test # the no inplace version as the added transfer will make them inplace. try: return tcn.shared_constructor(val) except TypeError: return theano.shared(val)
def test_gpuspecifyshape(): x = cuda.shared_constructor(numpy.ones(3, dtype='float32'), 'x') m = theano.tensor.specify_shape(x + numpy.float32(1), (3,)) f = theano.function([], updates=[(x, m * numpy.float32(2))], mode=mode_with_gpu) l = f.maker.fgraph.toposort() assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
def test_opt_gpujoin_joinvectors_negativeaxes(): """ Test that negative axis concatenation works as expected. """ # Test case for one-dimensional vectors rng = numpy.random.RandomState(22) x1 = rng.rand(5) x2 = rng.rand(10) t1 = cuda.shared_constructor(numpy.asarray(x1, "float32")) t2 = cuda.shared_constructor(numpy.asarray(x2, "float32")) t = tensor.concatenate([t1, t2], axis=-1) f = theano.function(inputs=[], outputs=t) assert(numpy.allclose(f(), numpy.concatenate([x1, x2], axis=-1))) # Test case for two-dimensional vectors x1 = rng.rand(5, 10) x2 = rng.rand(10, 10) t1 = cuda.shared_constructor(numpy.asarray(x1, "float32")) t2 = cuda.shared_constructor(numpy.asarray(x2, "float32")) t = tensor.concatenate([t1, t2], axis=-2) f = theano.function(inputs=[], outputs=t) assert(numpy.allclose(f(), numpy.concatenate([x1, x2], axis=-2))) # Now check that a value error is raised when vectors don't match # along the negative concatenation axis try: t = tensor.concatenate([t1, t2], axis=-1) f = theano.function(inputs=[], outputs=t) f() assert(False) except ValueError: assert(True) # Finally check that a value error is raised when negative # axis is larger in absolute value than smallest number of dims try: t = tensor.concatenate([t1, t2], axis=-3) f = theano.function(inputs=[], outputs=t) f() assert(False) except IndexError: assert(True)
def cmp(a_shp, b_shp): a0 = numpy.random.uniform(-0.4, 0.4, a_shp).astype('float32') a = cuda.shared_constructor(a0, 'a') b0 = numpy.random.uniform(-0.4, 0.4, b_shp).astype('float32') b = cuda.shared_constructor(b0, 'b') f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu) assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op, cuda.cula.GpuSolve) assert cuda.opt.local_gpu_solve.transform( tensor.slinalg.solve(a, b).owner) out = f() assert numpy.allclose(numpy.dot(a0, out), b0)
def test_memory(): """ We test that we do not keep link to memory between Theano function call and during Theano compilation The origin of this code come from Aaron Vandenoord and Sander Dieleman. I have their autorisation to put this in Theano with the Theano license. note:: This test can fail if there is other process running on the gpu. """ shapes = (6000, 5000) test_params = np.asarray(np.random.randn(np.prod(shapes)), 'float32') some_vector = tensor.vector('some_vector') some_matrix = some_vector.reshape(shapes) mem1 = freemem() print "Before shared variable", mem1 variables = cuda.shared_constructor(np.ones((shapes[1],), dtype='float32')) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) print "Shared took ", np.prod(variables.get_value( borrow=True, return_internal_type=True).shape) * 4 / 1024, "kB" mem2 = freemem() print "Before compilation", mem2 obj = theano.function([some_vector], derp, mode=mode_with_gpu) mem3 = freemem() print "After function compilation 1", mem3 assert mem2 == mem3, (mem2, mem3) grad_derp = tensor.grad(derp, some_vector) grad = theano.function([some_vector], grad_derp, mode=mode_with_gpu) mem4 = freemem() print "After function compilation 2", mem4 assert mem2 == mem4, (mem2, mem4) for i in range(3): obj(test_params) print "After function evaluation 1", freemem() assert mem2 == freemem(), (mem2, freemem()) grad(test_params) print "After function evaluation 2", freemem() assert mem2 == freemem(), (mem2, freemem()) del obj print "After deleting function 1", freemem() assert mem2 == freemem(), (mem2, freemem()) del grad print "After deleting function 2", freemem() assert mem2 == freemem(), (mem2, freemem()) del derp, variables, grad_derp print "After deleting shared variable and ref to it", freemem() assert mem1 == freemem(), (mem1, freemem())
def cmp(a_shp, b_shp): a = tcn.shared_constructor(my_rand(*a_shp), 'a') cval = my_rand(a_shp[0], b_shp[1]) c = tcn.shared_constructor(cval.copy(), 'c') b = tcn.fmatrix('b') b2 = tcn.fmatrix('b2') f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu) a0 = a.get_value() * 1.0 assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()]) bval = my_rand(*b_shp) bval2 = my_rand(*b_shp) rval = f(bval,bval2) assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value()) assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
def cmp(a_shp, b_shp): a0 = numpy.random.rand(*a_shp).astype('float32') a = cuda.shared_constructor(a0, 'a') b0 = numpy.random.rand(*b_shp).astype('float32') b = cuda.shared_constructor(b0, 'b') f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu) assert cuda.opt.local_gpu_dot_to_dot22.transform( tensor.dot(a, b).owner) out = f() assert numpy.allclose(numpy.dot(a0, b0), out) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1], borrow=True) f()
def cmp_sigmoids(shape): def numpy_sigmoid(input): rval = 1.0 / (1.0 + numpy.exp(-input)) sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))() shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input') times = compare_fns( dict( numpy=numpy_sigmoid , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))) , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))]) ), input=shared_input.value) showtimes(times)
def speed_adv_sub1(): data = numpy.random.rand(50000, 21).astype("float32") var = tcn.shared_constructor(data) vec = tensor.lvector() for batch_size in [100, 1000, 10000, 100000]: idx = numpy.random.randint(0, 50000, batch_size) mode_with_gpu = theano.compile.ProfileMode().including('gpu') f = theano.function([vec], var[vec], mode=mode_with_gpu) for i in range(100): f(idx) print "ProfileMode with batch size", batch_size mode_with_gpu.print_summary()
def cmp(a_shp, b_shp): a = tcn.shared_constructor(my_rand(*a_shp), 'a') b = tensor.fmatrix() f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu) a0 = a.get_value() * 1.0 bval = my_rand(*b_shp) f(bval) assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
def test_gpualloc_input_on_gpu(): a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32') a = tcn.shared_constructor(a_val) b = T.fscalar() f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu) f_gpu = theano.function([b], T.ones_like(a)+b, mode=mode_with_gpu) assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1 assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1 assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9)) assert numpy.allclose(f(5),f_gpu(5))
def test_elemwise_empty(): #test with 0 element a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0,0), dtype='float32'), 'a') b = tensor.fmatrix() f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu) f2 = pfunc([b], [], updates=[(a, a+b)], mode=mode_without_gpu) a0 = a.get_value() * 1.0 f(numpy.ones((0,0), dtype='float32')) assert numpy.all(a0 + 1.0 == a.get_value())
def test_elemwise_fusion(): """ Test the the GpuElemwise fusion work correctly""" shape = (3,4) a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fmatrix() c = tensor.fmatrix() f = pfunc([b,c], [a+b+c], mode=mode_with_gpu) topo = f.maker.env.toposort() for i, node in enumerate(topo): print >> sys.stdout, i, node assert len(topo)==4 assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite) #let debugmode catch errors f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
def test_elemwise4(): """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update""" shape = (3,4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fvector() c = tensor.fvector() f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): print >> sys.stdout, i, node has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') b = tensor.fmatrix() f = pfunc([b], [], updates=[(a, tensor.dot(a, b))], mode=mode_with_gpu) bval = my_rand(*b_shp) f(bval) assert numpy.allclose(numpy.dot(a0, bval), a.get_value()) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value(a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval)
def test_nvidia_driver1(): """ Some nvidia driver give bad result for reduction This execute some reduction test to ensure it run correctly """ a = numpy.random.rand(10000).astype("float32") A = cuda.shared_constructor(a) f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu, profile=False) topo = f.maker.fgraph.toposort() assert len(topo) == 2 if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1: msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' + 'but got:']+[str(app) for app in topo]) raise AssertionError(msg) if not numpy.allclose(f(), a.sum()): raise Exception("The nvidia driver version installed with this OS " "does not give good results for reduction." "Installing the nvidia driver available on the same " "download page as the cuda package will fix the " "problem: http://developer.nvidia.com/cuda-downloads")
def test_elemwise0(): a = tcn.shared_constructor( theano._asarray(numpy.random.rand(4, 4), dtype='float32'), 'a') b = tensor.fmatrix() f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu) #check that we work inplace. assert f.maker.env.toposort()[1].op.destroy_map.items() == [(0, [0])] a0 = a.get_value() * 1.0 print 'BEFORE ADD', a.get_value() for i, node in enumerate(f.maker.env.toposort()): print i, node f(numpy.ones((4, 4), dtype='float32')) print 'AFTER ADD', a.get_value() assert numpy.all(a0 + 1.0 == a.get_value())
def test_elemwise3(): """ Several kinds of elemwise expressions with dimension permutations and broadcasting""" shape = (3,4,5,6) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') b = tensor.fvector() print b.type print tensor.constant(1).type print (1 + b).type print (1 + b**a).type print tensor.exp((1 + b**a)).type f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 + b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu) has_elemwise = False for i, node in enumerate(f.maker.env.toposort()): print >> sys.stdout, i, node has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) assert not has_elemwise #let debugmode catch errors f(theano._asarray(numpy.random.rand(6), dtype='float32'))
def test_elemwise_collapse4(): """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar""" shape = (4, 5) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x', 0, 1, 'x') b = tcn.CudaNdarrayType((False, False, False, False))() c = (a3 + b + 2) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4), dtype='float32') v = cuda_ndarray.CudaNdarray(v) #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
def test_elemwise_collapse6(): """ Test when all inputs have two broadcastable dimension at the beginning""" shape = (4,5) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle('x','x',0,1) b = tcn.CudaNdarrayType((True, True, False, False))() f = pfunc([b], [a3+b], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32') v=cuda_ndarray.CudaNdarray(v) if False: for id,n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out=f(v)[0] assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v) print "Expected collapse to c contiguous"
def test_elemwise_collapse(): """ Test when all inputs have one(and the same) broadcastable dimension """ shape = (4, 5, 60) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle(0, 'x', 1, 2) b = tcn.CudaNdarrayType((False, True, False, False))() c = a3 + b f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(shape[0], 1, *shape[1:]), dtype='float32') v = cuda_ndarray.CudaNdarray(v) #let debugmode catch errors out = f(v)[0] assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
def test_elemwise1(): """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ shape = (3, 4) a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.5, 'a') b = tensor.fmatrix() #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3) #let debugmode catch any mistakes f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))], mode=mode_with_gpu) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
def cmp(a_shp, b_shp): a = tcn.shared_constructor(my_rand(*a_shp), 'a') b = tensor.fmatrix('b') c = tensor.fmatrix('c') f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], mode=mode_with_gpu) assert any([ node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort() ]) a0 = a.get_value() * 1.0 bval = my_rand(*b_shp) cval = my_rand(a_shp[0], b_shp[1]) f(bval, cval) assert numpy.allclose( numpy.dot(a0, bval) + numpy.exp(cval), a.get_value())
def speed_elemwise_collapse(): """ used to time if the collapse of ccontiguous dims are useful """ shape = (30, 40, 50, 600) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape), dtype='float32')) a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2[:, ::2, :, :] b = tcn.CudaNdarrayType((False, False, False, False))() c = a3 + b * tensor.exp(1 + b ** a3) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(*shape), dtype='float32') v = v[:, ::2, :, :] v = cuda_ndarray.CudaNdarray(v) t1 = time.time() for i in range(100): #let debugmode catch errors f(v) t2 = time.time()
def test_gpualloc_output_to_gpu(): a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') a = tcn.shared_constructor(a_val) b = T.fscalar() f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu) f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a)) + b, mode=mode_with_gpu) print f.maker.env.toposort() print f_gpu.maker.env.toposort() print f(2) print f_gpu(2) assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1 assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()]) == 1 assert numpy.allclose( numpy.ones(a.get_value(borrow=True).shape) + 9, f_gpu(9)) assert numpy.allclose(f(5), f_gpu(5))
def test_elemwise_collapse2(): """ Test when only one inputs have one broadcastable dimension """ shape = (4,5,9) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2.dimshuffle(0,'x',1,2) b = tcn.CudaNdarrayType((False, False, False, False))() c = a3+b f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32') v=cuda_ndarray.CudaNdarray(v) if False: for id,n in enumerate(f.maker.env.toposort()): print id, n #let debugmode catch errors out=f(v)[0] assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v) print "Expected collapse to 3 dimensions"
def speed_elemwise_collapse2(): """ used to test the speed up of the generalised collapse of ccontiguous dims""" shape = (30,40,50,600) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = theano._asarray(numpy.random.rand(*shape),dtype='float32') a2 = tcn.shared_constructor(a, 'a') a3 = a2[:,:,:,::2] b = tcn.CudaNdarrayType((False, False, False, False))() c = a3+b * tensor.exp(1 + b**a3) f = pfunc([b], [c], mode=mode_with_gpu) v = theano._asarray(numpy.random.rand(*shape),dtype='float32') v = v[:,:,:,::2] v=cuda_ndarray.CudaNdarray(v) for id,n in enumerate(f.maker.env.toposort()): print id, n t1=time.time() for i in range(100): #let debugmode catch errors f(v) t2=time.time()
# Skip test if cuda is not available. from theano.sandbox import cuda if cuda.cuda_available == False: raise SkipTest('Optional package cuda disabled') from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv # needed as the gpu conv don't have a perform implementation. if theano.config.mode == 'FAST_COMPILE': theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') else: theano_mode = theano.compile.mode.get_default_mode().including('gpu') device_id = theano.sandbox.cuda.use.device_number if device_id is None: cuda.shared_constructor(numpy.zeros(2, dtype='float32')) device_id = theano.sandbox.cuda.use.device_number if device_id is None: cuda.use("gpu", force=False, default_to_move_computation_to_gpu=False, move_shared_float32_to_gpu=False, enable_cuda=False, test_driver=True) device_id = theano.sandbox.cuda.use.device_number cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray device_prop = cuda_ndarray.device_properties(device_id) def py_conv_valid_numpy(img, kern):
def test_downsample(): import random shps = [ (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[2]: continue if ds[1] > shp[3]: continue #GpuDownsampleFactorMax don't having more then 512 columns in the output tensor if float(shp[3]) / ds[1] > 512: continue for ignore_border in (True, False): print 'test_downsample', shp, ds, ignore_border ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), 'a') f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu) assert any([ isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort() ]) assert any([ isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort() ]) assert numpy.allclose(f(), f2()) g = pfunc([], tensor.grad( ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu) g2 = pfunc([], tensor.grad( ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu) assert any([ isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort() ]) assert any([ isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort() ]) assert numpy.allclose(g(), g2())
def test_pool(): try: if hasattr(mode_with_gpu, 'check_isfinite'): mode_with_gpu_check_is_finite_prev = mode_with_gpu.check_isfinite if hasattr(mode_without_gpu, 'check_isfinite'): mode_without_gpu_check_is_finite_prev = mode_without_gpu.check_isfinite mode_with_gpu.check_isfinite = False mode_without_gpu.check_isfinite = False #(batch, channel, x, y) shps = [ (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 2, 2, 2), (1, 1, 4, 4), (3, 1, 4, 4), (1, 5, 4, 4), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (65536, 1, 10, 10), #(1, 65536, 10, 10),#crash as too much channel (30, 3, 40, 40), ] shps = [(channel, x, y, batch) for (batch, channel, x, y) in shps] #numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in range(1, min(4, shp[2] + 1)): # for start in range(shp[2] + 1): for start in [0]: for stride in range(1, min(shp[2], ds, 4) + 1): print('test_pool shape=%s, ds=%d, stride=%d start=%d' % (str(shp), ds, stride, start)) a = tcn.shared_constructor(my_rand(*shp), 'a') op = MaxPool(ds=ds, stride=stride) f = theano.function([], op(a), mode=mode_with_gpu) assert any([ isinstance(node.op, MaxPool) for node in f.maker.fgraph.toposort() ]) out = numpy.asarray(f()) #Compute the gold version with a Theano graph. gold_out = gold_max_pool_c01b(a, (ds, ds), (stride, stride), shp[1:3]) f2 = theano.function([], gold_out, mode=mode_without_gpu) assert not any([ isinstance(node.op, MaxPool) for node in f2.maker.fgraph.toposort() ]) out2 = f2() numpy.testing.assert_allclose(out, out2, err_msg=str(out - out2)) # grad testing # The code support grad only in this case. if shp[0] % 16 != 0: shp2 = list(shp) shp2[0] *= 16 # This make it crash due to not enough memory. # On a GPU with 1279M of ram. if numpy.prod(shp2) >= (16 * 10 * 10 * 65536): continue a.set_value(my_rand(*shp2)) g = theano.function([], grad(op(a).sum(), a), mode=mode_with_gpu) g2 = theano.function([], grad(gold_out.sum(), a), mode=mode_without_gpu) assert any([ isinstance(node.op, MaxPoolGrad) for node in g.maker.fgraph.toposort() ]) assert not any([ isinstance(node.op, MaxPoolGrad) for node in g2.maker.fgraph.toposort() ]) numpy.testing.assert_allclose(g(), g2(), err_msg=str(shp)) # Don't call verify_grad. There was problem with # the test and we already assert that 2 version # are equals. Also, it will be slower to verify # like that then the comparison. continue theano.tests.unittest_tools.verify_grad( op, [a.get_value()]) finally: if 'mode_with_gpu_check_is_finite_prev' in locals(): mode_with_gpu.check_isfinite = mode_with_gpu_check_is_finite_prev if 'mode_without_gpu_check_is_finite_prev' in locals(): mode_without_gpu.check_isfinite = mode_without_gpu_check_is_finite_prev
def test_huge_elemwise_fusion(): """ Test the the GpuElemwise fusion work correctly We check that we fuse one node with part of its input in case their is too many inputs and that would make it bust the 256 bytes limits. """ shape = (2, 3, 4, 5, 6) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) vars = [tensor.tanh(ttype) for x in range(7)] f = pfunc( vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #theano.printing.debugprint(f) #for i, node in enumerate(topo): # print >> sys.stdout, i, node assert len(topo) == 10 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2 assert isinstance(topo[7].op.scalar_op, theano.scalar.basic.Sub) assert isinstance(topo[8].op.scalar_op, theano.scalar.basic.Composite) #let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(gen(), gen(), gen(), gen(), gen(), gen(), gen()) # Test the case where we can't put the computation on the gpu! their is too # many dimensions to the input to have 2 inputs to the op! shape = ( 1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2, ) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) vars = [tensor.tanh(ttype) for x in range(7)] f = pfunc( vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #theano.printing.debugprint(f) assert len(topo) == 1 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0 assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1 #let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(gen(), gen(), gen(), gen(), gen(), gen(), gen()) def gen(shape): return theano._asarray(numpy.random.rand(*shape), dtype='float32') max_var = 16 # excluded for shape in [ (2, ), (2, 2), (2, 2, 2), (2, 2, 2, 2), (2, 2, 2, 2, 2), # 5d (2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d ]: vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)] for use_tan in [True, False]: if use_tan: vars = [tensor.tanh(x) for x in vals] else: vars = vals for nb_var in range(1, max_var): out = reduce(lambda x, y: x + y, vars[:nb_var]) if not isinstance(out.type, CudaNdarrayType): out = cuda.gpu_from_host(out) f = pfunc([], [out], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() #print shape, nb_var, use_tan, len(topo) assert (sum( [isinstance(node.op, cuda.GpuElemwise) for node in topo]) == len(topo) or (nb_var == 1 and use_tan == False)) assert sum([ isinstance(node.op, tensor.Elemwise) for node in topo ]) == 0 #let debugmode catch errors f()
def test_shared_cudandarray(): '''Test that we can create a CudaNdarraySharedVariable from a CudaNdarray''' a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3))) assert isinstance(a.type, tcn.CudaNdarrayType)
def test_gpujoin_no_rebroadcast(): _a = numpy.asarray([[1,2],[3,4]],dtype='float32') a = tcn.shared_constructor(_a) f = theano.function([],T.join(1,a)) l = f.maker.env.toposort() assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
def test_memory_lazy(): """As test_memory, but with the ifelse op. We need to test it as the ifelse op with the [c]vm create op not executed in the graph. This mess with [c]vm gc implementation. """ shapes = (50, 100) # more_alloc1 is not the same for both dtype. # when dtype is float32, the computation is done on the gpu. # This insert constant on the gpu during compilation # that raise the number of alloc. # When dtype is float64, only the shared is on the gpu and it is transferd # to the cpu for computation. So no extra alloc after compilation. # more_alloc1 if after the first compilation for dtype, more_alloc1 in [("float32", 1), ("float64", 0)]: print(dtype) test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype) some_vector = tensor.vector('some_vector', dtype=dtype) some_matrix = some_vector.reshape(shapes) branch_select = tensor.iscalar() mem1 = freemem() print("Before shared variable", mem1) variables = cuda.shared_constructor(np.ones((shapes[1],), dtype='float32')) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) derp = ifelse.IfElse(1)(branch_select, derp, some_matrix[:shapes[0]].sum()) derp += 1 print("Shared took ", np.prod(variables.get_value( borrow=True, return_internal_type=True).shape) * 4 / 1024, "kB") mem2 = freemem() print("Before compilation", mem2) mem2_1 = freemem(extra_alloc=more_alloc1) obj = theano.function([some_vector, branch_select], derp, mode=mode_with_gpu) # theano.printing.debugprint(obj, print_type=True) mem3 = freemem() print("After function compilation 1", mem3) assert mem2_1 == mem3, (mem2_1, mem3) for i in range(3): obj(test_params, 1) print("After function evaluation branch true", freemem()) assert mem2_1 == freemem(), (mem2_1, freemem()) obj(test_params, 0) print("After function evaluation branch false", freemem()) assert mem2_1 == freemem(), (mem2_1, freemem()) del obj print("After deleting function 1", freemem()) assert mem2 == freemem(), (mem2, freemem()) del derp, variables print("After deleting shared variable and ref to it", freemem()) assert mem1 == freemem(), (mem1, freemem())
def test_shared(self): # NB: we also test higher order tensors at the same time. y = cuda.CudaNdarray.zeros((1, 2, 3, 4)) x = cuda.shared_constructor(y) assert y.size == theano.function([], x.size)()
def test_memory(): """ We test that we do not keep link to memory between Theano function call and during Theano compilation The origin of this code come from Aaron Vandenoord and Sander Dieleman. I have their autorisation to put this in Theano with the Theano license. note:: This test can fail if there is other process running on the gpu. """ shapes = (200, 100) # more_alloc1 was different for each dtype in the past. # more_alloc2 is still currently not the same for both dtype. # when dtype is float32, the computation is done on the gpu. # This insert constant on the gpu during compilation # that raise the number of alloc. # When dtype is float64, only the shared is on the gpu and it is transferd # to the cpu for computation. So no extra alloc after compilation. # more_alloc1 if after the first compilation, more_alloc2 after the second. for dtype, more_alloc1, more_alloc2 in [("float32", 0, 3), ("float64", 0, 0)]: print(dtype) test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype) some_vector = tensor.vector('some_vector', dtype=dtype) some_matrix = some_vector.reshape(shapes) mem1 = freemem() print("Before shared variable", mem1) variables = cuda.shared_constructor(np.ones((shapes[1],), dtype='float32')) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) print("Shared took ", np.prod(variables.get_value( borrow=True, return_internal_type=True).shape) * 4 / 1024, "kB") mem2 = freemem() print("Before compilation", mem2) mem2_1 = freemem(extra_alloc=more_alloc1) mem2_2 = freemem(extra_alloc=more_alloc2) obj = theano.function([some_vector], derp, mode=mode_with_gpu) mem3 = freemem() print("After function compilation 1", mem3) assert mem2_1 == mem3, (mem2_1, mem3, dtype) grad_derp = tensor.grad(derp, some_vector) grad = theano.function([some_vector], grad_derp, mode=mode_with_gpu) mem4 = freemem() print("After function compilation 2", mem4) assert mem2_2 == mem4, (mem2_2, mem4, dtype) for i in range(3): obj(test_params) print("After function evaluation 1", freemem()) assert mem2_2 == freemem(), (mem2_2, freemem()) grad(test_params) print("After function evaluation 2", freemem()) assert mem2_2 == freemem(), (mem2_2, freemem()) del obj # print "After deleting function 1", freemem() # assert mem2 == freemem(), (mem2, freemem()) del grad print("After deleting function 2", freemem()) assert mem2 == freemem(), (mem2, freemem()) del derp, variables, grad_derp print("After deleting shared variable and ref to it", freemem()) assert mem1 == freemem(), (mem1, freemem())
def test_downsample(): shps = [(1, 12), (1, 1, 12), (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), (65536, 1, 10, 10), (1, 65536, 10, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[-2]: continue if ds[1] > shp[-1]: continue # GpuDownsampleFactorMax doesn't like having more than 512 columns # in the output tensor. if float(shp[-1]) / ds[1] > 512: continue for ignore_border in (True, False): # print 'test_downsample', shp, ds, ignore_border ds_op = Pool(ndim=len(ds), ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), 'a') f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds), mode=mode_with_gpu.excluding('cudnn')) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a), ds), mode=mode_without_gpu) assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.fgraph.toposort()]) assert any([isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()]) assert numpy.allclose(f(), f2()) # The grad is too slow on GT220 GPU # This cause the computer to freeze... # Remove this when it gets optimized enough # This only bypass the last 2 checks # Those tests where passing in all Mode on a GTX470 if shp[0] > 30000 or shp[1] > 30000: continue g = pfunc( [], tensor.grad(ds_op(tensor.as_tensor_variable(a), ds).sum(), a), mode=mode_with_gpu.excluding('cudnn')) g2 = pfunc( [], tensor.grad(ds_op(tensor.as_tensor_variable(a), ds).sum(), a), mode=mode_without_gpu) assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.fgraph.toposort()]) assert any([isinstance(node.op, PoolGrad) for node in g2.maker.fgraph.toposort()]) assert numpy.allclose(g(), g2()), shp ggf = gradient.Lop(tensor.grad((ds_op( tensor.as_tensor_variable(a), ds)**2).sum(), a), a, a) ref_mode = copy.copy(mode_without_gpu) ref_mode.check_py_code = False gpu_mode = copy.copy(mode_with_gpu) gpu_mode.check_py_code = False gg = pfunc([], ggf, mode=gpu_mode) gg2 = pfunc([], ggf, mode=ref_mode) assert any([isinstance( node.op, tcn.blas.GpuDownsampleFactorMaxGradGrad) for node in gg.maker.fgraph.toposort()]) assert any([isinstance( node.op, DownsampleFactorMaxGradGrad) for node in gg2.maker.fgraph.toposort()]) assert numpy.allclose(gg(), gg2()), shp
def test_huge_elemwise_fusion(): """ Test the the GpuElemwise fusion work correctly We check that we fuse one node with part of its input in case their is too many inputs and that would make it bust the 256 bytes limits. """ shape = (2, 3, 4, 5, 6) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes( )['gpu_ptr_size'] if gpu_ptr_size == 8: nb_in = 7 len_topo = 10 elif gpu_ptr_size == 4: nb_in = 8 len_topo = 11 else: raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size) vars = [tensor.tanh(ttype) for x in range(nb_in)] f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == len_topo assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2 assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub) assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite) # let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(*[gen() for i in range(nb_in)]) # Test the case where we can't put the computation on the gpu! their is too # many dimensions to the input to have 2 inputs to the op! shape = ( 1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2, ) ttype = tensor.tensor(dtype='float32', broadcastable=(False, ) * len(shape)) vars = [tensor.tanh(ttype) for x in range(7)] f = pfunc( vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0 assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1 # let debugmode catch errors gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') f(gen(), gen(), gen(), gen(), gen(), gen(), gen()) def gen(shape): return theano._asarray(numpy.random.rand(*shape), dtype='float32') max_var = 16 # excluded for shape in [ (2, ), (2, 2), (2, 2, 2), (2, 2, 2, 2), (2, 2, 2, 2, 2), # 5d (2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d ]: vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)] for use_tan in [True, False]: if use_tan: vars = [tensor.tanh(x) for x in vals] else: vars = vals for nb_var in range(1, max_var): out = reduce(lambda x, y: x + y, vars[:nb_var]) if not isinstance(out.type, CudaNdarrayType): out = cuda.gpu_from_host(out) f = pfunc([], [out], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() # print shape, nb_var, use_tan, len(topo) assert (sum( [isinstance(node.op, cuda.GpuElemwise) for node in topo]) == len(topo) or (nb_var == 1 and use_tan is False)) assert sum([ isinstance(node.op, tensor.Elemwise) for node in topo ]) == 0 # let debugmode catch errors f()
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite( [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = cuda.shared_constructor(c_v) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def shared(val): try: return tcn.shared_constructor(val) except TypeError: return theano.shared(val)