def test_local_gpu_elemwise(): """ Test local_gpu_elemwise when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) return # Not yet implemeted # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite([a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype="float32")) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite( [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = cuda.shared_constructor(c_v) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])