Example #1
0
def test_opt_gpujoin_onlyajoin():
    # from a bug in normal sampling
    _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32')
    a = cuda.shared_constructor(_a)
    b = cuda.shared_constructor(_b)

    c = tensor.join(1, a, b)

    f = theano.function([], c, mode=mode_with_gpu)

    f()

    graph_nodes = f.maker.fgraph.toposort()

    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
    assert isinstance(graph_nodes[-2].op, cuda.GpuJoin)

    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))

    # test mixed dtype
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float64')
    b = theano.tensor.constant(_b)

    c = tensor.join(1, a, b)

    f = theano.function([], c, mode=mode_with_gpu)

    f()

    graph_nodes = f.maker.fgraph.toposort()
    assert isinstance(graph_nodes[-1].op, theano.tensor.Join)

    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
Example #2
0
def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
    # from a bug in gpu normal sampling
    _a = numpy.asarray([1, 2, 3, 4], dtype='float32')
    _b = numpy.asarray([5, 6, 7, 8], dtype='float32')
    a = cuda.shared_constructor(_a)
    b = cuda.shared_constructor(_b)

    a_prime = tensor.cos(a)
    b_prime = tensor.sin(b)

    c = tensor.join(0, a_prime, b_prime)

    d = c[:-1]

    f = theano.function([], d, mode=mode_with_gpu)

    graph_nodes = f.maker.fgraph.toposort()

    assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu)
    assert isinstance(graph_nodes[-2].op, cuda.GpuSubtensor)
    assert isinstance(graph_nodes[-3].op, cuda.GpuJoin)

    concat = numpy.concatenate([numpy.cos(_a), numpy.sin(_b)], axis=0)
    concat = concat[:-1]

    assert numpy.allclose(numpy.asarray(f()), concat)
Example #3
0
def test_elemwise_composite_support_code():
    """
    This was generating an error at compile time.
    Commit 3d1690fa346103594356ecaeceeb2c6757b45d2b fixed that.
    """
    X = tcn.shared_constructor(value=numpy.zeros((100, 10), dtype="float32"),
                               name='X')
    W = tcn.shared_constructor(value=numpy.zeros((10, 1), dtype="float32"),
                               name='W')
    U = T.dot(X, W)
    Y = tcn.shared_constructor(value=numpy.zeros((100, 1), dtype="float32"),
                               name='Y')
    P = T.exp(-(Y - U) ** 2)
    epsilon = numpy.asarray(0.001, dtype="float32")
    NLL = -T.mean(T.log(P + epsilon))  # SupportCodeError
    G = T.grad(NLL, wrt=[W])

    backup = theano.config.warn.identify_1pexp_bug
    theano.config.warn.identify_1pexp_bug = False
    try:
        f_grad = theano.function(inputs=[], outputs=G, mode=mode_with_gpu)
    finally:
        theano.config.warn.identify_1pexp_bug = backup
    f_grad()

    topo = f_grad.maker.env.toposort()
    assert sum([isinstance(node.op, T.Elemwise) for node in topo]) == 1
    assert sum([isinstance(node.op, tcn.GpuElemwise) for node in topo]) == 1
Example #4
0
def test_elemwise2():
    """ Several kinds of elemwise expressions with dimension permutations """
    rng = numpy.random.RandomState(int(time.time()))
    shape = (3, 5)
    for pattern in [(0, 1), (1, 0)]:
        a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
                                                   dtype='float32'), name=None)
        b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
        f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))],
                  mode=mode_with_gpu)
        has_elemwise = False
        for i, node in enumerate(f.maker.env.toposort()):
            has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
        assert not has_elemwise
        #let debugmode catch errors
        f(theano._asarray(rng.rand(*shape), dtype='float32') * .3)

    shape = (3, 4, 5, 6)
    a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
                                               dtype='float32'), 'a')
    b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
    f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
        tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
    f(theano._asarray(rng.rand(*shape), dtype='float32'))
Example #5
0
def test_pool():
    #(batch, channel, x, y)
    shps = [(1, 1, 2, 2),
             ]
    shps = [(channel, x, y, batch) for (batch, channel, x, y) in shps]

    #numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)
    warnings.warn("TODO: Razvan needs to finish this")
    for shp in shps:
        for ds in range(1, min(4, shp[2] + 1)):
            for start in [0]:
                for stride in range(1, min(shp[2], ds, 4) + 1):
                    #print 'test_pool shape=%s, ds=%d, stride=%d start=%d' % (
                    #    str(shp), ds, stride, start)

                    va = my_rand(*shp)
                    tva = va.flatten()
                    #print 'va', tva, tva.max(), tva.argmax()

                    vb = my_rand(*shp)
                    tvb = vb.flatten()
                    #print 'vb', tvb, tvb.max(), tvb.argmax(),\
                    #                tvb[tva.argmax()]
                    a = tcn.shared_constructor(va, 'a')
                    b = tcn.shared_constructor(vb, 'b')
                    op = MaxPool(ds=ds, stride=stride)
                    v = op(a)
                    rval = theano.tensor.Rop(v, a, b)
                    f = theano.function([], rval,
                                        mode=mode_with_gpu)
                    print f.maker.fgraph.toposort()
                    #ssert any([isinstance(node.op, MaxPool)
                    #   for node in f.maker.fgraph.toposort()])
                    out = numpy.asarray(f())
Example #6
0
def test_gpujoin_preserves_broadcasting():
    _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32")
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32")
    a = tcn.shared_constructor(_a)
    b = tcn.shared_constructor(_b)

    # [0,0] : the two original dims were non-broadcastable
    # [1,x,0]: new order and broadcastability
    gpu_dimshuffle = GpuDimShuffle([0, 0], [1, "x", 0])

    a_shuffled = gpu_dimshuffle(a)
    b_shuffled = gpu_dimshuffle(b)

    c = gpu_join(0, a_shuffled, b_shuffled)

    assert c.type.broadcastable == (False, True, False)

    f = theano.function([], c, mode=mode_with_gpu)

    res = f()

    a_reshaped = numpy.asarray([[[1, 3]], [[2, 4]]], dtype="float32")
    b_reshaped = numpy.asarray([[[5, 8]], [[6, 9]], [[7, 10]]], dtype="float32")

    concat = numpy.concatenate([a_reshaped, b_reshaped], axis=0)

    assert numpy.all(res == concat)
Example #7
0
    def cmp(a_shp, b_shp):
        a0 = my_rand(*a_shp)
        a = tcn.shared_constructor(a0, 'a')
        cval = my_rand(a_shp[0], b_shp[1])
        c = tcn.shared_constructor(cval.copy(), 'c')

        b = tcn.fmatrix('b')
        b2 = tcn.fmatrix('b2')

        f = pfunc(
                [b, b2],
                [tensor.dot(a, b2) + c],
                updates=[(a, tensor.dot(a, b) + c)],
                mode=mode_with_gpu)

        assert any([node.op == tcn.blas.gpu_gemm_no_inplace
            for node in f.maker.fgraph.toposort()])
        bval = my_rand(*b_shp)
        bval2 = my_rand(*b_shp)
        rval = f(bval, bval2)

        assert numpy.allclose(numpy.dot(a0, bval) + cval, a.get_value())
        assert numpy.allclose(numpy.dot(a0, bval2) + cval, rval)

        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(
                a.get_value(borrow=True,
                    return_internal_type=True)[::-1, ::-1],
                borrow=True)
        f(bval, bval2)
Example #8
0
def test_nvidia_driver2():
    """ Test that the gpu device is initialized by theano when
        we manually make a shared variable on the gpu.

        The driver should always be tested during theano initialization
        of the gpu device
    """
    a = numpy.random.rand(10000).astype("float32")
    cuda.shared_constructor(a)
    assert theano.sandbox.cuda.use.device_number is not None
Example #9
0
def test_gpujoin_twomatrices_joincolumns():
    _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32")
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32")
    a = tcn.shared_constructor(_a)
    b = tcn.shared_constructor(_b)

    c = gpu_join(1, a, b)

    f = theano.function([], c)

    assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
Example #10
0
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10,
             n_train=100):

    if config.mode == 'DEBUG_MODE':
        n_train = 1

    if use_gpu:
        w = tcn.shared_constructor(0.01 * (my_rand(n_in, n_hid) - 0.5), 'w')
        b = tcn.shared_constructor(my_zeros(n_hid), 'b')
        v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
        c = tcn.shared_constructor(my_zeros(n_out), 'c')
    else:
        w = shared(0.01 * (my_rand(n_in, n_hid) - 0.5), 'w')
        b = shared(my_zeros(n_hid), 'b')
        v = shared(my_zeros((n_hid, n_out)), 'c')
        c = shared(my_zeros(n_out), 'c')

    x = tensor.fmatrix('x')
    y = tensor.fmatrix('y')
    lr = tensor.fscalar('lr')

    hid = tensor.tanh(tensor.dot(x, w) + b)
    out = tensor.tanh(tensor.dot(hid, v) + c)
    loss = tensor.sum(0.5 * (out - y) ** 2 * lr)
    if 0:
        print('loss type', loss.type)

    params = [w, b, v, c]
    gparams = tensor.grad(loss, params)

    mode = get_mode(use_gpu)

    # print 'building pfunc ...'
    train = pfunc([x, y, lr], [loss], mode=mode,
                  updates=[(p, p - g) for p, g in izip(params, gparams)])

    if 0:
        for i, n in enumerate(train.maker.fgraph.toposort()):
            print(i, n)

    xval = my_rand(n_batch, n_in)
    yval = my_rand(n_batch, n_out)
    lr = theano._asarray(0.01, dtype='float32')

    t0 = time.time()
    rval = []
    for i in xrange(n_train):
        rval.append(train(xval, yval, lr))
    dt = time.time() - t0

    print_mode(mode)
    return numpy.asarray(rval), dt
Example #11
0
def test_gpujoin_twomatrices_badshapes():
    _a = numpy.asarray([[1, 2], [3, 4]], dtype="float32")
    _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype="float32")
    a = tcn.shared_constructor(_a)
    b = tcn.shared_constructor(_b)

    # try to join on dimension 0 where they don't agree (2!=3)
    c = gpu_join(0, a, b)

    f = theano.function([], c)

    try:
        f()
        assert False
    except ValueError:
        assert True
Example #12
0
    def cmp(a_shp, b_shp):
        a0 = my_rand(*a_shp)
        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix('b')
        c = tensor.fmatrix('c')

        f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))],
                mode=mode_with_gpu)
        assert any([node.op == tcn.blas.gpu_gemm_inplace
            for node in f.maker.env.toposort()])

        bval = my_rand(*b_shp)
        cval = my_rand(a_shp[0], b_shp[1])
        f(bval, cval)

        assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval),
                a.get_value())

        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(
                a.get_value(borrow=True,
                    return_internal_type=True)[::-1, ::-1],
                borrow=True)
        f(bval, cval)
Example #13
0
def test_downsample():
    shps = [
        (1, 1, 1, 12),
        (1, 1, 2, 2),
        (1, 1, 1, 1),
        (1, 1, 4, 4),
        (1, 1, 10, 11),
        (1, 2, 2, 2),
        (3, 5, 4, 4),
        (25, 1, 7, 7),
        (1, 1, 12, 12),
        (1, 1, 2, 14),
        (1, 1, 12, 14),
        (1, 1, 14, 14),
        (1, 1, 16, 16),
        (1, 1, 18, 18),
        (1, 1, 24, 24),
        (1, 6, 24, 24),
        (10, 1, 24, 24),
        (10, 6, 24, 24),
        (30, 6, 12, 12),
        (30, 2, 24, 24),
        (30, 6, 24, 24),
        (10, 10, 10, 11),
        (1, 1, 10, 1025),
        (1, 1, 10, 1023),
        (1, 1, 1025, 10),
        (1, 1, 1023, 10),
    ]

    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
            if ds[0] > shp[2]:
                continue
            if ds[1] > shp[3]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
            if float(shp[3]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                print "test_downsample", shp, ds, ignore_border
                ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), "a")
                f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu)
                f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu)
                assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort()])
                assert any([isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort()])
                assert numpy.allclose(f(), f2())

                g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu)
                g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu)
                assert any(
                    [isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort()]
                )
                assert any([isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort()])
                assert numpy.allclose(g(), g2())
Example #14
0
def test_local_assert_no_cpu_op():
    numpy.random.seed(1)
    m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32")
    ms = cuda.shared_constructor(m, name="m_shared")
    out = theano.tensor.tanh(ms).dot(ms.T)

    mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_0")
    mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_1")

    old = config.assert_no_cpu_op
    old2 = config.on_opt_error
    # If the flag is raise
    try:
        config.assert_no_cpu_op = 'raise'
        config.on_opt_error = 'ignore'

        assert_raises(AssertionError, theano.function,
                        [], out, mode=mode_local_assert)
    finally:
        config.assert_no_cpu_op = old
        config.on_opt_error = old2

    # If the flag is ignore
    try:
        config.assert_no_cpu_op = 'ignore'
        theano.function([], out, mode=mode_local_assert)
    finally:
        config.assert_no_cpu_op = old
Example #15
0
def test_elemwise1():
    """ Several kinds of elemwise expressions with no broadcasting,
    non power-of-two shape """

    shape = (3, 4)
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
                                               dtype='float32') + 0.5, 'a')
    b = tensor.fmatrix()

    #let debugmode catch any mistakes
    print >> sys.stdout, "STARTING FUNCTION 1"
    f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    print >> sys.stdout, "STARTING FUNCTION 2"
    #let debugmode catch any mistakes
    f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu)
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    print >> sys.stdout, "STARTING FUNCTION 3"
    #let debugmode catch any mistakes
    f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))],
              mode=mode_with_gpu)
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
Example #16
0
def test_memory_lazy():
    """As test_memory, but with the ifelse op.

    We need to test it as the ifelse op with the [c]vm create op not
    executed in the graph. This mess with [c]vm gc implementation.
    """
    shapes = (50, 100)
    # more_alloc1 is not the same for both dtype.
    # when dtype is float32, the computation is done on the gpu.
    # This insert constant on the gpu during compilation
    # that raise the number of alloc.
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
    # more_alloc1 if after the first compilation
    for dtype, more_alloc1 in [("float32", 1),
                               ("float64", 0)]:
        print(dtype)
        test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)

        some_vector = tensor.vector('some_vector', dtype=dtype)
        some_matrix = some_vector.reshape(shapes)
        branch_select = tensor.iscalar()

        mem1 = freemem()
        print("Before shared variable", mem1)
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
        derp = ifelse.IfElse(1)(branch_select,
                                derp, some_matrix[:shapes[0]].sum())
        derp += 1
        print("Shared took ", np.prod(variables.get_value(
                borrow=True,
                return_internal_type=True).shape) * 4 / 1024, "kB")

        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        obj = theano.function([some_vector, branch_select], derp,
                              mode=mode_with_gpu)
        #theano.printing.debugprint(obj, print_type=True)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3)

        for i in range(3):
            obj(test_params, 1)
            print("After function evaluation branch true", freemem())
            assert mem2_1 == freemem(), (mem2_1, freemem())
            obj(test_params, 0)
            print("After function evaluation branch false", freemem())
            assert mem2_1 == freemem(), (mem2_1, freemem())

        del obj
        print("After deleting function 1", freemem())
        assert mem2 == freemem(), (mem2, freemem())

        del derp, variables
        print("After deleting shared variable and ref to it", freemem())
        assert mem1 == freemem(), (mem1, freemem())
Example #17
0
 def shared(val):
     # If we don't put shared on the GPU, we won't be able to test
     # the no inplace version as the added transfer will make them inplace.
     try:
         return tcn.shared_constructor(val)
     except TypeError:
         return theano.shared(val)
Example #18
0
def test_gpuspecifyshape():
    x = cuda.shared_constructor(numpy.ones(3, dtype='float32'), 'x')
    m = theano.tensor.specify_shape(x + numpy.float32(1), (3,))
    f = theano.function([], updates=[(x, m * numpy.float32(2))],
                        mode=mode_with_gpu)
    l = f.maker.fgraph.toposort()
    assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l])
Example #19
0
def test_opt_gpujoin_joinvectors_negativeaxes():
    """
    Test that negative axis concatenation works as expected.
    """

    # Test case for one-dimensional vectors
    rng = numpy.random.RandomState(22)
    x1 = rng.rand(5)
    x2 = rng.rand(10)
    t1 = cuda.shared_constructor(numpy.asarray(x1, "float32"))
    t2 = cuda.shared_constructor(numpy.asarray(x2, "float32"))

    t = tensor.concatenate([t1, t2], axis=-1)
    f = theano.function(inputs=[], outputs=t)

    assert(numpy.allclose(f(), numpy.concatenate([x1, x2], axis=-1)))

    # Test case for two-dimensional vectors
    x1 = rng.rand(5, 10)
    x2 = rng.rand(10, 10)
    t1 = cuda.shared_constructor(numpy.asarray(x1, "float32"))
    t2 = cuda.shared_constructor(numpy.asarray(x2, "float32"))

    t = tensor.concatenate([t1, t2], axis=-2)
    f = theano.function(inputs=[], outputs=t)

    assert(numpy.allclose(f(), numpy.concatenate([x1, x2], axis=-2)))

    # Now check that a value error is raised when vectors don't match
    # along the negative concatenation axis
    try:
        t = tensor.concatenate([t1, t2], axis=-1)
        f = theano.function(inputs=[], outputs=t)
        f()
        assert(False)
    except ValueError:
        assert(True)

    # Finally check that a value error is raised when negative
    # axis is larger in absolute value than smallest number of dims
    try:
        t = tensor.concatenate([t1, t2], axis=-3)
        f = theano.function(inputs=[], outputs=t)
        f()
        assert(False)
    except IndexError:
        assert(True)
Example #20
0
    def cmp(a_shp, b_shp):
        a0 = numpy.random.uniform(-0.4, 0.4,
                                  a_shp).astype('float32')
        a = cuda.shared_constructor(a0, 'a')

        b0 = numpy.random.uniform(-0.4, 0.4,
                                  b_shp).astype('float32')
        b = cuda.shared_constructor(b0, 'b')

        f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)

        assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
                          cuda.cula.GpuSolve)

        assert cuda.opt.local_gpu_solve.transform(
            tensor.slinalg.solve(a, b).owner)
        out = f()
        assert numpy.allclose(numpy.dot(a0, out), b0)
Example #21
0
def test_memory():
    """
    We test that we do not keep link to memory between Theano function call
    and during Theano compilation

    The origin of this code come from Aaron Vandenoord and Sander Dieleman.
    I have their autorisation to put this in Theano with the Theano license.

    note::
        This test can fail if there is other process running on the gpu.
    """
    shapes = (6000, 5000)
    test_params = np.asarray(np.random.randn(np.prod(shapes)), 'float32')

    some_vector = tensor.vector('some_vector')
    some_matrix = some_vector.reshape(shapes)

    mem1 = freemem()
    print "Before shared variable", mem1
    variables = cuda.shared_constructor(np.ones((shapes[1],), dtype='float32'))
    derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
    print "Shared took ", np.prod(variables.get_value(
            borrow=True,
            return_internal_type=True).shape) * 4 / 1024, "kB"

    mem2 = freemem()
    print "Before compilation", mem2
    obj = theano.function([some_vector], derp, mode=mode_with_gpu)
    mem3 = freemem()
    print "After function compilation 1", mem3
    assert mem2 == mem3, (mem2, mem3)

    grad_derp = tensor.grad(derp, some_vector)
    grad = theano.function([some_vector], grad_derp, mode=mode_with_gpu)
    mem4 = freemem()
    print "After function compilation 2", mem4
    assert mem2 == mem4, (mem2, mem4)

    for i in range(3):
        obj(test_params)
        print "After function evaluation 1", freemem()
        assert mem2 == freemem(), (mem2, freemem())
        grad(test_params)
        print "After function evaluation 2", freemem()
        assert mem2 == freemem(), (mem2, freemem())

    del obj
    print "After deleting function 1", freemem()
    assert mem2 == freemem(), (mem2, freemem())

    del grad
    print "After deleting function 2", freemem()
    assert mem2 == freemem(), (mem2, freemem())

    del derp, variables, grad_derp
    print "After deleting shared variable and ref to it", freemem()
    assert mem1 == freemem(), (mem1, freemem())
Example #22
0
    def cmp(a_shp, b_shp):
        a = tcn.shared_constructor(my_rand(*a_shp), 'a')
        cval = my_rand(a_shp[0], b_shp[1])
        c = tcn.shared_constructor(cval.copy(), 'c')

        b = tcn.fmatrix('b')
        b2 = tcn.fmatrix('b2')

        f = pfunc([b,b2], [tensor.dot(a,b2) + c], updates=[(a, tensor.dot(a,b) + c)], mode=mode_with_gpu)

        a0 = a.get_value() * 1.0
        assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
        bval = my_rand(*b_shp)
        bval2 = my_rand(*b_shp)
        rval = f(bval,bval2)

        assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
        assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
Example #23
0
    def cmp(a_shp, b_shp):
        a0 = numpy.random.rand(*a_shp).astype('float32')
        a = cuda.shared_constructor(a0, 'a')
        b0 = numpy.random.rand(*b_shp).astype('float32')
        b = cuda.shared_constructor(b0, 'b')

        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
        assert cuda.opt.local_gpu_dot_to_dot22.transform(
            tensor.dot(a, b).owner)
        out = f()

        assert numpy.allclose(numpy.dot(a0, b0), out)

        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(
            a.get_value(borrow=True,
                        return_internal_type=True)[::-1],
            borrow=True)
        f()
Example #24
0
def cmp_sigmoids(shape):
    def numpy_sigmoid(input):
        rval = 1.0 / (1.0 + numpy.exp(-input))
    sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
    shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
    times = compare_fns(
            dict( numpy=numpy_sigmoid
                , theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput)))
                , theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))])
                ),
            input=shared_input.value)
    showtimes(times)
Example #25
0
def speed_adv_sub1():
    data = numpy.random.rand(50000, 21).astype("float32")
    var = tcn.shared_constructor(data)
    vec = tensor.lvector()
    for batch_size in [100, 1000, 10000, 100000]:
        idx = numpy.random.randint(0, 50000, batch_size)
        mode_with_gpu = theano.compile.ProfileMode().including('gpu')
        f = theano.function([vec], var[vec], mode=mode_with_gpu)
        for i in range(100):
            f(idx)
        print "ProfileMode with batch size", batch_size
        mode_with_gpu.print_summary()
Example #26
0
    def cmp(a_shp, b_shp):
        a = tcn.shared_constructor(my_rand(*a_shp), 'a')

        b = tensor.fmatrix()

        f = pfunc([b], [], updates=[(a, tensor.dot(a,b))], mode=mode_with_gpu)

        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        f(bval)

        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
Example #27
0
def test_gpualloc_input_on_gpu():
    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
    a = tcn.shared_constructor(a_val)

    b = T.fscalar()
    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
    f_gpu = theano.function([b], T.ones_like(a)+b, mode=mode_with_gpu)

    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1

    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
    assert numpy.allclose(f(5),f_gpu(5))
Example #28
0
def test_elemwise_empty():
    #test with 0 element
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0,0), dtype='float32'), 'a')

    b = tensor.fmatrix()

    f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu)
    f2 = pfunc([b], [], updates=[(a, a+b)], mode=mode_without_gpu)

    a0 = a.get_value() * 1.0
    f(numpy.ones((0,0), dtype='float32'))

    assert numpy.all(a0 + 1.0 == a.get_value())
Example #29
0
def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""
    shape = (3,4)
    a = cuda.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
    b = tensor.fmatrix()
    c = tensor.fmatrix()
    f = pfunc([b,c], [a+b+c], mode=mode_with_gpu)
    topo = f.maker.env.toposort()
    for i, node in enumerate(topo):
        print >> sys.stdout, i, node
    assert len(topo)==4
    assert isinstance(topo[2].op.scalar_op,theano.scalar.basic.Composite)
    #let debugmode catch errors
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
Example #30
0
def test_elemwise4():
    """ Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""

    shape = (3,4)
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
    b = tensor.fvector()
    c = tensor.fvector()
    f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
    f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32'))
    def cmp(a_shp, b_shp):
        a0 = my_rand(*a_shp)
        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix()

        f = pfunc([b], [], updates=[(a, tensor.dot(a, b))], mode=mode_with_gpu)

        bval = my_rand(*b_shp)
        f(bval)

        assert numpy.allclose(numpy.dot(a0, bval), a.get_value())

        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(a.get_value(borrow=True,
                                return_internal_type=True)[::-1, ::-1],
                    borrow=True)
        f(bval)
Example #32
0
def test_nvidia_driver1():
    """ Some nvidia driver give bad result for reduction
        This execute some reduction test to ensure it run correctly
    """
    a = numpy.random.rand(10000).astype("float32")
    A = cuda.shared_constructor(a)
    f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu,
                        profile=False)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 2
    if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
        msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
            'but got:']+[str(app) for app in topo])
        raise AssertionError(msg)
    if not numpy.allclose(f(), a.sum()):
        raise Exception("The nvidia driver version installed with this OS "
                        "does not give good results for reduction."
                        "Installing the nvidia driver available on the same "
                        "download page as the cuda package will fix the "
                        "problem: http://developer.nvidia.com/cuda-downloads")
Example #33
0
def test_elemwise0():

    a = tcn.shared_constructor(
        theano._asarray(numpy.random.rand(4, 4), dtype='float32'), 'a')

    b = tensor.fmatrix()

    f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)

    #check that we work inplace.
    assert f.maker.env.toposort()[1].op.destroy_map.items() == [(0, [0])]

    a0 = a.get_value() * 1.0
    print 'BEFORE ADD', a.get_value()
    for i, node in enumerate(f.maker.env.toposort()):
        print i, node
    f(numpy.ones((4, 4), dtype='float32'))
    print 'AFTER ADD', a.get_value()

    assert numpy.all(a0 + 1.0 == a.get_value())
Example #34
0
def test_elemwise3():
    """ Several kinds of elemwise expressions with dimension permutations and broadcasting"""

    shape = (3,4,5,6)
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
    b = tensor.fvector()
    print b.type
    print tensor.constant(1).type
    print (1 + b).type
    print (1 + b**a).type
    print tensor.exp((1 + b**a)).type
    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
        b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):
        print >> sys.stdout, i, node
        has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
    assert not has_elemwise
    #let debugmode catch errors
    f(theano._asarray(numpy.random.rand(6), dtype='float32'))
Example #35
0
def test_elemwise_collapse4():
    """ Test when only one inputs have two broadcastable dimension at
    each ends and we add a scalar"""

    shape = (4, 5)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
                        dtype='float32')
    v = cuda_ndarray.CudaNdarray(v)
    #let debugmode catch errors
    out = f(v)[0]
    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
Example #36
0
def test_elemwise_collapse6():
    """ Test when all inputs have two broadcastable dimension at the beginning"""

    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle('x','x',0,1)
    b = tcn.CudaNdarrayType((True, True, False, False))()
    f = pfunc([b], [a3+b], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
    if False:
        for id,n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
    out=f(v)[0]
    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
    print "Expected collapse to c contiguous"
Example #37
0
def test_elemwise_collapse():
    """ Test when all inputs have one(and the same) broadcastable dimension """

    shape = (4, 5, 60)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, True, False, False))()
    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(shape[0], 1, *shape[1:]),
                        dtype='float32')
    v = cuda_ndarray.CudaNdarray(v)

    #let debugmode catch errors
    out = f(v)[0]
    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
Example #38
0
def test_elemwise1():
    """ Several kinds of elemwise expressions with no broadcasting,
    non power-of-two shape """

    shape = (3, 4)
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
                                               dtype='float32') + 0.5, 'a')
    b = tensor.fmatrix()

    #let debugmode catch any mistakes
    f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu)
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    #let debugmode catch any mistakes
    f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu)
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)

    #let debugmode catch any mistakes
    f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))],
              mode=mode_with_gpu)
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
Example #39
0
    def cmp(a_shp, b_shp):
        a = tcn.shared_constructor(my_rand(*a_shp), 'a')

        b = tensor.fmatrix('b')
        c = tensor.fmatrix('c')

        f = pfunc([b, c], [],
                  updates=[(a, tensor.dot(a, b) + tensor.exp(c))],
                  mode=mode_with_gpu)
        assert any([
            node.op == tcn.blas.gpu_gemm_inplace
            for node in f.maker.env.toposort()
        ])

        a0 = a.get_value() * 1.0
        bval = my_rand(*b_shp)
        cval = my_rand(a_shp[0], b_shp[1])
        f(bval, cval)

        assert numpy.allclose(
            numpy.dot(a0, bval) + numpy.exp(cval), a.get_value())
Example #40
0
def speed_elemwise_collapse():
    """ used to time if the collapse of ccontiguous dims are useful """

    shape = (30, 40, 50, 600)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2[:, ::2, :, :]
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    v = v[:, ::2, :, :]
    v = cuda_ndarray.CudaNdarray(v)
    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
    t2 = time.time()
Example #41
0
def test_gpualloc_output_to_gpu():
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    a = tcn.shared_constructor(a_val)

    b = T.fscalar()
    f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
    f_gpu = theano.function([b],
                            B.gpu_from_host(T.ones_like(a)) + b,
                            mode=mode_with_gpu)

    print f.maker.env.toposort()
    print f_gpu.maker.env.toposort()
    print f(2)
    print f_gpu(2)

    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
    assert sum([node.op == B.gpu_alloc
                for node in f_gpu.maker.env.toposort()]) == 1

    assert numpy.allclose(
        numpy.ones(a.get_value(borrow=True).shape) + 9, f_gpu(9))
    assert numpy.allclose(f(5), f_gpu(5))
Example #42
0
def test_elemwise_collapse2():
    """ Test when only one inputs have one broadcastable dimension """

    shape = (4,5,9)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle(0,'x',1,2)
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3+b
    f = pfunc([b], [c], mode=mode_with_gpu)


    v = theano._asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
    if False:
        for id,n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
    out=f(v)[0]
    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse to 3 dimensions"
Example #43
0
def speed_elemwise_collapse2():
    """ used to test the speed up of the generalised collapse of ccontiguous dims"""

    shape = (30,40,50,600)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2[:,:,:,::2]
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3+b * tensor.exp(1 + b**a3)
    f = pfunc([b], [c], mode=mode_with_gpu)


    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    v = v[:,:,:,::2]
    v=cuda_ndarray.CudaNdarray(v)
    for id,n in enumerate(f.maker.env.toposort()):
        print id, n
    t1=time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
    t2=time.time()
Example #44
0
# Skip test if cuda is not available.
from theano.sandbox import cuda
if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv

# needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
    theano_mode = theano.compile.mode.get_default_mode().including('gpu')

device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
    cuda.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
    cuda.use("gpu",
             force=False,
             default_to_move_computation_to_gpu=False,
             move_shared_float32_to_gpu=False,
             enable_cuda=False,
             test_driver=True)
    device_id = theano.sandbox.cuda.use.device_number

cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)


def py_conv_valid_numpy(img, kern):
Example #45
0
def test_downsample():
    import random
    shps = [
        (1, 1, 1, 12),
        (1, 1, 2, 2),
        (1, 1, 1, 1),
        (1, 1, 4, 4),
        (1, 1, 10, 11),
        (1, 2, 2, 2),
        (3, 5, 4, 4),
        (25, 1, 7, 7),
        (1, 1, 12, 12),
        (1, 1, 2, 14),
        (1, 1, 12, 14),
        (1, 1, 14, 14),
        (1, 1, 16, 16),
        (1, 1, 18, 18),
        (1, 1, 24, 24),
        (1, 6, 24, 24),
        (10, 1, 24, 24),
        (10, 6, 24, 24),
        (30, 6, 12, 12),
        (30, 2, 24, 24),
        (30, 6, 24, 24),
        (10, 10, 10, 11),
        (1, 1, 10, 1025),
        (1, 1, 10, 1023),
        (1, 1, 1025, 10),
        (1, 1, 1023, 10),
    ]

    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
            if ds[0] > shp[2]: continue
            if ds[1] > shp[3]: continue
            #GpuDownsampleFactorMax don't having more then 512 columns in the output tensor
            if float(shp[3]) / ds[1] > 512: continue
            for ignore_border in (True, False):
                print 'test_downsample', shp, ds, ignore_border
                ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([],
                          ds_op(tensor.as_tensor_variable(a)),
                          mode=mode_with_gpu)
                f2 = pfunc([],
                           ds_op(tensor.as_tensor_variable(a)),
                           mode=mode_without_gpu)
                assert any([
                    isinstance(node.op, tcn.blas.GpuDownsampleFactorMax)
                    for node in f.maker.env.toposort()
                ])
                assert any([
                    isinstance(node.op, DownsampleFactorMax)
                    for node in f2.maker.env.toposort()
                ])
                assert numpy.allclose(f(), f2())

                g = pfunc([],
                          tensor.grad(
                              ds_op(tensor.as_tensor_variable(a)).sum(), a),
                          mode=mode_with_gpu)
                g2 = pfunc([],
                           tensor.grad(
                               ds_op(tensor.as_tensor_variable(a)).sum(), a),
                           mode=mode_without_gpu)
                assert any([
                    isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad)
                    for node in g.maker.env.toposort()
                ])
                assert any([
                    isinstance(node.op, DownsampleFactorMaxGrad)
                    for node in g2.maker.env.toposort()
                ])
                assert numpy.allclose(g(), g2())
Example #46
0
def test_pool():
    try:
        if hasattr(mode_with_gpu, 'check_isfinite'):
            mode_with_gpu_check_is_finite_prev = mode_with_gpu.check_isfinite
        if hasattr(mode_without_gpu, 'check_isfinite'):
            mode_without_gpu_check_is_finite_prev = mode_without_gpu.check_isfinite
        mode_with_gpu.check_isfinite = False
        mode_without_gpu.check_isfinite = False
        #(batch, channel, x, y)
        shps = [
            (1, 1, 2, 2),
            (1, 1, 1, 1),
            (1, 1, 4, 4),
            (1, 2, 2, 2),
            (1, 1, 4, 4),
            (3, 1, 4, 4),
            (1, 5, 4, 4),
            (3, 5, 4, 4),
            (25, 1, 7, 7),
            (1, 1, 12, 12),
            (1, 1, 14, 14),
            (1, 1, 16, 16),
            (1, 1, 18, 18),
            (1, 1, 24, 24),
            (1, 6, 24, 24),
            (10, 1, 24, 24),
            (10, 6, 24, 24),
            (30, 6, 12, 12),
            (30, 2, 24, 24),
            (30, 6, 24, 24),
            (65536, 1, 10, 10),
            #(1, 65536, 10, 10),#crash as too much channel
            (30, 3, 40, 40),
        ]
        shps = [(channel, x, y, batch) for (batch, channel, x, y) in shps]

        #numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)

        for shp in shps:
            for ds in range(1, min(4, shp[2] + 1)):
                #            for start in range(shp[2] + 1):
                for start in [0]:
                    for stride in range(1, min(shp[2], ds, 4) + 1):
                        print('test_pool shape=%s, ds=%d, stride=%d start=%d' %
                              (str(shp), ds, stride, start))

                        a = tcn.shared_constructor(my_rand(*shp), 'a')
                        op = MaxPool(ds=ds, stride=stride)
                        f = theano.function([], op(a), mode=mode_with_gpu)
                        assert any([
                            isinstance(node.op, MaxPool)
                            for node in f.maker.fgraph.toposort()
                        ])
                        out = numpy.asarray(f())

                        #Compute the gold version with a Theano graph.
                        gold_out = gold_max_pool_c01b(a, (ds, ds),
                                                      (stride, stride),
                                                      shp[1:3])
                        f2 = theano.function([],
                                             gold_out,
                                             mode=mode_without_gpu)
                        assert not any([
                            isinstance(node.op, MaxPool)
                            for node in f2.maker.fgraph.toposort()
                        ])
                        out2 = f2()
                        numpy.testing.assert_allclose(out,
                                                      out2,
                                                      err_msg=str(out - out2))

                        # grad testing
                        # The code support grad only in this case.
                        if shp[0] % 16 != 0:
                            shp2 = list(shp)
                            shp2[0] *= 16
                            # This make it crash due to not enough memory.
                            # On a GPU with 1279M of ram.
                            if numpy.prod(shp2) >= (16 * 10 * 10 * 65536):
                                continue
                            a.set_value(my_rand(*shp2))

                        g = theano.function([],
                                            grad(op(a).sum(), a),
                                            mode=mode_with_gpu)
                        g2 = theano.function([],
                                             grad(gold_out.sum(), a),
                                             mode=mode_without_gpu)
                        assert any([
                            isinstance(node.op, MaxPoolGrad)
                            for node in g.maker.fgraph.toposort()
                        ])
                        assert not any([
                            isinstance(node.op, MaxPoolGrad)
                            for node in g2.maker.fgraph.toposort()
                        ])
                        numpy.testing.assert_allclose(g(),
                                                      g2(),
                                                      err_msg=str(shp))

                        # Don't call verify_grad. There was problem with
                        # the test and we already assert that 2 version
                        # are equals.  Also, it will be slower to verify
                        # like that then the comparison.
                        continue
                        theano.tests.unittest_tools.verify_grad(
                            op, [a.get_value()])
    finally:
        if 'mode_with_gpu_check_is_finite_prev' in locals():
            mode_with_gpu.check_isfinite = mode_with_gpu_check_is_finite_prev
        if 'mode_without_gpu_check_is_finite_prev' in locals():
            mode_without_gpu.check_isfinite = mode_without_gpu_check_is_finite_prev
Example #47
0
def test_huge_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly
        We check that we fuse one node with part of its input
        in case their is too many inputs and that would make it bust the 256
        bytes limits.
    """
    shape = (2, 3, 4, 5, 6)
    ttype = tensor.tensor(dtype='float32',
                          broadcastable=(False, ) * len(shape))
    vars = [tensor.tanh(ttype) for x in range(7)]
    f = pfunc(
        vars,
        [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]],
        mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    #theano.printing.debugprint(f)
    #for i, node in enumerate(topo):
    #    print >> sys.stdout, i, node
    assert len(topo) == 10
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
    assert isinstance(topo[7].op.scalar_op, theano.scalar.basic.Sub)
    assert isinstance(topo[8].op.scalar_op, theano.scalar.basic.Composite)
    #let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())

    # Test the case where we can't put the computation on the gpu! their is too
    # many dimensions to the input to have 2 inputs to the op!

    shape = (
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        2,
        2,
        3,
        2,
        1,
        2,
        2,
        2,
    )
    ttype = tensor.tensor(dtype='float32',
                          broadcastable=(False, ) * len(shape))
    vars = [tensor.tanh(ttype) for x in range(7)]
    f = pfunc(
        vars,
        [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]],
        mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    #theano.printing.debugprint(f)
    assert len(topo) == 1
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
    #let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())

    def gen(shape):
        return theano._asarray(numpy.random.rand(*shape), dtype='float32')

    max_var = 16  # excluded
    for shape in [
        (2, ),
        (2, 2),
        (2, 2, 2),
        (2, 2, 2, 2),
        (2, 2, 2, 2, 2),  # 5d
        (2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
    ]:
        vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
        for use_tan in [True, False]:
            if use_tan:
                vars = [tensor.tanh(x) for x in vals]
            else:
                vars = vals
            for nb_var in range(1, max_var):
                out = reduce(lambda x, y: x + y, vars[:nb_var])
                if not isinstance(out.type, CudaNdarrayType):
                    out = cuda.gpu_from_host(out)
                f = pfunc([], [out], mode=mode_with_gpu)
                topo = f.maker.fgraph.toposort()
                #print shape, nb_var, use_tan, len(topo)
                assert (sum(
                    [isinstance(node.op, cuda.GpuElemwise)
                     for node in topo]) == len(topo)
                        or (nb_var == 1 and use_tan == False))
                assert sum([
                    isinstance(node.op, tensor.Elemwise) for node in topo
                ]) == 0

                #let debugmode catch errors
                f()
Example #48
0
def test_shared_cudandarray():
    '''Test that we can create a CudaNdarraySharedVariable from a CudaNdarray'''
    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3)))
    assert isinstance(a.type, tcn.CudaNdarrayType)
Example #49
0
def test_gpujoin_no_rebroadcast():
    _a = numpy.asarray([[1,2],[3,4]],dtype='float32')
    a = tcn.shared_constructor(_a)
    f = theano.function([],T.join(1,a))
    l = f.maker.env.toposort()
    assert not any([isinstance(x.op,T.Rebroadcast) for x in l])
Example #50
0
def test_memory_lazy():
    """As test_memory, but with the ifelse op.

    We need to test it as the ifelse op with the [c]vm create op not
    executed in the graph. This mess with [c]vm gc implementation.
    """
    shapes = (50, 100)
    # more_alloc1 is not the same for both dtype.
    # when dtype is float32, the computation is done on the gpu.
    # This insert constant on the gpu during compilation
    # that raise the number of alloc.
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
    # more_alloc1 if after the first compilation
    for dtype, more_alloc1 in [("float32", 1),
                               ("float64", 0)]:
        print(dtype)
        test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)

        some_vector = tensor.vector('some_vector', dtype=dtype)
        some_matrix = some_vector.reshape(shapes)
        branch_select = tensor.iscalar()

        mem1 = freemem()
        print("Before shared variable", mem1)
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
        derp = ifelse.IfElse(1)(branch_select,
                                derp, some_matrix[:shapes[0]].sum())
        derp += 1
        print("Shared took ",
              np.prod(variables.get_value(
                  borrow=True,
                  return_internal_type=True).shape) *
              4 / 1024,
              "kB")

        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        obj = theano.function([some_vector, branch_select], derp,
                              mode=mode_with_gpu)
        # theano.printing.debugprint(obj, print_type=True)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3)

        for i in range(3):
            obj(test_params, 1)
            print("After function evaluation branch true", freemem())
            assert mem2_1 == freemem(), (mem2_1, freemem())
            obj(test_params, 0)
            print("After function evaluation branch false", freemem())
            assert mem2_1 == freemem(), (mem2_1, freemem())

        del obj
        print("After deleting function 1", freemem())
        assert mem2 == freemem(), (mem2, freemem())

        del derp, variables
        print("After deleting shared variable and ref to it", freemem())
        assert mem1 == freemem(), (mem1, freemem())
Example #51
0
 def test_shared(self):
     # NB: we also test higher order tensors at the same time.
     y = cuda.CudaNdarray.zeros((1, 2, 3, 4))
     x = cuda.shared_constructor(y)
     assert y.size == theano.function([], x.size)()
Example #52
0
def test_memory():
    """
    We test that we do not keep link to memory between Theano function call
    and during Theano compilation

    The origin of this code come from Aaron Vandenoord and Sander Dieleman.
    I have their autorisation to put this in Theano with the Theano license.

    note::
        This test can fail if there is other process running on the gpu.
    """
    shapes = (200, 100)
    # more_alloc1 was different for each dtype in the past.
    # more_alloc2 is still currently not the same for both dtype.
    # when dtype is float32, the computation is done on the gpu.
    # This insert constant on the gpu during compilation
    # that raise the number of alloc.
    # When dtype is float64, only the shared is on the gpu and it is transferd
    # to the cpu for computation. So no extra alloc after compilation.
    # more_alloc1 if after the first compilation, more_alloc2 after the second.
    for dtype, more_alloc1, more_alloc2 in [("float32", 0, 3),
                                            ("float64", 0, 0)]:
        print(dtype)
        test_params = np.asarray(np.random.randn(np.prod(shapes)), dtype)

        some_vector = tensor.vector('some_vector', dtype=dtype)
        some_matrix = some_vector.reshape(shapes)

        mem1 = freemem()
        print("Before shared variable", mem1)
        variables = cuda.shared_constructor(np.ones((shapes[1],),
                                                    dtype='float32'))
        derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
        print("Shared took ",
              np.prod(variables.get_value(
                  borrow=True,
                  return_internal_type=True).shape) *
              4 / 1024,
              "kB")

        mem2 = freemem()
        print("Before compilation", mem2)
        mem2_1 = freemem(extra_alloc=more_alloc1)
        mem2_2 = freemem(extra_alloc=more_alloc2)
        obj = theano.function([some_vector], derp, mode=mode_with_gpu)
        mem3 = freemem()
        print("After function compilation 1", mem3)
        assert mem2_1 == mem3, (mem2_1, mem3, dtype)

        grad_derp = tensor.grad(derp, some_vector)
        grad = theano.function([some_vector], grad_derp, mode=mode_with_gpu)
        mem4 = freemem()
        print("After function compilation 2", mem4)
        assert mem2_2 == mem4, (mem2_2, mem4, dtype)

        for i in range(3):
            obj(test_params)
            print("After function evaluation 1", freemem())
            assert mem2_2 == freemem(), (mem2_2, freemem())
            grad(test_params)
            print("After function evaluation 2", freemem())
            assert mem2_2 == freemem(), (mem2_2, freemem())

        del obj
        # print "After deleting function 1", freemem()
        # assert mem2 == freemem(), (mem2, freemem())

        del grad
        print("After deleting function 2", freemem())
        assert mem2 == freemem(), (mem2, freemem())

        del derp, variables, grad_derp
        print("After deleting shared variable and ref to it", freemem())
        assert mem1 == freemem(), (mem1, freemem())
Example #53
0
def test_downsample():
    shps = [(1, 12),
            (1, 1, 12),
            (1, 1, 1, 12),
            (1, 1, 2, 2),
            (1, 1, 1, 1),
            (1, 1, 4, 4),
            (1, 1, 10, 11),
            (1, 2, 2, 2),
            (3, 5, 4, 4),
            (25, 1, 7, 7),
            (1, 1, 12, 12),
            (1, 1, 2, 14),
            (1, 1, 12, 14),
            (1, 1, 14, 14),
            (1, 1, 16, 16),
            (1, 1, 18, 18),
            (1, 1, 24, 24),
            (1, 6, 24, 24),
            (10, 1, 24, 24),
            (10, 6, 24, 24),
            (30, 6, 12, 12),
            (30, 2, 24, 24),
            (30, 6, 24, 24),
            (10, 10, 10, 11),
            (1, 1, 10, 1025),
            (1, 1, 10, 1023),
            (1, 1, 1025, 10),
            (1, 1, 1023, 10),
            (65536, 1, 10, 10),
            (1, 65536, 10, 10), ]

    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
            if ds[0] > shp[-2]:
                continue
            if ds[1] > shp[-1]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
            if float(shp[-1]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                # print 'test_downsample', shp, ds, ignore_border
                ds_op = Pool(ndim=len(ds), ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), 'a')
                f = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),
                          mode=mode_with_gpu.excluding('cudnn'))
                f2 = pfunc([], ds_op(tensor.as_tensor_variable(a), ds),
                           mode=mode_without_gpu)
                assert any([isinstance(node.op,
                                       tcn.blas.GpuDownsampleFactorMax)
                            for node in f.maker.fgraph.toposort()])
                assert any([isinstance(node.op, Pool)
                            for node in f2.maker.fgraph.toposort()])
                assert numpy.allclose(f(), f2())

                # The grad is too slow on GT220 GPU
                # This cause the computer to freeze...
                # Remove this when it gets optimized enough
                # This only bypass the last 2 checks
                # Those tests where passing in all Mode on a GTX470
                if shp[0] > 30000 or shp[1] > 30000:
                    continue

                g = pfunc(
                    [],
                    tensor.grad(ds_op(tensor.as_tensor_variable(a), ds).sum(),
                                a),
                    mode=mode_with_gpu.excluding('cudnn'))
                g2 = pfunc(
                    [],
                    tensor.grad(ds_op(tensor.as_tensor_variable(a), ds).sum(),
                                a),
                    mode=mode_without_gpu)
                assert any([isinstance(node.op,
                                       tcn.blas.GpuDownsampleFactorMaxGrad)
                            for node in g.maker.fgraph.toposort()])
                assert any([isinstance(node.op, PoolGrad)
                            for node in g2.maker.fgraph.toposort()])
                assert numpy.allclose(g(), g2()), shp

                ggf = gradient.Lop(tensor.grad((ds_op(
                    tensor.as_tensor_variable(a), ds)**2).sum(), a), a, a)

                ref_mode = copy.copy(mode_without_gpu)
                ref_mode.check_py_code = False
                gpu_mode = copy.copy(mode_with_gpu)
                gpu_mode.check_py_code = False
                gg = pfunc([], ggf, mode=gpu_mode)
                gg2 = pfunc([], ggf, mode=ref_mode)

                assert any([isinstance(
                    node.op, tcn.blas.GpuDownsampleFactorMaxGradGrad)
                    for node in gg.maker.fgraph.toposort()])
                assert any([isinstance(
                    node.op, DownsampleFactorMaxGradGrad)
                    for node in gg2.maker.fgraph.toposort()])
                assert numpy.allclose(gg(), gg2()), shp
Example #54
0
def test_huge_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly
        We check that we fuse one node with part of its input
        in case their is too many inputs and that would make it bust the 256
        bytes limits.
    """
    shape = (2, 3, 4, 5, 6)
    ttype = tensor.tensor(dtype='float32',
                          broadcastable=(False, ) * len(shape))
    gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes(
    )['gpu_ptr_size']
    if gpu_ptr_size == 8:
        nb_in = 7
        len_topo = 10
    elif gpu_ptr_size == 4:
        nb_in = 8
        len_topo = 11
    else:
        raise Exception("Unexpected value for gpu_ptr_size", gpu_ptr_size)
    vars = [tensor.tanh(ttype) for x in range(nb_in)]
    f = pfunc(vars, [reduce(operator.sub, vars)], mode=mode_with_gpu)

    topo = f.maker.fgraph.toposort()
    assert len(topo) == len_topo
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 2
    assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
    assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
    # let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(*[gen() for i in range(nb_in)])

    # Test the case where we can't put the computation on the gpu! their is too
    # many dimensions to the input to have 2 inputs to the op!

    shape = (
        1,
        2,
        3,
        4,
        5,
        6,
        7,
        2,
        2,
        3,
        2,
        1,
        2,
        2,
        2,
    )
    ttype = tensor.tensor(dtype='float32',
                          broadcastable=(False, ) * len(shape))
    vars = [tensor.tanh(ttype) for x in range(7)]
    f = pfunc(
        vars,
        [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - vars[5] - vars[6]],
        mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 1
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
    # let debugmode catch errors
    gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(gen(), gen(), gen(), gen(), gen(), gen(), gen())

    def gen(shape):
        return theano._asarray(numpy.random.rand(*shape), dtype='float32')

    max_var = 16  # excluded
    for shape in [
        (2, ),
        (2, 2),
        (2, 2, 2),
        (2, 2, 2, 2),
        (2, 2, 2, 2, 2),  # 5d
        (2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 2, 2, 2, 2, 2),
            #                  (2, 2, 2, 1, 1, 1, 1, 2, 2),  # 9d
    ]:
        vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
        for use_tan in [True, False]:
            if use_tan:
                vars = [tensor.tanh(x) for x in vals]
            else:
                vars = vals
            for nb_var in range(1, max_var):
                out = reduce(lambda x, y: x + y, vars[:nb_var])
                if not isinstance(out.type, CudaNdarrayType):
                    out = cuda.gpu_from_host(out)
                f = pfunc([], [out], mode=mode_with_gpu)
                topo = f.maker.fgraph.toposort()
                # print shape, nb_var, use_tan, len(topo)
                assert (sum(
                    [isinstance(node.op, cuda.GpuElemwise)
                     for node in topo]) == len(topo)
                        or (nb_var == 1 and use_tan is False))
                assert sum([
                    isinstance(node.op, tensor.Elemwise) for node in topo
                ]) == 0

                # let debugmode catch errors
                f()
Example #55
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Test multiple output
    a_s = theano.scalar.float32()
    a = tensor.fmatrix()
    from theano.scalar.basic import identity
    out_s = theano.scalar.Composite(
        [a_s, b_s, c_s],
        [identity(a_s), identity(c_s),
         identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
    c = cuda.shared_constructor(c_v)
    f = theano.function([a, b],
                        outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Example #56
0
 def shared(val):
     try:
         return tcn.shared_constructor(val)
     except TypeError:
         return theano.shared(val)