def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
                  direction):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)
    subsample = (subsx, subsy)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda_tensor4()
    k = cuda_tensor4()

    if direction == 'fprop':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
                                                subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
    elif direction == 'bprop img':
        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
    elif direction == 'bprop kern':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = numpy.array(f(
                npy_img.transpose(1, 0, 2, 3),
                npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
            1, 0, 2, 3)

    assert_allclose(cpuval, gpuval, rtol=1e-4)
    def test_compare_1D_and_2D_upsampling_values(self):
        """Compare 1D and 2D upsampling

        This method verifies the bilinear upsampling done by using
        1D and 2D kernels will generate the same result.

        """
        # checking upsampling with ratio 5
        input_x = np.random.rand(5, 4, 6, 7).astype(theano.config.floatX)
        mat_1D = bilinear_upsampling(input=input_x, ratio=5,
                                     batch_size=5, num_input_channels=4,
                                     use_1D_kernel=True)
        mat_2D = bilinear_upsampling(input=input_x, ratio=5,
                                     batch_size=5, num_input_channels=4,
                                     use_1D_kernel=False)
        f_1D = theano.function([], mat_1D, mode=self.compile_mode)
        f_2D = theano.function([], mat_2D, mode=self.compile_mode)
        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)

        # checking upsampling with ratio 8
        input_x = np.random.rand(12, 11, 10, 7).astype(theano.config.floatX)
        mat_1D = bilinear_upsampling(input=input_x, ratio=8,
                                     batch_size=12, num_input_channels=11,
                                     use_1D_kernel=True)
        mat_2D = bilinear_upsampling(input=input_x, ratio=8,
                                     batch_size=12, num_input_channels=11,
                                     use_1D_kernel=False)
        f_1D = theano.function([], mat_1D, mode=self.compile_mode)
        f_2D = theano.function([], mat_2D, mode=self.compile_mode)
        utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
Example #3
0
    def run_conv_valid(self, inputs_shape, filters_shape, pad=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))

        # Flip filter as conv3D compute correlation
        filters_flip = filters[:, ::-1, ::-1, ::-1, :]
        # filters_flip = filters
        conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters_flip,
                                             b=bias, d=(1, 1, 1))

        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(
            inputs.dimshuffle(0, 4, 1, 2, 3),
            filters.dimshuffle(0, 4, 1, 2, 3),
            border_mode="valid",
            pad_last_dim=pad)
        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)

        f_ref = theano.function([], conv_ref, mode="FAST_RUN")
        mode = mode_with_gpu
        mode.check_py_code = False
        f_fft = theano.function([], conv_fft, mode=mode)

        res_ref = f_ref()
        res_fft = f_fft()
        utt.assert_allclose(res_ref, res_fft, rtol=1e-05, atol=1e-05)
Example #4
0
    def test_cast_float16(self):
        f16 = theano.tensor.vector(dtype='float16')
        f32 = theano.tensor.fvector()
        i8 = theano.tensor.bvector()
        f = theano.function([f16, f32, i8],
                            [f16.astype('float32'),
                             f32.astype('float16'),
                             f32.astype('float64'),
                             f16.astype('int8'),
                             f32.astype('int8'),
                             i8.astype('float16'),
                             i8.astype('float32')],
                            mode=mode_with_gpu)

        d1 = (np.random.rand(4) * 10).astype('float16')
        d2 = (np.random.rand(5) * 10).astype('float32')
        d3 = (np.random.rand(6) * 10).astype('int8')
        res = f(d1, d2, d3)

        for i, out in enumerate(f.outputs):
            dtype = out.variable.dtype
            assert res[i].dtype == dtype
            inp = out.variable.owner.inputs[0]
            if inp.dtype == 'float16':
                d = d1
            elif inp.dtype == 'float32':
                d = d2
            else:
                d = d3
            assert_allclose(d.astype(dtype), res[i])
Example #5
0
    def run_gradinput(self, inputs_shape, filters_shape,
                      subsample=(1, 1, 1)):

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
        conv = theano.tensor.nnet.convTransp3D(W=filters,
                                               b=bias,
                                               d=subsample,
                                               H=inputs)
        f_ref = theano.function([], conv)
        res_ref = f_ref()

        # Get bottom shape using convTransp3D
        bottom_shape = res_ref.shape
        bottom_val = numpy.random.random(bottom_shape).astype('float32')
        bottom = shared(bottom_val)

        weight = gpu_contiguous(filters.dimshuffle(0, 4, 1, 2, 3))
        top = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
        if (subsample == (1, 1, 1)):
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=weight, topgrad=top)
        else:
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=weight, topgrad=top,
                shape=bottom.shape[1:4])
        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
        f = theano.function([], conv_gemm, mode=mode_with_gpu)

        res = f()
        utt.assert_allclose(res_ref, res)
Example #6
0
def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
    dtypes = ["uint8", "uint16", "uint32", "uint64",
              "int8", "int16", "int32", "int64",
              "float16", "float32", "float64"]

    for dtype_base in dtypes:
        for dtype_exp in dtypes:

            # Compile a gpu function with the specified dtypes
            base_val = np.random.randint(0, 5, size=10).astype(dtype_base)
            exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp)

            base = theano.tensor.vector(dtype=dtype_base)
            exp = gpuarray_shared_constructor(exp_val)
            assert exp.dtype == dtype_exp
            output = base ** exp
            f = theano.function([base], output, mode=mode_with_gpu)
            theano.printing.debugprint(f)
            # We don't transfer to the GPU when the output dtype is int*
            n = len([n for n in f.maker.fgraph.apply_nodes
                     if isinstance(n.op, GpuElemwise)])
            assert n == (output.dtype in tensor.float_dtypes)

            # Call the function to make sure the output is valid
            out = f(base_val)
            expected_out = base_val ** exp_val
            assert_allclose(out, expected_out)
Example #7
0
def test_hgemm_swap():
    from theano.sandbox.cuda import nvcc_compiler
    if nvcc_compiler.nvcc_version < '7.5':
        raise SkipTest("SgemmEx is only avaialble on cuda 7.5+")

    v = tensor.vector(dtype='float16')
    m = tensor.matrix(dtype='float16')
    m2 = tensor.matrix(dtype='float16')
    m32 = tensor.matrix(dtype='float32')

    # test that we don't try to replace anything but matrix x matrix in float16
    f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu)
    assert len([node for node in f.maker.fgraph.apply_nodes
                if isinstance(node.op, GpuGemm)]) == 0

    f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu)
    assert len([node for node in f.maker.fgraph.apply_nodes
                if isinstance(node.op, GpuGemm)]) == 0

    f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu)
    assert len([node for node in f.maker.fgraph.apply_nodes
                if isinstance(node.op, GpuGemm)]) == 1

    v1 = numpy.random.random((3, 4)).astype('float16')
    v2 = numpy.random.random((4, 2)).astype('float16')

    of = f(v1, v2)
    on = numpy.dot(v1, v2)

    utt.assert_allclose(of, on)
Example #8
0
    def test_opt_conv3d_gemm(self):
        inputs_shape = (16, 20, 32, 16, 1)
        filters_shape = (10, 6, 12, 4, 1)

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))

        conv = theano.tensor.nnet.conv3D(V=inputs, W=filters,
                                         b=bias, d=(1, 1, 1))
        mode = mode_with_gpu.including('conv3d_gemm')
        mode.check_py_code = False

        f_ref = theano.function([], conv, mode="FAST_RUN")
        f_gemm = theano.function([], conv, mode=mode)

        # make sure we inserted the gemm trickery
        topo = f_gemm.maker.fgraph.toposort()
        assert sum(isinstance(n.op, GpuCorr3dMM) for n in topo) > 0

        res_ref = f_ref()
        res_gemm = f_gemm()
        utt.assert_allclose(res_ref, res_gemm)
Example #9
0
    def test_DownsampleFactorMax(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        # generate random images
        maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3))
        imval = rng.rand(4, 2, 16, 16)
        images = tensor.dtensor4()
        for maxpoolshp, ignore_border, mode in product(maxpoolshps,
                                                       [True, False],
                                                       ['max',
                                                        'sum',
                                                        'average_inc_pad',
                                                        'average_exc_pad']):
                # print 'maxpoolshp =', maxpoolshp
                # print 'ignore_border =', ignore_border

                # Pure Numpy computation
                numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp,
                                                          ignore_border,
                                                          mode=mode)
                output = max_pool_2d(images, maxpoolshp, ignore_border,
                                     mode=mode)
                f = function([images, ], [output, ])
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)

                # DownsampleFactorMax op
                maxpool_op = DownsampleFactorMax(maxpoolshp,
                                                 ignore_border=ignore_border,
                                                 mode=mode)(images)
                f = function([images], maxpool_op)
                output_val = f(imval)
                utt.assert_allclose(output_val, numpy_output_val)
Example #10
0
def test_dnn_conv_merge():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
    out = T.ftensor4()

    b = 1
    c = 4
    f = 3
    ih = 5
    iw = 8
    kh = 2
    kw = 6
    img_val = numpy.random.random((b, c, ih, iw)).astype("float32")
    kern_val = numpy.random.random((f, c, kh, kw)).astype("float32")
    out_val = numpy.random.random((b, f, ih - kh + 1, iw - kw + 1)).astype("float32")

    conv = dnn.dnn_conv(img, kern)
    gw = theano.grad(conv.sum(), kern)
    gi = theano.grad(conv.sum(), img)

    lr = numpy.asarray(0.05, dtype="float32")

    if cuda.dnn.version() == -1:
        # Can't merge alpha with cudnn v1
        fr = conv + out
        wr = kern + gw
        ir = img + gi
    else:
        fr = lr * (conv + out)
        wr = kern + lr * gw
        ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv)
    assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW)
    assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI)

    mode = mode_with_gpu
    mode = mode.excluding("local_dnn_conv_alpha_merge")
    mode = mode.excluding("local_dnn_convw_alpha_merge")
    mode = mode.excluding("local_dnn_convi_alpha_merge")
    mode = mode.excluding("local_dnn_conv_output_merge")
    mode = mode.excluding("local_dnn_convw_output_merge")
    mode = mode.excluding("local_dnn_convi_output_merge")

    f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)

    assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv)
    assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW)
    assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI)

    out_f1 = f1(img_val, kern_val, out_val)
    out_f2 = f2(img_val, kern_val, out_val)

    assert len(out_f1) == len(out_f2)

    for v1, v2 in zip(out_f1, out_f2):
        utt.assert_allclose(v1, v2)
Example #11
0
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape,
                       subsample=(1, 1, 1)):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
        inputs = shared(inputs_val)
        dCdH = shared(dCdH_val)

        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
                                             WShape=filters_shape,
                                             d=subsample)
        img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3))
        topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3))
        if (subsample == (1, 1, 1)):
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img,
                                                                     topgrad)
        else:
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                img, topgrad, shape=filters_shape[1:4])
        conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1)
        f_ref = theano.function([], conv)
        f = theano.function([], conv_gemm, mode=mode_with_gpu)

        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)
Example #12
0
    def test_opt_convtransp3d_fft(self):
        inputs_shape = (2, 9, 16, 12, 10)
        filters_shape = (10, 3, 8, 4, 1)

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))

        inputs = shared(inputs_val)
        filters = shared(filters_val)

        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1, 1, 1),
                                               H=inputs)
        mode = mode_with_gpu.including('convtransp3d_fft')

        f_ref = theano.function([], conv)
        f_fft = theano.function([], conv, mode=mode)

        # make sure we inserted the fft trickery
        topo = f_fft.maker.fgraph.toposort()
        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
                   for n in topo) == 2

        res_ref = f_ref()
        res_fft = f_fft()

        utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
Example #13
0
    def test_concatenate(self):
        def ref(*inputs):
            axis = inputs[0]
            tensors = inputs[1:]

            return numpy.concatenate(tensors, axis)

        seed = utt.fetch_seed()
        rng = numpy.random.RandomState(seed)

        imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8))
        n, c = 4, 2

        axis = 1

        image = T.dtensor4('image')
        image1 = T.dtensor4('image1')
        for imgsize in imgsize_list:
            imval = rng.rand(n, c, imgsize[0], imgsize[1])

            output_ref = ref(axis, imval, imval)

            Opout = self.mkl_concatenate_func(axis, image, image1)
            f = function([image, image1], [Opout, ])
            output_mkl = f(imval, imval)

            utt.assert_allclose(output_mkl, output_ref)
Example #14
0
    def test_relu_grad(self):
        seed = utt.fetch_seed()
        rng = numpy.random.RandomState(seed)

        imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8))
        n, c = 4, 2

        axis = 1

        image = T.dtensor4('image')
        image1 = T.dtensor4('image1')
        for imgsize in imgsize_list:
            imval = rng.rand(n, c, imgsize[0], imgsize[1])

            out = T.concatenate([image, image1], axis)
            sum_ref = T.sum(out)
            gx_ref = T.grad(sum_ref, [image, image1])
            f_ref = theano.function([image, image1], outputs=gx_ref, mode=mode_without_mkl)
            output_ref = f_ref(imval, imval)

            out_mkl = self.mkl_concatenate_func(axis, image, image1)
            sum_mkl = T.sum(out_mkl)
            gx_mkl = T.grad(sum_mkl, [image, image1])
            f_mkl = theano.function([image, image1], outputs=gx_mkl)
            output_mkl = f_mkl(imval, imval)

            utt.assert_allclose(output_mkl, output_ref)
Example #15
0
def test_small_star():
    from batman import _rsky
    m_star = 0.151
    r_star = 0.189
    period = 0.4626413
    t0 = 0.2
    b = 0.5
    ecc = 0.1
    omega = 0.1
    t = np.linspace(0, period, 500)

    orbit = KeplerianOrbit(
        r_star=r_star, m_star=m_star,
        period=period, t0=t0, b=b,
        ecc=ecc, omega=omega)
    a = orbit.a.eval()
    incl = orbit.incl.eval()

    r_batman = _rsky._rsky(t, t0, period, a, incl, ecc, omega, 1, 1)
    m = r_batman < 100.0
    assert m.sum() > 0

    func = theano.function([], orbit.get_relative_position(t))
    x, y, z = func()
    r = np.sqrt(x**2 + y**2)

    # Make sure that the in-transit impact parameter matches batman
    utt.assert_allclose(r_batman[m], r[m], atol=2e-5)
Example #16
0
    def test_sparseblockgemvF(self):
        """
            Test the fortan order for W (which can happen in the grad for some
            graphs).
        """
        b = tensor.fmatrix()
        W = tensor.ftensor4()
        h = tensor.ftensor3()
        iIdx = tensor.imatrix()
        oIdx = tensor.imatrix()

        o = self.gemv_op(b.take(oIdx, axis=0),
                         tensor.DimShuffle((False, False, False, False),
                                           (0, 1, 3, 2))
                         (tensor.as_tensor_variable(W)),
                         h, iIdx, oIdx)

        f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = \
            BlockSparse_Gemv_and_Outer.gemv_data()

        th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val,
                   oIdx_val)
        ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
            b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)

        utt.assert_allclose(ref_out, th_out)
Example #17
0
        def cmp(n, m, f, f_gpu):
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
            gdata = numpy.asarray(data)[:, :, None, None]

            out = f(data)
            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
            utt.assert_allclose(out, gout)
Example #18
0
    def cmp(n, m):
        data = numpy.random.uniform(1e-7, 1, (n, m)).astype(dtype=dtypeInput)
        b_data = numpy.random.uniform(1e-7, 1, (m,)).astype(dtype=dtypeBias)

        out = f(data, b_data)
        gout = f_gpu(data, b_data)
        utt.assert_allclose(out, gout)
Example #19
0
    def with_linker(self, linker, op, type, rand_val):
        for xsh, ysh in [((3, 5), (3, 5)),
                         ((3, 5), (1, 5)),
                         ((3, 5), (3, 1)),
                         ((1, 5), (5, 1)),
                         ((1, 1), (1, 1)),
                         ((self.openmp_minsize,), (self.openmp_minsize,)),
                         ((self.openmp_minsize_sqrt,
                           self.openmp_minsize_sqrt),
                          (self.openmp_minsize_sqrt,
                           self.openmp_minsize_sqrt)),
                         ((2, 3, 4, 5), (2, 3, 4, 5)),
                         ((2, 3, 4, 5), (1, 3, 1, 5)),
                         ((2, 3, 4, 5), (1, 1, 1, 1)),
                         ((), ())]:
            x = type('float64', [(entry == 1) for entry in xsh])('x')
            y = type('float64', [(entry == 1) for entry in ysh])('y')
            e = op(scalar.add)(x, y)
            f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
            xv = rand_val(xsh)
            yv = rand_val(ysh)
            zv = xv + yv

            unittest_tools.assert_allclose(f(xv, yv), zv)

            #test Elemwise.infer_shape
            #the Shape op don't implement c_code!
            if isinstance(linker, gof.PerformLinker):
                x = type('float64', [(entry == 1) for entry in xsh])('x')
                y = type('float64', [(entry == 1) for entry in ysh])('y')
                e = op(scalar.add)(x, y)
                f = copy(linker).accept(FunctionGraph(
                    [x, y], [e.shape])).make_function()
                assert tuple(f(xv, yv)) == tuple(zv.shape)
Example #20
0
    def test_opt_convgrad3d_fft(self):
        inputs_shape = (2, 17, 15, 16, 1)
        filters_shape = (10, 6, 7, 4, 1)
        dCdH_shape = (inputs_shape[0],
                      inputs_shape[1] - filters_shape[1] + 1,
                      inputs_shape[2] - filters_shape[2] + 1,
                      inputs_shape[3] - filters_shape[3] + 1,
                      filters_shape[0])

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')

        inputs = shared(inputs_val)
        dCdH = shared(dCdH_val)

        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
                                             WShape=filters_shape,
                                             d=(1, 1, 1))
        mode = mode_with_gpu.including('convgrad3d_fft')
        mode.check_py_code = False

        f_ref = theano.function([], conv, mode="FAST_RUN")
        f_fft = theano.function([], conv, mode=mode)

        # make sure we inserted the fft trickery
        topo = f_fft.maker.fgraph.toposort()
        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
                   for n in topo) == 2

        res_ref = f_ref()
        res_fft = f_fft()

        utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
Example #21
0
 def test3(self):
     a = tensor.dvector()
     w2 = sort(a)
     f = theano.function([a], w2)
     gv = f(self.v_val)
     gt = np.sort(self.v_val)
     utt.assert_allclose(gv, gt)
Example #22
0
    def test_opt_full(self):
        inputs_shape = (5, 3, 7, 6)
        filters_shape = (2, 3, 3, 3)

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)

        conv = theano.tensor.nnet.conv.conv2d(inputs, filters,
                                              border_mode='full')

        mode = mode_with_gpu.including('conv_fft_full')

        f_ref = theano.function([], conv)
        f_fft = theano.function([], conv, mode=mode)

        # make sure we inserted the fft trickery
        topo = f_fft.maker.fgraph.toposort()
        assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp)
                   for n in topo) == 2, topo

        res_ref = f_ref()
        res_fft = f_fft()

        utt.assert_allclose(res_ref, res_fft)
Example #23
0
 def test_None(self):
     a = tensor.dmatrix()
     l = sort(a, None)
     f = theano.function([a], l)
     gv = f(self.m_val)
     gt = np.sort(self.m_val, None)
     utt.assert_allclose(gv, gt)
Example #24
0
    def test_opt_convgrad3d_gemm(self):
        inputs_shape = (16, 10, 12, 16, 1)
        filters_shape = (10, 6, 12, 4, 1)
        dCdH_shape = (16, 5, 1, 13, 10)

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')

        inputs = shared(inputs_val)
        dCdH = shared(dCdH_val)

        conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH,
                                             WShape=filters_shape,
                                             d=(1, 1, 1))
        mode = mode_with_gpu.including('convgrad3d_gemm')

        f_ref = theano.function([], conv)
        f_gemm = theano.function([], conv, mode=mode)

        # make sure we inserted the gemm trickery
        topo = f_gemm.maker.fgraph.toposort()
        assert sum(isinstance(n.op, GpuCorr3dMM_gradWeights) for n in topo) > 0

        res_ref = f_ref()
        res_gemm = f_gemm()
        utt.assert_allclose(res_ref, res_gemm)
Example #25
0
    def test_1Drfft(self):
        inputs_val = np.random.random((1, N)).astype(theano.config.floatX)

        x = T.matrix('x')
        rfft = fft.rfft(x)
        f_rfft = theano.function([x], rfft)
        res_rfft = f_rfft(inputs_val)
        res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) +
                         1j * np.asarray(res_rfft[:, :, 1]))

        rfft_ref = np.fft.rfft(inputs_val, axis=1)

        utt.assert_allclose(rfft_ref, res_rfft_comp)

        m = rfft.type()
        print(m.ndim)
        irfft = fft.irfft(m)
        f_irfft = theano.function([m], irfft)
        res_irfft = f_irfft(res_rfft)

        utt.assert_allclose(inputs_val, np.asarray(res_irfft))

        # The numerical gradient of the FFT is sensitive, must set large
        # enough epsilon to get good accuracy.
        eps = 1e-1

        def f_rfft(inp):
            return fft.rfft(inp)
        inputs_val = np.random.random((1, N)).astype(theano.config.floatX)
        utt.verify_grad(f_rfft, [inputs_val], eps=eps)

        def f_irfft(inp):
            return fft.irfft(inp)
        inputs_val = np.random.random((1, N // 2 + 1, 2)).astype(theano.config.floatX)
        utt.verify_grad(f_irfft, [inputs_val], eps=eps)
    def test_DownsampleFactorMaxPaddingStride(self):
        ignore_border = True  # padding does not support ignore_border=False
        rng = numpy.random.RandomState(utt.fetch_seed())
        maxpoolsizes = [(3, 3), (4, 4), (3, 4), (4, 3), (2, 2)]
        stridesizes = [(2, 2), (2, 2), (1, 1), (1, 2), (2, 2)]
        paddingsizes = [(2, 2), (1, 2), (2, 1), (0, 0), (1, 1)]
        imgsizes = [(5, 5), (5, 5), (5, 6), (6, 5), (5, 5)]
        m = 4  # minibatch
        c = 2  # channel size
        images = tensor.dtensor4()
        for indx, mode in product(
            numpy.arange(len(maxpoolsizes)), ["max", "sum", "average_inc_pad", "average_exc_pad"]
        ):
            imgsize = imgsizes[indx]
            imval = rng.rand(m, c, imgsize[0], imgsize[1]) - 0.5

            stridesize = stridesizes[indx]
            maxpoolsize = maxpoolsizes[indx]
            paddingsize = paddingsizes[indx]
            numpy_output_val = self.numpy_max_pool_2d_stride_padding(
                imval, maxpoolsize, ignore_border, stridesize, paddingsize, mode
            )
            maxpool_op = DownsampleFactorMax(
                maxpoolsize, ignore_border=ignore_border, st=stridesize, padding=paddingsize, mode=mode
            )(images)
            f = function([images], maxpool_op)
            output_val = f(imval)
            utt.assert_allclose(output_val, numpy_output_val)
Example #27
0
    def test_irfft(self):
        inputs_val = np.random.random((1, N, N)).astype(theano.config.floatX)
        inputs = theano.shared(inputs_val)

        rfft = fft.rfft(inputs)
        f_rfft = theano.function([], rfft)
        res_fft = f_rfft()

        m = rfft.type()
        irfft = fft.irfft(m)
        f_irfft = theano.function([m], irfft)
        res_irfft = f_irfft(res_fft)

        utt.assert_allclose(inputs_val, np.asarray(res_irfft))

        inputs_val = np.random.random((1, N, N, 2)).astype(theano.config.floatX)
        inputs = theano.shared(inputs_val)

        irfft = fft.irfft(inputs)
        f_irfft = theano.function([], irfft)
        res_irfft = f_irfft()
        inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j

        irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2))

        utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
Example #28
0
            def cmp(a_shp, b_shp):

                a = numpy.random.randn(* a_shp).astype(numpy.float32)
                b = numpy.random.randn(* b_shp).astype(numpy.float32)

                x = tensor.ftensor3()
                y = tensor.ftensor3()

                f = theano.function([x, y],
                                    batched_dot(x, y),
                                    mode=mode_with_gpu)

                z0 = numpy.asarray(f(a, b))

                ga = cuda_ndarray.CudaNdarray(a)
                gb = cuda_ndarray.CudaNdarray(b)

                z1 = numpy.asarray(f(ga, gb))

                z_test = numpy.sum(
                    a[:, :, :, None] * b[:, None, :, :], axis=-2)
                z1 = numpy.asarray(f(ga, gb))

                z_test = numpy.sum(
                    a[:, :, :, None] * b[:, None, :, :], axis=-2)

                unittest_tools.assert_allclose(z0, z_test)
                unittest_tools.assert_allclose(z1, z_test)
Example #29
0
    def test_opt_convtransp3d_gemm(self):
        inputs_shape = (16, 15, 12, 12, 10)
        filters_shape = (10, 6, 12, 4, 1)

        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))

        inputs = shared(inputs_val)
        filters = shared(filters_val)

        conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1, 1, 1),
                                               H=inputs)
        mode = mode_with_gpu.including('convtransp3d_gemm')

        f_ref = theano.function([], conv)
        f_gemm = theano.function([], conv, mode=mode)

        # make sure we inserted the gemm trickery
        topo = f_gemm.maker.fgraph.toposort()
        assert sum(isinstance(n.op, GpuCorr3dMM_gradInputs) for n in topo) > 0

        res_ref = f_ref()
        res_gemm = f_gemm()
        utt.assert_allclose(res_ref, res_gemm)
Example #30
0
    def run_conv_full(self, inputs_shape, filters_shape, pad=False):
        inputs_val = numpy.random.random(inputs_shape).astype('float32')
        filters_val = numpy.random.random(filters_shape).astype('float32')

        inputs = shared(inputs_val)
        filters = shared(filters_val)
        bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))

        conv_ref = theano.tensor.nnet.convTransp3D(
            W=filters, b=bias, d=(1, 1, 1),
            H=inputs)

        filters = filters.dimshuffle(4, 0, 1, 2, 3)
        inputs = inputs.dimshuffle(0, 4, 1, 2, 3)
        conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs, filters,
                                                          border_mode="full",
                                                          pad_last_dim=pad)
        conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1)

        f_ref = theano.function([], conv_ref)
        f_fft = theano.function([], conv_fft, mode=mode_with_gpu)

        res_ref = f_ref()
        res_fft = f_fft()
        utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
Example #31
0
    def test_GpuCumsum2D(self):
        block_max_size = self.max_threads_dim0 * 2

        x = T.fmatrix('x')
        for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]):
            f = theano.function([x], cumsum(x, axis=axis), mode=self.mode)
            assert [
                n for n in f.maker.fgraph.toposort()
                if isinstance(n.op, GpuCumsum)
            ]

            # Extensive testing for the first 1025 sizes
            a_shape = [5, 5]
            a_shape[shape_axis] = 1025
            a = np.random.random(a_shape).astype("float32")
            slices = [slice(None), slice(None)]
            for i in xrange(a.shape[shape_axis]):
                slices[shape_axis] = slice(i)
                fa = f(a[slices])
                npa = np.cumsum(a[slices], axis=axis)
                utt.assert_allclose(npa, fa)

            # Use multiple GPU threadblocks
            a_shape = [5, 5]
            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))

            # Use multiple GPU gridblocks
            a_shape = [4, 4]
            a_shape[1 - shape_axis] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)

            # Use recursive cumsum
            a_shape = [3, 3]
            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
            a = np.sign(a - 0.5).astype(
                "float32")  # Avoid floating point error
            utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
Example #32
0
def test_multinomial_input_dtype():
    # This tests the MultinomialFromUniform Op directly, not going through the
    # multinomial() call in GPU random generation.

    for idtype in ['float32', 'float16', 'float64']:
        for odtype in ['float32', 'float16', 'float64', 'int32']:

            p = tensor.matrix('p', idtype)
            u = tensor.vector('u', idtype)
            # p = tensor.dmatrix('p')
            # u = tensor.dvector('u')
            m = theano.sandbox.multinomial.MultinomialFromUniform(odtype)(p, u)

            # the m*2 allows the multinomial to reuse output
            f = function([p, u],
                         m * 2,
                         allow_input_downcast=True,
                         mode=mode_with_gpu)

            assert any([
                type(node.op) is GPUAMultinomialFromUniform
                for node in f.maker.fgraph.toposort()
            ])

            # test that both first and second samples can be drawn
            utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
                                [[2, 0], [0, 2]])

            # test that both second labels can be drawn
            r = f([[.2, .8], [.3, .7]], [.31, .31])
            utt.assert_allclose(r, [[0, 2], [0, 2]])

            # test that both first labels can be drawn
            r = f([[.2, .8], [.3, .7]], [.21, .21])
            utt.assert_allclose(r, [[0, 2], [2, 0]])

            # change the size to make sure output gets reallocated ok
            # and also make sure that the GPU version doesn't screw up the
            # transposed-ness
            r = f([[.2, .8]], [.25])
            utt.assert_allclose(r, [[0, 2]])
Example #33
0
def test_float16():
    # gemv (gemm called)
    float16_data = [
        rand(3).astype("float16"),
        np.asarray(1, dtype=np.float32),
        rand(3, 3).astype("float16"),
        rand(3).astype("float16"),
        np.asarray(0.5, dtype=np.float32),
    ]
    float16_shared = [
        gpuarray_shared_constructor(val, target=test_ctx_name)
        for val in float16_data
    ]
    o = gemv(*float16_shared)
    f = theano.function([], o, mode=mode_with_gpu)
    y, alpha, A, x, beta = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
    topo = f.maker.fgraph.toposort()
    assert any([isinstance(n.op, GpuGemm) for n in topo])

    # gemm
    float16_data = [
        rand(3, 3).astype("float16"),
        np.asarray(1, dtype=np.float32),
        rand(3, 3).astype("float16"),
        rand(3, 3).astype("float16"),
        np.asarray(0.5, dtype=np.float32),
    ]
    float16_shared = [
        gpuarray_shared_constructor(val, target=test_ctx_name)
        for val in float16_data
    ]
    o = gpugemm_no_inplace(*float16_shared)
    f = theano.function([], o)
    y, alpha, A, x, beta = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)

    # dot22
    float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")]

    float16_shared = [gpuarray_shared_constructor(val) for val in float16_data]
    o = gpu_dot22(*float16_shared)
    f = theano.function([], o)
    x, y = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), np.dot(x, y))
Example #34
0
def test_boolean_mask():
    tensor = T.constant([0, 1, 2, 3], dtype=theano.config.floatX)
    mask = np.array([True, False, True, False])
    masked = nn.utils.boolean_mask(tensor, mask)
    utt.assert_allclose(masked.eval(), (0, 2))

    tensor = [[1, 2], [3, 4], [5, 6]]
    mask = np.array([True, False, True])
    masked = nn.utils.boolean_mask(tensor, mask)
    utt.assert_allclose(masked.eval(), [[1, 2], [5, 6]])

    tensor_np = np.random.rand(3, 4, 2).astype(theano.config.floatX)
    tensor = T.as_tensor(tensor_np)
    mask = T.all(tensor > .5, 2)
    masked = nn.utils.boolean_mask(tensor, mask)
    utt.assert_allclose(masked.eval(), tensor_np[np.all(tensor_np > .5, 2)])
    def test_GpuCumsum1D(self):
        block_max_size = self.max_threads_dim0 * 2

        x = T.fvector('x')
        f = theano.function([x], cumsum(x), mode=self.mode)
        assert [n for n in f.maker.fgraph.toposort()
                if isinstance(n.op, GpuCumsum)]

        # Extensive testing for the first 1025 sizes
        a = np.random.random(1025).astype("float32")
        for i in xrange(a.shape[0]):
            utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))

        # Use multiple GPU threadblocks
        a = np.random.random((block_max_size + 2, )).astype("float32")
        utt.assert_allclose(np.cumsum(a), f(a))

        # Use recursive cumsum
        a = np.ones((block_max_size * (block_max_size + 1) + 2,),
                    dtype="float32")
        utt.assert_allclose(np.cumsum(a), f(a))
Example #36
0
def test_velocity():
    t_tensor = tt.dvector()
    t = np.linspace(0, 100, 1000)
    m_planet = 0.1
    m_star = 1.3
    orbit = KeplerianOrbit(
        m_star=m_star,
        r_star=1.0,
        t0=0.5,
        period=100.0,
        ecc=0.1,
        omega=0.5,
        Omega=1.0,
        incl=0.25 * np.pi,
        m_planet=m_planet,
    )

    star_pos = orbit.get_star_position(t_tensor)
    star_vel = theano.function([], orbit.get_star_velocity(t))()
    star_vel_expect = np.empty_like(star_vel)
    for i in range(3):
        g = theano.grad(tt.sum(star_pos[i]), t_tensor)
        star_vel_expect[i] = theano.function([t_tensor], g)(t)
    utt.assert_allclose(star_vel, star_vel_expect)

    planet_pos = orbit.get_planet_position(t_tensor)
    planet_vel = theano.function([], orbit.get_planet_velocity(t))()
    planet_vel_expect = np.empty_like(planet_vel)
    for i in range(3):
        g = theano.grad(tt.sum(planet_pos[i]), t_tensor)
        planet_vel_expect[i] = theano.function([t_tensor], g)(t)
    utt.assert_allclose(planet_vel, planet_vel_expect)

    pos = orbit.get_relative_position(t_tensor)
    vel = np.array(theano.function([], orbit.get_relative_velocity(t))())
    vel_expect = np.empty_like(vel)
    for i in range(3):
        g = theano.grad(tt.sum(pos[i]), t_tensor)
        vel_expect[i] = theano.function([t_tensor], g)(t)
    utt.assert_allclose(vel, vel_expect)
Example #37
0
    def test_DownsampleFactorMax(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        # generate random images
        maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3))
        imval = rng.rand(4, 2, 16, 16)
        images = tensor.dtensor4()
        for maxpoolshp, ignore_border, mode in product(
                maxpoolshps, [True, False],
            ['max', 'sum', 'average_inc_pad', 'average_exc_pad']):
            # print 'maxpoolshp =', maxpoolshp
            # print 'ignore_border =', ignore_border

            # Pure Numpy computation
            numpy_output_val = self.numpy_max_pool_2d(imval,
                                                      maxpoolshp,
                                                      ignore_border,
                                                      mode=mode)
            output = pool_2d(images, maxpoolshp, ignore_border, mode=mode)
            f = function([
                images,
            ], [
                output,
            ])
            output_val = f(imval)
            utt.assert_allclose(output_val, numpy_output_val)

            # Pool op
            maxpool_op = Pool(maxpoolshp,
                              ignore_border=ignore_border,
                              mode=mode)(images)

            output_shape = Pool.out_shape(imval.shape,
                                          maxpoolshp,
                                          ignore_border=ignore_border)
            utt.assert_allclose(numpy.asarray(output_shape),
                                numpy_output_val.shape)
            f = function([images], maxpool_op)
            output_val = f(imval)
            utt.assert_allclose(output_val, numpy_output_val)
Example #38
0
    def test_GpuCumOp1D(self, mode):
        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
        op_class = partial(self.op_class, mode=mode)
        block_max_size = self.max_threads_dim0 * 2

        x = T.fvector('x')
        f = theano.function([x], op_class(axis=0)(x), mode=self.mode)
        assert [
            n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp)
        ]

        # Extensive testing for the first 1025 sizes
        a = np.random.random(1025).astype("float32")
        for i in xrange(a.shape[0]):
            utt.assert_allclose(np_func(a[:i]), f(a[:i]))

        # Use multiple GPU threadblocks
        a = np.random.random((block_max_size + 2, )).astype("float32")
        utt.assert_allclose(np_func(a), f(a))

        # Use recursive cumop
        a = np.ones((block_max_size * (block_max_size + 1) + 2, ),
                    dtype="float32")
        utt.assert_allclose(np_func(a), f(a))
Example #39
0
 def test_los(self):
     f, _, in_args = self.get_args()
     in_args[-1] = np.ones_like(in_args[-1])
     out = f(*in_args)
     utt.assert_allclose(0.0, out)
Example #40
0
 def test_basic(self):
     f, _, in_args = self.get_args()
     out = f(*in_args)
     utt.assert_allclose(0.0, out[0])
     utt.assert_allclose(0.0, out[-1])
Example #41
0
def _params_allgood(ishape,
                    kshape,
                    mode,
                    subsample=(1, 1),
                    img_stride=(1, 1),
                    kern_stride=(1, 1),
                    version=-1,
                    verbose=0,
                    random=True,
                    print_=None,
                    id=None,
                    rtol=1e-5,
                    atol=1e-8,
                    nb_iter=0,
                    ones=False,
                    compile_kshp=None,
                    theano_mode=None,
                    cls=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
            numpy.prod(ishape)).reshape(ishape),
                                  dtype='float32') + 1
        npy_kern = -(
            theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape),
                            dtype='float32') + 1)

    img = cuda_ndarray.CudaNdarray(npy_img)
    kern = cuda_ndarray.CudaNdarray(npy_kern)

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(broadcastable=[sh == 1
                                            for sh in npy_kern.shape])()
    op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
                                          subsample=subsample,
                                          version=version,
                                          verbose=verbose,
                                          kshp=compile_kshp)(i, k)
    f = theano.function([i, k], op, mode=theano_mode)
    if cls is not None:
        assert any([
            isinstance(node.op, cls) for node in f.maker.fgraph.toposort()
        ]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
    t2 = time.time()
    gpuval = f(img, kern)
    t3 = time.time()
    for i in range(nb_iter):
        gpuval2 = f(img, kern)
        assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
    gpuval = numpy.asarray(gpuval)

    # CPU val computed after GPU val to get the GPU errors.
    t0 = time.time()
    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
    t1 = time.time()

    assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape,
                                          cpuval.shape)
    assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
    assert numpy.all(numpy.isfinite(gpuval)), gpuval
    assert [(sh == 1) is br
            for sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]

    if (t2 is not None):
        if mode == 'valid':
            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
        else:
            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
                         kshape[3] * ishape[2] * ishape[3] * 2)
        approx_fp /= 1e6
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t3 - t2)
        if verbose > 0:
            print('%15s' % str(ishape),
                  '%15s' % str(kshape),
                  end=' ',
                  file=sys.stdout)
            print('%12.5f  %7.2f %7.2f %7.1f' %
                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
                  file=sys.stdout)
Example #42
0
    def test_DownsampleFactorMax(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        # maxpool, input size
        examples = (
            ((2, ), (16, )),
            ((2, ), (
                4,
                16,
            )),
            ((2, ), (
                4,
                2,
                16,
            )),
            ((1, 1), (4, 2, 16, 16)),
            ((2, 2), (4, 2, 16, 16)),
            ((3, 3), (4, 2, 16, 16)),
            ((3, 2), (4, 2, 16, 16)),
            ((3, 2, 2), (3, 2, 16, 16, 16)),
            ((2, 3, 2), (3, 2, 16, 16, 16)),
            ((2, 2, 3), (3, 2, 16, 16, 16)),
            ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)),
        )

        for example, ignore_border, mode in product(
                examples, [True, False],
            ['max', 'sum', 'average_inc_pad', 'average_exc_pad']):
            (maxpoolshp, inputsize) = example
            imval = rng.rand(*inputsize)
            images = theano.shared(imval)

            # Pure Numpy computation
            numpy_output_val = self.numpy_max_pool_nd(imval,
                                                      maxpoolshp,
                                                      ignore_border,
                                                      mode=mode)

            # The pool_2d or pool_3d helper methods
            if len(maxpoolshp) == 2:
                output = pool_2d(images, maxpoolshp, ignore_border, mode=mode)
                f = function([], [
                    output,
                ])
                output_val = f()
                utt.assert_allclose(output_val, numpy_output_val)
            elif len(maxpoolshp) == 3:
                output = pool_3d(images, maxpoolshp, ignore_border, mode=mode)
                f = function([], [
                    output,
                ])
                output_val = f()
                utt.assert_allclose(output_val, numpy_output_val)

            # Pool op
            maxpool_op = Pool(ndim=len(maxpoolshp),
                              ignore_border=ignore_border,
                              mode=mode)(images, maxpoolshp)

            output_shape = Pool.out_shape(imval.shape,
                                          maxpoolshp,
                                          ndim=len(maxpoolshp),
                                          ignore_border=ignore_border)
            utt.assert_allclose(numpy.asarray(output_shape),
                                numpy_output_val.shape)
            f = function([], maxpool_op)
            output_val = f()
            utt.assert_allclose(output_val, numpy_output_val)
Example #43
0
    def test_softmax_grad(self):
        def cmp(n, m, f, f_gpu):
            data = numpy.arange(n * m, dtype='float32').reshape(n, m)
            gdata = numpy.asarray(data)[:, :, None, None]

            out = f(data)
            gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
            utt.assert_allclose(out, gout)

        x = T.matrix('x', 'float32')
        x_gpu = T.tensor4('x_gpu', 'float32')
        f_z = T.nnet.softmax_op
        f_gpu = dnn.GpuDnnSoftmax('accurate', 'channel')

        # Verify the grad operation
        dims = (2, 3, 4, 5)
        gdata = numpy.arange(numpy.product(dims),
                             dtype='float32').reshape(dims)
        T.verify_grad(f_gpu, [gdata], rng=numpy.random, mode=mode_with_gpu)

        # Verify that the CPU and GPU implementations return the same results
        # up to a tolerance.

        self._test_softmax(x, x_gpu, f_z, f_gpu, cmp)

        self._test_softmax(x, x, f_z, f_z, self._cmp)

        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
        # optimization is applied when cudnn is required
        y = T.fvector('y')
        f = theano.function([y],
                            T.grad(T.nnet.softmax(y).mean(), y),
                            mode=mode_with_gpu)
        sorted_f = f.maker.fgraph.toposort()
        val = numpy.random.rand(5).astype('float32')
        out_dnn = f(val)
        assert (len(
            [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1)
        assert (len([
            i for i in sorted_f
            if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)
        ]) == 0)

        # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad
        # optimization is not applied when cudnn is excluded or not
        # available
        mode_wo_cudnn = mode_with_gpu.excluding("cudnn")
        y = T.fvector('y')
        f = theano.function([y],
                            T.grad(T.nnet.softmax(y).mean(), y),
                            mode=mode_wo_cudnn)
        sorted_f = f.maker.fgraph.toposort()
        out_cpu = f(val)
        utt.assert_allclose(out_dnn, out_cpu)
        assert (len(
            [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 0)
        assert (len([
            i for i in sorted_f
            if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)
        ]) == 1)

        # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not
        # crash with manual graph
        y = T.fvector('y')
        o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2)
        f = theano.function([y], o, mode=mode_with_gpu)
        sorted_f = f.maker.fgraph.toposort()
        assert (len(
            [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1)
        assert (len([
            i for i in sorted_f
            if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad)
        ]) == 0)
Example #44
0
def test_pooling():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

    # 'average_exc_pad' is disabled for versions < 4004
    if dnn.version(raises=False) < 4004:
        modes = ('max', 'average_inc_pad')
    else:
        modes = ('max', 'average_inc_pad', 'average_exc_pad')

    x = T.ftensor4()
    for mode, pad in product(modes, ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
        if mode == 'max':
            func = T.max
        else:
            func = T.mean

        if pad != (0, 0) and func is T.mean:
            continue

        for ws in (4, 2, 5):
            for stride in (2, 3):
                if stride > ws:
                    continue
                if pad[0] > stride or pad[1] > stride:
                    # Not implemented
                    continue
                # We will check that the opt introduced it.
                out1 = pool_2d(x, (ws, ws),
                               st=(stride, stride),
                               ignore_border=True,
                               padding=pad,
                               mode=mode)
                out2 = pool_2d_i2n(x,
                                   ds=(ws, ws),
                                   strides=(stride, stride),
                                   pad=pad,
                                   pool_function=func)
                mode_without_gpu2 = mode_without_gpu.including()
                mode_without_gpu2.check_isfinite = False
                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([
                    isinstance(node.op, dnn.GpuDnnPool)
                    for node in f1.maker.fgraph.apply_nodes
                ])
                f2 = theano.function([x], out2, mode=mode_without_gpu2)
                assert not any([
                    isinstance(node.op, dnn.GpuDnnPool)
                    for node in f2.maker.fgraph.apply_nodes
                ])
                for shp in [
                    (1, 10, 100, 100),
                    (1, 3, 99, 99),
                    (32, 1, 147, 197),
                ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
                    a = f1(data)
                    b = f2(data)

                    utt.assert_allclose(a, b)

        # Test the grad
        for shp in [(1, 1, 2, 2), (1, 1, 3, 3)]:
            data = numpy.random.normal(0, 1, shp).astype("float32") * 10

            ws = 2
            stride = 2
            if pad[0] > stride or pad[1] > stride:
                # Not implemented
                continue

            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
                return pool_2d(x, (ws, ws),
                               ignore_border=True,
                               padding=pad,
                               mode=mode)

            utt.verify_grad(fn, [data],
                            cast_to_output_type=False,
                            mode=mode_with_gpu)
            # Confirm that the opt would have inserted it.
            fg = theano.function([x],
                                 theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
            assert any([
                isinstance(node.op, dnn.GpuDnnPoolGrad)
                for node in fg.maker.fgraph.toposort()
            ])

            # Test the GPU grad + GPU implementation
            def fn(x):
                dnn_op = dnn.dnn_pool(x,
                                      ws=(ws, ws),
                                      stride=(stride, stride),
                                      pad=pad,
                                      mode=mode)
                return dnn_op

            utt.verify_grad(fn, [data],
                            cast_to_output_type=False,
                            mode=mode_with_gpu)
            # Confirm that we get the good op.
            fg = theano.function([x],
                                 theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
            assert any([
                isinstance(node.op, dnn.GpuDnnPoolGrad)
                for node in fg.maker.fgraph.toposort()
            ])
            g_out = fg(data)

            # Compare against the CPU result
            out = pool_2d(x, (ws, ws),
                          padding=pad,
                          ignore_border=True,
                          mode=mode)
            fc = theano.function([x],
                                 theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            if mode == 'max':
                assert any([
                    isinstance(node.op, MaxPoolGrad)
                    for node in fc.maker.fgraph.toposort()
                ])
            else:
                assert any([
                    isinstance(node.op, AveragePoolGrad)
                    for node in fc.maker.fgraph.toposort()
                ])
            c_out = fc(data)
            utt.assert_allclose(c_out, g_out)
Example #45
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Test multiple output
    a_s = theano.scalar.float32()
    a = tensor.fmatrix()
    from theano.scalar.basic import identity
    out_s = theano.scalar.Composite(
        [a_s, b_s, c_s],
        [identity(a_s), identity(c_s),
         identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
    c = cuda.shared_constructor(c_v)
    f = theano.function([a, b],
                        outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Example #46
0
def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
    if ndimage is None:
        raise SkipTest("conv3d2d tests need SciPy")

    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
    Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5

    signals = numpy.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs,
                                                          Ws).astype('float32')
    filters = numpy.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf,
                                                          Wf).astype('float32')

    t0 = time.time()
    pyres = pyconv3d(signals, filters)
    print(time.time() - t0)

    s_signals = shared(signals)
    s_filters = shared(filters)
    s_output = shared(signals * 0)

    out = conv3d(s_signals,
                 s_filters,
                 signals_shape=signals.shape,
                 filters_shape=filters.shape)

    newconv3d = theano.function([], [], updates={s_output: out}, mode=mode)

    check_diagonal_subtensor_view_traces(newconv3d)
    t0 = time.time()
    newconv3d()
    print(time.time() - t0)
    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
    gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
    gnewconv3d = theano.function([], [],
                                 updates=[(s_filters, gfilters),
                                          (s_signals, gsignals)],
                                 mode=mode,
                                 name='grad')
    check_diagonal_subtensor_view_traces(gnewconv3d)

    t0 = time.time()
    gnewconv3d()
    print('grad', time.time() - t0)

    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
    Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2

    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
    utt.verify_grad(conv3d, [signals, filters], eps=1e-1, mode=mode)

    # Additional Test that covers the case of patched implementation for filter with Tf=1
    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
    Nf, Tf, C, Hf, Wf = 32, 1, 3, 5, 5

    signals = numpy.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs,
                                                          Ws).astype('float32')
    filters = numpy.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf,
                                                          Wf).astype('float32')

    t0 = time.time()
    pyres = pyconv3d(signals, filters)
    print(time.time() - t0)

    s_signals = shared(signals)
    s_filters = shared(filters)
    s_output = shared(signals * 0)

    out = conv3d(s_signals,
                 s_filters,
                 signals_shape=signals.shape,
                 filters_shape=filters.shape)

    newconv3d = theano.function([], [], updates={s_output: out}, mode=mode)

    t0 = time.time()
    newconv3d()
    print(time.time() - t0)
    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
    gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
    gnewconv3d = theano.function([], [],
                                 updates=[(s_filters, gfilters),
                                          (s_signals, gsignals)],
                                 mode=mode,
                                 name='grad')

    t0 = time.time()
    gnewconv3d()
    print('grad', time.time() - t0)

    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
    Nf, Tf, C, Hf, Wf = 4, 1, 3, 2, 2

    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
    utt.verify_grad(conv3d, [signals, filters], eps=1e-1, mode=mode)
Example #47
0
def test_batch_normalization_test():
    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor3, T.vector):
            x, scale, bias, mean, var = (vartype(n)
                                         for n in ("x", "scale", "bias",
                                                   "mean", "var"))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            out = bn.batch_normalization_test(x, scale, bias, mean, var, axes,
                                              eps)
            # reference forward pass
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes
            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2)
                                          for t in (scale, bias, mean, var))
            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
            # backward pass
            dy = vartype("dy")
            grads = T.grad(None,
                           wrt=[x, scale, bias, mean, var],
                           known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None,
                            wrt=[x, scale, bias, mean, var],
                            known_grads={out2: dy})
            # compile
            f = theano.function([x, scale, bias, mean, var, dy],
                                [out, out2] + grads + grads2)
            # check if the abstract Ops have been replaced
            assert not any([
                isinstance(
                    n.op,
                    (
                        bn.AbstractBatchNormTrain,
                        bn.AbstractBatchNormInference,
                        bn.AbstractBatchNormTrainGrad,
                    ),
                ) for n in f.maker.fgraph.toposort()
            ])
            # run
            for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5,
                                                                       5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Dy = -1 + 2 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Scale = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Bias = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Mean = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Var = np.random.rand(*param_shape).astype(theano.config.floatX)
                outputs = f(X, Scale, Bias, Mean, Var, Dy)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[1])  # out
                # compare gradients
                utt.assert_allclose(outputs[2], outputs[2 + 5],
                                    atol=4e-5)  # dx
                utt.assert_allclose(outputs[3], outputs[3 + 5],
                                    atol=4e-5)  # dscale
                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
                utt.assert_allclose(outputs[6],
                                    outputs[6 + 5],
                                    rtol=2e-3,
                                    atol=4e-5)  # dvar
Example #48
0
def test_batch_normalization():
    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    np.random.seed(1234)
    X = 1 + np.random.random([10, 20]).astype("float32")
    B = 1 + np.random.random([20]).astype("float32")
    G = 1 + np.random.random([20]).astype("float32")
    M = 1 + np.random.random([20]).astype("float32")
    V = 1 + np.random.random([20]).astype("float32")

    x = theano.tensor.matrix("x")
    b = theano.tensor.vector("b")
    g = theano.tensor.vector("g")
    m = theano.tensor.vector("m")
    v = theano.tensor.vector("v")

    bn_ref_op = bn_ref(x, g, b, m, v)
    f_ref = theano.function([x, g, b, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ["low_mem", "high_mem"]:
        bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, g, b, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs,
                                          gamma,
                                          beta,
                                          mean,
                                          std,
                                          mode=mode)

        utt.verify_grad(bn_f, [X, G, B, M, V])

    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True),
                       x.std(axis=0, keepdims=True))
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ["low_mem", "high_mem"]:
        bn_op = bn.batch_normalization(
            x,
            g,
            b,
            x.mean(axis=0, keepdims=True),
            x.std(axis=0, keepdims=True),
            mode=mode,
        )
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs,
                                          gamma,
                                          beta,
                                          mean,
                                          std,
                                          mode=mode)

        utt.verify_grad(
            bn_f,
            [X, G, B,
             X.mean(axis=0)[np.newaxis],
             X.std(axis=0)[np.newaxis]])
Example #49
0
def test_batch_normalization_train():
    utt.seed_rng()

    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor3, T.vector):
            x, scale, bias, running_mean, running_var = (vartype(n) for n in (
                "x", "scale", "bias", "running_mean", "running_var"))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            (
                out,
                x_mean,
                x_invstd,
                out_running_mean,
                out_running_var,
            ) = bn.batch_normalization_train(
                x,
                scale,
                bias,
                axes,
                eps,
                running_average_factor,
                running_mean,
                running_var,
            )
            # reference forward pass
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes
            x_mean2 = x.mean(axis=axes2, keepdims=True)
            x_var2 = x.var(axis=axes2, keepdims=True)
            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
            scale2 = T.addbroadcast(scale, *axes2)
            bias2 = T.addbroadcast(bias, *axes2)
            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
            m = T.cast(
                T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
            out_running_mean2 = (running_mean * (1 - running_average_factor) +
                                 x_mean2 * running_average_factor)
            out_running_var2 = (running_var * (1 - running_average_factor) +
                                (m /
                                 (m - 1)) * x_var2 * running_average_factor)
            # backward pass
            dy = vartype("dy")
            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
            # second-order backward pass
            dx = vartype("dinputs")
            dscale = vartype("dscale")
            dbias = vartype("dbias")
            grad_grads = T.grad(
                None,
                wrt=[x, dy, scale],
                known_grads=OrderedDict({
                    grads[0]: dx,
                    grads[1]: dscale,
                    grads[2]: dbias
                }),
                consider_constant=[
                    x,
                    dy,
                    scale,
                    bias,
                    x_mean,
                    x_invstd,
                    running_mean,
                    running_var,
                ],
                return_disconnected="zero",
            )
            # reference second-order backward pass
            grad_grads2 = T.grad(
                None,
                wrt=[x, dy, scale],
                known_grads=OrderedDict({
                    grads2[0]: dx,
                    grads2[1]: dscale,
                    grads2[2]: dbias
                }),
                consider_constant=[
                    x,
                    dy,
                    scale,
                    bias,
                    x_mean2,
                    x_var2,
                    running_mean,
                    running_var,
                ],
                return_disconnected="zero",
            )
            # compile
            f = theano.function(
                [
                    x, scale, bias, running_mean, running_var, dy, dx, dscale,
                    dbias
                ],
                [
                    out,
                    x_mean,
                    x_invstd,
                    out_running_mean,
                    out_running_var,
                    out2,
                    x_mean2,
                    x_invstd2,
                    out_running_mean2,
                    out_running_var2,
                ] + grads + grads2 + grad_grads + grad_grads2,
            )
            # check if the abstract Ops have been replaced
            assert not any([
                isinstance(
                    n.op,
                    (
                        bn.AbstractBatchNormTrain,
                        bn.AbstractBatchNormInference,
                        bn.AbstractBatchNormTrainGrad,
                    ),
                ) for n in f.maker.fgraph.toposort()
            ])
            # run
            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5,
                                                                      5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Dy = -1 + 2 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Scale = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Bias = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Running_mean = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Running_var = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Dx = 4 + 3 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Dscale = -1 + 2 * np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Dbias = np.random.randn(*param_shape).astype(
                    theano.config.floatX)

                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy, Dx,
                            Dscale, Dbias)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
                utt.assert_allclose(np.nan_to_num(outputs[4]),
                                    np.nan_to_num(outputs[4 +
                                                          5]))  # running_var
                # compare gradients
                utt.assert_allclose(outputs[10], outputs[10 + 3],
                                    atol=1e-4)  # dx
                utt.assert_allclose(outputs[11],
                                    outputs[11 + 3],
                                    rtol=2e-4,
                                    atol=1e-4)  # dscale
                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
                # compare second-order gradients
                utt.assert_allclose(outputs[16], outputs[16 + 3],
                                    atol=1e-4)  # ddx
                utt.assert_allclose(outputs[17], outputs[17 + 3])  # ddy
                utt.assert_allclose(outputs[18],
                                    outputs[18 + 3],
                                    rtol=3e-4,
                                    atol=1e-4)  # ddscale
Example #50
0
def test_conv_nnet1():
    utt.seed_rng()
    rval_cpu = run_conv_nnet1(False)
    utt.seed_rng()
    rval_gpu = run_conv_nnet1(True)
    utt.assert_allclose(rval_cpu, rval_gpu, rtol=1e-4, atol=1e-6)
Example #51
0
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
                               ignore_error=False,
                               n_train=10,
                               gpu_only=False,
                               cpu_only=False,
                               float_atol=1e-06,
                               check_isfinite=True,
                               pickle=False,
                               verbose=0,
                               version=-1):
    """Run the nnet2 function on 1 or 2 devices, and compares the results.

       float_atol: None mean use the default value.
       check_isfinite: the debug mode option. We forward this value to debug mode.
                       For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
    """
    if config.mode == 'DEBUG_MODE':
        n_train = 1

    # Change global tolerance, used in DebugMode for instance
    orig_float32_atol = theano.tensor.basic.float32_atol
    try:
        if float_atol:
            # print "float_atol", float_atol
            theano.tensor.basic.float32_atol = float_atol

        if gpu_only and cpu_only:
            raise ValueError("Please use only one of cpu_only and gpu_only")
        elif cpu_only:
            use_gpu = False
            compare = False
        elif gpu_only:
            use_gpu = True
            compare = False
        else:
            compare = True

        if not compare:
            return run_conv_nnet2_classif(
                use_gpu=use_gpu,
                seed=seed, isize=isize, ksize=ksize, bsize=bsize,
                n_train=n_train,
                check_isfinite=check_isfinite,
                pickle=pickle,
                verbose=verbose,
                version=version)

        utt.seed_rng(seed)  # Seeds numpy.random with seed
        train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \
            build_conv_nnet2_classif(
                use_gpu=False,
                isize=isize,
                ksize=ksize,
                n_batch=bsize,
                verbose=verbose,
                version=version,
                check_isfinite=check_isfinite)

        utt.seed_rng(seed)  # Seeds numpy.random with seed
        train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \
            build_conv_nnet2_classif(
                use_gpu=True,
                isize=isize,
                ksize=ksize,
                n_batch=bsize,
                verbose=verbose,
                version=version,
                check_isfinite=check_isfinite)

        assert x_shape == x_shape_gpu
        assert y_shape == y_shape_gpu

        xval = my_rand(*x_shape)
        yval = my_rand(*y_shape)
        lr = theano._asarray(0.01, dtype='float32')

        time_cpu = 0
        time_gpu = 0

        for i in range(n_train):
            # Train one batch on CPU
            t0 = time.time()
            rval_cpu = train_cpu(xval, yval, lr)[0]
            t1 = time.time()
            time_cpu += (t1 - t0)

            # Train one batch on GPU
            t0 = time.time()
            rval_gpu = train_gpu(xval, yval, lr)[0]
            t1 = time.time()
            time_gpu += (t1 - t0)

            # Compare results
            if (verbose or not
                    numpy.allclose(rval_cpu, rval_gpu, rtol=1e-5, atol=float_atol)):
                print("At batch:", i + 1)
                print("CPU:", rval_cpu)
                print("GPU:", rval_gpu)
                print("abs diff:", numpy.absolute(rval_gpu - rval_cpu))
                print("rel diff:", numpy.absolute((
                    rval_gpu - rval_cpu) / rval_gpu))

            if not ignore_error:
                utt.assert_allclose(rval_cpu, rval_gpu,
                                    rtol=1e-5, atol=float_atol)

            # Synchronize parameters to start from the same point next time
            if i < n_train - 1:
                for cpu_p, gpu_p in zip(params_cpu, params_gpu):
                    cpu_p.set_value(gpu_p.get_value(borrow=False), borrow=True)

    finally:
        theano.tensor.basic.float32_atol = orig_float32_atol
Example #52
0
    def test_one_sequence_one_output_weights_gpu1(self):
        def f_rnn(u_t, x_tm1, W_in, W):
            return u_t * W_in + x_tm1 * W

        u = theano.tensor.fvector('u')
        x0 = theano.tensor.fscalar('x0')
        W_in = theano.tensor.fscalar('win')
        W = theano.tensor.fscalar('w')

        mode = mode_with_gpu.excluding('InputToGpuOptimizer')
        output, updates = theano.scan(f_rnn,
                                      u,
                                      x0, [W_in, W],
                                      n_steps=None,
                                      truncate_gradient=-1,
                                      go_backwards=False,
                                      mode=mode)

        output = GpuFromHost(test_ctx_name)(output)
        f2 = theano.function([u, x0, W_in, W],
                             output,
                             updates=updates,
                             allow_input_downcast=True,
                             mode=mode)

        rng = numpy.random.RandomState(utt.fetch_seed())
        v_u = rng.uniform(size=(4, ), low=-5., high=5.)
        v_x0 = rng.uniform()
        W = rng.uniform()
        W_in = rng.uniform()

        v_u = numpy.asarray(v_u, dtype='float32')
        v_x0 = numpy.asarray(v_x0, dtype='float32')
        W = numpy.asarray(W, dtype='float32')
        W_in = numpy.asarray(W_in, dtype='float32')

        # compute the output in numpy
        v_out = numpy.zeros((4, ))
        v_out[0] = v_u[0] * W_in + v_x0 * W
        for step in range(1, 4):
            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W

        theano_values = f2(v_u, v_x0, W_in, W)
        utt.assert_allclose(theano_values, v_out)

        # TO DEL
        topo = f2.maker.fgraph.toposort()
        scan_node = [
            node for node in topo
            if isinstance(node.op, theano.scan_module.scan_op.Scan)
        ]
        assert len(scan_node) == 1
        scan_node = scan_node[0]

        topo = f2.maker.fgraph.toposort()
        assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0
        assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4

        scan_node = [
            node for node in topo
            if isinstance(node.op, theano.scan_module.scan_op.Scan)
        ]
        assert len(scan_node) == 1
        scan_node = scan_node[0]
        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()

        # check that there is no gpu transfer in the inner loop.
        assert any(
            [isinstance(node.op, GpuElemwise) for node in scan_node_topo])
        assert not any(
            [isinstance(node.op, HostFromGpu) for node in scan_node_topo])
        assert not any(
            [isinstance(node.op, GpuFromHost) for node in scan_node_topo])
Example #53
0
def test_dnn_conv_alpha_output_merge():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
    out = T.ftensor4()

    b = 1
    c = 4
    f = 3
    ih = 5
    iw = 8
    kh = 2
    kw = 6
    img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
    kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
    out_val = numpy.random.random(
        (b, f, ih - kh + 1, iw - kw + 1)).astype('float32')

    conv = dnn.dnn_conv(img, kern)
    gw = theano.grad(conv.sum(), kern)
    gi = theano.grad(conv.sum(), img)

    lr = numpy.asarray(0.05, dtype='float32')

    fr = lr * (conv + out)
    wr = kern + lr * gw
    ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
                      dnn.GpuDnnConv)
    assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
                      dnn.GpuDnnConvGradW)
    assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
                      dnn.GpuDnnConvGradI)

    mode = mode_with_gpu
    mode = mode.excluding('local_dnn_conv_alpha_merge')
    mode = mode.excluding('local_dnn_convw_alpha_merge')
    mode = mode.excluding('local_dnn_convi_alpha_merge')
    mode = mode.excluding('local_dnn_conv_output_merge')
    mode = mode.excluding('local_dnn_convw_output_merge')
    mode = mode.excluding('local_dnn_convi_output_merge')

    f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)

    assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
                          dnn.GpuDnnConv)
    assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
                          dnn.GpuDnnConvGradW)
    assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
                          dnn.GpuDnnConvGradI)

    out_f1 = f1(img_val, kern_val, out_val)
    out_f2 = f2(img_val, kern_val, out_val)

    assert len(out_f1) == len(out_f2)

    for v1, v2 in zip(out_f1, out_f2):
        utt.assert_allclose(v1, v2)
 def test1(self):
     a = tensor.dmatrix()
     w = sort(a)
     f = theano.function([a], w)
     utt.assert_allclose(f(self.m_val), np.sort(self.m_val))
Example #55
0
def check_equality_two_nd_array(a, b):
    utt.assert_allclose(a, b, atol=1e-5, rtol=1e-5)
    return True
Example #56
0
    def test_machine_translation(self):
        # This test case comes from https://github.com/rizar/scan-grad-speed and
        # is an example of actual computation done with scan in the context of
        # machine translation
        #
        # 'dim' has been reduced from 1000 to 5 to make the test run faster

        # Parameters from an actual machine tranlation run
        batch_size = 80
        seq_len = 50
        dim = 5

        # Weight matrices
        U = theano.shared(
            np.random.normal(size=(dim, dim),
                             scale=0.0001).astype(config.floatX))
        U.name = "U"
        V = theano.shared(U.get_value())
        V.name = "V"
        W = theano.shared(U.get_value())
        W.name = "W"

        # Variables and their values
        x = T.tensor3("x")
        x_value = np.random.normal(size=(seq_len, batch_size, dim),
                                   scale=0.0001).astype(config.floatX)

        ri = T.tensor3("ri")
        ri_value = x_value

        zi = T.tensor3("zi")
        zi_value = x_value

        init = T.alloc(np.cast[config.floatX](0), batch_size, dim)

        def rnn_step1(
            # sequences
            x,
            ri,
            zi,
            # outputs_info
            h,
        ):
            pre_r = ri + h.dot(U)
            pre_z = zi + h.dot(V)
            r = T.nnet.sigmoid(pre_r)
            z = T.nnet.sigmoid(pre_z)

            after_r = r * h
            pre_h = x + after_r.dot(W)
            new_h = T.tanh(pre_h)

            res_h = z * new_h + (1 - z) * h
            return res_h

        # Compile the function twice, once with the optimization and once
        # without
        opt_mode = mode.including("scan")
        h, _ = theano.scan(
            rnn_step1,
            sequences=[x, ri, zi],
            n_steps=seq_len,
            outputs_info=init,
            name="fpass1",
            mode=opt_mode,
        )
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_opt = theano.function(inputs=[x, ri, zi],
                                outputs=grad1,
                                mode=opt_mode)

        no_opt_mode = mode.excluding("scanOp_pushout_output")
        h, _ = theano.scan(
            rnn_step1,
            sequences=[x, ri, zi],
            n_steps=seq_len,
            outputs_info=init,
            name="fpass1",
            mode=no_opt_mode,
        )
        cost = h[-1].sum()
        grad1 = T.grad(cost, [U, V, W])
        f_no_opt = theano.function(inputs=[x, ri, zi],
                                   outputs=grad1,
                                   mode=no_opt_mode)

        # Validate that the optimization has been applied
        scan_node_grad = [
            node for node in f_opt.maker.fgraph.toposort()
            if isinstance(node.op, Scan)
        ][1]

        for output in scan_node_grad.op.outputs:
            assert not (
                isinstance(output.owner.op, T.elemwise.Elemwise)
                and any([isinstance(i, T.Dot) for i in output.owner.inputs]))

        # Compare the outputs of the two functions on the same input data.
        f_opt_output = f_opt(x_value, ri_value, zi_value)
        f_no_opt_output = f_no_opt(x_value, ri_value, zi_value)
        utt.assert_allclose(f_opt_output, f_no_opt_output)
Example #57
0
    def test_DownsampleFactorMaxStride(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        # maxpool, stride, ignore_border, input, output sizes
        examples = (
            ((1, 1), (1, 1), True, (4, 10, 16, 16), (4, 10, 16, 16)),
            ((1, 1), (3, 3), True, (4, 10, 16, 16), (4, 10, 6, 6)),
            ((1, 1), (5, 7), True, (4, 10, 16, 16), (4, 10, 4, 3)),
            ((1, 1), (1, 1), False, (4, 10, 16, 16), (4, 10, 16, 16)),
            ((1, 1), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)),
            ((1, 1), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
            ((3, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 14, 14)),
            ((3, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 5, 5)),
            ((3, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)),
            ((3, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 14, 14)),
            ((3, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)),
            ((3, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
            ((5, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 12, 14)),
            ((5, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 4, 5)),
            ((5, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)),
            ((5, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 12, 14)),
            ((5, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 5, 6)),
            ((5, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)),
            ((16, 16), (1, 1), True, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((16, 16), (3, 3), True, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((16, 16), (5, 7), True, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((16, 16), (1, 1), False, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((16, 16), (3, 3), False, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((16, 16), (5, 7), False, (4, 10, 16, 16), (4, 10, 1, 1)),
            ((3, ), (5, ), True, (16, ), (3, )),
            ((3, ), (5, ), True, (
                2,
                16,
            ), (
                2,
                3,
            )),
            ((5, ), (3, ), True, (
                2,
                3,
                16,
            ), (
                2,
                3,
                4,
            )),
            ((5, 1, 3), (3, 3, 3), True, (2, 16, 16, 16), (2, 4, 6, 5)),
            ((5, 1, 3), (3, 3, 3), True, (4, 2, 16, 16, 16), (4, 2, 4, 6, 5)),
        )

        for example, mode in product(
                examples,
            ['max', 'sum', 'average_inc_pad', 'average_exc_pad']):
            (maxpoolshp, stride, ignore_border, inputshp, outputshp) = example
            # generate random images
            imval = rng.rand(*inputshp)
            images = theano.shared(imval)
            # Pool op
            numpy_output_val = \
                self.numpy_max_pool_nd_stride(imval, maxpoolshp,
                                              ignore_border, stride,
                                              mode)
            assert numpy_output_val.shape == outputshp, (
                "outshape is %s, calculated shape is %s" %
                (outputshp, numpy_output_val.shape))
            maxpool_op = \
                Pool(ndim=len(maxpoolshp),
                     ignore_border=ignore_border,
                     mode=mode)(images, maxpoolshp, stride)
            f = function([], maxpool_op)
            output_val = f()
            utt.assert_allclose(output_val, numpy_output_val)
Example #58
0
 def _cmp(self, n, m, f, f_gpu):
     data = numpy.arange(n * m, dtype='float32').reshape(n, m)
     out = f(data)
     gout = f_gpu(data)
     utt.assert_allclose(out, gout)
Example #59
0
def test_zca_dataset():
    """
    Tests the ZCA_Dataset class.
    """
    # Preparation
    rng = np.random.RandomState([2014, 11, 4])
    start = 0
    stop = 990
    num_examples = 1000
    num_feat = 5
    num_classes = 2

    # random_dense_design_matrix has values that are centered and of
    # unit stdev, which is not useful to test the ZCA.
    # So, we replace its value by an uncentered uniform one.
    raw = random_dense_design_matrix(rng, num_examples, num_feat, num_classes)
    x = rng.uniform(low=-0.5, high=2.0, size=(num_examples, num_feat))
    x = x.astype(np.float32)
    raw.X = x

    zca = ZCA(filter_bias=0.0)
    zca.apply(raw, can_fit=True)
    zca_dataset = ZCA_Dataset(raw, zca, start, stop)

    # Testing general behaviour
    mean = zca_dataset.X.mean(axis=0)
    var = zca_dataset.X.std(axis=0)
    assert_allclose(mean, np.zeros(num_feat), atol=1e-2)
    assert_allclose(var, np.ones(num_feat), atol=1e-2)

    # Testing mapback()
    y = zca_dataset.mapback(zca_dataset.X)
    assert_allclose(x[start:stop], y)

    # Testing mapback_for_viewer()
    y = zca_dataset.mapback_for_viewer(zca_dataset.X)
    z = x/np.abs(x).max(axis=0)
    assert_allclose(z[start:stop], y, rtol=1e-2)

    # Testing adjust_for_viewer()
    y = zca_dataset.adjust_for_viewer(x.T).T
    z = x/np.abs(x).max(axis=0)
    assert_allclose(z, y)

    # Testing adjust_to_be_viewed_with()
    y = zca_dataset.adjust_to_be_viewed_with(x, 2*x, True)
    z = zca_dataset.adjust_for_viewer(x)
    assert_allclose(z/2, y)
    y = zca_dataset.adjust_to_be_viewed_with(x, 2*x, False)
    z = x/np.abs(x).max()
    assert_allclose(z/2, y)

    # Testing has_targets()
    assert zca_dataset.has_targets()
Example #60
0
    def validate(
            self,
            image_shape,
            filter_shape,
            border_mode="valid",
            subsample=(1, 1, 1),
            input=None,
            filters=None,
            verify_grad=True,
            non_contiguous=False,
            filter_dilation=(1, 1, 1),
    ):
        """
        :param image_shape: The constant shape info passed to corr3dMM.
        :param filter_shape: The constant shape info passed to corr3dMM.
        """
        if not theano.config.cxx:
            pytest.skip("Need cxx for this test")

        N_image_shape = [
            T.get_scalar_constant_value(T.as_tensor_variable(x))
            for x in image_shape
        ]
        N_filter_shape = [
            T.get_scalar_constant_value(T.as_tensor_variable(x))
            for x in filter_shape
        ]

        if input is None:
            input = self.input
        if filters is None:
            filters = self.filters

        # THEANO IMPLEMENTATION

        # we create a symbolic function so that verify_grad can work
        def sym_Corr3dMM(input, filters):
            # define theano graph and function
            input.name = "input"
            filters.name = "filters"
            rval = corr3d.Corr3dMM(border_mode, subsample,
                                   filter_dilation)(input, filters)
            rval.name = "corr_output"
            return rval

        output = sym_Corr3dMM(input, filters)
        output.name = "Corr3dMM()(%s,%s)" % (input.name, filters.name)
        theano_corr = theano.function([input, filters], output, mode=self.mode)

        # initialize input and compute result
        image_data = np.random.random(N_image_shape).astype(self.dtype)
        filter_data = np.random.random(N_filter_shape).astype(self.dtype)
        image_data /= 10
        filter_data /= 10
        if non_contiguous:
            image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2))
            image_data = image_data.copy()
            image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2))
            filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2))
            filter_data = filter_data.copy()
            filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2))
            assert not image_data.flags["CONTIGUOUS"]
            assert not filter_data.flags["CONTIGUOUS"]

        theano_output = theano_corr(image_data, filter_data)

        # REFERENCE IMPLEMENTATION
        # Testing correlation, not convolution. Reverse filters.
        filter_data_corr = np.array(filter_data[:, :, ::-1, ::-1, ::-1],
                                    copy=True,
                                    order="C")
        orig_image_data = image_data
        img_shape3d = np.array(N_image_shape[-3:])
        fil_shape3d = np.array(N_filter_shape[-3:])
        dil_shape3d = np.array(filter_dilation)
        dil_fil_shape3d = (fil_shape3d - 1) * dil_shape3d + 1
        subsample3d = np.array(subsample)
        if border_mode == "full":
            padHWD = dil_fil_shape3d - 1
        elif border_mode == "valid":
            padHWD = np.array([0, 0, 0])
        elif border_mode == "half":
            padHWD = np.floor(dil_fil_shape3d / 2).astype("int32")
        elif isinstance(border_mode, tuple):
            padHWD = np.array(border_mode)
        elif isinstance(border_mode, integer_types):
            padHWD = np.array([border_mode, border_mode, border_mode])
        else:
            raise NotImplementedError(
                "Unsupported border_mode {}".format(border_mode))
        out_shape3d = (np.floor(
            (img_shape3d + 2 * (padHWD) - dil_fil_shape3d) / subsample3d) + 1)
        # avoid numpy deprecation
        out_shape3d = out_shape3d.astype("int32")
        out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape3d)
        ref_output = np.zeros(out_shape)

        # loop over output feature maps
        ref_output.fill(0)
        image_data2 = np.zeros((
            N_image_shape[0],
            N_image_shape[1],
            N_image_shape[2] + 2 * padHWD[0],
            N_image_shape[3] + 2 * padHWD[1],
            N_image_shape[4] + 2 * padHWD[2],
        ))
        image_data2[:, :, padHWD[0]:padHWD[0] + N_image_shape[2],
                    padHWD[1]:padHWD[1] + N_image_shape[3],
                    padHWD[2]:padHWD[2] + N_image_shape[4], ] = image_data
        image_data = image_data2
        N_image_shape = image_data.shape
        for bb in range(N_image_shape[0]):
            for nn in range(N_filter_shape[0]):
                for im0 in range(N_image_shape[1]):
                    filter3d = filter_data_corr[nn, im0, :, :, :]
                    image3d = image_data[bb, im0, :, :, :]
                    for row in range(ref_output.shape[2]):
                        irow = row * subsample[0]  # image row
                        for col in range(ref_output.shape[3]):
                            icol = col * subsample[1]  # image col
                            for slc in range(ref_output.shape[4]):
                                islc = slc * subsample[2]  # image slice
                                ref_output[bb, nn, row, col, slc] += (
                                    image3d[
                                        irow:irow +
                                        dil_fil_shape3d[0]:filter_dilation[0],
                                        icol:icol +
                                        dil_fil_shape3d[1]:filter_dilation[1],
                                        islc:islc + dil_fil_shape3d[2]:
                                        filter_dilation[2], ] *
                                    filter3d[::-1, ::-1, ::-1]).sum()

        utt.assert_allclose(theano_output, ref_output)

        # TEST GRADIENT
        if verify_grad:
            utt.verify_grad(sym_Corr3dMM, [orig_image_data, filter_data],
                            mode=self.mode)