Beispiel #1
0
def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if  enable_double:
        dtypes.remove("complex128")
    for shape in [
        # need something bigger then 32, 1024 or 4096.
        # Those are corner case.

        # 1d, take only a few seconds on a GTX470
        (0,), (5,), (31,), (32,), (33,),
        (1023,), (1024,), (1025,),
        (4095,), (4096,), (4097,),
        (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),

        # 2d, take 2 minutes on a GTX 470
        (0, 0), (1, 0), (0, 1,), (5, 4),
        (31, 31), (31, 32), (31, 33),
        (32, 31), (32, 32), (32, 33),
        (33, 31), (33, 32), (33, 33),
        (1024, 32), (1025, 32),
        (1024, 33), (1025, 33),
        (4096, 32), (32, 4096), (4096, 33), (33, 4096),
        (4097, 32), (32, 4097), (4097, 33), (33, 4097),

        # 3d, take 2 minutes on a GTX 470
        (0, 0, 0), (0, 1, 0), (0, 0, 1),
        (5, 4, 3), (5, 4, 3), (5, 4, 3),
        (4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
        (4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
        (4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
        (4097, 33, 2), (33, 4097, 2), (2, 33, 4097),

        # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
        (0, 0, 1, 0), (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
        (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
        (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
        (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
        (4100, 4, 3, 2), (4, 4100, 3, 2),
        (4, 3, 4100, 2), (4, 3, 2, 4100),

        # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
        ]:

        for dtype, off_o, off_i, sliced, order in product(
            *([dtypes] +
              [[False, True]] +
              [[False, True]] +
              [[-1, 2, -2, 1]] +
              [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
                                                off_i, sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
                                       gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
#            print dtype, shape, off_o, off_i, sliced, order
#            print (cpu_val.strides,
#                   cpu_val.flags["C_CONTIGUOUS"],
#                   cpu_val.flags["F_CONTIGUOUS"])
#            print (gpu_val.strides,
#                   gpu_val.flags["C_CONTIGUOUS"],
#                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol
            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
                        cpu_sum == gpu_sum), (
                    dtype, shape, cpu_sum, gpu_sum,
                    (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(cpu_sum, gpu_sum), (
                        "axis=%d" % axis, dtype, shape, cpu_sum.shape,
                        cpu_sum, gpu_sum,
                        cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)
Beispiel #2
0
def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if enable_double:
        dtypes.remove("complex128")
    for shape in [
            # need something bigger then 32, 1024 or 4096.
            # Those are corner case.

            # 1d, take only a few seconds on a GTX470
        (
            0, ),
        (5, ),
        (31, ),
        (32, ),
        (33, ),
        (1023, ),
        (1024, ),
        (1025, ),
        (4095, ),
        (4096, ),
        (4097, ),
        (32 * 1024 - 1, ),
        (32 * 1024, ),
        (32 * 1024 + 1, ),

            # 2d, take 2 minutes on a GTX 470
        (0, 0),
        (1, 0),
        (
            0,
            1,
        ),
        (5, 4),
        (31, 31),
        (31, 32),
        (31, 33),
        (32, 31),
        (32, 32),
        (32, 33),
        (33, 31),
        (33, 32),
        (33, 33),
        (1024, 32),
        (1025, 32),
        (1024, 33),
        (1025, 33),
        (4096, 32),
        (32, 4096),
        (4096, 33),
        (33, 4096),
        (4097, 32),
        (32, 4097),
        (4097, 33),
        (33, 4097),

            # 3d, take 2 minutes on a GTX 470
        (0, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (5, 4, 3),
        (5, 4, 3),
        (5, 4, 3),
        (4096, 2, 33),
        (2, 4096, 33),
        (33, 2, 4096),
        (4097, 2, 33),
        (2, 4097, 33),
        (33, 2, 4097),
        (4096, 33, 2),
        (33, 4096, 2),
        (2, 33, 4096),
        (4097, 33, 2),
        (33, 4097, 2),
        (2, 33, 4097),

            # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0),
        (1, 0, 0, 0),
        (0, 1, 0, 0),
        (0, 0, 1, 0),
        (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3),
        (3, 1024, 32, 2),
        (2, 3, 1024, 32),
        (1024, 2, 32, 3),
        (3, 1024, 2, 32),
        (1024, 3, 2, 32),
        (1025, 33, 2, 3),
        (3, 1025, 33, 2),
        (2, 3, 1025, 33),
        (1025, 2, 33, 3),
        (3, 1025, 2, 33),
        (1025, 3, 2, 33),
        (4100, 4, 3, 2),
        (4, 4100, 3, 2),
        (4, 3, 4100, 2),
        (4, 3, 2, 4100),

            # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
    ]:

        for dtype, off_o, off_i, sliced, order in product(
                *([dtypes] + [[False, True]] + [[False, True]] +
                  [[-1, 2, -2, 1]] + [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i,
                                                sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"]
                                       or gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
            #            print dtype, shape, off_o, off_i, sliced, order
            #            print (cpu_val.strides,
            #                   cpu_val.flags["C_CONTIGUOUS"],
            #                   cpu_val.flags["F_CONTIGUOUS"])
            #            print (gpu_val.strides,
            #                   gpu_val.flags["C_CONTIGUOUS"],
            #                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol

            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol)
                        or cpu_sum == gpu_sum), (dtype, shape, cpu_sum,
                                                 gpu_sum,
                                                 (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(
                        cpu_sum,
                        gpu_sum), ("axis=%d" % axis, dtype, shape,
                                   cpu_sum.shape, cpu_sum, gpu_sum, cpu_max,
                                   gpu_max, (cpu_max - gpu_max) / cpu_max)