Python MyGpuNdArray.sum Beispiele

Programmiersprache: Python

Namespace / Paketname: gen_elemwise

Klasse / Typ: MyGpuNdArray

Methode / Funktion: sum

Beispiele auf hotexamples.com: 2

Python MyGpuNdArray.sum - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die gen_elemwise.MyGpuNdArray.sum, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

MyGpuNdArray(3)

adds(2)

add(1)

gen_fct(1)

multiplys(1)

sum(1)

Beispiel #1

Datei anzeigen

Datei: test_gpu_elemwise.py Projekt: abergeron/compyte

def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if  enable_double:
        dtypes.remove("complex128")
    for shape in [
        # need something bigger then 32, 1024 or 4096.
        # Those are corner case.

        # 1d, take only a few seconds on a GTX470
        (0,), (5,), (31,), (32,), (33,),
        (1023,), (1024,), (1025,),
        (4095,), (4096,), (4097,),
        (32 * 1024 - 1,), (32 * 1024,), (32 * 1024 + 1,),

        # 2d, take 2 minutes on a GTX 470
        (0, 0), (1, 0), (0, 1,), (5, 4),
        (31, 31), (31, 32), (31, 33),
        (32, 31), (32, 32), (32, 33),
        (33, 31), (33, 32), (33, 33),
        (1024, 32), (1025, 32),
        (1024, 33), (1025, 33),
        (4096, 32), (32, 4096), (4096, 33), (33, 4096),
        (4097, 32), (32, 4097), (4097, 33), (33, 4097),

        # 3d, take 2 minutes on a GTX 470
        (0, 0, 0), (0, 1, 0), (0, 0, 1),
        (5, 4, 3), (5, 4, 3), (5, 4, 3),
        (4096, 2, 33), (2, 4096, 33), (33, 2, 4096),
        (4097, 2, 33), (2, 4097, 33), (33, 2, 4097),
        (4096, 33, 2), (33, 4096, 2), (2, 33, 4096),
        (4097, 33, 2), (33, 4097, 2), (2, 33, 4097),

        # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0), (1, 0, 0, 0), (0, 1, 0, 0),
        (0, 0, 1, 0), (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3), (3, 1024, 32, 2), (2, 3, 1024, 32),
        (1024, 2, 32, 3), (3, 1024, 2, 32), (1024, 3, 2, 32),
        (1025, 33, 2, 3), (3, 1025, 33, 2), (2, 3, 1025, 33),
        (1025, 2, 33, 3), (3, 1025, 2, 33), (1025, 3, 2, 33),
        (4100, 4, 3, 2), (4, 4100, 3, 2),
        (4, 3, 4100, 2), (4, 3, 2, 4100),

        # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
        ]:

        for dtype, off_o, off_i, sliced, order in product(
            *([dtypes] +
              [[False, True]] +
              [[False, True]] +
              [[-1, 2, -2, 1]] +
              [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o,
                                                off_i, sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"] or
                                       gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
#            print dtype, shape, off_o, off_i, sliced, order
#            print (cpu_val.strides,
#                   cpu_val.flags["C_CONTIGUOUS"],
#                   cpu_val.flags["F_CONTIGUOUS"])
#            print (gpu_val.strides,
#                   gpu_val.flags["C_CONTIGUOUS"],
#                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol
            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol) or
                        cpu_sum == gpu_sum), (
                    dtype, shape, cpu_sum, gpu_sum,
                    (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(cpu_sum, gpu_sum), (
                        "axis=%d" % axis, dtype, shape, cpu_sum.shape,
                        cpu_sum, gpu_sum,
                        cpu_max, gpu_max, (cpu_max - gpu_max) / cpu_max)

Beispiel #2

Datei anzeigen

Datei: test_gpu_elemwise.py Projekt: torrange/nixcuda

def test_sum():
    to_cpu = numpy.asarray
    dtypes = list(dtypes_all)
    # I remove *int8 as currently the output have the same dtype
    # And this cause overflow
    dtypes.remove("int8")
    dtypes.remove("uint8")
    # I need to find how pycuda handle complexe in c.
    # I probably just need to add an header.
    dtypes.remove("complex64")
    if enable_double:
        dtypes.remove("complex128")
    for shape in [
            # need something bigger then 32, 1024 or 4096.
            # Those are corner case.

            # 1d, take only a few seconds on a GTX470
        (
            0, ),
        (5, ),
        (31, ),
        (32, ),
        (33, ),
        (1023, ),
        (1024, ),
        (1025, ),
        (4095, ),
        (4096, ),
        (4097, ),
        (32 * 1024 - 1, ),
        (32 * 1024, ),
        (32 * 1024 + 1, ),

            # 2d, take 2 minutes on a GTX 470
        (0, 0),
        (1, 0),
        (
            0,
            1,
        ),
        (5, 4),
        (31, 31),
        (31, 32),
        (31, 33),
        (32, 31),
        (32, 32),
        (32, 33),
        (33, 31),
        (33, 32),
        (33, 33),
        (1024, 32),
        (1025, 32),
        (1024, 33),
        (1025, 33),
        (4096, 32),
        (32, 4096),
        (4096, 33),
        (33, 4096),
        (4097, 32),
        (32, 4097),
        (4097, 33),
        (33, 4097),

            # 3d, take 2 minutes on a GTX 470
        (0, 0, 0),
        (0, 1, 0),
        (0, 0, 1),
        (5, 4, 3),
        (5, 4, 3),
        (5, 4, 3),
        (4096, 2, 33),
        (2, 4096, 33),
        (33, 2, 4096),
        (4097, 2, 33),
        (2, 4097, 33),
        (33, 2, 4097),
        (4096, 33, 2),
        (33, 4096, 2),
        (2, 33, 4096),
        (4097, 33, 2),
        (33, 4097, 2),
        (2, 33, 4097),

            # 4d, take 1 minutes on a GTX 470
        (0, 0, 0, 0),
        (1, 0, 0, 0),
        (0, 1, 0, 0),
        (0, 0, 1, 0),
        (0, 0, 0, 1),
        (5, 4, 3, 2),
        (1024, 32, 2, 3),
        (3, 1024, 32, 2),
        (2, 3, 1024, 32),
        (1024, 2, 32, 3),
        (3, 1024, 2, 32),
        (1024, 3, 2, 32),
        (1025, 33, 2, 3),
        (3, 1025, 33, 2),
        (2, 3, 1025, 33),
        (1025, 2, 33, 3),
        (3, 1025, 2, 33),
        (1025, 3, 2, 33),
        (4100, 4, 3, 2),
        (4, 4100, 3, 2),
        (4, 3, 4100, 2),
        (4, 3, 2, 4100),

            # 5d, work only if c contiguous
        (5, 4, 3, 10, 11),
    ]:

        for dtype, off_o, off_i, sliced, order in product(
                *([dtypes] + [[False, True]] + [[False, True]] +
                  [[-1, 2, -2, 1]] + [['f', 'c']])):

            cpu_val, gpu_val = gen_gpu_nd_array(shape, dtype, off_o, off_i,
                                                sliced, order)

            if len(shape) > 4 and not (gpu_val.flags["C_CONTIGUOUS"]
                                       or gpu_val.flags["F_CONTIGUOUS"]):
                continue
            gpu_val = MyGpuNdArray(gpu_val)
            cpu_sum = cpu_val.sum()
            #            print dtype, shape, off_o, off_i, sliced, order
            #            print (cpu_val.strides,
            #                   cpu_val.flags["C_CONTIGUOUS"],
            #                   cpu_val.flags["F_CONTIGUOUS"])
            #            print (gpu_val.strides,
            #                   gpu_val.flags["C_CONTIGUOUS"],
            #                   gpu_val.flags["F_CONTIGUOUS"])
            gpu_sum = to_cpu(gpu_val.sum())

            def get_rtol(orig, after_reduction):
                if after_reduction.size == 0:
                    return 0
                if orig.size // after_reduction.size > 500000:
                    rtols = {"float32": 4.3e-5}
                elif orig.size // after_reduction.size > 100000:
                    rtols = {"float32": 3e-5}
                elif orig.size // after_reduction.size > 50000:
                    rtols = {"float32": 2e-5}
                else:
                    rtols = {"float32": 1e-5}
                if dtype in rtols:
                    rtol = rtols[dtype]
                else:
                    rtol = 1e-8
                return rtol

            rtol = get_rtol(gpu_val, gpu_sum)
            cpu_sum = cpu_sum.astype(dtype)
            if not (dtype.endswith("int16") and numpy.prod(shape) > 20000):
                assert (numpy.allclose(cpu_sum, gpu_sum, rtol=rtol)
                        or cpu_sum == gpu_sum), (dtype, shape, cpu_sum,
                                                 gpu_sum,
                                                 (cpu_sum - gpu_sum) / cpu_sum)

            # Test pattern 10 and 01
            # Test pattern 100, 010 and 001
            if len(shape) in [2, 3]:
                for axis in range(len(shape)):
                    gpu_sum = to_cpu(gpu_val.sum(axis=[axis]))
                    cpu_sum = cpu_val.sum(axis=axis)
                    rtol = get_rtol(gpu_val, gpu_sum)
                    if cpu_sum.size > 0:
                        argmax = numpy.absolute(cpu_sum - gpu_sum).argmax()
                        cpu_max = cpu_sum.flatten()[argmax]
                        gpu_max = gpu_sum.flatten()[argmax]
                    assert numpy.allclose(
                        cpu_sum,
                        gpu_sum), ("axis=%d" % axis, dtype, shape,
                                   cpu_sum.shape, cpu_sum, gpu_sum, cpu_max,
                                   gpu_max, (cpu_max - gpu_max) / cpu_max)