コード例 #1
0
def test_broadcast_alias():

    with Driver() as drv:

        code = drv.program(qpu_broadcast_alias)
        X = drv.alloc((16, ), dtype='int32')
        Y = drv.alloc((len(range(-15, 16)), 16), dtype='int32')
        unif = drv.alloc(3, dtype='uint32')

        X[:] = np.arange(16)
        Y[:] = 0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        expected = X
        for ix, rot in enumerate(range(-15, 16)):
            assert (Y[ix] == expected[(-rot % 16)].repeat(16)).all()
コード例 #2
0
def boilerplate_sfu_regs(sfu_regs, domain_limitter):

    with Driver() as drv:

        code = drv.program(lambda asm: qpu_sfu_regs(asm, sfu_regs))
        X = drv.alloc((16, ), dtype='float32')
        Y = drv.alloc((len(sfu_regs), 16), dtype='float32')
        unif = drv.alloc(3, dtype='uint32')

        X[:] = domain_limitter(np.random.randn(*X.shape).astype('float32'))
        Y[:] = 0.0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        for ix, reg in enumerate(sfu_regs):
            msg = 'mov({}, None)'.format(reg)
            assert np.allclose(Y[ix], ops[reg](X), rtol=1e-4), msg
コード例 #3
0
def test_parallel_16():

    with Driver() as drv:

        thread = 16

        serial_code = drv.program(qpu_serial)
        parallel_code = drv.program(qpu_parallel_16)
        X = drv.alloc((thread, 16), dtype='float32')
        Ys = drv.alloc((thread, 16), dtype='float32')
        Yp = drv.alloc((thread, 16), dtype='float32')
        unif = drv.alloc((thread, 4), dtype='uint32')

        X[:] = np.random.randn(*X.shape)
        Ys[:] = -1
        Yp[:] = -1

        unif[:, 0] = unif.addresses()[:, 0]
        unif[:, 1] = unif.shape[1]
        unif[:, 2] = X.addresses()[:, 0]
        unif[:, 3] = Ys.addresses()[:, 0]

        start = time.time()
        drv.execute(serial_code, unif.addresses()[0, 0])
        end = time.time()
        serial_cost = end - start

        unif[:, 3] = Yp.addresses()[:, 0]

        start = time.time()
        drv.execute(parallel_code, unif.addresses()[0, 0], thread=thread)
        end = time.time()
        parallel_cost = end - start

        np.set_printoptions(threshold=np.inf)

        assert (X == Ys).all()
        assert (X == Yp).all()
        assert parallel_cost < serial_cost * 2
コード例 #4
0
def test_signal_ldtmu():

    with Driver() as drv:

        code = drv.program(qpu_signal_ldtmu)
        X = drv.alloc((16, ), dtype='float32')
        Y = drv.alloc((3, 16), dtype='float32')
        unif = drv.alloc(3, dtype='uint32')

        X[:] = np.random.randn(*X.shape).astype('float32')
        Y[:] = 0.0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        assert (Y[0] == X).all()
        assert (Y[1] == 2).all()
        assert (Y[2] == 4).all()
コード例 #5
0
def test_rotate_alias():

    with Driver() as drv:

        code = drv.program(qpu_rotate_alias)
        X = drv.alloc((16, ), dtype='int32')
        Y = drv.alloc((2, len(range(-15, 16)), 16), dtype='int32')
        unif = drv.alloc(3, dtype='uint32')

        X[:] = np.arange(16)
        Y[:] = 0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        expected = np.concatenate([X, X])
        for ix, rot in enumerate(range(-15, 16)):
            assert (Y[:, ix] == expected[(-rot % 16):(-rot % 16) + 16]).all()
コード例 #6
0
ファイル: scopy.py プロジェクト: zeta1999/py-videocore6
def scopy(*, length, num_qpus=8, unroll_shift=0):

    assert length > 0
    assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0

    print(f'==== scopy example ({length / 1024 / 1024} Mi elements) ====')

    with Driver(data_area_size=(length * 2 + 1024) * 4) as drv:

        code = drv.program(qpu_scopy,
                           num_qpus=num_qpus,
                           unroll_shift=unroll_shift,
                           code_offset=drv.code_pos // 8)

        print('Preparing for buffers...')

        X = drv.alloc(length, dtype='float32')
        Y = drv.alloc(length, dtype='float32')

        X[:] = np.arange(*X.shape, dtype=X.dtype)
        Y[:] = -X

        assert not np.array_equal(X, Y)

        unif = drv.alloc(3, dtype='uint32')
        unif[0] = length
        unif[1] = X.addresses()[0]
        unif[2] = Y.addresses()[0]

        print('Executing on QPU...')

        start = monotonic()
        drv.execute(code, unif.addresses()[0], thread=num_qpus)
        end = monotonic()

        assert np.array_equal(X, Y)

        print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
コード例 #7
0
def summation(*, length, num_qpus=8, unroll_shift=5):

    assert length > 0
    assert length % (16 * 8 * num_qpus * (1 << unroll_shift)) == 0

    print(f'==== summaton example ({length / 1024 / 1024} Mi elements) ====')

    with Driver(data_area_size=(length + 1024) * 4) as drv:

        code = drv.program(qpu_summation,
                           num_qpus=num_qpus,
                           unroll_shift=unroll_shift,
                           code_offset=drv.code_pos // 8)

        print('Preparing for buffers...')

        X = drv.alloc(length, dtype='uint32')
        Y = drv.alloc(16 * num_qpus, dtype='uint32')

        X[:] = np.arange(length, dtype=X.dtype)
        Y.fill(0)

        assert sum(Y) == 0

        unif = drv.alloc(3, dtype='uint32')
        unif[0] = length
        unif[1] = X.addresses()[0]
        unif[2] = Y.addresses()[0]

        print('Executing on QPU...')

        start = monotonic()
        drv.execute(code, unif.addresses()[0], thread=num_qpus)
        end = monotonic()

        assert sum(Y) % 2**32 == (length - 1) * length // 2 % 2**32

        print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
コード例 #8
0
def test_quad_rotate_alias():

    with Driver() as drv:

        code = drv.program(qpu_quad_rotate_alias)
        X = drv.alloc((16, ), dtype='int32')
        Y = drv.alloc((4, len(range(-15, 16)), 16), dtype='int32')
        unif = drv.alloc(3, dtype='uint32')

        X[:] = np.arange(16)
        Y[:] = 0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        expected = np.concatenate([X.reshape(4, 4)] * 2, axis=1)
        for ix, rot in enumerate(range(-15, 16)):
            assert (Y[:, ix] == expected[:, (-rot % 4):(-rot % 4) +
                                         4].ravel()).all()
コード例 #9
0
def test_branch_link_reg():

    for set_subroutine_link, expected in [(False, 2), (True, 1)]:
        for use_link_reg_direct in [False, True]:
            with Driver() as drv:

                code = drv.program(lambda asm: qpu_branch_link_reg(asm, set_subroutine_link, use_link_reg_direct))
                X = drv.alloc(16, dtype = 'uint32')
                Y = drv.alloc((2, 16), dtype = 'uint32')
                unif = drv.alloc(2, dtype = 'uint32')

                X[:] = (np.random.randn(16) * 1024).astype('uint32')
                Y[:] = 0.0

                unif[0] = X.addresses()[0]
                unif[1] = Y.addresses()[0,0]

                start = time.time()
                drv.execute(code, unif.addresses()[0])
                end = time.time()

                assert (Y[0] == X).all()
                assert (Y[1] == expected).all()
コード例 #10
0
def test_tmu_write():
    print()

    n = 256 * 1024

    with Driver(data_area_size = n * 16 * 4 + 2 * 4) as drv:

        code = drv.program(qpu_tmu_write)
        data = drv.alloc(n * 16, dtype = 'uint32')
        unif = drv.alloc(2, dtype = 'uint32')

        data[:] = 0xdeadbeaf
        unif[0] = n
        unif[1] = data.addresses()[0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        assert all(data == range(n * 16))

        print(f'{end - start} sec')
        print(f'{data.nbytes / (end - start) / 1000 / 1000} MB/s')
コード例 #11
0
def boilerplate_unary_ops(uni_ops, dst, src):

    dst_dtype, dst_ops = dst
    src_dtype, src_ops = src

    with Driver() as drv:

        cases = list(itertools.product(uni_ops, dst_ops, src_ops))

        code = drv.program(
            lambda asm: qpu_unary_ops(asm, uni_ops, dst_ops, src_ops))
        X = drv.alloc((16 * 4 // np.dtype(src_dtype).itemsize, ),
                      dtype=src_dtype)
        Y = drv.alloc((len(cases), 16 * 4 // np.dtype(dst_dtype).itemsize),
                      dtype=dst_dtype)
        unif = drv.alloc(3, dtype='uint32')

        X[:] = np.random.uniform(-(2**15), 2**15, X.shape).astype(src_dtype)
        Y[:] = 0.0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0, 0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        for ix, (uni_op, dst_op, src_op) in enumerate(cases):
            msg = '{}({}, {})'.format(uni_op, dst_op, src_op)
            if np.dtype(dst_dtype).name.startswith('float'):
                assert np.allclose(ops[dst_op](Y[ix]),
                                   ops[uni_op](ops[src_op](X)),
                                   rtol=1e-2), msg
            elif np.dtype(dst_dtype).name.startswith('int') or np.dtype(
                    dst_dtype).name.startswith('uint'):
                assert np.all(
                    ops[dst_op](Y[ix]) == ops[uni_op](ops[src_op](X))), msg
コード例 #12
0
def test_branch_abs_imm():

    with Driver() as drv:

        @qpu
        def qpu_dummy(asm):
            nop()
        dummy = drv.program(qpu_dummy)
        code = drv.program(lambda asm: qpu_branch_abs_imm(asm, int(dummy.addresses()[0]+16*8)))
        X = drv.alloc((16, ), dtype = 'uint32')
        Y = drv.alloc((16, ), dtype = 'uint32')
        unif = drv.alloc(3, dtype = 'uint32')

        X[:] = np.arange(16)
        Y[:] = 0.0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0]

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        assert (Y == X + 2).all()
コード例 #13
0
ファイル: memset.py プロジェクト: zeta1999/py-videocore6
def memset(*, fill, length, num_qpus=8, unroll_shift=1):

    assert length > 0
    assert length % (16 * num_qpus * (1 << unroll_shift)) == 0

    print(f'==== memset example ({length * 4 / 1024 / 1024} MiB) ====')

    with Driver(data_area_size=(length + 1024) * 4) as drv:

        code = drv.program(qpu_memset,
                           num_qpus=num_qpus,
                           unroll_shift=unroll_shift,
                           code_offset=drv.code_pos // 8)

        print('Preparing for buffers...')

        X = drv.alloc(length, dtype='uint32')

        X.fill(~fill)

        assert not np.array_equiv(X, fill)

        unif = drv.alloc(3, dtype='uint32')
        unif[0] = X.addresses()[0]
        unif[1] = fill
        unif[2] = length

        print('Executing on QPU...')

        start = monotonic()
        drv.execute(code, unif.addresses()[0], thread=num_qpus)
        end = monotonic()

        assert np.array_equiv(X, fill)

        print(f'{end - start} sec, {length * 4 / (end - start) * 1e-6} MB/s')
コード例 #14
0
ファイル: test_branch.py プロジェクト: zeta1999/py-videocore6
def test_uniform_branch_reg():

    with Driver() as drv:

        code = drv.program(qpu_uniform_branch_reg)
        X = drv.alloc((16, ), dtype='uint32')
        Y = drv.alloc((16, ), dtype='uint32')
        unif = drv.alloc(6, dtype='uint32')

        X[1] = unif.addresses()[4]  # absolute address for uniform branch
        Y[:] = 0.0

        unif[0] = X.addresses()[0]
        unif[1] = Y.addresses()[0]
        unif[2] = 3
        unif[3] = 4
        unif[4] = 5  # uniform branch point here
        unif[5] = 6

        start = time.time()
        drv.execute(code, unif.addresses()[0])
        end = time.time()

        assert (Y == 5).all()
コード例 #15
0
ファイル: sgemm.py プロジェクト: zeta1999/py-videocore6
def sgemm_rnn_naive():

    thread = 8

    P = 1024
    Q = 1024
    R = 1024

    assert P % (16 * 2) == 0
    assert R % (16 * 4) == 0

    with Driver() as drv:

        code = drv.program(lambda asm: qpu_sgemm_rnn_naive(asm, thread))

        A = drv.alloc((P, Q), dtype='float32')
        B = drv.alloc((Q, R), dtype='float32')
        C = drv.alloc((P, R), dtype='float32')

        np.random.seed(0)
        alpha = np.random.randn()
        beta = np.random.randn()
        A_ref = np.random.randn(*A.shape).astype(A.dtype)
        B_ref = np.random.randn(*B.shape).astype(B.dtype)
        C_ref = np.random.randn(*C.shape).astype(C.dtype)

        A[:] = A_ref
        B[:] = B_ref
        C[:] = C_ref

        start = getsec()
        C_ref[:] = alpha * A_ref.dot(B_ref) + beta * C_ref
        time_ref = getsec() - start

        def block_2x4_params(i, j):
            tile_P = P // 2
            tile_R = R // 4
            return [
                tile_P,
                Q,
                tile_R,
                A.addresses()[tile_P * i, 0],
                A.strides[0],
                B.addresses()[0, tile_R * j],
                B.strides[0],
                C.addresses()[tile_P * i, tile_R * j],
                C.strides[0],
                *pack_unpack('f', 'I', [alpha, beta]),
            ]

        unif_params = drv.alloc((thread, len(block_2x4_params(0, 0))),
                                dtype='uint32')
        for th in range(thread):
            unif_params[th] = block_2x4_params(th // 4, th % 4)

        unif = drv.alloc(2, dtype='uint32')
        unif[0] = unif_params.addresses()[0, 0]
        unif[1] = unif_params.shape[1]

        start = getsec()
        drv.execute(code, unif.addresses()[0], thread=thread)
        time_gpu = getsec() - start

        np.set_printoptions(threshold=np.inf)

        # print(C)
        # print(C-C_ref)

        def Gflops(sec):
            return (2 * P * Q * R + 3 * P * R) / sec * 1e-9

        print(f'==== sgemm example ({P}x{Q} times {Q}x{R}) ====')
        print(f'numpy: {time_ref:.4} sec, {Gflops(time_ref):.4} Gflop/s')
        print(f'QPU:   {time_gpu:.4} sec, {Gflops(time_gpu):.4} Gflop/s')
        print(f'Minimum absolute error: {np.min(np.abs(C - C_ref))}')
        print(f'Maximum absolute error: {np.max(np.abs(C - C_ref))}')
        print(f'Minimum relative error: {np.min(np.abs((C - C_ref) / C_ref))}')
        print(f'Maximum relative error: {np.max(np.abs((C - C_ref) / C_ref))}')
コード例 #16
0
ファイル: comatcopy_t.py プロジェクト: Idein/qmkl6
def benchmark():

    from time import monotonic

    import numpy as np

    from videocore6.driver import Driver

    def run(drv,
            unif,
            src,
            dst,
            num_qpus,
            rows,
            cols,
            tile_rows,
            tile_cols,
            subtile_rows,
            subtile_cols,
            code_offset=0):

        code = drv.program(qpu_comatcopy_t,
                           num_qpus=num_qpus,
                           tile_rows=tile_rows,
                           tile_cols=tile_cols,
                           subtile_rows=subtile_rows,
                           subtile_cols=subtile_cols,
                           code_offset=code_offset)

        src[:, :] = np.arange(src.size, dtype=src.dtype).reshape(src.shape)
        dst[:, :] = np.arange(dst.size, dtype=dst.dtype).reshape(dst.shape)

        unif[0] = rows
        unif[1] = cols
        unif[2] = pack_unpack('f', 'I', 1.)
        unif[3] = pack_unpack('f', 'I', 0.)
        unif[4] = src.addresses()[0, 0]
        unif[5] = cols * 8
        unif[6] = dst.addresses()[0, 0]
        unif[7] = rows * 8

        start = monotonic()
        drv.execute(code, unif.addresses()[0], thread=num_qpus)
        end = monotonic()

        print(f'{num_qpus} QPUs,', f'{rows} x {cols} matrix,',
              f'{tile_rows:2} x {tile_cols:2} tile,',
              f'{subtile_rows:2} x {subtile_cols:2} subtile:',
              f'{end - start} seconds,',
              f'{rows * cols * 8 / (end - start) * 1e-6} MB/s')

    rows = 8192
    cols = 8192

    with Driver(data_area_size=1100 * 1024 * 1024) as drv:

        unif = drv.alloc(8, dtype='uint32')
        src = drv.alloc((rows, cols), dtype='uint64')
        dst = drv.alloc((cols, rows), dtype='uint64')

        for num_qpus in [1, 8]:
            run(drv, unif, src, dst, num_qpus, rows, cols, 4, 8, 4, 4)

        for tile_rows in [2, 4, 8, 16]:
            tile_cols = 32 // tile_rows
            for subtile_rows in [2, 4, 8]:
                subtile_cols = 16 // subtile_rows
                run(drv, unif, src, dst, 8, rows, cols, tile_rows, tile_cols,
                    subtile_rows, subtile_cols)
コード例 #17
0
def test_multiple_dispatch_delay():
    print()

    bench = BenchHelper('benchmarks/libbench_helper.so')

    with Driver() as drv:

        data = drv.alloc((10, 16), dtype='uint32')
        code = [
            drv.program(lambda asm: qpu_write_N(asm, i))
            for i in range(data.shape[0])
        ]
        unif = drv.alloc((data.shape[0], 2), dtype='uint32')
        done = drv.alloc(1, dtype='uint32')

        data[:] = 0
        unif[:, 0] = data.addresses()[:, 0]
        unif[:, 1] = done.addresses()[0]

        ref_start = time.time()
        with drv.compute_shader_dispatcher() as csd:
            for i in range(data.shape[0]):
                csd.dispatch(code[i], unif.addresses()[i, 0])
        ref_end = time.time()
        assert (data == np.arange(data.shape[0]).reshape(data.shape[0],
                                                         1)).all()

        data[:] = 0

        naive_results = np.zeros(data.shape[0], dtype='float32')
        with drv.compute_shader_dispatcher() as csd:
            for i in range(data.shape[0]):
                done[:] = 0
                start = time.time()
                csd.dispatch(code[i], unif.addresses()[i, 0])
                bench.wait_address(done)
                end = time.time()
                naive_results[i] = end - start
        assert (data == np.arange(data.shape[0]).reshape(data.shape[0],
                                                         1)).all()

        sleep_results = np.zeros(data.shape[0], dtype='float32')
        with drv.compute_shader_dispatcher() as csd:
            for i in range(data.shape[0]):
                done[:] = 0
                time.sleep(1)
                start = time.time()
                csd.dispatch(code[i], unif.addresses()[i, 0])
                bench.wait_address(done)
                end = time.time()
                sleep_results[i] = end - start
        assert (data == np.arange(data.shape[0]).reshape(data.shape[0],
                                                         1)).all()

        print
        print(
            f'API wait after {data.shape[0]} dispatch: {ref_end - ref_start:.6f} sec'
        )
        print(f'polling wait for each {data.shape[0]} dispatch:')
        print(f'    total: {np.sum(naive_results):.6f} sec')
        print(f'    details: {" ".join([f"{t:.6f}" for t in naive_results])}')
        print(
            f'polling wait for each {data.shape[0]} dispatch with between sleep:'
        )
        print(f'    total: {np.sum(sleep_results):.6f} sec + sleep...')
        print(f'    details: {" ".join([f"{t:.6f}" for t in sleep_results])}')
コード例 #18
0
def test_tmu_load_1_slot_1_qpu():

    bench = BenchHelper('benchmarks/libbench_helper.so')

    for trans in [False, True]:

        with Driver() as drv:

            loop = 2**15

            X = drv.alloc((16, loop) if trans else (loop, 16), dtype='float32')
            Y = drv.alloc(16, dtype='float32')
            unif = drv.alloc(6, dtype='uint32')
            done = drv.alloc(1, dtype='uint32')

            unif[0] = loop
            unif[1] = X.addresses()[0, 0]
            unif[2] = X.strides[int(trans)]
            unif[3] = X.strides[1 - int(trans)]
            unif[4] = Y.addresses()[0]
            unif[5] = done.addresses()[0]

            results = np.zeros((24, 10), dtype='float32')

            fig = plt.figure()
            ax = fig.add_subplot(1, 1, 1)
            ax.set_title(
                f'TMU load latency (1 slot, 1 qpu, stride=({unif[2]},{unif[3]}))'
            )
            ax.set_xlabel('# of nop (between request and load signal)')
            ax.set_ylabel('sec')

            print()
            for nops in range(results.shape[0]):

                code = drv.program(
                    lambda asm: qpu_tmu_load_1_slot_1_qpu(asm, nops))

                for i in range(results.shape[1]):

                    with drv.compute_shader_dispatcher() as csd:

                        X[:] = np.random.randn(*X.shape) / X.shape[int(trans)]
                        Y[:] = 0.0
                        done[:] = 0

                        start = time.time()
                        csd.dispatch(code, unif.addresses()[0], thread=8)
                        bench.wait_address(done)
                        end = time.time()

                        results[nops, i] = end - start

                        assert np.allclose(Y,
                                           np.sum(X, axis=int(trans)),
                                           atol=1e-4)

                ax.scatter(np.zeros(results.shape[1]) + nops,
                           results[nops],
                           s=1,
                           c='blue')

                print('{:4}/{}\t{:.9f}'.format(
                    nops, results.shape[0],
                    np.sum(results[nops]) / results.shape[1]))

            ax.set_ylim(auto=True)
            ax.set_xlim(0, results.shape[0])
            fig.savefig(
                f'benchmarks/tmu_load_1_slot_1_qpu_{unif[2]}_{unif[3]}.png')