Beispiel #1
0
def test_link_register():
    with Driver() as drv:
        X = drv.alloc(16, 'uint32')
        prog = drv.program(link_register)
        drv.execute(n_threads=1, program=prog, uniforms=[X.address])

        assert np.all(X == prog.address + 0x28)
Beispiel #2
0
def run_code(code, nout):
    with Driver() as drv:
        X = drv.alloc((nout, 16), 'int32')
        drv.execute(n_threads=1,
                    program=drv.program(boilerplate, code, nout),
                    uniforms=[X.address])
        return np.copy(X)
Beispiel #3
0
def test_absolute_jump():
    with Driver() as drv:
        X = drv.alloc(16, 'int32')
        prog = drv.program(absolute_jump)
        drv.execute(n_threads=1, program=prog, uniforms=[X.address])

        assert np.all(X == ASSERT_OK)
def test_unpack_R4():
    F = np.random.randn(16)

    X = np.zeros((7, 16), dtype='uint32')
    X[0] = unpack('16L', F.astype('float32'))
    X[1] = unpack('16H', F.astype('float16'))
    X[2] = unpack('16H', F.astype('float16'))
    X[2] <<= 16
    X[3:7] = np.array([getrandbits(32) for i in range(4 * 16)]).reshape(4, 16)

    with Driver() as drv:
        X = drv.copy(X)
        Y = drv.alloc((7, 16), dtype='uint32')
        drv.execute(n_threads=1,
                    program=drv.program(unpack_R4),
                    uniforms=[X.address, Y.address])
        X = np.copy(X)
        Y = np.copy(Y)

    assert np.allclose(F, unpack('16f', Y[0]), rtol=1e-3)
    assert np.allclose(F, unpack('16f', Y[1]), rtol=1e-3)
    assert np.allclose(F, unpack('16f', Y[2]), rtol=1e-3)
    assert np.allclose(((X[3] >> 0) & 0xff) / 255.0,
                       unpack('16f', Y[3]),
                       rtol=1e-7)
    assert np.allclose(((X[4] >> 8) & 0xff) / 255.0,
                       unpack('16f', Y[4]),
                       rtol=1e-7)
    assert np.allclose(((X[5] >> 16) & 0xff) / 255.0,
                       unpack('16f', Y[5]),
                       rtol=1e-7)
    assert np.allclose(((X[6] >> 24) & 0xff) / 255.0,
                       unpack('16f', Y[6]),
                       rtol=1e-7)
def run_code(code, X):
    with Driver() as drv:
        X = drv.copy(X)
        Y = drv.copy(X)
        drv.execute(n_threads=1,
                    program=drv.program(boilerplate, code, X.shape[0]),
                    uniforms=[X.address, Y.address])
        return np.copy(Y)
Beispiel #6
0
def test_with_namespace():
    with Driver() as drv:
        X = drv.alloc((1, 16), 'int32')
        X[:] = 1234
        drv.execute(n_threads=1,
                    program=drv.program(with_namespace),
                    uniforms=[X.address])
        assert np.all(X == 4)
def run_code(code, X, output_shape, output_type):
    with Driver() as drv:
        X = drv.copy(X)
        Y = drv.alloc(output_shape, dtype=output_type)
        drv.execute(n_threads=1,
                    program=drv.program(boilerplate, code, output_shape[0]),
                    uniforms=[X.address, Y.address])
        return np.copy(Y)
Beispiel #8
0
def test_horizontal_32bit_stride_load():
    with Driver() as drv:
        X = drv.alloc((16, 32), dtype='uint32')
        X[:] = np.arange(16 * 32).reshape(16, 32).astype('uint32')
        Y = drv.alloc((16, 16), dtype='uint32')
        drv.execute(n_threads=1,
                    program=drv.program(horizontal_32bit_stride_load),
                    uniforms=[X.address, Y.address])
        assert np.all(X[:, :16] == Y)
Beispiel #9
0
def test_horizontal_32bit_partial():
    with Driver() as drv:
        X = drv.alloc((8, 8), dtype='uint32')
        X[:] = np.arange(8 * 8).reshape(8, 8).astype('uint32')
        Y = drv.alloc((16, 16), dtype='uint32')

        drv.execute(n_threads=1,
                    program=drv.program(horizontal_32bit_partial),
                    uniforms=[X.address, Y.address])
        assert np.all(X == Y[4:12, 4:12])
Beispiel #10
0
def test_vertical_32bit_load():
    with Driver() as drv:
        X = drv.alloc((16, 64), dtype='uint32')
        X[:] = np.arange(16 * 64).reshape(16, 64).astype('uint32')
        Y = drv.alloc((64, 16), dtype='uint32')

        drv.execute(n_threads=1,
                    program=drv.program(vertical_32bit_load),
                    uniforms=[X.address, Y.address])

        assert np.all(X == Y.T)
Beispiel #11
0
def test_horizontal_32bit_load_calc_and_store_another_buffer():
    with Driver() as drv:
        X = drv.alloc((64, 16), dtype='uint32')
        X[:] = np.arange(64 * 16).reshape(64, 16).astype('uint32')
        Y = drv.alloc((64, 16), dtype='uint32')

        drv.execute(n_threads=1,
                    program=drv.program(horizontal_32bit_load_calc_and_store),
                    uniforms=[X.address, Y.address])

        X[0] = X[0] + 1
        assert np.all(X == Y)
Beispiel #12
0
def test_rotate_r4():
    d = np.array([random.getrandbits(32) for i in range(16)]).astype(np.uint32)
    with Driver() as drv:
        addr = drv.copy(d).address
        X = np.array([addr+4*i for i in range(16)], dtype=np.uint32)
        Y = run_code(rotate_r4, X, 1+15+16)
        assert np.alltrue(Y[0] == d)
        for i in range(1, 16):
            Y_ref = list_half_rotate(d, i)
            assert np.alltrue(Y[i] == Y_ref)
        for i in range(0, 16):
            Y_ref = list_half_rotate(d, i)
            assert np.alltrue(Y[16+i] == Y_ref)
def test_semaphore():
    with Driver() as drv:
        nthreads = 10
        X = drv.alloc(16, dtype='uint32')
        Y = drv.alloc(16, dtype='uint32')
        X[:] = 0
        unifs = np.zeros((nthreads, 3), dtype='uint32')
        unifs[:, 0] = X.address
        unifs[:, 1] = Y.address
        unifs[:, 2] = np.arange(nthreads)
        drv.execute(n_threads=nthreads,
                    program=drv.program(increment_thread, nthreads),
                    uniforms=unifs)
        assert np.all(Y == nthreads * 10000)
Beispiel #14
0
def test_given_jump():
    lbls = get_label_positions(given_jmp)
    entry_pc = 0
    test_pc = 0
    for lbl, pc in lbls:
        if lbl.name == 'entry':
            entry_pc = pc
        if lbl.name == 'test':
            test_pc = pc
    with Driver() as drv:
        X = drv.alloc((1, 16), 'int32')
        X[:] = 1234
        drv.execute(n_threads=1,
                    program=drv.program(given_jmp),
                    uniforms=[test_pc - entry_pc - 32, X.address])
        assert np.all(X == 4)
Beispiel #15
0
def main():
    with Driver() as drv:
        p = 96
        q = 363
        r = 3072

        p_div = 2
        r_div = 6
        n_threads = p_div * r_div

        assert (p % 16 == 0 and p >= p_div * 16)
        assert (q >= 2)
        assert (r % 64 == 0 and r >= r_div * 64)

        # Allocate matrices.
        C = drv.alloc((p, r), 'float32')
        A = drv.alloc((p, q), 'float32')
        B = drv.alloc((q, r), 'float32')

        # Initialize matrices.
        np.random.seed(0)
        alpha = 1.0
        beta = 1.0
        A[:] = np.random.randn(p, q)
        B[:] = np.random.randn(q, r)
        C[:] = np.random.randn(p, r)

        # Reference
        start = time.time()
        R = alpha * A.dot(B) + beta * C
        elapsed_ref = time.time() - start

        # Allocate uniforms.
        uniforms = drv.alloc((n_threads, 14), 'uint32')
        uniforms[:, 0] = uniforms.addresses()[:, 0]

        th = 0
        h = (p + 16 * p_div - 1) // (16 * p_div)
        w = (r + 64 * r_div - 1) // (64 * r_div)
        for i in range(p_div):
            for j in range(r_div):
                uniforms[th,
                         1] = h if i != p_div - 1 else (p - i * h * 16) // 16
                uniforms[th, 2] = q
                uniforms[th,
                         3] = w if j != r_div - 1 else (r - j * w * 64) // 64
                uniforms[th, 4] = A.addresses()[i * 16 * h, 0]
                uniforms[th, 5] = B.addresses()[0, j * 64 * w]
                uniforms[th, 6] = C.addresses()[i * 16 * h, j * 64 * w]
                th += 1
        uniforms[:, 7] = A.strides[0]
        uniforms[:, 8] = B.strides[0]
        uniforms[:, 9] = C.strides[0]
        uniforms[:, 10] = struct.unpack('L', struct.pack('f', alpha))[0]
        uniforms[:, 11] = struct.unpack('L', struct.pack('f', beta))[0]
        uniforms[:, 12] = np.arange(n_threads)
        uniforms[:, 13] = n_threads

        # Allocate GPU program.
        code = drv.program(sgemm_gpu_code)

        # GPU
        start = time.time()
        drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
        elapsed_gpu = time.time() - start

        def Gflops(sec):
            return (2 * p * q * r + 3 * p * r) / sec * 1e-9

        print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p,
                                                                       q=q,
                                                                       r=r))
        print('threads: {}'.format(n_threads))
        print('numpy: {:.4f} sec, {:.4f} Gflops'.format(
            elapsed_ref, Gflops(elapsed_ref)))
        print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu,
                                                      Gflops(elapsed_gpu)))
        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(R - C)))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(R - C)))))
        print('minimum relative error: {:.4e}'.format(
            float(np.min(np.abs((R - C) / R)))))
        print('maximum relative error: {:.4e}'.format(
            float(np.max(np.abs((R - C) / R)))))
Beispiel #16
0
def main():
    with Driver() as drv:
        p = 96
        q = 363
        r = 3072

        assert (p % 16 == 0)
        assert (q >= 2)
        assert (r % 64 == 0)

        # Allocate matrices.
        C = drv.alloc((p, r), 'float32')
        A = drv.alloc((p, q), 'float32')
        B = drv.alloc((q, r), 'float32')

        # Initialize matrices.
        np.random.seed(0)
        alpha = 1.0
        beta = 1.0
        A[:] = np.random.randn(p, q)
        B[:] = np.random.randn(q, r)
        C[:] = np.random.randn(p, r)

        # Reference
        start = time.time()
        R = alpha * A.dot(B) + beta * C
        elapsed_ref = time.time() - start

        # Allocate uniforms.
        uniforms = drv.alloc(12, 'uint32')
        uniforms[0] = uniforms.address
        uniforms[1] = p / 16
        uniforms[2] = q
        uniforms[3] = r / 64
        uniforms[4] = A.address
        uniforms[5] = B.address
        uniforms[6] = C.address
        uniforms[7] = A.strides[0]
        uniforms[8] = B.strides[0]
        uniforms[9] = C.strides[0]
        uniforms[10] = struct.unpack('L', struct.pack('f', alpha))[0]
        uniforms[11] = struct.unpack('L', struct.pack('f', beta))[0]

        # Allocate GPU program.
        code = drv.program(sgemm_gpu_code)

        # GPU
        start = time.time()
        drv.execute(n_threads=1, program=code, uniforms=uniforms)
        elapsed_gpu = time.time() - start

        def Gflops(sec):
            return (2 * p * q * r + 3 * p * r) / sec * 1e-9

        print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p,
                                                                       q=q,
                                                                       r=r))
        print('threads: {}'.format(1))
        print('numpy: {:.4f} sec, {:.4f} Gflops'.format(
            elapsed_ref, Gflops(elapsed_ref)))
        print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu,
                                                      Gflops(elapsed_gpu)))
        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(R - C)))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(R - C)))))
        print('minimum relative error: {:.4e}'.format(
            float(np.min(np.abs((R - C) / R)))))
        print('maximum relative error: {:.4e}'.format(
            float(np.max(np.abs((R - C) / R)))))
Beispiel #17
0
def GPU_conv(x, w, b, Relu_flag=0):
    #def main():
    with Driver() as drv:
        SIMD = 16
        UNIFORM = 64
        n_threads = 12
        N, C, H, W = x.shape
        FN, C, FH, FW = w.shape
        calc_H = H
        calc_W = W
        calc_FN = FN
        eH = int(FH / 2) * 2
        eW = int(FW / 2) * 2
        oH = H - eH
        oW = W - eW
        modH = oH % n_threads
        modW = oW % SIMD
        modFN = FN % SIMD
        if (modH != 0):
            calc_H += n_threads - modH
        if (modW != 0):
            calc_W += SIMD - modW
        if (modFN != 0):
            calc_FN += SIMD - modFN
        calc_oH = calc_H - eH
        calc_oW = calc_W - eW

        th_oH = int(calc_oH / n_threads)
        th_iter = int((th_oH * calc_oW) / (64 / calc_FN * 16))
        convX = drv.alloc((N, C, calc_H, calc_W), 'float32')
        convW = drv.alloc((C, FH, FW, calc_FN), 'float32')
        convout = drv.alloc((1, calc_oH, calc_oW, calc_FN), 'float32')
        cb = drv.alloc(calc_FN, 'float32')
        convout[:] = 0
        convX[:] = 0
        convW[:] = 0
        cb[:] = 0
        pad = 0
        stride = 1

        convX[:, :, :H, :W] = x[:]
        convW[:, :, :, :FN] = w.transpose(1, 2, 3, 0)[:]  #転置してcopy
        cb[:FN] = b[:]

        #CPU Calculation
        #im2col->dot
        cpuetime = 0
        start = time.time()
        out_h = 1 + int((H + 2 * pad - FH) / stride)
        out_w = 1 + int((W + 2 * pad - FW) / stride)
        col = im2col(x, FH, FW, stride, pad)
        col_W = w.reshape(FN, -1).T
        out = np.dot(col, col_W) + b
        out = np.maximum(out, 0.0)
        CPU = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
        cetime = time.time() - start

        uniforms = drv.alloc((n_threads, 16), 'uint32')
        uniforms[:, 0] = convW.addresses()[0, 0, 0, 0]
        for th in range(n_threads):
            uniforms[th, 1] = convX.addresses()[0, 0, th * th_oH, 0]
            uniforms[th, 2] = convout.addresses()[0, th * th_oH, 0, 0]
        uniforms[:, 3] = cb.addresses()[0]
        uniforms[:, 4] = th_iter
        uniforms[:, 5] = th_oH
        uniforms[:, 6] = int(calc_W * 4)
        uniforms[:, 7] = C
        uniforms[:, 8] = np.arange(1, (n_threads + 1))
        uniforms[:, 9] = n_threads
        uniforms[:, 10] = Relu_flag + 1
        code = drv.program(conv, calc_H, calc_W, FH, FW, calc_FN, calc_oH,
                           calc_oW)  #引数渡し

        start = time.time()
        drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
        getime = time.time() - start

        GPU = np.zeros((C, FN, oH, oW))
        convout = convout.transpose(0, 3, 1, 2)

        GPU[:] = convout[:, :FN, :oH, :oW]
        print("===========Conv&Relu=============")
        print("x size:{0},w size:{1}".format(x.shape, w.shape))
        print("CPU time:{:.4f}".format(cetime * 1000), "[msec]")
        print("GPU time:{:.4f}".format(getime * 1000), "[msec]")
        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(CPU[:] - GPU[:])))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(CPU[:] - GPU[:])))))
        #print(CPU[:,:,:,:])
        #print(GPU[:,:,:,:])
        return GPU
Beispiel #18
0
def main():
    with Driver() as drv:
        p = random.randint(64 * 12, 1024)
        q = random.randint(2, 512)
        r = random.randint(64 * 12, 1024)

        assert (q >= 2)

        p_div = 2
        r_div = 6
        n_threads = p_div * r_div

        # Allocate matrices.
        C = drv.alloc((p, r), 'float32')
        A = drv.alloc((p, q), 'float32')
        B = drv.alloc((q, r), 'float32')

        # Initialize matrices.
        np.random.seed(0)
        alpha = 1.0
        beta = 1.0
        A[:] = np.random.randn(p, q)  # np.ones(shape=(p, q)) #
        B[:] = np.random.randn(q, r)  # np.ones(shape=(q, r)) #
        C[:] = np.random.randn(
            p, r)  # np.ones(shape=(p, r)) # np.arange(p*r).reshape(p, r) + 1

        # Reference
        RA = A.copy()
        RB = B.copy()
        RC = C.copy()
        start = time.time()
        R = alpha * RA.dot(RB) + beta * RC
        elapsed_ref = time.time() - start

        # Allocate uniforms.
        uniforms = drv.alloc((n_threads, 14), 'uint32')
        uniforms[:, 0] = uniforms.addresses()[:, 0]

        th = 0
        p_up = p // 16
        h = (p_up + p_div - 1) // p_div
        h_len = p_div - (h * p_div - p_up)
        r_up = r // 64
        w = (r_up + r_div - 1) // r_div
        w_len = r_div - (w * r_div - r_up)
        h_acc = 0
        for i in range(p_div):
            hi = 0
            if i == p_div - 1:
                hi = p - h_acc
            else:
                hi = 16 * h if i < h_len else 16 * (h - 1)
            w_acc = 0
            for j in range(r_div):
                wj = 0
                if j == r_div - 1:
                    wj = r - w_acc
                else:
                    wj = 64 * w if j < w_len else 64 * (w - 1)
                uniforms[th, 1] = hi
                uniforms[th, 2] = q
                uniforms[th, 3] = wj
                uniforms[th, 4] = A.addresses()[h_acc, 0]
                uniforms[th, 5] = B.addresses()[0, w_acc]
                uniforms[th, 6] = C.addresses()[h_acc, w_acc]
                th += 1
                w_acc += wj
            h_acc += hi
        uniforms[:, 7] = A.strides[0]
        uniforms[:, 8] = B.strides[0]
        uniforms[:, 9] = C.strides[0]
        uniforms[:, 10] = struct.unpack('L', struct.pack('f', alpha))[0]
        uniforms[:, 11] = struct.unpack('L', struct.pack('f', beta))[0]
        uniforms[:, 12] = np.arange(n_threads)
        uniforms[:, 13] = n_threads

        # Allocate GPU program.
        code = drv.program(sgemm_gpu_code)

        # GPU
        start = time.time()
        drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
        elapsed_gpu = time.time() - start

        # Image.fromarray(R.astype(np.uint8)).save("expected.png")
        # Image.fromarray(C.astype(np.uint8)).save("sgemm.png")

        np.set_printoptions(threshold=np.inf)

        # print(R.astype(int))
        # print(C.astype(int))

        def Gflops(sec):
            return (2 * p * q * r + 3 * p * r) / sec * 1e-9

        print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(p=p,
                                                                       q=q,
                                                                       r=r))
        print('threads: {}'.format(n_threads))
        print('numpy: {:.4f} sec, {:.4f} Gflops'.format(
            elapsed_ref, Gflops(elapsed_ref)))
        print('GPU: {:.4f} sec, {:.4f} Gflops'.format(elapsed_gpu,
                                                      Gflops(elapsed_gpu)))
        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(R - C)))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(R - C)))))
        print('minimum relative error: {:.4e}'.format(
            float(np.min(np.abs((R - C) / R)))))
        print('maximum relative error: {:.4e}'.format(
            float(np.max(np.abs((R - C) / R)))))
def GPU_dot(x, w, b, Relu_flag=0):
    with Driver() as drv:
        SIMD = 16
        UNIFORM = 64
        n_threads = 12
        if (x.ndim == 4):
            N, C, H, W = x.shape
        else:
            N = 1
            C = 1
            H, W = x.shape
        p = 1
        q = C * H * W
        r = w.shape[1]
        cal_q = q
        cal_r = r
        #rとqの調整
        rmod = r % SIMD
        if rmod != 0:
            cal_r += SIMD - rmod
        qmod = q % n_threads
        if qmod != 0:
            cal_q += n_threads - qmod

        q_th = int(cal_q / n_threads)  #1thあたりのqの担当量
        q_uni_iter = int(cal_q / n_threads / UNIFORM)  #uniformの繰り返し回数
        q_uni_mod = int((cal_q / n_threads % UNIFORM))  #uniformのあまり分
        r_simd_iter = int(cal_r / SIMD)

        A = drv.alloc((p, cal_q), 'float32')
        B = drv.alloc((cal_q, cal_r), 'float32')
        C = drv.alloc((p, cal_r), 'float32')
        out = drv.alloc((p, cal_r), 'float32')

        out[:] = A[:] = B[:] = C[:] = 0.0
        A[:, :q] = x.reshape(1, q)[:]
        B[:q, :r] = w[:]
        C[:, :r] = b[:]

        cetime = 0
        start = time.time()
        xx = x.reshape(x.shape[0], -1)
        if (Relu_flag == 0):
            CPUout = np.maximum(np.dot(A, B) + C, 0.0)
        else:
            CPUout = np.dot(A, B) + C
        cetime = time.time() - start

        uniforms = drv.alloc((n_threads, 16), 'uint32')
        for th in range(n_threads):
            uniforms[th, 0] = A.addresses()[0, int(th * q_th)]
            uniforms[th, 1] = B.addresses()[int(th * q_th), 0]
        uniforms[:, 2] = out.addresses()[0, 0]
        uniforms[:, 3] = C.addresses()[0, 0]
        uniforms[:, 4] = q_uni_iter
        uniforms[:, 5] = q_uni_mod + 1
        uniforms[:, 6] = np.arange(1, (n_threads + 1))
        uniforms[:, 7] = n_threads
        uniforms[:, 8] = Relu_flag + 1
        code = drv.program(dot, r_simd_iter)

        getime = 0
        start = time.time()
        drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
        getime = time.time() - start

        out_r = np.zeros((p, r))
        out_r[:] = out[:, :r]
        print("===========Affine&Relu=============")

        if Relu_flag == 1:
            print("x size:{0},w size:{1},Relu:×".format(x.shape, w.shape))
        else:
            print("x size:{0},w size:{1},Relu:〇".format(x.shape, w.shape))
        print("CPU time:{:.4f}[msec]".format(cetime * 1000))
        print("GPU time:{:.4f}[msec]".format(getime * 1000))
        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(CPUout[:, :r] - out_r[:, :r])))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(CPUout[:, :r] - out_r[:, :r])))))
        return out_r
Beispiel #20
0
def GPU_pool(x, stride, pad):
    #def main():
    with Driver() as drv:
        SIMD = 16
        UNIFORM = 64
        n_threads = 12
        #N=1;C=30;H=24;W=24
        N, C, H, W = x.shape
        cal_C = C
        Cmod = C % SIMD
        if Cmod != 0:
            cal_C += SIMD - Cmod
        FH = 2
        FW = 2
        oH = int(H / FH)
        oW = int(W / FW)
        th_oH = int(oH / n_threads)
        th_iter = int((th_oH * oW) / SIMD)
        X = drv.alloc((N, H, W, cal_C), 'float32')
        out = drv.alloc((1, oH, oW, cal_C), 'float32')
        X[:] = 0
        X[:, :, :, :C] = x.transpose(0, 2, 3, 1)[:]
        """
        x=np.random.randn(N,cal_C,H,W)
        x=np.arange(N*cal_C*H*W).reshape(N,cal_C,H,W)
        X[:]=x.transpose(0,2,3,1)

        """
        cetime = 0
        start = time.time()
        out_h = int(1 + (H - FH) / stride)
        out_w = int(1 + (W - FW) / stride)
        col = im2col(x, FH, FW, stride, pad)
        col = col.reshape(-1, FH * FW)
        arg_max = np.argmax(col, axis=1)
        CPUout = np.max(col, axis=1)
        CPUout = CPUout.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
        cetime = time.time() - start

        uniforms = drv.alloc((n_threads, 16), 'uint32')
        for th in range(n_threads):
            uniforms[th, 0] = X.addresses()[0, th * th_oH * stride, 0, 0]
            uniforms[th, 1] = out.addresses()[0, th * th_oH, 0, 0]
        uniforms[:, 2] = np.arange(1, (n_threads + 1))
        uniforms[:, 3] = n_threads
        code = drv.program(pool, H, W, cal_C, stride)

        getime = 0
        start = time.time()
        drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
        getime = time.time() - start
        print("===========Pooling=============")
        print("x size:{0},stride:{1},pad:{2}".format(x.shape, stride, pad))
        print("CPU time:{:.4f}[msec]".format(cetime * 1000))
        print("GPU time:{:.4f}[msec]".format(getime * 1000))
        """
        print("GPU time:{0}".format(etime*1000),"[msec]")
        print("CPU time:{0}".format(cpuetime*1000),"[msec]")
        """
        out_r = np.zeros((1, C, oH, oW))
        out_r[:] = out.transpose(0, 3, 1, 2)[:, :C, :, :]

        print('minimum absolute error: {:.4e}'.format(
            float(np.min(np.abs(CPUout[:] - out_r[:])))))
        print('maximum absolute error: {:.4e}'.format(
            float(np.max(np.abs(CPUout[:] - out_r[:])))))
        """
        print('minimum relative error: {:.4e}'.format(
                float(np.min(np.abs((CPUout - out_r) / CPUout)))))
        print('maximum relative error: {:.4e}'.format(
                float(np.max(np.abs((CPUout - out_r) / CPUout)))))

        print("GPU{0}".format(out_r))
        print("CPU{0}".format(CPUout))
        """
        return out_r
Beispiel #21
0
def main():
    with Driver() as drv:
        p = random.randint(1, 1024)
        q = random.randint(2, 512)
        r = random.randint(1, 1024)

        assert(q >= 2)

        # Allocate matrices.
        C = drv.alloc((p, r), 'float32')
        A = drv.alloc((p, q), 'float32')
        B = drv.alloc((q, r), 'float32')

        # Initialize matrices.
        np.random.seed(0)
        alpha = 1.0
        beta = 1.0
        A[:] = np.random.randn(p, q) # np.ones(shape=(p, q)) #
        B[:] = np.random.randn(q, r) # np.ones(shape=(q, r)) #
        C[:] = np.random.randn(p, r) # np.ones(shape=(p, r)) # np.arange(p*r).reshape(p, r) + 1 #

        # Reference
        RA = A.copy()
        RB = B.copy()
        RC = C.copy()
        start = time.time()
        R = alpha*RA.dot(RB) + beta*RC
        elapsed_ref = time.time() - start

        # Allocate uniforms.
        uniforms = drv.alloc(12, 'uint32')
        uniforms[0] = uniforms.address
        uniforms[1] = p
        uniforms[2] = q
        uniforms[3] = r
        uniforms[4] = A.address
        uniforms[5] = B.address
        uniforms[6] = C.address
        uniforms[7] = A.strides[0]
        uniforms[8] = B.strides[0]
        uniforms[9] = C.strides[0]
        uniforms[10] = struct.unpack('L', struct.pack('f', alpha))[0]
        uniforms[11] = struct.unpack('L', struct.pack('f', beta))[0]

        # Allocate GPU program.
        code = drv.program(sgemm_gpu_code)

        # GPU
        start = time.time()
        drv.execute(
                n_threads=1,
                program=code,
                uniforms=uniforms
                )
        elapsed_gpu = time.time() - start

        # Image.fromarray(R.astype(np.uint8)).save("expected.png")
        # Image.fromarray(C.astype(np.uint8)).save("sgemm.png")

        # np.set_printoptions(threshold=np.inf)
        # print(C.astype(int))

        def Gflops(sec):
            return (2*p*q*r + 3*p*r)/sec * 1e-9

        print('==== sgemm example ({p}x{q} times {q}x{r}) ===='.format(
                p=p, q=q, r=r))
        print('threads: {}'.format(1))
        print('numpy: {:.4f} sec, {:.4f} Gflops'.format(
                elapsed_ref, Gflops(elapsed_ref)))
        print('GPU: {:.4f} sec, {:.4f} Gflops'.format(
                elapsed_gpu, Gflops(elapsed_gpu)))
        print('minimum absolute error: {:.4e}'.format(
                float(np.min(np.abs(R - C)))))
        print('maximum absolute error: {:.4e}'.format(
                float(np.max(np.abs(R - C)))))
        print('minimum relative error: {:.4e}'.format(
                float(np.min(np.abs((R - C) / R)))))
        print('maximum relative error: {:.4e}'.format(
                float(np.max(np.abs((R - C) / R)))))
    rotate(broadcast, r2, -THR_NM)
    iadd(r0, r5, -1, set_flags=True)
    L.sem_down
    jzc(L.sem_down)
    sema_down(COMPLETED)  # すべてのスレッドが終了するまで待つ
    nop()
    iadd(r0, r0, -1)

    interrupt()

    L.skip_fin

    exit(interrupt=False)


with Driver() as drv:
    # 画像サイズ
    H = 360
    W = 320

    n_threads = 12
    SIMD = 16
    R = 60

    th_H = int(H / n_threads)  #1スレッドの担当行
    th_ele = th_H * W  #1スレッドの担当要素
    io_iter = int(th_ele / (R * SIMD))  #何回転送するか

    IN = drv.alloc((H, W), 'float32')
    OUT = drv.alloc((H, W), 'float32')
    OUT[:] = 0.0
Beispiel #23
0
def main():
    with Driver() as drv:

        class Color:
            BLACK = '\033[30m'
            RED = '\033[31m'
            GREEN = '\033[32m'
            YELLOW = '\033[33m'
            BLUE = '\033[34m'
            PURPLE = '\033[35m'
            CYAN = '\033[36m'
            WHITE = '\033[37m'
            END = '\033[0m'
            BOLD = '\038[1m'
            UNDERLINE = '\033[4m'
            INVISIBLE = '\033[08m'
            REVERCE = '\033[07m'

        SIMD = 16
        UNIFORM = 64
        n_threads = 12

        N = 1
        C = 3
        H = 64
        W = 64
        FN = 16
        FH = 5
        FW = 5
        Relu_flag = 1
        x = np.random.randn(N, C, H, W)
        w = np.random.randn(FN, C, FH, FW)
        b = np.random.randn(FN)
        N, C, H, W = x.shape
        FN, C, FH, FW = w.shape
        calc_H = H
        calc_W = W
        calc_FN = FN
        eH = int(FH / 2) * 2
        eW = int(FW / 2) * 2
        oH = H - eH
        oW = W - eW
        modH = oH % n_threads
        modW = oW % SIMD
        modFN = FN % SIMD
        if (modH != 0):
            calc_H += n_threads - modH
        if (modW != 0):
            calc_W += SIMD - modW
        if (modFN != 0):
            calc_FN += SIMD - modFN
        calc_oH = calc_H - eH
        calc_oW = calc_W - eW

        th_oH = int(calc_oH / n_threads)
        th_iter = int((th_oH * calc_oW) / (64 / calc_FN * 16))
        convX = drv.alloc((N, C, calc_H, calc_W), 'float32')
        convW = drv.alloc((C, FH, FW, calc_FN), 'float32')
        convout = drv.alloc((1, calc_oH, calc_oW, calc_FN), 'float32')
        cb = drv.alloc(calc_FN, 'float32')
        convout[:] = 0
        convX[:] = 0
        convW[:] = 0
        cb[:] = 0
        pad = 0
        stride = 1

        convX[:, :, :H, :W] = x[:]
        convW[:, :, :, :FN] = w.transpose(1, 2, 3, 0)[:]  #転置してcopy
        cb[:FN] = b[:]

        uniforms = drv.alloc((n_threads, 16), 'uint32')
        uniforms[:, 0] = convW.addresses()[0, 0, 0, 0]
        for th in range(n_threads):
            uniforms[th, 1] = convX.addresses()[0, 0, th * th_oH, 0]
            uniforms[th, 2] = convout.addresses()[0, th * th_oH, 0, 0]
        uniforms[:, 3] = cb.addresses()[0]
        uniforms[:, 4] = th_iter
        uniforms[:, 5] = th_oH
        uniforms[:, 6] = int(calc_W * 4)
        uniforms[:, 7] = C
        uniforms[:, 8] = np.arange(1, (n_threads + 1))
        uniforms[:, 9] = n_threads
        uniforms[:, 10] = Relu_flag + 1
        code = drv.program(conv, calc_H, calc_W, FH, FW, calc_FN, calc_oH,
                           calc_oW)  #引数渡し

        while (1):
            #CPU Calculation
            #im2col->dot
            cpuetime = 0
            start = time.time()
            out_h = 1 + int((H + 2 * pad - FH) / stride)
            out_w = 1 + int((W + 2 * pad - FW) / stride)
            col = im2col(x, FH, FW, stride, pad)
            col_W = w.reshape(FN, -1).T
            out = np.dot(col, col_W) + b
            #out = np.maximum(out,0.0)
            CPU = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
            cetime = time.time() - start

            start = time.time()
            drv.execute(n_threads=n_threads, program=code, uniforms=uniforms)
            getime = time.time() - start

            GPU = np.zeros((C, FN, oH, oW))
            tranout = convout.transpose(0, 3, 1, 2)

            GPU[:] = tranout[:, :FN, :oH, :oW]
            print("===========畳み込み層=============")
            print("x size:{0},w size:{1}".format(x.shape, w.shape))
            print("CPU time:{:.4f}".format(cetime * 1000), "[msec]")
            print("GPU time:{:.4f}".format(getime * 1000), "[msec]")
            print('minimum absolute error: {:.4e}'.format(
                float(np.min(np.abs(CPU[:] - GPU[:])))))
            print('maximum absolute error: {:.4e}'.format(
                float(np.max(np.abs(CPU[:] - GPU[:])))))
            print(Color.GREEN + "{:.2f}倍高速化!!!".format(cetime / getime) +
                  Color.END)
            convout[:] = 0
            time.sleep(3)