Beispiel #1
0
def test_float16():
    # gemv (gemm called)
    float16_data = [
        rand(3).astype("float16"),
        np.asarray(1, dtype=np.float32),
        rand(3, 3).astype("float16"),
        rand(3).astype("float16"),
        np.asarray(0.5, dtype=np.float32),
    ]
    float16_shared = [
        gpuarray_shared_constructor(val, target=test_ctx_name)
        for val in float16_data
    ]
    o = gemv(*float16_shared)
    f = aesara.function([], o, mode=mode_with_gpu)
    y, alpha, A, x, beta = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)
    topo = f.maker.fgraph.toposort()
    assert any(isinstance(n.op, GpuGemm) for n in topo)

    # gemm
    float16_data = [
        rand(3, 3).astype("float16"),
        np.asarray(1, dtype=np.float32),
        rand(3, 3).astype("float16"),
        rand(3, 3).astype("float16"),
        np.asarray(0.5, dtype=np.float32),
    ]
    float16_shared = [
        gpuarray_shared_constructor(val, target=test_ctx_name)
        for val in float16_data
    ]
    o = gpugemm_no_inplace(*float16_shared)
    f = aesara.function([], o)
    y, alpha, A, x, beta = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y)

    # dot22
    float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")]

    float16_shared = [gpuarray_shared_constructor(val) for val in float16_data]
    o = gpu_dot22(*float16_shared)
    f = aesara.function([], o)
    x, y = float16_data
    out = f()
    utt.assert_allclose(np.asarray(out), np.dot(x, y))
Beispiel #2
0
def main(dev1, dev2):
    init_dev(dev1, "ctx1")
    init_dev(dev2, "ctx2")

    size = 1024 * 16
    data = np.random.randn(size, size).astype("float32")
    val1a = aesara.shared(data, target="ctx1")
    val1b = aesara.shared(data, target="ctx1")
    val1c = aesara.shared(data, target="ctx1")
    val1d = aesara.shared(data, target="ctx1")

    val2a = aesara.shared(data, target="ctx2")
    val2b = aesara.shared(data, target="ctx2")

    f1 = aesara.function([],
                         [gpu_dot22(val1a, val1b),
                          gpu_dot22(val1c, val1d)])
    f2 = aesara.function([],
                         [gpu_dot22(val1a, val1b),
                          gpu_dot22(val2a, val2b)])
    f3 = aesara.function([], [gpu_dot22(val1a, val1b)])
    f4 = aesara.function([], [gpu_dot22(val2a, val2b)])
    f5 = aesara.function([], [gpu_dot22(val1a, val1b)[0, 0].transfer("cpu")])
    f6 = aesara.function([], [gpu_dot22(val2a, val2b)[0, 0].transfer("cpu")])

    # pre-execute to load code to GPU.
    r = f1.fn()
    r[0].sync(), r[1].sync()
    r = f2.fn()
    r[0].sync(), r[1].sync()
    r = f3.fn()
    r[0].sync()
    r = f4.fn()
    r[0].sync()
    r = f5.fn()
    r = f6.fn()
    r = None

    t = time.time()
    r = f1.fn()
    r[0].sync(), r[1].sync()
    t2 = time.time()
    r = None

    print(f"one ctx async {t2 - t:f}")

    t = time.time()
    r = f2.fn()
    r[0].sync(), r[1].sync()
    t2 = time.time()
    r = None

    print(f"two ctx async {t2 - t:f}")

    t = time.time()
    r = f3.fn()
    r2 = f4.fn()
    r[0].sync()
    r2[0].sync()
    t2 = time.time()
    r = None

    print(f"two ctx, 2 fct async {t2 - t:f}")

    t = time.time()
    r = f5.fn()
    r2 = f6.fn()
    t2 = time.time()
    r = None
    print(f"two ctx, 2 fct with transfer {t2 - t:f}")

    # Multi-thread version
    class myThread(threading.Thread):
        def __init__(self, name, f, sync):
            threading.Thread.__init__(self)
            self.f = f
            self.name = name
            self.sync = sync

        def run(self):
            # print "Starting " + self.name
            # r = self.f.fn(n_calls=10)
            r = self.f()
            # print "End " + self.name
            if self.sync:
                r[0].sync()
            self.r = r
            # print "Exiting " + self.name

    thread1 = myThread("Thread-3", f3, True)
    thread2 = myThread("Thread-4", f4, True)
    t = time.time()
    thread1.start()
    thread2.start()
    thread1.join()
    thread2.join()
    t2 = time.time()

    print(f"two ctx, 2 fct async, 2 threads {t2 - t:f}")

    thread1 = myThread("Thread-5", f5, False)
    thread2 = myThread("Thread-6", f6, False)
    t = time.time()
    thread1.start()
    thread2.start()
    thread1.join()
    thread2.join()
    t2 = time.time()

    print(f"two ctx, 2 fct with transfer, 2 threads {t2 - t:f}")