def test_basic():
    # -- prepare initial conditions on host
    A = RA([[[0.1, .2], [.3, .4]], [[.5, .6]]])
    X = RA([[3, 5]])
    Y = RA([[0.0], [2, 3], ])
    A_js = RA([[1], [0]], dtype=np.int32)
    X_js = RA([[0], [0]], dtype=np.int32)
    # alpha = 0.5
    alpha = 1.0
    # beta = 0.1
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    clA = CLRA(queue, A)
    clX = CLRA(queue, X)
    clY = CLRA(queue, Y)
    assert allclose(A, clA)
    assert allclose(X, clX)
    assert allclose(Y, clY)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(
        queue, alpha, clA, A_js, clX, X_js, beta, clY)
    # plans = prog.choose_plans()
    # assert len(plans) == 1
    for plan in prog.plans:
        plan()

    # -- ensure they match
    for i in range(len(A_js)):
        aj, xj = int(A_js[i]), int(X_js[i])
        ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i]
        sim = clY[i]
        assert np.allclose(ref, sim)
Beispiel #2
0
    def test_basic(self):
        # -- prepare initial conditions on host
        A = RA([ [[0.1, .2], [.3, .4]], [[.5, .6]]])
        X = RA([ [3, 5] ])
        Y = RA([[0.0], [2, 3],])
        A_js = RA([[1], [0]])
        X_js = RA([[0], [0]])
        alpha = 0.5
        beta = 0.1

        # -- prepare initial conditions on device
        queue = cl.CommandQueue(ctx)
        clA = CLRA(queue, A)
        clX = CLRA(queue, X)
        clY = CLRA(queue, Y)
        clA_js = CLRA(queue, A_js)
        clX_js = CLRA(queue, X_js)
        assert allclose(A, clA)
        assert allclose(X, clX)
        assert allclose(Y, clY)
        assert allclose(A_js, clA_js)
        assert allclose(X_js, clX_js)

        # -- run cl computation
        plan = plan_ragged_gather_gemv(
            queue, alpha, clA, clA_js, clX, clX_js, beta, clY)

        plan()

        # -- ensure they match
        for i in xrange(len(A_js)):
            aj, xj = int(A_js[i]), int(X_js[i])
            ref = alpha*np.dot(A[aj], X[xj]) + beta*Y[i]
            sim = clY[i]
            assert np.allclose(ref, sim)
Beispiel #3
0
def test_basic(ctx):
    # -- prepare initial conditions on host
    A = RA([[[0.1, 0.2], [0.3, 0.4]], [[0.5, 0.6]]])
    X = RA([[3, 5]])
    Y = RA([[0.0], [2, 3]])
    A_js = RA([[1], [0]], dtype=np.int32)
    X_js = RA([[0], [0]], dtype=np.int32)
    # alpha = 0.5
    alpha = 1.0
    # beta = 0.1
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    clA = CLRA(queue, A)
    clX = CLRA(queue, X)
    clY = CLRA(queue, Y)
    assert ra_allclose(A, clA)
    assert ra_allclose(X, clX)
    assert ra_allclose(Y, clY)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta,
                                   clY)
    # plans = prog.choose_plans()
    # assert len(plans) == 1
    for plan in prog.plans:
        plan()

    # -- ensure they match
    for i, _ in enumerate(A_js):
        aj, xj = int(A_js[i]), int(X_js[i])
        ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i]
        sim = clY[i]
        assert np.allclose(ref, sim)
Beispiel #4
0
    def _test_random(self, k=4, p=1, m=10, n=10):
        """
        Parameters
        ----------
        k : number of operations (length of A_js)
        p : number of dots per operation (width of A_js)
        m : output dimensions
        n : input dimensions
        """

        rng = np.random.RandomState(3294)

        aa = [rng.normal(size=(m, n)) for i in xrange(k)]
        xx = [rng.normal(size=n) for i in xrange(k)]
        yy = [rng.normal(size=m) for i in xrange(k)]
        ajs = [rng.randint(k, size=p) for i in xrange(k)]
        xjs = [rng.randint(k, size=p) for i in xrange(k)]

        A = RA(aa)
        X = RA(xx)
        Y = RA(yy)
        A_js = RA(ajs)
        X_js = RA(xjs)
        alpha = 0.5
        beta = 0.1

        # -- prepare initial conditions on device
        queue = cl.CommandQueue(ctx)
        clA = CLRA(queue, A)
        clX = CLRA(queue, X)
        clY = CLRA(queue, Y)
        clA_js = CLRA(queue, A_js)
        clX_js = CLRA(queue, X_js)
        assert allclose(A, clA)
        assert allclose(X, clX)
        assert allclose(Y, clY)
        assert allclose(A_js, clA_js)
        assert allclose(X_js, clX_js)

        # -- run cl computation
        prog = plan_ragged_gather_gemv(queue, alpha, clA, clA_js, clX, clX_js,
                                       beta, clY)

        print '-' * 5 + ' Plans ' + '-' * 45
        for plan in prog.plans:
            print plan
        prog()

        # -- ensure they match
        for i in xrange(k):
            ref = beta * Y[i]
            for aj, xj in zip(A_js[i], X_js[i]):
                ref += alpha * np.dot(A[aj], X[xj])
            sim = clY[i]
            assert np.allclose(ref, sim, atol=1e-3, rtol=1e-3)
Beispiel #5
0
    def _test_random(self, k=4, p=1, m=10, n=10):
        """
        Parameters
        ----------
        k : number of operations (length of A_js)
        p : number of dots per operation (width of A_js)
        m : output dimensions
        n : input dimensions
        """

        rng = np.random.RandomState(3294)

        aa = [rng.normal(size=(m, n)) for i in xrange(k)]
        xx = [rng.normal(size=n) for i in xrange(k)]
        yy = [rng.normal(size=m) for i in xrange(k)]
        ajs = [rng.randint(k, size=p) for i in xrange(k)]
        xjs = [rng.randint(k, size=p) for i in xrange(k)]

        A = RA(aa)
        X = RA(xx)
        Y = RA(yy)
        A_js = RA(ajs)
        X_js = RA(xjs)
        alpha = 0.5
        beta = 0.1

        # -- prepare initial conditions on device
        queue = cl.CommandQueue(ctx)
        clA = CLRA(queue, A)
        clX = CLRA(queue, X)
        clY = CLRA(queue, Y)
        clA_js = CLRA(queue, A_js)
        clX_js = CLRA(queue, X_js)
        assert allclose(A, clA)
        assert allclose(X, clX)
        assert allclose(Y, clY)
        assert allclose(A_js, clA_js)
        assert allclose(X_js, clX_js)

        # -- run cl computation
        prog = plan_ragged_gather_gemv(
            queue, alpha, clA, clA_js, clX, clX_js, beta, clY)

        print '-' * 5 + ' Plans ' + '-' * 45
        for plan in prog.plans:
            print plan
        prog()

        # -- ensure they match
        for i in xrange(k):
            ref = beta*Y[i]
            for aj, xj in zip(A_js[i], X_js[i]):
                ref += alpha*np.dot(A[aj], X[xj])
            sim = clY[i]
            assert np.allclose(ref, sim, atol=1e-3, rtol=1e-3)
Beispiel #6
0
    def test_basic(self):
        # -- prepare initial conditions on host
        A = RA([[[0.1, .2], [.3, .4]], [[.5, .6]]])
        X = RA([[3, 5]])
        Y = RA([
            [0.0],
            [2, 3],
        ])
        A_js = RA([[1], [0]])
        X_js = RA([[0], [0]])
        alpha = 0.5
        beta = 0.1

        # -- prepare initial conditions on device
        queue = cl.CommandQueue(ctx)
        clA = CLRA(queue, A)
        clX = CLRA(queue, X)
        clY = CLRA(queue, Y)
        clA_js = CLRA(queue, A_js)
        clX_js = CLRA(queue, X_js)
        assert allclose(A, clA)
        assert allclose(X, clX)
        assert allclose(Y, clY)
        assert allclose(A_js, clA_js)
        assert allclose(X_js, clX_js)

        # -- run cl computation
        plan = plan_ragged_gather_gemv(queue, alpha, clA, clA_js, clX, clX_js,
                                       beta, clY)

        plan()

        # -- ensure they match
        for i in xrange(len(A_js)):
            aj, xj = int(A_js[i]), int(X_js[i])
            ref = alpha * np.dot(A[aj], X[xj]) + beta * Y[i]
            sim = clY[i]
            assert np.allclose(ref, sim)
def test_speed(rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32')
          for m, n in zip(ms, ns)]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(
        queue, alpha, clA, A_js, clX, X_js, beta, clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)
Beispiel #8
0
 def plan_ragged_gather_gemv(self, *args, **kwargs):
     return plan_ragged_gather_gemv(self.queue, *args, **kwargs)
Beispiel #9
0
 def plan_ragged_gather_gemv(self, *args, **kwargs):
     return plan_ragged_gather_gemv(self.queue, *args, **kwargs)
Beispiel #10
0
def test_speed(ctx, rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [
        rng.uniform(-1, 1, size=(m, n)).astype('float32')
        for m, n in zip(ms, ns)
    ]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta,
                                   clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)