Ejemplo n.º 1
0
def solve_HingeLoss(solver, queue, clA, Y, rng=None, E=None):
    import scipy.optimize
    import pyopencl_blas
    pyopencl_blas.setup()

    tstart = time.time()

    assert clA.shape[0] == Y.shape[0]
    m, n = clA.shape
    _, d = Y.shape
    Xshape = (n, d)

    # regularization
    sigma = solver.reg * cl.array.max(clA).get()
    lamb = m * sigma**2

    # --- initialization
    X0 = rng.uniform(-1. / n, 1. / n, size=Xshape)

    # --- solve with L-BFGS
    yinds = Y.argmax(axis=1)

    clX = cl.array.Array(queue, (n, d), dtype=np.float32)
    clyinds = cl.array.to_device(queue, yinds.astype(np.int32))
    clZ = cl.array.Array(queue, (m, d), dtype=np.float32)
    clc = cl.array.Array(queue, (m, ), dtype=np.float32)
    clE = cl.array.Array(queue, (m, d), dtype=np.float32)
    clG = cl.array.Array(queue, (n, d), dtype=np.float32)

    hingeloss_plan = plan_hingeloss(queue, clyinds, clZ, clc, clE)

    def f_df(x):
        clX.set(x.astype(np.float32).reshape(Xshape))
        pyopencl_blas.gemm(queue, clA, clX, clZ)
        hingeloss_plan()

        cost = pyopencl.array.sum(clc).get()
        pyopencl_blas.gemm(queue, clA, clE, clG, transA=True)
        if lamb > 0:
            cost += 0.5 * lamb * pyopencl.array.sum(clX**2).get()
            # cost += 0.5 * lamb * sum_square(clX).get()
            clG[:] += lamb * clX

        G = clG.get().astype(np.float64)
        return cost, G.ravel()

    x0 = X0.ravel()
    x, mincost, info = scipy.optimize.fmin_l_bfgs_b(f_df,
                                                    x0,
                                                    maxfun=solver.n_epochs,
                                                    iprint=solver.verbose)

    tend = time.time()

    A = clA.get()
    X = x.reshape(Xshape)
    return solver.mul_encoders(X, E), {
        'rmses': npext.rms(np.dot(A, X) - Y, axis=1),
        'time': tend - tstart
    }
Ejemplo n.º 2
0
def iddr_rid(queue, m, n, matvect, krank):
    id_srand = util.setup_rand(queue)

    blas.setup()

    dtype = 'float64'
    clx = cl_array.zeros(queue, m, dtype)
    rnorms = cl_array.Array(queue, n, dtype)
    lst = cl_array.Array(queue, n, np.int32)
    proj = cl_array.Array(queue, (krank+2,n), dtype)


    l = krank + 2

    for i in range(l):
        id_srand(m, clx)
        matvect(clx, proj[i,:])


    iddr_id(queue, l, n, proj, krank, lst, rnorms)

    blas.teardown()

    return lst, proj
Ejemplo n.º 3
0
def test_speed(rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32')
          for m, n in zip(ms, ns)]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(
        queue, alpha, clA, A_js, clX, X_js, beta, clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)
Ejemplo n.º 4
0
from __future__ import print_function

import numpy as np
import pyopencl as cl
from pyopencl.array import Array
import pyopencl_blas as blas

# start up OpenCL
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# start up the BLAS
blas.setup()

# generate some random data on the CPU
m, n = 5, 4
dtype = 'float32'  # also supports 'float64'

A = np.zeros((m, n), dtype=dtype)
x = np.zeros(n, dtype=dtype)
y = np.zeros(m, dtype=dtype)

rng = np.random.RandomState(1)  # change the seed to see different data
A[...] = rng.uniform(-1, 1, size=A.shape)
x[...] = rng.uniform(-1, 1, size=x.shape)
y[...] = rng.uniform(-1, 1, size=y.shape)

# allocate OpenCL memory on the device
clA = Array(queue, A.shape, A.dtype)
clx = Array(queue, x.shape, x.dtype)
cly = Array(queue, y.shape, y.dtype)
Ejemplo n.º 5
0
def solve_Softmax(solver, queue, clA, Y, rng=None, E=None):
    from nengo_extras.convnet import softmax
    import scipy.optimize
    import pyopencl_blas
    pyopencl_blas.setup()

    tstart = time.time()

    assert clA.shape[0] == Y.shape[0]
    m, n = clA.shape
    _, d = Y.shape
    Xshape = (n, d)

    # regularization
    sigma = solver.reg * cl.array.max(clA).get()
    lamb = m * sigma**2

    # --- initialization
    # X0 = np.zeros(Xshape, dtype=np.float32)
    X0 = np.zeros(Xshape, dtype=np.float64)

    # --- solve with L-BFGS
    clY = cl.array.to_device(queue, Y.astype(np.float32))
    clyi = cl.array.to_device(queue, np.argmax(Y, axis=1).astype(np.int32))
    clX = cl.array.Array(queue, (n, d), dtype=np.float32)
    clE = cl.array.Array(queue, (m, d), dtype=np.float32)
    clG = cl.array.Array(queue, (n, d), dtype=np.float32)

    softmax_plan = plan_softmax(queue, clE, clE)

    # sum_square = cl.reduction.ReductionKernel(
    #     queue.context, np.float32, neutral="0",
    #     reduce_expr="a+b", map_expr="x[i]*x[i]",
    #     arguments="__global float *x")

    sum_logloss = cl.reduction.ReductionKernel(
        queue.context,
        np.float32,
        neutral="0",
        reduce_expr="a+b",
        map_expr="-log(max(Y[i*%(d)d + yi[i]], 1e-16f))" % dict(d=d),
        arguments="__global const int *yi, __global const float *Y")
    assert clE.elemstrides[0] == d

    def f_df(x):
        clX.set(x.astype(np.float32).reshape(Xshape))
        pyopencl_blas.gemm(queue, clA, clX, clE)
        softmax_plan()
        cost = sum_logloss(clyi, clE).get()
        clE[:] -= clY
        pyopencl_blas.gemm(queue, clA, clE, clG, transA=True)
        if lamb > 0:
            cost += 0.5 * lamb * pyopencl.array.sum(clX**2).get()
            # cost += 0.5 * lamb * sum_square(clX).get()
            clG[:] += lamb * clX

        G = clG.get().astype(np.float64)
        return cost, G.ravel()

    x0 = X0.ravel()
    x, mincost, info = scipy.optimize.fmin_l_bfgs_b(f_df,
                                                    x0,
                                                    maxfun=solver.n_epochs,
                                                    iprint=solver.verbose)

    tend = time.time()

    A = clA.get()
    X = x.reshape(Xshape)
    return solver.mul_encoders(X, E), {
        'rmses': npext.rms(softmax(np.dot(A, X), axis=1) - Y, axis=1),
        'time': tend - tstart
    }
Ejemplo n.º 6
0
def solve_lstsqclassifier(solver, queue, clA, Y, rng=None, E=None):
    # from nengo_ocl.builder.solvers import cho_solve
    import pyopencl_blas
    pyopencl_blas.setup()

    m, n = clA.shape
    _, d = Y.shape
    precompute_ai = solver.precompute_ai

    def XTdotX(clX):
        clXX = cl.array.Array(queue, (n, n), dtype=np.float32)
        pyopencl_blas.gemm(queue, clX, clX, clXX, transA=True)
        return clXX.get()

    def ATdotx(x):
        clx = cl.array.to_device(queue, x.astype(np.float32))
        cly = cl.array.Array(queue, (n, ), dtype=np.float32)
        pyopencl_blas.gemv(queue, clA, clx, cly, transA=True)
        return cly.get()

    def AdotX(X):
        clX = cl.array.to_device(queue, X.astype(np.float32))
        clAX = cl.array.Array(queue, (m, clX.shape[1]), dtype=np.float32)
        pyopencl_blas.gemm(queue, clA, clX, clAX)
        return clAX.get()

    def getAi(i, cache={}):
        if i in cache:
            return cache[i]

        clAi = clAis[i]
        AAi = XTdotX(clAi)
        if precompute_ai:
            cache[i] = AAi
        return AAi

    tstart = time.time()

    sigma = solver.reg * cl.array.max(clA).get()

    # Get Y inds
    Yi = np.argmax(Y, axis=1)
    Yd = np.diff(Yi)
    assert set(np.unique(Yd)) == set(
        (0, 1)), "Y not sorted, or missing some classes"

    clAis = []
    for i in range(d):
        inds, = (Yi == i).nonzero()
        a, b = inds.min(), inds.max() + 1
        clAis.append(clA[a:b])

    if not precompute_ai:
        AA = XTdotX(clA)
    else:
        AA = np.zeros((n, n))
        for i in range(d):
            AA += getAi(i)

    X = np.zeros((n, d))
    for i in range(d):
        y = Y[:, i]

        # weight for classification
        p = y.mean()
        q = solver.weight_power
        wr = p * (1 - p)**q + (1 - p) * p**q
        w0 = p**q / wr
        w1 = (1 - p)**q / wr
        dw = w1 - w0
        w = w0 + dw * y

        # form Gram matrix G = A.T W A + m * sigma**2
        G = w0 * AA + dw * getAi(i)
        np.fill_diagonal(G, G.diagonal() + m * sigma**2)
        b = ATdotx(w * y)

        # X[:, i] = cho_solve(G, b, overwrite=True)
        X[:, i] = np.linalg.solve(G, b)

    tend = time.time()

    AX = AdotX(X)
    return solver.mul_encoders(X, E), {
        'rmses': npext.rms(AX - Y, axis=1),
        'time': tend - tstart
    }
Ejemplo n.º 7
0
def test_speed(ctx, rng):
    try:
        import pyopencl_blas
    except ImportError:
        pyopencl_blas = None

    # enable_out_of_order = (
    #     cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)

    k = 300
    # k = 100
    # k = 32
    # k = 16
    ms = [rng.randint(100, 1000) for i in range(k)]
    ns = [rng.randint(100, 1000) for i in range(k)]
    # ms = [4096 for i in range(k)]
    # ns = [4096 for i in range(k)]

    aa = [
        rng.uniform(-1, 1, size=(m, n)).astype('float32')
        for m, n in zip(ms, ns)
    ]
    xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns]
    yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms]
    ajs = [np.int32(i) for i in range(k)]
    xjs = [np.int32(i) for i in range(k)]
    # ajs = [rng.randint(k, size=p) for i in range(k)]
    # xjs = [rng.randint(k, size=p) for i in range(k)]

    # alpha = 0.5
    # beta = 0.1
    alpha = 1.0
    beta = 1.0

    # -- prepare initial conditions on device
    queue = cl.CommandQueue(ctx)
    # queue = cl.CommandQueue(ctx, properties=enable_out_of_order)
    clA = CLRA.from_arrays(queue, aa)
    clX = CLRA.from_arrays(queue, xx)
    clY = CLRA.from_arrays(queue, yy)
    A_js = RA(ajs, dtype=np.int32)
    X_js = RA(xjs, dtype=np.int32)

    # -- run cl computation
    prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta,
                                   clY)
    plans = prog.choose_plans()

    print('')
    print('-' * 5 + ' Plans ' + '-' * 45)
    for plan in plans:
        print(plan)

    with Timer() as timer:
        for plan in plans:
            plan()
    print("nengo_ocl: %0.3f" % timer.duration)

    # -- speed test in ocl blas
    if pyopencl_blas:
        pyopencl_blas.setup()

        def array(a):
            cla = cl.array.Array(queue, a.shape, a.dtype)
            cla.set(a)
            return cla

        clAs = [array(a) for a in aa]
        clXs = [array(x.ravel()) for x in xx]
        clYs = [array(y.ravel()) for y in yy]

        queues = [cl.CommandQueue(ctx) for _ in range(k)]
        # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order)
        #           for _ in range(k)]

        queue.finish()
        with Timer() as timer:
            if 0:
                # use a single queue
                for A, X, Y in zip(clAs, clXs, clYs):
                    pyopencl_blas.gemv(queue, A, X, Y)
                queue.finish()
            else:
                # use multiple parallel queues
                events = []
                for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)):
                    q = queues[i % len(queues)]
                    e = pyopencl_blas.gemv(q, A, X, Y)
                    events.append(e)
                for q in queues:
                    q.flush()
                cl.wait_for_events(events)
        print("clBLAS: %0.3f" % timer.duration)
Ejemplo n.º 8
0
import numpy as np
import pyopencl
import pyopencl.array
import pyopencl_blas
pyopencl_blas.setup()  # initialize the library

ctx = pyopencl.create_some_context()
queue = pyopencl.CommandQueue(ctx)

dtype = 'float32'  # also supports 'float64', 'complex64' and 'complex128'
x = np.array([1, 2, 3, 4], dtype=dtype)
y = np.array([4, 3, 2, 1], dtype=dtype)

clx = pyopencl.array.to_device(queue, x)
cly = pyopencl.array.to_device(queue, y)

# call a BLAS function on the arrays
pyopencl_blas.axpy(queue, clx, cly, alpha=0.8)
print("Expected: %s" % (0.8 * x + y))
print("Actual:   %s" % (cly.get()))