Exemple #1
0
    def cl_geometry_and_textconf(self, items, padding=4):
        p = self
        max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items)
        n_structure_vars = 4 * max_n_dots + 5
        structure_vars_stride = int(
            padding * math.ceil(float(n_structure_vars) / padding))
        gstructure = np.zeros((len(items), structure_vars_stride), dtype='int32')
        A_starts = p.A.starts
        X_starts = p.X.starts
        Y_starts = p.Y.starts
        Y_in_starts = p.Y_in.starts
        A_stride0s = p.A.stride0s
        A_shape1s = p.A.shape1s
        Y_shape0s = p.Y.shape0s

        for bbi, bb in enumerate(items):
            x_js_i = p.X_js[bb]
            A_js_i = p.A_js[bb]
            assert len(x_js_i) == len(A_js_i)
            for ii, (xi, ai) in enumerate(zip(x_js_i, A_js_i)):
                assert xi.size == 1 and ai.size == 1
                xi, ai = xi[0], ai[0]  # to ignore numpy DeprecationWarning
                gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi]
                gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai]
                gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai]
                gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai]
            # -- offset of output and input buffers
            gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb]
            gstructure[bbi, 4 * max_n_dots + 1] = Y_starts[bb]
            # -- number of dots for bb
            gstructure[bbi, 4 * max_n_dots + 2] = len(A_js_i)
            # -- length of Y[bb]
            gstructure[bbi, 4 * max_n_dots + 3] = Y_shape0s[bb]
            gstructure[bbi, 4 * max_n_dots + 4] = bb
        cl_gstructure = to_device(p.queue, gstructure)

        textconf = {
            'n_structure_vars': n_structure_vars,
            'structure_vars_stride': structure_vars_stride,
            'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots,
            'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots,
            'a_s0'    : 'lstructure[2 * %s + ii]' % max_n_dots,
            'N_i'     : 'lstructure[3 * %s + ii]' % max_n_dots,
            'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots,
            'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots,
            'n_dot_products': 'lstructure[4 * %s + 2]' % max_n_dots,
            'y_len'   : 'lstructure[4 * %s + 3]' % max_n_dots,
            'bb'   : 'lstructure[4 * %s + 4]' % max_n_dots,
            }
        return cl_gstructure, textconf
Exemple #2
0
    def cl_geometry_and_textconf(self, items, padding=4):
        p = self
        max_n_dots = max(len(p.geometry[ii]['dots']) for ii in items)
        n_structure_vars = 4 * max_n_dots + 5
        structure_vars_stride = int(
            padding * math.ceil(float(n_structure_vars) / padding))
        gstructure = np.zeros((len(items), structure_vars_stride), dtype='int32')
        A_starts = p.A.starts
        X_starts = p.X.starts
        Y_starts = p.Y.starts
        Y_in_starts = p.Y_in.starts
        A_stride0s = p.A.stride0s
        A_shape1s = p.A.shape1s
        Y_shape0s = p.Y.shape0s

        for bbi, bb in enumerate(items):
            x_js_i = p.X_js[bb]
            A_js_i = p.A_js[bb]
            assert len(x_js_i) == len(A_js_i)
            for ii, (xi, ai) in enumerate(zip(x_js_i, A_js_i)):
                gstructure[bbi, 0 * max_n_dots + ii] = X_starts[xi]
                gstructure[bbi, 1 * max_n_dots + ii] = A_starts[ai]
                gstructure[bbi, 2 * max_n_dots + ii] = A_stride0s[ai]
                gstructure[bbi, 3 * max_n_dots + ii] = A_shape1s[ai]
            # -- offset of output and input buffers
            gstructure[bbi, 4 * max_n_dots + 0] = Y_in_starts[bb]
            gstructure[bbi, 4 * max_n_dots + 1] = Y_starts[bb]
            # -- number of dots for bb
            gstructure[bbi, 4 * max_n_dots + 2] = len(A_js_i)
            # -- length of Y[bb]
            gstructure[bbi, 4 * max_n_dots + 3] = Y_shape0s[bb]
            gstructure[bbi, 4 * max_n_dots + 4] = bb
        cl_gstructure = to_device(p.queue, gstructure)

        textconf = {
            'n_structure_vars': n_structure_vars,
            'structure_vars_stride': structure_vars_stride,
            'x_starts': 'lstructure[0 * %s + ii]' % max_n_dots,
            'a_starts': 'lstructure[1 * %s + ii]' % max_n_dots,
            'a_s0'    : 'lstructure[2 * %s + ii]' % max_n_dots,
            'N_i'     : 'lstructure[3 * %s + ii]' % max_n_dots,
            'y_in_starts': 'lstructure[4 * %s + 0]' % max_n_dots,
            'y_offset': 'lstructure[4 * %s + 1]' % max_n_dots,
            'n_dot_products': 'lstructure[4 * %s + 2]' % max_n_dots,
            'y_len'   : 'lstructure[4 * %s + 3]' % max_n_dots,
            'bb'   : 'lstructure[4 * %s + 4]' % max_n_dots,
            }
        return cl_gstructure, textconf
Exemple #3
0
def float_cl_clra(queue, arg, cl_dtype, N):
    float_arg = None
    cl_arg = None
    clra_arg = None
    if isinstance(arg, CLRaggedArray):
        clra_arg = arg
        assert arg.dtype == cl_dtype
    elif isinstance(arg, float):
        float_arg = arg
    elif len(set(arg)) == 1:
        float_arg = arg[0]
    else:
        host_arg = np.asarray(arg, cl_dtype)
        assert host_arg.shape == (N,)
        cl_arg = to_device(queue, host_arg)
    return float_arg, cl_arg, clra_arg
Exemple #4
0
def float_cl_clra(queue, arg, cl_dtype, N):
    float_arg = None
    cl_arg = None
    clra_arg = None
    if isinstance(arg, CLRaggedArray):
        clra_arg = arg
        assert arg.dtype == cl_dtype
    elif isinstance(arg, float):
        float_arg = arg
    elif len(set(arg)) == 1:
        float_arg = arg[0]
    else:
        host_arg = np.asarray(arg, cl_dtype)
        assert host_arg.shape == (N,)
        cl_arg = to_device(queue, host_arg)
    return float_arg, cl_arg, clra_arg
def plan_probes(queue, periods, X, Y, tag=None):
    """
    Parameters
    ----------
    P : raggedarray of ints
        The period (in time-steps) of each probe
    """

    assert len(X) == len(Y)
    assert len(X) == len(periods)
    N = len(X)

    cl_countdowns = to_device(queue, np.zeros(N, dtype="int32"))
    cl_bufpositions = to_device(queue, np.zeros(N, dtype="int32"))
    cl_periods = to_device(queue, np.asarray(periods, dtype="int32"))

    assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype

    ### N.B.  X[i].shape = (ndims[i], )
    ###       Y[i].shape = (buf_ndims[i], buf_len)

    for i in xrange(N):
        assert X.shape0s[i] == Y.shape1s[i]
        assert X.shape1s[i] == 1
        assert X.stride0s[i] == 1
        assert Y.stride1s[i] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
            __global int *countdowns,
            __global int *bufpositions,
            __global const int *periods,
            __global const int *Xstarts,
            __global const int *Xshape0s,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            const int countdown = countdowns[n];

            if (countdown == 0) {
                const int n_dims = Xshape0s[n];
                __global const ${Xtype} *x = Xdata + Xstarts[n];
                const int bufpos = bufpositions[n];

                __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims;

                for (int ii = get_global_id(0);
                         ii < n_dims;
                         ii += get_global_size(0))
                {
                    y[ii] = x[ii];
                }
                // This should *not* cause deadlock because
                // all local threads guaranteed to be
                // in this branch together.
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = periods[n] - 1;
                    bufpositions[n] = bufpos + 1;
                }
            }
            else
            {
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown - 1;
                }
            }
        }
        """

    textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype)
    text = Template(text, output_encoding="ascii").render(**textconf)

    full_args = (cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf)
    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(X.shape0s))
    gsize = (max_len, N)
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    rval.cl_bufpositions = cl_bufpositions
    rval.Y = Y
    return rval
Exemple #6
0
def plan_probes(queue, periods, X, Y, tag=None):
    """
    Parameters
    ----------
    P : raggedarray of ints
        The period (in time-steps) of each probe
    """

    assert len(X) == len(Y)
    assert len(X) == len(periods)
    N = len(X)

    cl_countdowns = to_device(queue, np.zeros(N, dtype='int32'))
    cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32'))
    cl_periods = to_device(queue, np.asarray(periods, dtype='int32'))

    assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype

    ### N.B.  X[i].shape = (ndims[i], )
    ###       Y[i].shape = (buf_ndims[i], buf_len)

    for i in xrange(N):
        assert X.shape0s[i] == Y.shape1s[i]
        assert X.shape1s[i] == 1
        assert X.stride0s[i] == 1
        assert Y.stride1s[i] == 1

    text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
            __global int *countdowns,
            __global int *bufpositions,
            __global const int *periods,
            __global const int *Xstarts,
            __global const int *Xshape0s,
            __global const ${Xtype} *Xdata,
            __global const int *Ystarts,
            __global ${Ytype} *Ydata
        )
        {
            const int n = get_global_id(1);
            const int countdown = countdowns[n];

            if (countdown == 0) {
                const int n_dims = Xshape0s[n];
                __global const ${Xtype} *x = Xdata + Xstarts[n];
                const int bufpos = bufpositions[n];

                __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims;

                for (int ii = get_global_id(0);
                         ii < n_dims;
                         ii += get_global_size(0))
                {
                    y[ii] = x[ii];
                }
                // This should *not* cause deadlock because
                // all local threads guaranteed to be
                // in this branch together.
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = periods[n] - 1;
                    bufpositions[n] = bufpos + 1;
                }
            }
            else
            {
                barrier(CLK_LOCAL_MEM_FENCE);
                if (get_global_id(0) == 0)
                {
                    countdowns[n] = countdown - 1;
                }
            }
        }
        """

    textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype)
    text = Template(text, output_encoding='ascii').render(**textconf)

    full_args = (
        cl_countdowns,
        cl_bufpositions,
        cl_periods,
        X.cl_starts,
        X.cl_shape0s,
        X.cl_buf,
        Y.cl_starts,
        Y.cl_buf,
    )
    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    max_len = min(queue.device.max_work_group_size, max(X.shape0s))
    gsize = (
        max_len,
        N,
    )
    lsize = (max_len, 1)
    rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    rval.cl_bufpositions = cl_bufpositions
    rval.Y = Y
    return rval
Exemple #7
0
def ref_impl(p, items):
    """
    Return an OpenCL function to calculate elements `items` of
    gemv operation `p`.

    In this reference implementation, we create a work item
    per output number, or more specifically, a work grid
    of (max_y_len, len(items)).  Each work item loops over the
    dot products and the elements within each dot product to
    compute the output value Y[global_id(1)][global_id(0)].

    """

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    cl_items = to_device(p.queue,
        np.asarray(items, dtype='int32'))
    if 0:
        if len(items) < 10:
            print 'Falling back on reference implementation'
            p.print_geometry_summary(items, full=True)
        else:
            print 'Falling back on reference implementation'
            p.print_geometry_summary(items)

    assert all(s == 1 for s in p.A.stride1s)
    assert all(s == 1 for s in p.X.stride1s)
    assert all(s == 1 for s in p.Y.stride0s)
    assert all(s == 1 for s in p.Y.stride1s)
    assert all(s == 1 for s in p.Y_in.stride0s)
    assert all(s == 1 for s in p.Y_in.stride1s)

    text = """
        __kernel void fn(
            __global int *items,
    % if cl_alpha is not None:
            __global ${cl_alpha.ocldtype} * alphas,
    % endif
    % if (A_js is not None):
            __global int *A_starts,
            __global int *A_shape1s,
            __global int *A_stride0s,
            __global ${A.cl_buf.ocldtype} *A_data,
            __global int *A_js_starts,
            __global int *A_js_shape0s,
            __global int *A_js_data,
            __global int *X_starts,
            __global int *X_stride0s,
            __global ${X.cl_buf.ocldtype} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
    % endif
    % if cl_beta is not None:
            __global ${cl_beta.ocldtype} * betas,
    % endif
    % if clra_beta is not None:
            __global int *beta_starts,
            __global int *beta_data,
    % endif
    % if cl_gamma is not None:
            __global ${cl_gamma.ocldtype} * gammas,
    % endif
            __global int *Y_in_starts,
            __global ${Y_in.cl_buf.ocldtype} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_shape0s,
            __global ${Y.cl_buf.ocldtype} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = items[get_global_id(1)];
            const int M = Y_shape0s[bb];
            if (mm < M)
            {
                const int y_offset = Y_starts[bb];
                const int y_in_offset = Y_in_starts[bb];

    % if float_beta is not None:
                const ${Y.cl_buf.ocldtype} beta = ${float_beta};
    % elif cl_beta is not None:
                const ${cl_beta.ocldtype} beta = betas[bb];
    % elif clra_beta is not None:
                const int beta_offset = beta_starts[bb];
                const ${clra_beta.cl_buf.ocldtype} beta
                    = beta_data[beta_offset + mm];
    % endif

    % if float_gamma is not None:
                const ${Y.cl_buf.ocldtype} gamma = ${float_gamma};
    % elif cl_gamma is not None:
                const ${cl_gamma.ocldtype} gamma = gammas[bb];
    % endif

                Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm];

    % if (A_js is not None) :

                const int n_dot_products = A_js_shape0s[bb];
                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                ${Y.cl_buf.ocldtype} y_sum = 0;
                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    const int x_ji = X_js_data[ii];
                    const int a_ji = A_js_data[ii];
                    const int N_i = A_shape1s[a_ji];
                    const int x_offset = X_starts[x_ji];
                    const int a_offset = A_starts[a_ji];
                    const int AsM = A_stride0s[a_ji];
                    const int XsM = X_stride0s[x_ji];

                    for (int nn = 0; nn < N_i; ++nn)
                    {
                        y_sum += X_data[x_offset + nn * XsM]
                                 * A_data[a_offset + mm * AsM + nn];
                    }
                }
        % if float_alpha is not None:
                Y_data[y_offset + mm] += ${float_alpha} * y_sum;
        % elif cl_alpha is not None:
                Y_data[y_offset + mm] += alphas[bb] * y_sum;
        % endif
    % endif
            }

        }
    """

    text = Template(text, output_encoding='ascii').render(**p.__dict__)
    #print text

    gsize = (
        max(p.geometry[ii]['y_len'] for ii in items),
        len(items))
    lsize = None
    fn = cl.Program(p.queue.context, text).build().fn
    full_args = [cl_items]
    if p.cl_alpha is not None:
        full_args += [p.cl_alpha]
    if p.A_js is not None:
        full_args += [
            p.A.cl_starts,
            p.A.cl_shape1s,
            p.A.cl_stride0s,
            p.A.cl_buf,
            p.A_js.cl_starts,
            p.A_js.cl_shape0s,
            p.A_js.cl_buf,
            p.X.cl_starts,
            p.X.cl_stride0s,
            p.X.cl_buf,
            p.X_js.cl_starts,
            p.X_js.cl_buf,
            ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    elif p.clra_beta is not None:
        full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf]

    if p.cl_gamma is not None:
        full_args += [p.cl_gamma]
    elif p.clra_gamma is not None:
        full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf]

    full_args += [
        p.Y_in.cl_starts,
        p.Y_in.cl_buf,
        p.Y.cl_starts,
        p.Y.cl_shape0s,
        p.Y.cl_buf]

    #print [str(arr.dtype)[0] for arr in full_args]
    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl",
        tag=p.tag,
        bw_per_call=bw_from_geometry(p.geometry, items),
        flops_per_call=flops_from_geometry(p.geometry, items))
    rval.full_args = full_args  # prevent GC the args
    return rval
Exemple #8
0
def ref_impl(p, items):
    """
    Return an OpenCL function to calculate elements `items` of
    gemv operation `p`.

    In this reference implementation, we create a work item
    per output number, or more specifically, a work grid
    of (max_y_len, len(items)).  Each work item loops over the
    dot products and the elements within each dot product to
    compute the output value Y[global_id(1)][global_id(0)].

    """

    if p.clra_alpha is not None:
        raise NotImplementedError()
    if p.clra_gamma is not None:
        raise NotImplementedError()
    cl_items = to_device(p.queue,
        np.asarray(items, dtype='int32'))
    if 0:
        if len(items) < 10:
            print 'Falling back on reference implementation'
            p.print_geometry_summary(items, full=True)
        else:
            print 'Falling back on reference implementation'
            p.print_geometry_summary(items)

    assert all(s == 1 for s in p.A.stride1s)
    assert all(s == 1 for s in p.X.stride1s)
    assert all(s == 1 for s in p.Y.stride0s)
    assert all(s == 1 for s in p.Y.stride1s)
    assert all(s == 1 for s in p.Y_in.stride0s)
    assert all(s == 1 for s in p.Y_in.stride1s)

    text = """
        __kernel void fn(
            __global int *items,
    % if cl_alpha is not None:
            __global ${cl_alpha.ocldtype} * alphas,
    % endif
    % if (A_js is not None):
            __global int *A_starts,
            __global int *A_shape1s,
            __global int *A_stride0s,
            __global ${A.cl_buf.ocldtype} *A_data,
            __global int *A_js_starts,
            __global int *A_js_shape0s,
            __global int *A_js_data,
            __global int *X_starts,
            __global int *X_stride0s,
            __global ${X.cl_buf.ocldtype} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
    % endif
    % if cl_beta is not None:
            __global ${cl_beta.ocldtype} * betas,
    % endif
    % if clra_beta is not None:
            __global int *beta_starts,
            __global int *beta_data,
    % endif
    % if cl_gamma is not None:
            __global ${cl_gamma.ocldtype} * gammas,
    % endif
            __global int *Y_in_starts,
            __global ${Y_in.cl_buf.ocldtype} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_shape0s,
            __global ${Y.cl_buf.ocldtype} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = items[get_global_id(1)];
            const int M = Y_shape0s[bb];
            if (mm < M)
            {
                const int y_offset = Y_starts[bb];
                const int y_in_offset = Y_in_starts[bb];

    % if float_beta is not None:
                const ${Y.cl_buf.ocldtype} beta = ${float_beta};
    % elif cl_beta is not None:
                const ${cl_beta.ocldtype} beta = betas[bb];
    % elif clra_beta is not None:
                const int beta_offset = beta_starts[bb];
                const ${clra_beta.cl_buf.ocldtype} beta
                    = beta_data[beta_offset + mm];
    % endif

    % if float_gamma is not None:
                const ${Y.cl_buf.ocldtype} gamma = ${float_gamma};
    % elif cl_gamma is not None:
                const ${cl_gamma.ocldtype} gamma = gammas[bb];
    % endif

                Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm];

    % if (A_js is not None) :

                const int n_dot_products = A_js_shape0s[bb];
                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                ${Y.cl_buf.ocldtype} y_sum = 0;
                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    const int x_ji = X_js_data[ii];
                    const int a_ji = A_js_data[ii];
                    const int N_i = A_shape1s[a_ji];
                    const int x_offset = X_starts[x_ji];
                    const int a_offset = A_starts[a_ji];
                    const int AsM = A_stride0s[a_ji];
                    const int XsM = X_stride0s[x_ji];

                    for (int nn = 0; nn < N_i; ++nn)
                    {
                        y_sum += X_data[x_offset + nn * XsM]
                                 * A_data[a_offset + mm * AsM + nn];
                    }
                }
        % if float_alpha is not None:
                Y_data[y_offset + mm] += ${float_alpha} * y_sum;
        % elif cl_alpha is not None:
                Y_data[y_offset + mm] += alphas[bb] * y_sum;
        % endif
    % endif
            }

        }
    """

    text = Template(text, output_encoding='ascii').render(**p.__dict__)
    #print text

    gsize = (
        max(p.geometry[ii]['y_len'] for ii in items),
        len(items))
    lsize = None
    fn = cl.Program(p.queue.context, text).build().fn
    full_args = [cl_items]
    if p.cl_alpha is not None:
        full_args += [p.cl_alpha]
    if p.A_js is not None:
        full_args += [
            p.A.cl_starts,
            p.A.cl_shape1s,
            p.A.cl_stride0s,
            p.A.cl_buf,
            p.A_js.cl_starts,
            p.A_js.cl_shape0s,
            p.A_js.cl_buf,
            p.X.cl_starts,
            p.X.cl_stride0s,
            p.X.cl_buf,
            p.X_js.cl_starts,
            p.X_js.cl_buf,
            ]
    if p.cl_beta is not None:
        full_args += [p.cl_beta]
    elif p.clra_beta is not None:
        full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf]

    if p.cl_gamma is not None:
        full_args += [p.cl_gamma]
    elif p.clra_gamma is not None:
        full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf]

    full_args += [
        p.Y_in.cl_starts,
        p.Y_in.cl_buf,
        p.Y.cl_starts,
        p.Y.cl_shape0s,
        p.Y.cl_buf]

    #print [str(arr.dtype)[0] for arr in full_args]
    fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl",
        tag=p.tag,
        bw_per_call=bw_from_geometry(p.geometry, items),
        flops_per_call=flops_from_geometry(p.geometry, items))
    rval.full_args = full_args  # prevent GC the args
    return rval