def plan_parallel_ragged_gather_gemv2(queue, Ms, Ns, alpha, A, A_js, X, X_js,
                       beta, Y, group_size = 32, Y_in=None, tag=None):
    """
    """
    
    # TODO: if alpha or beta is a float
    #       then render it into the kernel text.
    try:
        float(alpha)
        alpha = [alpha] * len(Y)
    except TypeError:
        pass

    try:
        float(beta)
        beta = [beta] * len(Y)
    except TypeError:
        pass

    cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
    cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))
    

    if Y_in is None:
        Y_in = Y

    # XXX check for e.g. all Ns being the same thing
    #     especially all Ns == 1
    cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))

    # XXX check that all the ints are ints not longs
    textconf = {
        'type_alpha': cl_alpha.ocldtype,
        'type_beta': cl_beta.ocldtype,
        'type_A': A.cl_buf.ocldtype,
        'type_X': X.cl_buf.ocldtype,
        'type_Y': Y.cl_buf.ocldtype,
        'y_len': len(Y),
        'lsize': group_size,
    }

    text = """
        __kernel void fn(
            const __global int *Ns,
            const __global ${type_alpha} * alphas,
            const __global int *A_starts,
            const __global ${type_A} *A_data,
            const __global int *A_js_starts,
            const __global int *A_js_lens,
            const __global int *A_js_data,
            const __global int *X_starts,
            const __global ${type_X} *X_data,
            const __global int *X_js_starts,
            const __global int *X_js_data,
            const __global ${type_beta} * betas,
            const __global int *Y_in_starts,
            const __global ${type_Y} *Y_in_data,
            const __global int *Y_starts,
            const __global int *Y_lens,
            __global ${type_Y} *Y_data)
        {
            //const int mm = get_global_id(1); //TODO
            
            
            __local ${type_Y} partialDotProduct[${lsize}]; //Scratch space for the dot products

            //Y is divided into groups of size group_size. Each work-item does enough dot-products to cover one of the groups
            for (uint yi = get_group_id(0); yi < ${y_len}; yi += get_num_groups(0)) {

                const __global int* X_js_row = X_js_data + X_js_starts[yi];
                const __global int* A_js_row = A_js_data + A_js_starts[yi];
                
                const ${type_alpha} alpha = alphas[yi];
                const ${type_beta} beta = betas[yi];
                
                int y_offset = Y_starts[yi];
                int y_in_offset = Y_in_starts[yi];
                Y_data[y_offset] = beta * Y_in_data[y_in_offset];
                
                float sum = 0; 
                int n_dot_products = A_js_lens[yi]; //Do all of xjs dot products at same time
                for(int j = 0; j < n_dot_products; j++) {
                    int x_ji = X_js_row[j];
                    int a_ji = A_js_row[j];
                    int N_i = Ns[a_ji];
                              
                    const __global ${type_A}* A_row = A_data + A_starts[a_ji]; //Get the rows for the product
                    const __global ${type_X}* X_row = X_data + X_starts[x_ji];

                    //Each work item will do some fraction of the multiplications and store the result locally
                    for (uint x = get_local_id(0); x < N_i; x += get_local_size(0)) {
                        sum += A_row[x] * X_row[x]; 
                    }
                }
                partialDotProduct[get_local_id(0)] = sum;
                
                //Parallel reduction of locally stored sums
                for (uint stride = 1; stride < get_local_size(0); stride *= 2) { 
                    barrier(CLK_LOCAL_MEM_FENCE); 
                    
                    uint index = 2 * stride * get_local_id(0);
                    if (index < get_local_size(0)) { 
                        partialDotProduct[index] += partialDotProduct[index + stride]; 
                    } 
                } 
                
                //Multiply by alpha and store the result.
                if (get_local_id(0) == 0) {
                    Y_data[yi] += alpha * partialDotProduct[0];
                    barrier(CLK_LOCAL_MEM_FENCE); 
                }   
            }
        }
        """

    text = Template(text, output_encoding='ascii').render(**textconf)
    
    #Make the global size the closest multiple of the group size (ceiling)
    y_size = int(math.ceil(len(Y) / float(group_size))) * group_size
    gsize = (y_size,)
    lsize = (group_size,)
    
    _fn = cl.Program(queue.context, text).build().fn
    full_args = (cl_Ns,
                 cl_alpha,
                 A.cl_starts,
                 A.cl_buf,
                 A_js.cl_starts,
                 A_js.cl_lens,
                 A_js.cl_buf,
                 X.cl_starts,
                 X.cl_buf,
                 X_js.cl_starts,
                 X_js.cl_buf,
                 cl_beta,
                 Y_in.cl_starts,
                 Y_in.cl_buf,
                 Y.cl_starts,
                 Y.cl_lens,
                 Y.cl_buf,
                )

    _fn.set_args(*[arr.data for arr in full_args])
    
    rval = Plan(queue, _fn, gsize, lsize,
                name='ref_parallel_ragged_gather_gemv',
                tag=tag,
               )
    # prevent garbage-collection
    rval.alpha = cl_alpha
    rval.beta = cl_beta
    rval.Ns = cl_Ns
    return rval
Exemple #2
0
def plan_ragged_gather_gemv(queue,
                            Ms,
                            Ns,
                            alpha,
                            A,
                            A_js,
                            X,
                            X_js,
                            beta,
                            Y,
                            Y_in=None,
                            tag=None):
    """
    """
    # TODO: if alpha or beta is a float
    #       then render it into the kernel text.
    try:
        float(alpha)
        alpha = [alpha] * len(Y)
    except TypeError:
        pass

    try:
        float(beta)
        beta = [beta] * len(Y)
    except TypeError:
        pass

    cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
    cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))

    if Y_in is None:
        Y_in = Y

    # XXX check for e.g. all Ns being the same thing
    #     especially all Ns == 1
    cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))

    # XXX check that all the ints are ints not longs
    textconf = {
        'type_alpha': cl_alpha.ocldtype,
        'type_beta': cl_beta.ocldtype,
        'type_A': A.cl_buf.ocldtype,
        'type_X': X.cl_buf.ocldtype,
        'type_Y': Y.cl_buf.ocldtype,
    }

    text = """
        __kernel void fn(
            __global int *Ns,
            __global ${type_alpha} * alphas,
            __global int *A_starts,
            __global ${type_A} *A_data,
            __global int *A_js_starts,
            __global int *A_js_lens,
            __global int *A_js_data,
            __global int *X_starts,
            __global ${type_X} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
            __global ${type_beta} * betas,
            __global int *Y_in_starts,
            __global ${type_Y} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_lens,
            __global ${type_Y} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = get_global_id(1);
            const int M = Y_lens[bb];
            if (mm < M)
            {
                const ${type_alpha} alpha = alphas[bb];
                const ${type_beta} beta = betas[bb];

                int n_dot_products = A_js_lens[bb];
                int y_offset = Y_starts[bb];
                int y_in_offset = Y_in_starts[bb];

                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                Y_data[y_offset + mm] = beta * Y_in_data[y_in_offset + mm];

                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    int x_ji = X_js_data[ii];
                    int a_ji = A_js_data[ii];
                    int N_i = Ns[a_ji];
                    int x_offset = X_starts[x_ji];
                    int a_offset = A_starts[a_ji];

                    // compute the matrix-vector product
                    // dot(X[x_ji], A[a_ji])
                    ${type_Y} y_sum = 0;
                    for (int nn = 0; nn < N_i; ++nn) //Parallel reduction. How big is N_i?
                    {
                        y_sum += X_data[x_offset + nn]
                        * A_data[a_offset + nn * M + mm];
                    }
                    Y_data[y_offset + mm] += alpha * y_sum;
                }
            }
        }
    """

    text = Template(text, output_encoding='ascii').render(**textconf)
    gsize = (
        int(max(Ms)),
        int(len(Y)),
    )
    lsize = None
    _fn = cl.Program(queue.context, text).build().fn
    full_args = (
        cl_Ns,
        cl_alpha,
        A.cl_starts,
        A.cl_buf,
        A_js.cl_starts,
        A_js.cl_lens,
        A_js.cl_buf,
        X.cl_starts,
        X.cl_buf,
        X_js.cl_starts,
        X_js.cl_buf,
        cl_beta,
        Y_in.cl_starts,
        Y_in.cl_buf,
        Y.cl_starts,
        Y.cl_lens,
        Y.cl_buf,
    )
    #print [str(arr.dtype)[0] for arr in full_args]
    _fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(
        queue,
        _fn,
        gsize,
        lsize,
        name='ref_ragged_gather_gemv',
        tag=tag,
    )
    # prevent garbage-collection
    rval.alpha = cl_alpha
    rval.beta = cl_beta
    rval.Ns = cl_Ns
    return rval
def plan_ragged_gather_gemv(queue, Ms, Ns, alpha, A, A_js, X, X_js,
                       beta, Y, Y_in=None, tag=None):
    """
    """
    # TODO: if alpha or beta is a float
    #       then render it into the kernel text.
    try:
        float(alpha)
        alpha = [alpha] * len(Y)
    except TypeError:
        pass

    try:
        float(beta)
        beta = [beta] * len(Y)
    except TypeError:
        pass

    cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
    cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))

    if Y_in is None:
        Y_in = Y

    # XXX check for e.g. all Ns being the same thing
    #     especially all Ns == 1
    cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))

    # XXX check that all the ints are ints not longs
    textconf = {
        'type_alpha': cl_alpha.ocldtype,
        'type_beta': cl_beta.ocldtype,
        'type_A': A.cl_buf.ocldtype,
        'type_X': X.cl_buf.ocldtype,
        'type_Y': Y.cl_buf.ocldtype,
    }

    text = """
        __kernel void fn(
            __global int *Ns,
            __global ${type_alpha} * alphas,
            __global int *A_starts,
            __global ${type_A} *A_data,
            __global int *A_js_starts,
            __global int *A_js_lens,
            __global int *A_js_data,
            __global int *X_starts,
            __global ${type_X} *X_data,
            __global int *X_js_starts,
            __global int *X_js_data,
            __global ${type_beta} * betas,
            __global int *Y_in_starts,
            __global ${type_Y} *Y_in_data,
            __global int *Y_starts,
            __global int *Y_lens,
            __global ${type_Y} *Y_data)
        {
            const int mm = get_global_id(0);
            const int bb = get_global_id(1);
            const int M = Y_lens[bb];
            if (mm < M)
            {
                const ${type_alpha} alpha = alphas[bb];
                const ${type_beta} beta = betas[bb];

                int n_dot_products = A_js_lens[bb];
                int y_offset = Y_starts[bb];
                int y_in_offset = Y_in_starts[bb];

                X_js_data += X_js_starts[bb];
                A_js_data += A_js_starts[bb];

                Y_data[y_offset + mm] = beta * Y_in_data[y_in_offset + mm];

                for (int ii = 0; ii < n_dot_products; ++ii)
                {
                    int x_ji = X_js_data[ii];
                    int a_ji = A_js_data[ii];
                    int N_i = Ns[a_ji];
                    int x_offset = X_starts[x_ji];
                    int a_offset = A_starts[a_ji];

                    // compute the matrix-vector product
                    // dot(X[x_ji], A[a_ji])
                    ${type_Y} y_sum = 0;
                    for (int nn = 0; nn < N_i; ++nn) //Parallel reduction. How big is N_i?
                    {
                        y_sum += X_data[x_offset + nn]
                        * A_data[a_offset + nn * M + mm];
                    }
                    Y_data[y_offset + mm] += alpha * y_sum;
                }
            }
        }
    """

    text = Template(text, output_encoding='ascii').render(**textconf)
    gsize = (int(max(Ms)), int(len(Y)),)
    lsize = None
    _fn = cl.Program(queue.context, text).build().fn
    full_args = (cl_Ns,
                 cl_alpha,
                 A.cl_starts,
                 A.cl_buf,
                 A_js.cl_starts,
                 A_js.cl_lens,
                 A_js.cl_buf,
                 X.cl_starts,
                 X.cl_buf,
                 X_js.cl_starts,
                 X_js.cl_buf,
                 cl_beta,
                 Y_in.cl_starts,
                 Y_in.cl_buf,
                 Y.cl_starts,
                 Y.cl_lens,
                 Y.cl_buf,
                )
    #print [str(arr.dtype)[0] for arr in full_args]
    _fn.set_args(*[arr.data for arr in full_args])
    rval = Plan(queue, _fn, gsize, lsize,
                name='ref_ragged_gather_gemv',
                tag=tag,
               )
    # prevent garbage-collection
    rval.alpha = cl_alpha
    rval.beta = cl_beta
    rval.Ns = cl_Ns
    return rval
Exemple #4
0
def plan_parallel_ragged_gather_gemv2(queue,
                                      Ms,
                                      Ns,
                                      alpha,
                                      A,
                                      A_js,
                                      X,
                                      X_js,
                                      beta,
                                      Y,
                                      group_size=32,
                                      Y_in=None,
                                      tag=None):
    """
    """

    # TODO: if alpha or beta is a float
    #       then render it into the kernel text.
    try:
        float(alpha)
        alpha = [alpha] * len(Y)
    except TypeError:
        pass

    try:
        float(beta)
        beta = [beta] * len(Y)
    except TypeError:
        pass

    cl_alpha = to_device(queue, np.asarray(alpha, Y.buf.dtype))
    cl_beta = to_device(queue, np.asarray(beta, Y.buf.dtype))

    if Y_in is None:
        Y_in = Y

    # XXX check for e.g. all Ns being the same thing
    #     especially all Ns == 1
    cl_Ns = to_device(queue, np.asarray(Ns, 'int32'))

    # XXX check that all the ints are ints not longs
    textconf = {
        'type_alpha': cl_alpha.ocldtype,
        'type_beta': cl_beta.ocldtype,
        'type_A': A.cl_buf.ocldtype,
        'type_X': X.cl_buf.ocldtype,
        'type_Y': Y.cl_buf.ocldtype,
        'y_len': len(Y),
        'lsize': group_size,
    }

    text = """
        __kernel void fn(
            const __global int *Ns,
            const __global ${type_alpha} * alphas,
            const __global int *A_starts,
            const __global ${type_A} *A_data,
            const __global int *A_js_starts,
            const __global int *A_js_lens,
            const __global int *A_js_data,
            const __global int *X_starts,
            const __global ${type_X} *X_data,
            const __global int *X_js_starts,
            const __global int *X_js_data,
            const __global ${type_beta} * betas,
            const __global int *Y_in_starts,
            const __global ${type_Y} *Y_in_data,
            const __global int *Y_starts,
            const __global int *Y_lens,
            __global ${type_Y} *Y_data)
        {
            //const int mm = get_global_id(1); //TODO
            
            
            __local ${type_Y} partialDotProduct[${lsize}]; //Scratch space for the dot products

            //Y is divided into groups of size group_size. Each work-item does enough dot-products to cover one of the groups
            for (uint yi = get_group_id(0); yi < ${y_len}; yi += get_num_groups(0)) {

                const __global int* X_js_row = X_js_data + X_js_starts[yi];
                const __global int* A_js_row = A_js_data + A_js_starts[yi];
                
                const ${type_alpha} alpha = alphas[yi];
                const ${type_beta} beta = betas[yi];
                
                int y_offset = Y_starts[yi];
                int y_in_offset = Y_in_starts[yi];
                Y_data[y_offset] = beta * Y_in_data[y_in_offset];
                
                float sum = 0; 
                int n_dot_products = A_js_lens[yi]; //Do all of xjs dot products at same time
                for(int j = 0; j < n_dot_products; j++) {
                    int x_ji = X_js_row[j];
                    int a_ji = A_js_row[j];
                    int N_i = Ns[a_ji];
                              
                    const __global ${type_A}* A_row = A_data + A_starts[a_ji]; //Get the rows for the product
                    const __global ${type_X}* X_row = X_data + X_starts[x_ji];

                    //Each work item will do some fraction of the multiplications and store the result locally
                    for (uint x = get_local_id(0); x < N_i; x += get_local_size(0)) {
                        sum += A_row[x] * X_row[x]; 
                    }
                }
                partialDotProduct[get_local_id(0)] = sum;
                
                //Parallel reduction of locally stored sums
                for (uint stride = 1; stride < get_local_size(0); stride *= 2) { 
                    barrier(CLK_LOCAL_MEM_FENCE); 
                    
                    uint index = 2 * stride * get_local_id(0);
                    if (index < get_local_size(0)) { 
                        partialDotProduct[index] += partialDotProduct[index + stride]; 
                    } 
                } 
                
                //Multiply by alpha and store the result.
                if (get_local_id(0) == 0) {
                    Y_data[yi] += alpha * partialDotProduct[0];
                    barrier(CLK_LOCAL_MEM_FENCE); 
                }   
            }
        }
        """

    text = Template(text, output_encoding='ascii').render(**textconf)

    #Make the global size the closest multiple of the group size (ceiling)
    y_size = int(math.ceil(len(Y) / float(group_size))) * group_size
    gsize = (y_size, )
    lsize = (group_size, )

    _fn = cl.Program(queue.context, text).build().fn
    full_args = (
        cl_Ns,
        cl_alpha,
        A.cl_starts,
        A.cl_buf,
        A_js.cl_starts,
        A_js.cl_lens,
        A_js.cl_buf,
        X.cl_starts,
        X.cl_buf,
        X_js.cl_starts,
        X_js.cl_buf,
        cl_beta,
        Y_in.cl_starts,
        Y_in.cl_buf,
        Y.cl_starts,
        Y.cl_lens,
        Y.cl_buf,
    )

    _fn.set_args(*[arr.data for arr in full_args])

    rval = Plan(
        queue,
        _fn,
        gsize,
        lsize,
        name='ref_parallel_ragged_gather_gemv',
        tag=tag,
    )
    # prevent garbage-collection
    rval.alpha = cl_alpha
    rval.beta = cl_beta
    rval.Ns = cl_Ns
    return rval