def plan_direct(queue, code, init, Xname, X, Y, tag=None): from . import ast_conversion assert len(X) == len(Y) N = len(X) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global const int *${IN}starts, __global const ${INtype} *${IN}data, __global const int *${OUT}starts, __global ${OUTtype} *${OUT}data ) { const int n = get_global_id(0); if (n >= ${N}) return; __global const ${INtype} *${arg} = ${IN}data + ${IN}starts[n]; __global ${OUTtype} *${OUT} = ${OUT}data + ${OUT}starts[n]; /////vvvvv USER DECLARATIONS BELOW vvvvv ${init} /////vvvvv USER COMPUTATIONS BELOW vvvvv ${code} // END OF FUNC: put nothing after user code, since it can return } """ textconf = dict( init=_indent(init, 12), code=_indent(code, 12), N=N, arg=Xname, IN=ast_conversion.INPUT_NAME, INtype=X.cl_buf.ocldtype, OUT=ast_conversion.OUTPUT_NAME, OUTtype=Y.cl_buf.ocldtype, ) text = Template(text, output_encoding='ascii').render(**textconf) full_args = (X.cl_starts, X.cl_buf, Y.cl_starts, Y.cl_buf) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) gsize = (N, ) rval = Plan(queue, _fn, gsize, lsize=None, name="cl_direct", tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def plan_direct(queue, code, init, Xname, X, Y, tag=None): from . import ast_conversion assert len(X) == len(Y) N = len(X) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global const int *${IN}starts, __global const ${INtype} *${IN}data, __global const int *${OUT}starts, __global ${OUTtype} *${OUT}data ) { const int n = get_global_id(0); if (n >= ${N}) return; __global const ${INtype} *${arg} = ${IN}data + ${IN}starts[n]; __global ${OUTtype} *${OUT} = ${OUT}data + ${OUT}starts[n]; /////vvvvv USER DECLARATIONS BELOW vvvvv ${init} /////vvvvv USER COMPUTATIONS BELOW vvvvv ${code} // END OF FUNC: put nothing after user code, since it can return } """ textconf = dict(init=_indent(init, 12), code=_indent(code, 12), N=N, arg=Xname, IN=ast_conversion.INPUT_NAME, INtype=X.cl_buf.ocldtype, OUT=ast_conversion.OUTPUT_NAME, OUTtype=Y.cl_buf.ocldtype, ) text = Template(text, output_encoding='ascii').render(**textconf) full_args = (X.cl_starts, X.cl_buf, Y.cl_starts, Y.cl_buf) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) gsize = (N,) rval = Plan(queue, _fn, gsize, lsize=None, name="cl_direct", tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- n_elements: int If n_elements == 0, then the kernels are allocated as a block. This is simple, but can be slow for large computations where input vector sizes are not uniform (e.g. one large population and many small ones). If n_elements >= 1, then all the vectors in the RaggedArray are flattened so that the exact number of required kernels is allocated. Each kernel performs computations for `n_elements` elements. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ base = inputs.values()[0] # input to use as reference (for lengths) N = len(base) ### split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v else: try: static_params[k] = ("float", float(v)) except TypeError: raise avars = {} for vname, v in inputs.items() + outputs.items(): assert vname not in avars, "Name clash" assert len(v) == N assert all_equal(v.shape0s, base.shape0s) ### N.B. - we should be able to ignore ldas as long as all vectors assert all_equal(v.shape1s, 1) dtype = v.cl_buf.ocldtype offset = "%(name)s_starts[n]" % {"name": vname} avars[vname] = (dtype, offset) for vname, v in params.items(): assert vname not in avars, "Name clash" assert len(v) == N for i in xrange(N): assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, "%s.shape0s[%d] must be 1 or %d (not %d)" % ( vname, i, base.shape0s[i], v.shape0s[i], ) assert v.shape1s[i] == 1 dtype = v.cl_buf.ocldtype offset = "%(name)s_starts[n]" % {"name": vname} avars[vname] = (dtype, offset) ivars = dict((k, avars[k]) for k in inputs.keys()) ovars = dict((k, avars[k]) for k in outputs.keys()) pvars = dict((k, avars[k]) for k in params.keys()) textconf = dict( N=N, n_elements=n_elements, tag=str(tag), declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params, ) if n_elements > 0: ### Allocate the exact number of required kernels in a vector gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))),) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int gid = get_global_id(0); int m = gid * ${n_elements}, n = 0; while (m >= lengths[n]) { m -= lengths[n]; n++; } if (n >= ${N}) return; % for name, [type, offset] in ivars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in ovars.items(): __global ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in pvars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset}; int ${name}_isvector = ${name}_shape0s[n] > 1; if (${name}_isvector) cur_${name} += m; % endfor % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): ${type} ${name}; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// % for ii in range(n_elements): ////////////////////////////////////////////////// ////////// LOOP ITERATION ${ii} % for name, [type, offset] in ivars.items(): ${name} = *cur_${name}; % endfor % for name, [type, offset] in pvars.items(): if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name}; % endfor /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): *cur_${name} = ${name}; % endfor % if ii + 1 < n_elements: m++; if (m >= lengths[n]) { n++; m = 0; if (n >= ${N}) return; % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): cur_${name} = in_${name} + ${offset}; % endfor % for name, [type, offset] in pvars.items(): ${name}_isvector = ${name}_shape0s[n] > 1; if (!${name}_isvector) ${name} = *cur_${name}; % endfor } else { % for name, [type, offset] in ivars.items() + ovars.items(): cur_${name}++; % endfor % for name, [type, offset] in pvars.items(): if (${name}_isvector) cur_${name}++; % endfor } % endif % endfor } """ else: ### Allocate more than enough kernels in a matrix gsize = (int(np.max(base.shape0s)), int(N)) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int m = get_global_id(0); const int n = get_global_id(1); const int M = lengths[n]; if (m >= M) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = in_${name}[${offset} + m]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = (${name}_shape0s[n] > 1) ? in_${name}[${offset} + m] : in_${name}[${offset}]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): in_${name}[${offset} + m] = ${name}; % endfor } """ text = Template(text, output_encoding="ascii").render(**textconf) if 0: for i, line in enumerate(text.split("\n")): print "%3d %s" % (i + 1, line) full_args = [] for name, v in inputs.items() + outputs.items(): full_args.extend([v.cl_starts, v.cl_buf]) for name, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(base.cl_shape0s) full_args = tuple(full_args) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype="int32")) cl_bufpositions = to_device(queue, np.zeros(N, dtype="int32")) cl_periods = to_device(queue, np.asarray(periods, dtype="int32")) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding="ascii").render(**textconf) full_args = (cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = (max_len, N) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- n_elements: int If n_elements == 0, then the kernels are allocated as a block. This is simple, but can be slow for large computations where input vector sizes are not uniform (e.g. one large population and many small ones). If n_elements >= 1, then all the vectors in the RaggedArray are flattened so that the exact number of required kernels is allocated. Each kernel performs computations for `n_elements` elements. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ base = inputs.values()[0] # input to use as reference (for lengths) N = len(base) ### split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v else: try: static_params[k] = ('float', float(v)) except TypeError: raise avars = {} for vname, v in inputs.items() + outputs.items(): assert vname not in avars, "Name clash" assert len(v) == N assert all_equal(v.shape0s, base.shape0s) ### N.B. - we should be able to ignore ldas as long as all vectors assert all_equal(v.shape1s, 1) dtype = v.cl_buf.ocldtype offset = '%(name)s_starts[n]' % {'name': vname} avars[vname] = (dtype, offset) for vname, v in params.items(): assert vname not in avars, "Name clash" assert len(v) == N for i in xrange(N): assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, \ "%s.shape0s[%d] must be 1 or %d (not %d)" % \ (vname, i, base.shape0s[i], v.shape0s[i]) assert v.shape1s[i] == 1 dtype = v.cl_buf.ocldtype offset = '%(name)s_starts[n]' % {'name': vname} avars[vname] = (dtype, offset) ivars = dict((k, avars[k]) for k in inputs.keys()) ovars = dict((k, avars[k]) for k in outputs.keys()) pvars = dict((k, avars[k]) for k in params.keys()) textconf = dict(N=N, n_elements=n_elements, tag=str(tag), declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params) if n_elements > 0: ### Allocate the exact number of required kernels in a vector gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))), ) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int gid = get_global_id(0); int m = gid * ${n_elements}, n = 0; while (m >= lengths[n]) { m -= lengths[n]; n++; } if (n >= ${N}) return; % for name, [type, offset] in ivars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in ovars.items(): __global ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in pvars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset}; int ${name}_isvector = ${name}_shape0s[n] > 1; if (${name}_isvector) cur_${name} += m; % endfor % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): ${type} ${name}; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// % for ii in range(n_elements): ////////////////////////////////////////////////// ////////// LOOP ITERATION ${ii} % for name, [type, offset] in ivars.items(): ${name} = *cur_${name}; % endfor % for name, [type, offset] in pvars.items(): if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name}; % endfor /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): *cur_${name} = ${name}; % endfor % if ii + 1 < n_elements: m++; if (m >= lengths[n]) { n++; m = 0; if (n >= ${N}) return; % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): cur_${name} = in_${name} + ${offset}; % endfor % for name, [type, offset] in pvars.items(): ${name}_isvector = ${name}_shape0s[n] > 1; if (!${name}_isvector) ${name} = *cur_${name}; % endfor } else { % for name, [type, offset] in ivars.items() + ovars.items(): cur_${name}++; % endfor % for name, [type, offset] in pvars.items(): if (${name}_isvector) cur_${name}++; % endfor } % endif % endfor } """ else: ### Allocate more than enough kernels in a matrix gsize = (int(np.max(base.shape0s)), int(N)) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int m = get_global_id(0); const int n = get_global_id(1); const int M = lengths[n]; if (m >= M) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = in_${name}[${offset} + m]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = (${name}_shape0s[n] > 1) ? in_${name}[${offset} + m] : in_${name}[${offset}]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): in_${name}[${offset} + m] = ${name}; % endfor } """ text = Template(text, output_encoding='ascii').render(**textconf) if 0: for i, line in enumerate(text.split('\n')): print "%3d %s" % (i + 1, line) full_args = [] for vname, v in inputs.items() + outputs.items(): full_args.extend([v.cl_starts, v.cl_buf]) for vname, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(base.cl_shape0s) full_args = tuple(full_args) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def plan_probes(queue, periods, X, Y, tag=None): """ Parameters ---------- P : raggedarray of ints The period (in time-steps) of each probe """ assert len(X) == len(Y) assert len(X) == len(periods) N = len(X) cl_countdowns = to_device(queue, np.zeros(N, dtype='int32')) cl_bufpositions = to_device(queue, np.zeros(N, dtype='int32')) cl_periods = to_device(queue, np.asarray(periods, dtype='int32')) assert X.cl_buf.ocldtype == Y.cl_buf.ocldtype ### N.B. X[i].shape = (ndims[i], ) ### Y[i].shape = (buf_ndims[i], buf_len) for i in xrange(N): assert X.shape0s[i] == Y.shape1s[i] assert X.shape1s[i] == 1 assert X.stride0s[i] == 1 assert Y.stride1s[i] == 1 text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( __global int *countdowns, __global int *bufpositions, __global const int *periods, __global const int *Xstarts, __global const int *Xshape0s, __global const ${Xtype} *Xdata, __global const int *Ystarts, __global ${Ytype} *Ydata ) { const int n = get_global_id(1); const int countdown = countdowns[n]; if (countdown == 0) { const int n_dims = Xshape0s[n]; __global const ${Xtype} *x = Xdata + Xstarts[n]; const int bufpos = bufpositions[n]; __global ${Ytype} *y = Ydata + Ystarts[n] + bufpos * n_dims; for (int ii = get_global_id(0); ii < n_dims; ii += get_global_size(0)) { y[ii] = x[ii]; } // This should *not* cause deadlock because // all local threads guaranteed to be // in this branch together. barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = periods[n] - 1; bufpositions[n] = bufpos + 1; } } else { barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) == 0) { countdowns[n] = countdown - 1; } } } """ textconf = dict(N=N, Xtype=X.cl_buf.ocldtype, Ytype=Y.cl_buf.ocldtype) text = Template(text, output_encoding='ascii').render(**textconf) full_args = ( cl_countdowns, cl_bufpositions, cl_periods, X.cl_starts, X.cl_shape0s, X.cl_buf, Y.cl_starts, Y.cl_buf, ) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) max_len = min(queue.device.max_work_group_size, max(X.shape0s)) gsize = ( max_len, N, ) lsize = (max_len, 1) rval = Plan(queue, _fn, gsize, lsize=lsize, name="cl_probes", tag=tag) rval.full_args = full_args # prevent garbage-collection rval.cl_bufpositions = cl_bufpositions rval.Y = Y return rval
def many_dots_impl(p, items): # target use case: # * several very shallow gemvs (short inner prods) into each target # * not all targets have the same size #p.print_geometry_summary(items, full=True) # # This algorithm is blocked out so that a work-group [i, j] computes # some segment of an output vector: # e.g. Y[i][ 32 * j : 32 * (j + 1)] # # This is done for two reasons: # - to increase occupancy when there are not so many vectors Y # - to handle long vectors Y #p.print_geometry_summary(items) if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None if p.A_js is None: # -- easy probably, but not done raise NotImplementedError() A_js_shape0s = p.A_js.shape0s cl_gstructure, textconf = p.cl_geometry_and_textconf(items) #min_n_dots = min(A_js_shape0s) max_n_dots = max(A_js_shape0s) max_y_len = max(p.geometry[ii]['y_len'] for ii in items) MAX_SEGMENT_SIZE = 16 # tricky to tune? segment_size = min( max_y_len, MAX_SEGMENT_SIZE) dot_block_size = min( max_n_dots, int(p.queue.device.max_work_group_size / segment_size), ) n_segments = int(math.ceil(float(max_y_len) / segment_size)) gsize = (n_segments * segment_size, dot_block_size, len(items)) lsize = (segment_size, dot_block_size, 1) textconf.update({ 'gsize': gsize, 'lsize': lsize, 'segment_size': segment_size, 'dot_block_size': dot_block_size, 'max_y_len': max_y_len, 'n_locals': segment_size * dot_block_size, #'segment_idx': 'get_local_id(0)', #'dot_block_idx': 'get_local_id(1)', 'segment_idx': 'segment_idx', 'dot_block_idx': 'dot_block_idx', }) if 0: for k, v in textconf.items(): print k, v textconf.update(p.__dict__) # print 'float_gamma', textconf['float_gamma'] # print 'cl_gamma', textconf['cl_gamma'] # print 'clra_gamma', textconf['clra_gamma'] text = """ __kernel void fn( const __global int *gstructure, const __global ${A.cl_buf.ocldtype} *A_data, const __global ${X.cl_buf.ocldtype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ocldtype} * betas, % endif const __global ${Y_in.cl_buf.ocldtype} *Y_in_data, __global ${Y.cl_buf.ocldtype} *Y_data) { __local int lstructure[${n_structure_vars}]; __local ${Y.cl_buf.ocldtype} y_sum_pre[${segment_size}]; __local ${Y.cl_buf.ocldtype} \ y_sum_post[${dot_block_size}][${segment_size}]; const int local_idx = get_local_id(0) \ + get_local_id(1) * get_local_size(0); int segment_idx = get_local_id(0); int dot_block_idx = get_local_id(1); for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${n_locals}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } barrier(CLK_LOCAL_MEM_FENCE); if (get_global_id(0) < ${y_len}) { if (dot_block_idx == 0) { % if float_beta is not None and float_beta != 0 : y_sum_pre[segment_idx] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(0)]; % elif cl_beta is not None: y_sum_pre[segment_idx] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(0)]; % else : y_sum_pre[segment_idx] = 0; % endif % if float_gamma is not None: % if float_gamma != 0: y_sum_pre[segment_idx] += ${float_gamma}; % endif % endif } //printf("betaY + gamma=%f\\n", y_sum_pre[segment_idx]); // XXX Move X into shared memory first y_sum_post[dot_block_idx][segment_idx] = 0; for (int ii = dot_block_idx; ii < ${n_dot_products}; ii += ${dot_block_size}) { for (int nn = 0; nn < ${N_i}; nn += 1) { y_sum_post[dot_block_idx][segment_idx] += A_data[${a_starts} + get_global_id(0) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } } } barrier(CLK_LOCAL_MEM_FENCE); //printf("AX=%f\\n", y_sum_post[dot_block_idx][segment_idx]); if ((get_global_id(0) < ${y_len}) && (dot_block_idx == 0)) { for (int ii = 1; ii < ${dot_block_size}; ++ii) { y_sum_post[0][segment_idx] += y_sum_post[ii][segment_idx]; } Y_data[${y_offset} + get_global_id(0)] = y_sum_pre[segment_idx] + ${float_alpha} * y_sum_post[0][segment_idx]; //printf("Yout=%f\\n", Y_data[${y_offset} + get_global_id(0)]); } } """ text = Template(text, output_encoding='ascii').render(**textconf) fn = cl.Program(p.queue.context, text).build().fn full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name='clra_gemv.many_dots_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args return rval
def reduce_impl(p, items, group_size=None, segment_size=None, ): # # Target use case: long inner products, small numbers of dots. # # Approach: each work-group computes a small number of gemv outputs # if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() if p.clra_beta is not None: raise NotImplementedError() if p.cl_alpha is not None: raise NotImplementedError() if p.cl_gamma is not None: raise NotImplementedError() if not all(s == 1 for s in p.A.stride1s): raise NotImplementedError() assert p.float_alpha is not None assert p.float_gamma is not None cl_gstructure, textconf = p.cl_geometry_and_textconf(items) max_n_dots = max([len(p.geometry[ii]['dots']) for ii in items]) max_reduce_len = max(max([gg['a_shape1'] for gg in p.geometry[ii]['dots']]) for ii in items ) max_y_len = max([p.geometry[ii]['y_len'] for ii in items]) # segment means the piece of Y written by a work-group # group_size is the number of values that we're reducing over if len(items) < 4: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 2) # XXX else: if group_size is None: group_size = 32 # XXX if segment_size is None: segment_size = min(max_y_len, 4) # XXX g_segments = int(math.ceil(float(max_y_len) / segment_size)) gsize = (group_size, g_segments * segment_size, len(items)) lsize = (group_size, segment_size, 1) max_reduce_iters = int(math.ceil(float(max_reduce_len) / group_size)) textconf.update({ 'n_items' : len(items), 'gsize' : gsize, 'segment_size': segment_size, 'max_y_len': max_y_len, 'group_size' : group_size, 'local_count': group_size * segment_size, 'max_reduce_len': max_reduce_len, 'N_cutoff': max_reduce_iters * group_size, 'max_n_dots': max_n_dots, }) if 0: for k, v in textconf.items(): print k, v textconf.update(p.__dict__) text = """ __kernel void fn( const __global int *gstructure, const __global ${A.cl_buf.ocldtype} *A_data, const __global ${X.cl_buf.ocldtype} *X_data, % if cl_beta is not None: const __global ${cl_beta.ocldtype} * betas, % endif const __global ${Y_in.cl_buf.ocldtype} *Y_in_data, __global ${Y.cl_buf.ocldtype} *Y_data) { __local int lstructure[${n_structure_vars}]; % if segment_size > 1: // we'll cache X in shared memory so we load it only once // for the whole segment __local ${X.cl_buf.ocldtype} lX[${group_size}]; % endif //Scratch space for the dot products __local ${Y.cl_buf.ocldtype} partialDotProduct[${segment_size}][${group_size}]; __local ${Y.cl_buf.ocldtype} y_sum_pre[${segment_size}]; const int local_idx = get_local_id(0) + get_local_id(1) * get_local_size(0); // load structure % if local_count < n_structure_vars: for (int ii = local_idx; ii < ${n_structure_vars}; ii += ${local_count}) { lstructure[ii] = gstructure[ get_global_id(2) * ${structure_vars_stride} + ii]; } % else : if (local_idx < ${n_structure_vars}) { lstructure[local_idx] = gstructure[ get_global_id(2) * ${structure_vars_stride} + local_idx]; } % endif barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { % if float_beta is not None and float_beta != 0 : y_sum_pre[get_local_id(1)] = ${float_beta} * Y_in_data[${y_in_starts} + get_global_id(1)]; % elif cl_beta is not None: y_sum_pre[get_local_id(1)] = betas[${bb}] * Y_in_data[${y_in_starts} + get_global_id(1)]; % else : y_sum_pre[get_local_id(1)] = 0; % endif % if float_gamma is not None and float_gamma != 0: y_sum_pre[get_local_id(1)] += ${float_gamma}; % endif // printf("betaY + gamma=%f\\n", y_sum_pre[get_local_id(1)]); } partialDotProduct[get_local_id(1)][get_local_id(0)] = 0; % if max_n_dots > 1: for (int ii = 0; ii < ${n_dot_products}; ii += 1) { % else: const int ii = 0; % endif for (int nn = get_local_id(0); nn < ${N_cutoff}; nn += get_local_size(0)) { // segment_size = ${segment_size} % if (segment_size == 1): if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * X_data[${x_starts} + nn]; } % else: barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(1) == 0) && (nn < ${N_i})) { lX[get_local_id(0)] = X_data[${x_starts} + nn]; } barrier(CLK_LOCAL_MEM_FENCE); if ((nn < ${N_i}) && (get_global_id(1) < ${y_len})) { partialDotProduct[get_local_id(1)][get_local_id(0)] += A_data[${a_starts} + get_global_id(1) * ${a_s0} + nn] * lX[get_local_id(0)]; } % endif } % if (max_n_dots > 1): } % endif // -- Parallel reduction long work-group dimension 0 for (uint stride = 1; stride < get_local_size(0); stride *= 2) { barrier(CLK_LOCAL_MEM_FENCE); uint index = 2 * stride * get_local_id(0); if (index + stride < get_local_size(0)) { partialDotProduct[get_local_id(1)][index] += partialDotProduct[get_local_id(1)][index + stride]; } } // barrier(CLK_LOCAL_MEM_FENCE); if ((get_local_id(0) == 0) && (get_global_id(1) < ${y_len})) { Y_data[${y_offset} + get_global_id(1)] = y_sum_pre[get_local_id(1)] + ${float_alpha} * partialDotProduct[get_local_id(1)][0]; } } """ text = Template(text, output_encoding='ascii').render(**textconf) fn = cl.Program(p.queue.context, text).build().fn full_args = [ cl_gstructure, p.A.cl_buf, p.X.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] full_args += [ p.Y_in.cl_buf, p.Y.cl_buf, ] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name='clra_gemv.reduce_impl', tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items), ) rval.full_args = full_args # prevent GC the args return rval
def ref_impl(p, items): """ Return an OpenCL function to calculate elements `items` of gemv operation `p`. In this reference implementation, we create a work item per output number, or more specifically, a work grid of (max_y_len, len(items)). Each work item loops over the dot products and the elements within each dot product to compute the output value Y[global_id(1)][global_id(0)]. """ if p.clra_alpha is not None: raise NotImplementedError() if p.clra_gamma is not None: raise NotImplementedError() cl_items = to_device(p.queue, np.asarray(items, dtype='int32')) if 0: if len(items) < 10: print 'Falling back on reference implementation' p.print_geometry_summary(items, full=True) else: print 'Falling back on reference implementation' p.print_geometry_summary(items) assert all(s == 1 for s in p.A.stride1s) assert all(s == 1 for s in p.X.stride1s) assert all(s == 1 for s in p.Y.stride0s) assert all(s == 1 for s in p.Y.stride1s) assert all(s == 1 for s in p.Y_in.stride0s) assert all(s == 1 for s in p.Y_in.stride1s) text = """ __kernel void fn( __global int *items, % if cl_alpha is not None: __global ${cl_alpha.ocldtype} * alphas, % endif % if (A_js is not None): __global int *A_starts, __global int *A_shape1s, __global int *A_stride0s, __global ${A.cl_buf.ocldtype} *A_data, __global int *A_js_starts, __global int *A_js_shape0s, __global int *A_js_data, __global int *X_starts, __global int *X_stride0s, __global ${X.cl_buf.ocldtype} *X_data, __global int *X_js_starts, __global int *X_js_data, % endif % if cl_beta is not None: __global ${cl_beta.ocldtype} * betas, % endif % if clra_beta is not None: __global int *beta_starts, __global int *beta_data, % endif % if cl_gamma is not None: __global ${cl_gamma.ocldtype} * gammas, % endif __global int *Y_in_starts, __global ${Y_in.cl_buf.ocldtype} *Y_in_data, __global int *Y_starts, __global int *Y_shape0s, __global ${Y.cl_buf.ocldtype} *Y_data) { const int mm = get_global_id(0); const int bb = items[get_global_id(1)]; const int M = Y_shape0s[bb]; if (mm < M) { const int y_offset = Y_starts[bb]; const int y_in_offset = Y_in_starts[bb]; % if float_beta is not None: const ${Y.cl_buf.ocldtype} beta = ${float_beta}; % elif cl_beta is not None: const ${cl_beta.ocldtype} beta = betas[bb]; % elif clra_beta is not None: const int beta_offset = beta_starts[bb]; const ${clra_beta.cl_buf.ocldtype} beta = beta_data[beta_offset + mm]; % endif % if float_gamma is not None: const ${Y.cl_buf.ocldtype} gamma = ${float_gamma}; % elif cl_gamma is not None: const ${cl_gamma.ocldtype} gamma = gammas[bb]; % endif Y_data[y_offset + mm] = gamma + beta * Y_in_data[y_in_offset + mm]; % if (A_js is not None) : const int n_dot_products = A_js_shape0s[bb]; X_js_data += X_js_starts[bb]; A_js_data += A_js_starts[bb]; ${Y.cl_buf.ocldtype} y_sum = 0; for (int ii = 0; ii < n_dot_products; ++ii) { const int x_ji = X_js_data[ii]; const int a_ji = A_js_data[ii]; const int N_i = A_shape1s[a_ji]; const int x_offset = X_starts[x_ji]; const int a_offset = A_starts[a_ji]; const int AsM = A_stride0s[a_ji]; const int XsM = X_stride0s[x_ji]; for (int nn = 0; nn < N_i; ++nn) { y_sum += X_data[x_offset + nn * XsM] * A_data[a_offset + mm * AsM + nn]; } } % if float_alpha is not None: Y_data[y_offset + mm] += ${float_alpha} * y_sum; % elif cl_alpha is not None: Y_data[y_offset + mm] += alphas[bb] * y_sum; % endif % endif } } """ text = Template(text, output_encoding='ascii').render(**p.__dict__) #print text gsize = ( max(p.geometry[ii]['y_len'] for ii in items), len(items)) lsize = None fn = cl.Program(p.queue.context, text).build().fn full_args = [cl_items] if p.cl_alpha is not None: full_args += [p.cl_alpha] if p.A_js is not None: full_args += [ p.A.cl_starts, p.A.cl_shape1s, p.A.cl_stride0s, p.A.cl_buf, p.A_js.cl_starts, p.A_js.cl_shape0s, p.A_js.cl_buf, p.X.cl_starts, p.X.cl_stride0s, p.X.cl_buf, p.X_js.cl_starts, p.X_js.cl_buf, ] if p.cl_beta is not None: full_args += [p.cl_beta] elif p.clra_beta is not None: full_args += [p.clra_beta.cl_starts, p.clra_beta.cl_buf] if p.cl_gamma is not None: full_args += [p.cl_gamma] elif p.clra_gamma is not None: full_args += [p.clra_gamma.cl_starts, p.clra_gamma.cl_buf] full_args += [ p.Y_in.cl_starts, p.Y_in.cl_buf, p.Y.cl_starts, p.Y.cl_shape0s, p.Y.cl_buf] #print [str(arr.dtype)[0] for arr in full_args] fn.set_args(*[arr.data for arr in full_args]) rval = Plan(p.queue, fn, gsize, lsize, name="clra_gemv.ref_impl", tag=p.tag, bw_per_call=bw_from_geometry(p.geometry, items), flops_per_call=flops_from_geometry(p.geometry, items)) rval.full_args = full_args # prevent GC the args return rval