def gemm_cpu_prepare_vectorized_FC(alpha, beta, M, N, K, dtype, Astrides, Bstrides, Cstrides, MB, NB, KB): ctype = ctype_from_dtype(dtype) (As0, As1) = elemstrides(Astrides, dtype, MB, vecdim=0) (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB, vecdim=1) (Cs0, Cs1) = elemstrides(Cstrides, dtype, NB, vecdim=1) NoMB = M // MB NoNB = N // NB NoKB = K // KB if M != MB * NoMB: raise BlockingError() if N != NB * NoNB: raise BlockingError() if K != KB * NoKB: raise BlockingError() if NB > 16: raise BlockingError('codegen breaks at this point') text = Template(vectorized_text_FC, output_encoding='ascii').render(**locals()) if 0: for ii, line in enumerate(text.split('\n')): print ii, line prg = cl.Program(ctx, text).build() #print 'built!' return prg.kern
def pairwise_cpu_prepare_vectorized_CF(alpha, beta, M, N, K, dtype, Astrides, Bstrides, Cstrides, MB, NB, KB): ctype = ctype_from_dtype(dtype) (As0, As1) = elemstrides(Astrides, dtype, KB, vecdim=1) (Bs0, Bs1) = elemstrides(Bstrides, dtype, KB, vecdim=0) (Cs0, Cs1) = elemstrides(Cstrides, dtype, NB, vecdim=1) NoMB = M // MB NoNB = N // NB NoKB = K // KB if M != MB * NoMB: raise BlockingError() if N != NB * NoNB: raise BlockingError() if K != KB * NoKB: raise BlockingError() if NB > 16: raise BlockingError("codegen breaks at this point") text = Template(vectorized_text_CF, output_encoding="ascii").render(**locals()) if 0: for ii, line in enumerate(text.split("\n")): print ii, line prg = cl.Program(ctx, text).build() # print 'built!' return prg.kern
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- n_elements: int If n_elements == 0, then the kernels are allocated as a block. This is simple, but can be slow for large computations where input vector sizes are not uniform (e.g. one large population and many small ones). If n_elements >= 1, then all the vectors in the RaggedArray are flattened so that the exact number of required kernels is allocated. Each kernel performs computations for `n_elements` elements. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ base = inputs.values()[0] # input to use as reference (for lengths) N = len(base) ### split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v else: try: static_params[k] = ("float", float(v)) except TypeError: raise avars = {} for vname, v in inputs.items() + outputs.items(): assert vname not in avars, "Name clash" assert len(v) == N assert all_equal(v.shape0s, base.shape0s) ### N.B. - we should be able to ignore ldas as long as all vectors assert all_equal(v.shape1s, 1) dtype = v.cl_buf.ocldtype offset = "%(name)s_starts[n]" % {"name": vname} avars[vname] = (dtype, offset) for vname, v in params.items(): assert vname not in avars, "Name clash" assert len(v) == N for i in xrange(N): assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, "%s.shape0s[%d] must be 1 or %d (not %d)" % ( vname, i, base.shape0s[i], v.shape0s[i], ) assert v.shape1s[i] == 1 dtype = v.cl_buf.ocldtype offset = "%(name)s_starts[n]" % {"name": vname} avars[vname] = (dtype, offset) ivars = dict((k, avars[k]) for k in inputs.keys()) ovars = dict((k, avars[k]) for k in outputs.keys()) pvars = dict((k, avars[k]) for k in params.keys()) textconf = dict( N=N, n_elements=n_elements, tag=str(tag), declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params, ) if n_elements > 0: ### Allocate the exact number of required kernels in a vector gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))),) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int gid = get_global_id(0); int m = gid * ${n_elements}, n = 0; while (m >= lengths[n]) { m -= lengths[n]; n++; } if (n >= ${N}) return; % for name, [type, offset] in ivars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in ovars.items(): __global ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in pvars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset}; int ${name}_isvector = ${name}_shape0s[n] > 1; if (${name}_isvector) cur_${name} += m; % endfor % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): ${type} ${name}; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// % for ii in range(n_elements): ////////////////////////////////////////////////// ////////// LOOP ITERATION ${ii} % for name, [type, offset] in ivars.items(): ${name} = *cur_${name}; % endfor % for name, [type, offset] in pvars.items(): if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name}; % endfor /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): *cur_${name} = ${name}; % endfor % if ii + 1 < n_elements: m++; if (m >= lengths[n]) { n++; m = 0; if (n >= ${N}) return; % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): cur_${name} = in_${name} + ${offset}; % endfor % for name, [type, offset] in pvars.items(): ${name}_isvector = ${name}_shape0s[n] > 1; if (!${name}_isvector) ${name} = *cur_${name}; % endfor } else { % for name, [type, offset] in ivars.items() + ovars.items(): cur_${name}++; % endfor % for name, [type, offset] in pvars.items(): if (${name}_isvector) cur_${name}++; % endfor } % endif % endfor } """ else: ### Allocate more than enough kernels in a matrix gsize = (int(np.max(base.shape0s)), int(N)) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int m = get_global_id(0); const int n = get_global_id(1); const int M = lengths[n]; if (m >= M) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = in_${name}[${offset} + m]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = (${name}_shape0s[n] > 1) ? in_${name}[${offset} + m] : in_${name}[${offset}]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): in_${name}[${offset} + m] = ${name}; % endfor } """ text = Template(text, output_encoding="ascii").render(**textconf) if 0: for i, line in enumerate(text.split("\n")): print "%3d %s" % (i + 1, line) full_args = [] for name, v in inputs.items() + outputs.items(): full_args.extend([v.cl_starts, v.cl_buf]) for name, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(base.cl_shape0s) full_args = tuple(full_args) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag) rval.full_args = full_args # prevent garbage-collection return rval
def generate(self, filename=None): formatted_data = np.ndarray(shape=self.data.shape, dtype=np.dtype('U25')) rule = np.ndarray(shape=self.data.shape[0], dtype=np.dtype('U25')) for i in range(self.data.shape[0]): for j in range(self.data.shape[1]): formatted_data[i, j] = self.format(i, j) rule[i] = self.hrule_adder(i) result_template = r"""\documentclass[11pt]{article} \usepackage{multirow} \usepackage{booktabs} ${preamble} \thispagestyle{empty} \begin{document} \begin{table} \begin{tabular}{${'c'*(data.shape[1]+len(row_names))}} \toprule % for col_name in col_names: <% cols_per_cell = data.shape[1] // len(col_name) alignment = 'c' delimiter = r'} & \multicolumn{' + str(cols_per_cell) + r'}{' + alignment +r'}{' # noqa %> \multicolumn{${len(row_names)}}{c}{} ${delimiter[2:]}${delimiter.join(col_name)}}\\\ % endfor \midrule % for i in range(data.shape[0]): % for row_name in row_names: <% nrows_per_row_name = data.shape[0] // len(row_name) %> ${r'\multirow{%d}{*}{%s}' % (nrows_per_row_name, row_name[i//nrows_per_row_name]) if i % nrows_per_row_name == 0 else ' '} & \ % endfor ${' & '.join(elem for elem in data[i, :])}\\\ ${rule[i]} % endfor \bottomrule \end{tabular} \end{table} \end{document} """ result = Template(result_template).render(row_names=self.row_names, col_names=self.column_names, data=formatted_data, preamble=self.preamble(), rule=rule) if filename is None: print(result) return ext = filename[-4:] if ext == ".tex": # if the desired output is tex only, then only extract the tabular # part. resultlines = result.split("\n") begintabular_idx, = [ i for i, line in enumerate(resultlines) if line.startswith(r"\begin{tabular}") ] endtabular_idx, = [ i for i, line in enumerate(resultlines) if line.startswith(r"\end{tabular}") ] result = "\n".join(resultlines[begintabular_idx:endtabular_idx + 1]) with open(filename, 'w') as f: f.write(result) elif ext == ".pdf" or ext == ".png": tex_filename = ( '/tmp/' + ''.join(random.choice(string.ascii_letters) for _ in range(9)) + '.tex') gen_pdf_filename = tex_filename[:-4] + '.pdf' with open(tex_filename, 'w') as f: f.write(result) subprocess.call( ('pdflatex -output-directory=/tmp {0}'.format(tex_filename) ).split()) if ext == ".pdf": subprocess.call(('mv {0} {1}'.format(gen_pdf_filename, filename)).split()) if ext == ".png": cropped_pdf = gen_pdf_filename[:-4] + '-crop.pdf' subprocess.call(('pdfcrop -margin 3 {0} {1}'.format( gen_pdf_filename, cropped_pdf)).split()) with open(filename, 'w') as f: ppm = subprocess.Popen( ('pdftoppm -r 500 {0}'.format(cropped_pdf)).split(), stdout=subprocess.PIPE) subprocess.call(['pnmtopng'], stdin=ppm.stdout, stdout=f) ppm.wait() else: raise NotImplementedError()
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}): """Template for making a plan for vector nonlinearities. This template assumes that all inputs and outputs are vectors. Parameters ---------- n_elements: int If n_elements == 0, then the kernels are allocated as a block. This is simple, but can be slow for large computations where input vector sizes are not uniform (e.g. one large population and many small ones). If n_elements >= 1, then all the vectors in the RaggedArray are flattened so that the exact number of required kernels is allocated. Each kernel performs computations for `n_elements` elements. inputs: dictionary of CLRaggedArrays Inputs to the function. RaggedArrays must be a list of vectors. outputs: dictionary of CLRaggedArrays Outputs of the function. RaggedArrays must be a list of vectors. parameters: dictionary of CLRaggedArrays Parameters to the function. Each RaggedArray element must be a vector of the same length of the inputs, or a scalar (to be broadcasted). Providing a float instead of a RaggedArray makes that parameter constant. """ base = inputs.values()[0] # input to use as reference (for lengths) N = len(base) ### split parameters into static and updated params static_params = {} # static params (hard-coded) params = {} # variable params (updated) for k, v in parameters.items(): if isinstance(v, CLRaggedArray): params[k] = v else: try: static_params[k] = ('float', float(v)) except TypeError: raise avars = {} for vname, v in inputs.items() + outputs.items(): assert vname not in avars, "Name clash" assert len(v) == N assert all_equal(v.shape0s, base.shape0s) ### N.B. - we should be able to ignore ldas as long as all vectors assert all_equal(v.shape1s, 1) dtype = v.cl_buf.ocldtype offset = '%(name)s_starts[n]' % {'name': vname} avars[vname] = (dtype, offset) for vname, v in params.items(): assert vname not in avars, "Name clash" assert len(v) == N for i in xrange(N): assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, \ "%s.shape0s[%d] must be 1 or %d (not %d)" % \ (vname, i, base.shape0s[i], v.shape0s[i]) assert v.shape1s[i] == 1 dtype = v.cl_buf.ocldtype offset = '%(name)s_starts[n]' % {'name': vname} avars[vname] = (dtype, offset) ivars = dict((k, avars[k]) for k in inputs.keys()) ovars = dict((k, avars[k]) for k in outputs.keys()) pvars = dict((k, avars[k]) for k in params.keys()) textconf = dict(N=N, n_elements=n_elements, tag=str(tag), declares=declares, core_text=core_text, ivars=ivars, ovars=ovars, pvars=pvars, static_params=static_params) if n_elements > 0: ### Allocate the exact number of required kernels in a vector gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))), ) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int gid = get_global_id(0); int m = gid * ${n_elements}, n = 0; while (m >= lengths[n]) { m -= lengths[n]; n++; } if (n >= ${N}) return; % for name, [type, offset] in ivars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in ovars.items(): __global ${type} *cur_${name} = in_${name} + ${offset} + m; % endfor % for name, [type, offset] in pvars.items(): __global const ${type} *cur_${name} = in_${name} + ${offset}; int ${name}_isvector = ${name}_shape0s[n] > 1; if (${name}_isvector) cur_${name} += m; % endfor % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): ${type} ${name}; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// % for ii in range(n_elements): ////////////////////////////////////////////////// ////////// LOOP ITERATION ${ii} % for name, [type, offset] in ivars.items(): ${name} = *cur_${name}; % endfor % for name, [type, offset] in pvars.items(): if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name}; % endfor /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): *cur_${name} = ${name}; % endfor % if ii + 1 < n_elements: m++; if (m >= lengths[n]) { n++; m = 0; if (n >= ${N}) return; % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items(): cur_${name} = in_${name} + ${offset}; % endfor % for name, [type, offset] in pvars.items(): ${name}_isvector = ${name}_shape0s[n] > 1; if (!${name}_isvector) ${name} = *cur_${name}; % endfor } else { % for name, [type, offset] in ivars.items() + ovars.items(): cur_${name}++; % endfor % for name, [type, offset] in pvars.items(): if (${name}_isvector) cur_${name}++; % endfor } % endif % endfor } """ else: ### Allocate more than enough kernels in a matrix gsize = (int(np.max(base.shape0s)), int(N)) text = """ ////////// MAIN FUNCTION ////////// __kernel void fn( % for name, [type, offset] in ivars.items(): __global const int *${name}_starts, __global const ${type} *in_${name}, % endfor % for name, [type, offset] in ovars.items(): __global const int *${name}_starts, __global ${type} *in_${name}, % endfor % for name, [type, offset] in pvars.items(): __global const int *${name}_starts, __global const int *${name}_shape0s, __global const ${type} *in_${name}, % endfor __global const int *lengths ) { const int m = get_global_id(0); const int n = get_global_id(1); const int M = lengths[n]; if (m >= M) return; % for name, [type, offset] in ivars.items(): ${type} ${name} = in_${name}[${offset} + m]; % endfor % for name, [type, offset] in ovars.items(): ${type} ${name}; % endfor % for name, [type, offset] in pvars.items(): const ${type} ${name} = (${name}_shape0s[n] > 1) ? in_${name}[${offset} + m] : in_${name}[${offset}]; % endfor % for name, [type, value] in static_params.items(): const ${type} ${name} = ${value}; % endfor ////////////////////////////////////////////////// //vvvvv USER DECLARATIONS BELOW vvvvv ${declares} //^^^^^ USER DECLARATIONS ABOVE ^^^^^ ////////////////////////////////////////////////// /////vvvvv USER COMPUTATIONS BELOW vvvvv ${core_text} /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^ % for name, [type, offset] in ovars.items(): in_${name}[${offset} + m] = ${name}; % endfor } """ text = Template(text, output_encoding='ascii').render(**textconf) if 0: for i, line in enumerate(text.split('\n')): print "%3d %s" % (i + 1, line) full_args = [] for vname, v in inputs.items() + outputs.items(): full_args.extend([v.cl_starts, v.cl_buf]) for vname, v in params.items(): full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf]) full_args.append(base.cl_shape0s) full_args = tuple(full_args) _fn = cl.Program(queue.context, text).build().fn _fn.set_args(*[arr.data for arr in full_args]) rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag) rval.full_args = full_args # prevent garbage-collection return rval