def gemm_cpu_prepare_vectorized_FC(alpha, beta, M, N, K, dtype, Astrides,
                                   Bstrides, Cstrides, MB, NB, KB):
    ctype = ctype_from_dtype(dtype)
    (As0, As1) = elemstrides(Astrides, dtype, MB, vecdim=0)
    (Bs0, Bs1) = elemstrides(Bstrides, dtype, NB, vecdim=1)
    (Cs0, Cs1) = elemstrides(Cstrides, dtype, NB, vecdim=1)
    NoMB = M // MB
    NoNB = N // NB
    NoKB = K // KB
    if M != MB * NoMB:
        raise BlockingError()
    if N != NB * NoNB:
        raise BlockingError()
    if K != KB * NoKB:
        raise BlockingError()
    if NB > 16:
        raise BlockingError('codegen breaks at this point')
    text = Template(vectorized_text_FC,
                    output_encoding='ascii').render(**locals())
    if 0:
        for ii, line in enumerate(text.split('\n')):
            print ii, line
    prg = cl.Program(ctx, text).build()
    #print 'built!'

    return prg.kern
def pairwise_cpu_prepare_vectorized_CF(alpha, beta, M, N, K, dtype, Astrides, Bstrides, Cstrides, MB, NB, KB):
    ctype = ctype_from_dtype(dtype)
    (As0, As1) = elemstrides(Astrides, dtype, KB, vecdim=1)
    (Bs0, Bs1) = elemstrides(Bstrides, dtype, KB, vecdim=0)
    (Cs0, Cs1) = elemstrides(Cstrides, dtype, NB, vecdim=1)
    NoMB = M // MB
    NoNB = N // NB
    NoKB = K // KB
    if M != MB * NoMB:
        raise BlockingError()
    if N != NB * NoNB:
        raise BlockingError()
    if K != KB * NoKB:
        raise BlockingError()
    if NB > 16:
        raise BlockingError("codegen breaks at this point")
    text = Template(vectorized_text_CF, output_encoding="ascii").render(**locals())
    if 0:
        for ii, line in enumerate(text.split("\n")):
            print ii, line
    prg = cl.Program(ctx, text).build()
    # print 'built!'
    return prg.kern
Ejemplo n.º 3
0
def _plan_template(queue, name, core_text, declares="", tag=None, n_elements=0, inputs={}, outputs={}, parameters={}):
    """Template for making a plan for vector nonlinearities.

    This template assumes that all inputs and outputs are vectors.

    Parameters
    ----------
    n_elements: int
        If n_elements == 0, then the kernels are allocated as a block. This is
        simple, but can be slow for large computations where input vector sizes
        are not uniform (e.g. one large population and many small ones).
        If n_elements >= 1, then all the vectors in the RaggedArray are
        flattened so that the exact number of required kernels is allocated.
        Each kernel performs computations for `n_elements` elements.

    inputs: dictionary of CLRaggedArrays
        Inputs to the function. RaggedArrays must be a list of vectors.

    outputs: dictionary of CLRaggedArrays
        Outputs of the function. RaggedArrays must be a list of vectors.

    parameters: dictionary of CLRaggedArrays
        Parameters to the function. Each RaggedArray element must be a vector
        of the same length of the inputs, or a scalar (to be broadcasted).
        Providing a float instead of a RaggedArray makes that parameter
        constant.

    """

    base = inputs.values()[0]  # input to use as reference (for lengths)
    N = len(base)

    ### split parameters into static and updated params
    static_params = {}  # static params (hard-coded)
    params = {}  # variable params (updated)
    for k, v in parameters.items():
        if isinstance(v, CLRaggedArray):
            params[k] = v
        else:
            try:
                static_params[k] = ("float", float(v))
            except TypeError:
                raise

    avars = {}
    for vname, v in inputs.items() + outputs.items():
        assert vname not in avars, "Name clash"
        assert len(v) == N
        assert all_equal(v.shape0s, base.shape0s)

        ### N.B. - we should be able to ignore ldas as long as all vectors
        assert all_equal(v.shape1s, 1)

        dtype = v.cl_buf.ocldtype
        offset = "%(name)s_starts[n]" % {"name": vname}
        avars[vname] = (dtype, offset)

    for vname, v in params.items():
        assert vname not in avars, "Name clash"
        assert len(v) == N
        for i in xrange(N):
            assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, "%s.shape0s[%d] must be 1 or %d (not %d)" % (
                vname,
                i,
                base.shape0s[i],
                v.shape0s[i],
            )
            assert v.shape1s[i] == 1

        dtype = v.cl_buf.ocldtype
        offset = "%(name)s_starts[n]" % {"name": vname}
        avars[vname] = (dtype, offset)

    ivars = dict((k, avars[k]) for k in inputs.keys())
    ovars = dict((k, avars[k]) for k in outputs.keys())
    pvars = dict((k, avars[k]) for k in params.keys())

    textconf = dict(
        N=N,
        n_elements=n_elements,
        tag=str(tag),
        declares=declares,
        core_text=core_text,
        ivars=ivars,
        ovars=ovars,
        pvars=pvars,
        static_params=static_params,
    )

    if n_elements > 0:
        ### Allocate the exact number of required kernels in a vector
        gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))),)
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            const int gid = get_global_id(0);
            int m = gid * ${n_elements}, n = 0;
            while (m >= lengths[n]) {
                m -= lengths[n];
                n++;
            }
            if (n >= ${N}) return;

% for name, [type, offset] in ivars.items():
            __global const ${type} *cur_${name} = in_${name} + ${offset} + m;
% endfor
% for name, [type, offset] in ovars.items():
            __global ${type} *cur_${name} = in_${name} + ${offset} + m;
% endfor
% for name, [type, offset] in pvars.items():
            __global const ${type} *cur_${name} = in_${name} + ${offset};
            int ${name}_isvector = ${name}_shape0s[n] > 1;
            if (${name}_isvector) cur_${name} += m;
% endfor
% for name, [type, offset] in ivars.items() + ovars.items() + pvars.items():
            ${type} ${name};
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

% for ii in range(n_elements):
            //////////////////////////////////////////////////
            ////////// LOOP ITERATION ${ii}
  % for name, [type, offset] in ivars.items():
            ${name} = *cur_${name};
  % endfor
  % for name, [type, offset] in pvars.items():
            if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name};
  % endfor

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

  % for name, [type, offset] in ovars.items():
            *cur_${name} = ${name};
  % endfor

  % if ii + 1 < n_elements:
            m++;
            if (m >= lengths[n]) {
                n++;
                m = 0;
                if (n >= ${N}) return;

    % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items():
                cur_${name} = in_${name} + ${offset};
    % endfor
    % for name, [type, offset] in pvars.items():
                ${name}_isvector = ${name}_shape0s[n] > 1;
                if (!${name}_isvector) ${name} = *cur_${name};
    % endfor
            } else {
    % for name, [type, offset] in ivars.items() + ovars.items():
                cur_${name}++;
    % endfor
    % for name, [type, offset] in pvars.items():
                if (${name}_isvector) cur_${name}++;
    % endfor
            }
  % endif
% endfor
        }
        """
    else:
        ### Allocate more than enough kernels in a matrix
        gsize = (int(np.max(base.shape0s)), int(N))
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            const int m = get_global_id(0);
            const int n = get_global_id(1);
            const int M = lengths[n];
            if (m >= M) return;

% for name, [type, offset] in ivars.items():
            ${type} ${name} = in_${name}[${offset} + m];
% endfor
% for name, [type, offset] in ovars.items():
            ${type} ${name};
% endfor
% for name, [type, offset] in pvars.items():
            const ${type} ${name} = (${name}_shape0s[n] > 1) ?
                in_${name}[${offset} + m] : in_${name}[${offset}];
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

% for name, [type, offset] in ovars.items():
            in_${name}[${offset} + m] = ${name};
% endfor
        }
        """

    text = Template(text, output_encoding="ascii").render(**textconf)
    if 0:
        for i, line in enumerate(text.split("\n")):
            print "%3d %s" % (i + 1, line)

    full_args = []
    for name, v in inputs.items() + outputs.items():
        full_args.extend([v.cl_starts, v.cl_buf])
    for name, v in params.items():
        full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf])
    full_args.append(base.cl_shape0s)
    full_args = tuple(full_args)

    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    return rval
Ejemplo n.º 4
0
    def generate(self, filename=None):
        formatted_data = np.ndarray(shape=self.data.shape,
                                    dtype=np.dtype('U25'))
        rule = np.ndarray(shape=self.data.shape[0], dtype=np.dtype('U25'))
        for i in range(self.data.shape[0]):
            for j in range(self.data.shape[1]):
                formatted_data[i, j] = self.format(i, j)

            rule[i] = self.hrule_adder(i)

        result_template = r"""\documentclass[11pt]{article}
\usepackage{multirow}
\usepackage{booktabs}
${preamble}

\thispagestyle{empty}
\begin{document}
\begin{table}
\begin{tabular}{${'c'*(data.shape[1]+len(row_names))}}
\toprule
% for col_name in col_names:
<%
    cols_per_cell = data.shape[1] // len(col_name)
    alignment = 'c'
    delimiter = r'} & \multicolumn{' + str(cols_per_cell) + r'}{' + alignment +r'}{'  # noqa
%>
\multicolumn{${len(row_names)}}{c}{} ${delimiter[2:]}${delimiter.join(col_name)}}\\\
% endfor


\midrule

% for i in range(data.shape[0]):
    % for row_name in row_names:
        <% nrows_per_row_name = data.shape[0] // len(row_name) %>
${r'\multirow{%d}{*}{%s}' % (nrows_per_row_name,
row_name[i//nrows_per_row_name]) if i % nrows_per_row_name == 0 else '  '}  & \
    % endfor
    ${'  &  '.join(elem for elem in data[i, :])}\\\

${rule[i]}
% endfor

\bottomrule
\end{tabular}
\end{table}
\end{document}
"""
        result = Template(result_template).render(row_names=self.row_names,
                                                  col_names=self.column_names,
                                                  data=formatted_data,
                                                  preamble=self.preamble(),
                                                  rule=rule)

        if filename is None:
            print(result)
            return

        ext = filename[-4:]
        if ext == ".tex":
            # if the desired output is tex only, then only extract the tabular
            # part.
            resultlines = result.split("\n")
            begintabular_idx, = [
                i for i, line in enumerate(resultlines)
                if line.startswith(r"\begin{tabular}")
            ]
            endtabular_idx, = [
                i for i, line in enumerate(resultlines)
                if line.startswith(r"\end{tabular}")
            ]

            result = "\n".join(resultlines[begintabular_idx:endtabular_idx +
                                           1])

            with open(filename, 'w') as f:
                f.write(result)
        elif ext == ".pdf" or ext == ".png":
            tex_filename = (
                '/tmp/' +
                ''.join(random.choice(string.ascii_letters)
                        for _ in range(9)) + '.tex')
            gen_pdf_filename = tex_filename[:-4] + '.pdf'
            with open(tex_filename, 'w') as f:
                f.write(result)
            subprocess.call(
                ('pdflatex -output-directory=/tmp {0}'.format(tex_filename)
                 ).split())
            if ext == ".pdf":
                subprocess.call(('mv {0} {1}'.format(gen_pdf_filename,
                                                     filename)).split())
            if ext == ".png":
                cropped_pdf = gen_pdf_filename[:-4] + '-crop.pdf'
                subprocess.call(('pdfcrop -margin 3 {0} {1}'.format(
                    gen_pdf_filename, cropped_pdf)).split())
                with open(filename, 'w') as f:
                    ppm = subprocess.Popen(
                        ('pdftoppm -r 500 {0}'.format(cropped_pdf)).split(),
                        stdout=subprocess.PIPE)
                    subprocess.call(['pnmtopng'], stdin=ppm.stdout, stdout=f)
                    ppm.wait()
        else:
            raise NotImplementedError()
Ejemplo n.º 5
0
def _plan_template(queue,
                   name,
                   core_text,
                   declares="",
                   tag=None,
                   n_elements=0,
                   inputs={},
                   outputs={},
                   parameters={}):
    """Template for making a plan for vector nonlinearities.

    This template assumes that all inputs and outputs are vectors.

    Parameters
    ----------
    n_elements: int
        If n_elements == 0, then the kernels are allocated as a block. This is
        simple, but can be slow for large computations where input vector sizes
        are not uniform (e.g. one large population and many small ones).
        If n_elements >= 1, then all the vectors in the RaggedArray are
        flattened so that the exact number of required kernels is allocated.
        Each kernel performs computations for `n_elements` elements.

    inputs: dictionary of CLRaggedArrays
        Inputs to the function. RaggedArrays must be a list of vectors.

    outputs: dictionary of CLRaggedArrays
        Outputs of the function. RaggedArrays must be a list of vectors.

    parameters: dictionary of CLRaggedArrays
        Parameters to the function. Each RaggedArray element must be a vector
        of the same length of the inputs, or a scalar (to be broadcasted).
        Providing a float instead of a RaggedArray makes that parameter
        constant.

    """

    base = inputs.values()[0]  # input to use as reference (for lengths)
    N = len(base)

    ### split parameters into static and updated params
    static_params = {}  # static params (hard-coded)
    params = {}  # variable params (updated)
    for k, v in parameters.items():
        if isinstance(v, CLRaggedArray):
            params[k] = v
        else:
            try:
                static_params[k] = ('float', float(v))
            except TypeError:
                raise

    avars = {}
    for vname, v in inputs.items() + outputs.items():
        assert vname not in avars, "Name clash"
        assert len(v) == N
        assert all_equal(v.shape0s, base.shape0s)

        ### N.B. - we should be able to ignore ldas as long as all vectors
        assert all_equal(v.shape1s, 1)

        dtype = v.cl_buf.ocldtype
        offset = '%(name)s_starts[n]' % {'name': vname}
        avars[vname] = (dtype, offset)

    for vname, v in params.items():
        assert vname not in avars, "Name clash"
        assert len(v) == N
        for i in xrange(N):
            assert v.shape0s[i] == base.shape0s[i] or v.shape0s[i] == 1, \
                "%s.shape0s[%d] must be 1 or %d (not %d)" % \
                (vname, i, base.shape0s[i], v.shape0s[i])
            assert v.shape1s[i] == 1

        dtype = v.cl_buf.ocldtype
        offset = '%(name)s_starts[n]' % {'name': vname}
        avars[vname] = (dtype, offset)

    ivars = dict((k, avars[k]) for k in inputs.keys())
    ovars = dict((k, avars[k]) for k in outputs.keys())
    pvars = dict((k, avars[k]) for k in params.keys())

    textconf = dict(N=N,
                    n_elements=n_elements,
                    tag=str(tag),
                    declares=declares,
                    core_text=core_text,
                    ivars=ivars,
                    ovars=ovars,
                    pvars=pvars,
                    static_params=static_params)

    if n_elements > 0:
        ### Allocate the exact number of required kernels in a vector
        gsize = (int(np.ceil(np.sum(base.shape0s) / float(n_elements))), )
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            const int gid = get_global_id(0);
            int m = gid * ${n_elements}, n = 0;
            while (m >= lengths[n]) {
                m -= lengths[n];
                n++;
            }
            if (n >= ${N}) return;

% for name, [type, offset] in ivars.items():
            __global const ${type} *cur_${name} = in_${name} + ${offset} + m;
% endfor
% for name, [type, offset] in ovars.items():
            __global ${type} *cur_${name} = in_${name} + ${offset} + m;
% endfor
% for name, [type, offset] in pvars.items():
            __global const ${type} *cur_${name} = in_${name} + ${offset};
            int ${name}_isvector = ${name}_shape0s[n] > 1;
            if (${name}_isvector) cur_${name} += m;
% endfor
% for name, [type, offset] in ivars.items() + ovars.items() + pvars.items():
            ${type} ${name};
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

% for ii in range(n_elements):
            //////////////////////////////////////////////////
            ////////// LOOP ITERATION ${ii}
  % for name, [type, offset] in ivars.items():
            ${name} = *cur_${name};
  % endfor
  % for name, [type, offset] in pvars.items():
            if ((${ii} == 0) || ${name}_isvector) ${name} = *cur_${name};
  % endfor

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

  % for name, [type, offset] in ovars.items():
            *cur_${name} = ${name};
  % endfor

  % if ii + 1 < n_elements:
            m++;
            if (m >= lengths[n]) {
                n++;
                m = 0;
                if (n >= ${N}) return;

    % for name, [type, offset] in ivars.items() + ovars.items() + pvars.items():
                cur_${name} = in_${name} + ${offset};
    % endfor
    % for name, [type, offset] in pvars.items():
                ${name}_isvector = ${name}_shape0s[n] > 1;
                if (!${name}_isvector) ${name} = *cur_${name};
    % endfor
            } else {
    % for name, [type, offset] in ivars.items() + ovars.items():
                cur_${name}++;
    % endfor
    % for name, [type, offset] in pvars.items():
                if (${name}_isvector) cur_${name}++;
    % endfor
            }
  % endif
% endfor
        }
        """
    else:
        ### Allocate more than enough kernels in a matrix
        gsize = (int(np.max(base.shape0s)), int(N))
        text = """
        ////////// MAIN FUNCTION //////////
        __kernel void fn(
% for name, [type, offset] in ivars.items():
            __global const int *${name}_starts,
            __global const ${type} *in_${name},
% endfor
% for name, [type, offset] in ovars.items():
            __global const int *${name}_starts,
            __global ${type} *in_${name},
% endfor
% for name, [type, offset] in pvars.items():
            __global const int *${name}_starts,
            __global const int *${name}_shape0s,
            __global const ${type} *in_${name},
% endfor
            __global const int *lengths
        )
        {
            const int m = get_global_id(0);
            const int n = get_global_id(1);
            const int M = lengths[n];
            if (m >= M) return;

% for name, [type, offset] in ivars.items():
            ${type} ${name} = in_${name}[${offset} + m];
% endfor
% for name, [type, offset] in ovars.items():
            ${type} ${name};
% endfor
% for name, [type, offset] in pvars.items():
            const ${type} ${name} = (${name}_shape0s[n] > 1) ?
                in_${name}[${offset} + m] : in_${name}[${offset}];
% endfor
% for name, [type, value] in static_params.items():
            const ${type} ${name} = ${value};
% endfor
            //////////////////////////////////////////////////
            //vvvvv USER DECLARATIONS BELOW vvvvv
            ${declares}
            //^^^^^ USER DECLARATIONS ABOVE ^^^^^
            //////////////////////////////////////////////////

            /////vvvvv USER COMPUTATIONS BELOW vvvvv
            ${core_text}
            /////^^^^^ USER COMPUTATIONS ABOVE ^^^^^

% for name, [type, offset] in ovars.items():
            in_${name}[${offset} + m] = ${name};
% endfor
        }
        """

    text = Template(text, output_encoding='ascii').render(**textconf)
    if 0:
        for i, line in enumerate(text.split('\n')):
            print "%3d %s" % (i + 1, line)

    full_args = []
    for vname, v in inputs.items() + outputs.items():
        full_args.extend([v.cl_starts, v.cl_buf])
    for vname, v in params.items():
        full_args.extend([v.cl_starts, v.cl_shape0s, v.cl_buf])
    full_args.append(base.cl_shape0s)
    full_args = tuple(full_args)

    _fn = cl.Program(queue.context, text).build().fn
    _fn.set_args(*[arr.data for arr in full_args])

    rval = Plan(queue, _fn, gsize, lsize=None, name=name, tag=tag)
    rval.full_args = full_args  # prevent garbage-collection
    return rval