Example #1
0
    def gencode_probability(pfunc, name):
        fun, jac = pfunc.gen_expr()

        return Template(r'''
        using ${name}_theta_t = ${theta_t};

        struct ${name}_t : ${name}_theta_t {

            constexpr static int jac_dims = ${jac_dims};

            template<class N>
            __device__ __inline__
            auto operator() (N const &n) const {
                return ${expr};
            }

            template<class N>
            __device__ __inline__
            auto _j_a_c_o_b_i_a_n_(N const &n) const {
                graphdot::array<float, jac_dims> j;
                ${jac;\n};
                return j;
            }
        };

        __constant__ ${name}_t ${name};
        ''').render(
            name=name,
            jac_dims=len(jac),
            theta_t=decltype(pfunc),
            expr=fun,
            jac=[f'j[{i}] = {expr}' for i, expr in enumerate(jac)]
        )
Example #2
0
 def __repr__(self):
     return Template('${cls}(${theta, }, ${bounds, })').render(
         cls=self.name,
         theta=[f'{n}={v}' for n, v in self._theta_values.items()],
         bounds=[f'{n}_bounds={v}'
                 for n, v in self._theta_bounds.items()]
     )
Example #3
0
    def __init__(self, node_kernel, edge_kernel, **kwargs):
        self.node_kernel = node_kernel
        self.edge_kernel = edge_kernel
        self.template = Template(self._template)
        self.scratch = None
        self.scratch_capacity = 0
        self.graph_cache = {}

        self.p = kwargs.pop('p', 'default')
        self.q = kwargs.pop('q', 0.01)

        self.block_per_sm = kwargs.pop('block_per_sm', 8)
        self.block_size = kwargs.pop('block_size', 128)

        self.device = pycuda.driver.Device(kwargs.pop('device', 0))
        self.nvcc_extra = kwargs.pop('nvcc_extra', [])
        self.ctx = self.device.make_context()
Example #4
0
 def gen_expr(self, x, y, theta_scope=''):
     F, J = self.kernel.gen_expr(
         '_1', '_2', theta_scope + 'kernel.'
     )
     f = Template(
         r'''normalize(
                 [&](auto _1, auto _2){return ${f};},
                 ${x}, ${y}
         )'''
     ).render(
         x=x, y=y, f=F
     )
     template = Template(
         r'''normalize_jacobian(
                 [&](auto _1, auto _2){return ${f};},
                 [&](auto _1, auto _2){return ${j};},
                 ${x},
                 ${y}
             )'''
     )
     jacobian = [template.render(x=x, y=y, f=F, j=j) for j in J]
     return f, jacobian
Example #5
0
def decltype(type, name=''):
    type = np.dtype(type, align=True)  # convert np.float32 etc. to dtype
    if type.names is not None:
        if len(type.names):
            return Template(r'struct{${members;};}${name}').render(
                name=name,
                members=[decltype(type.fields[v][0], v) for v in type.names])
        else:
            return 'constexpr static numpy_type::_empty {} {{}}'.format(name)
    # elif type.subdtype is not None:
    #     return Template(r'''${type} ${name}${dim}''').render(
    #         type=type.name, name=
    #     )
    else:
        return '{} {}'.format(str(type.name), name)
Example #6
0
def decltype(t, name=''):
    '''Generate C++ source code for structures corresponding to a `cpptype`
    class.'''
    t = np.dtype(t, align=True)  # convert np.float32 etc. to dtype
    if name.startswith('$'):
        ''' template class types '''
        n, t, *Ts = name[1:].split('::')
        _assert_is_identifier(n)
        return Template(r'${template}<${arguments,}>${name}').render(
            template=t,
            arguments=[decltype(T) for T in Ts],
            name=n
        )
    else:
        _assert_is_identifier(name)
        if _dtype_util.is_object(t):
            ''' structs '''
            if len(t.names):
                return Template(r'struct{${members;};}${name}').render(
                    name=name,
                    members=[decltype(t.fields[v][0], v) for v in t.names])
            else:
                return f'constexpr static _empty {name} {{}}'
        elif _dtype_util.is_array(t):
            ''' C-style arrays '''
            return Template(r'${t} ${name}[${shape][}]').render(
                t=decltype(t.base),
                name=name,
                shape=t.shape
            )
        else:
            ''' scalar types and C-strings '''
            if t.kind == 'S':
                return f'char {name}[{t.itemsize}]'
            else:
                return f'{str(t.name)} {name}'.strip()
Example #7
0
    def gencode_kernel(kernel, name):
        fun, jac = kernel.gen_expr('x1', 'x2')

        return Template(r'''
        using ${name}_theta_t = ${theta_t};

        struct ${name}_t : ${name}_theta_t {

            constexpr static int jac_dims = ${jac_dims};

            template<class X>
            __device__ __inline__
            auto operator() (X const &x1, X const &x2) const {
                return ${expr};
            }

            template<class X>
            __device__ __inline__
            auto _j_a_c_o_b_i_a_n_(X const &x1, X const &x2) const {
                graphdot::array<float, jac_dims> j;
                ${jac;\n};
                return j;
            }
        };

        __constant__ ${name}_t ${name};
        __constant__ ${name}_t ${name}_diff_grid[2 * ${n_theta}];
        __constant__ float32   ${name}_flat_theta[${n_theta}];
        ''').render(
            name=name,
            jac_dims=len(jac),
            theta_t=decltype(kernel),
            expr=fun,
            jac=[f'j[{i}] = {expr}' for i, expr in enumerate(jac)],
            n_theta=len(list(flatten(kernel.theta)))
        )
Example #8
0
def _from_sympy(name, desc, expr, vars, *hyperparameter_specs, minmax=(0, 1)):
    '''Create a microkernel class from a SymPy expression.

    Parameters
    ----------
    name: str
        The name of the microkernel. Must be a valid Python identifier.
    desc: str
        A human-readable description of the microkernel. Will be used to build
        the docstring of the returned microkernel class.
    expr: str or SymPy expression
        Expression of the microkernel in SymPy format.
    vars: 2-tuple of str or SymPy symbols
        The input variables of the microkernel as shown up in the expression.
        A microkernel must have exactly 2 input variables. All other symbols
        that show up in its expression should be regarded as
        hyperparameters.
    hyperparameter_specs: list of hyperparameter specifications in one of
    the formats below:

        | symbol,
        | (symbol,),
        | (symbol, dtype),
        | (symbol, dtype, description),
        | (symbol, dtype, lower_bound, upper_bound),
        | (symbol, dtype, lower_bound, upper_bound, description),

        If a default set of lower and upper bounds are not defined here,
        then it must be specified explicitly during microkernel object
        creation, using arguments as specified in the microkernel class's
        docstring.
    minmax: a 2-tuple of floats
        The minimum and maximum value that the kernel can output.
    '''

    assert(isinstance(name, str) and name.isidentifier())

    '''parse expression'''
    if isinstance(expr, str):
        expr = sy.sympify(expr)

    '''check input variables'''
    if len(vars) != 2:
        raise ValueError('A microkernel must have exactly two variables')
    vars = [sy.Symbol(v) if isinstance(v, str) else v for v in vars]

    '''parse the list of hyperparameters'''
    hyperdefs = OrderedDict()
    for spec in hyperparameter_specs:
        if not hasattr(spec, '__iter__'):
            symbol = spec
            hyperdefs[symbol] = dict(dtype=np.dtype(np.float32))
        if len(spec) == 1:
            symbol = spec[0]
            hyperdefs[symbol] = dict(dtype=np.dtype(np.float32))
        if len(spec) == 2:
            symbol, dtype = spec
            hyperdefs[symbol] = dict(dtype=np.dtype(dtype))
        if len(spec) == 3:
            symbol, dtype, doc = spec
            hyperdefs[symbol] = dict(dtype=np.dtype(dtype), doc=doc)
        elif len(spec) == 4:
            symbol, dtype, lb, ub = spec
            hyperdefs[symbol] = dict(dtype=np.dtype(dtype),
                                     bounds=(lb, ub))
        elif len(spec) == 5:
            symbol, dtype, lb, ub, doc = spec
            hyperdefs[symbol] = dict(dtype=np.dtype(dtype),
                                     bounds=(lb, ub),
                                     doc=doc)
        else:
            raise ValueError(
                'Invalid hyperparameter specification, '
                'must be one of\n'
                '(symbol)\n',
                '(symbol, dtype)\n',
                '(symbol, dtype, doc)\n',
                '(symbol, dtype, lb, ub)\n',
                '(symbol, dtype, lb, ub, doc)\n',
            )

    '''create microkernel class'''
    class CppType(type(MicroKernel)):
        @property
        def dtype(cls):
            return cls._dtype

    class uKernel(MicroKernel, metaclass=CppType):

        _expr = expr
        _vars = vars
        _hyperdefs = hyperdefs
        _dtype = np.dtype([(k, v['dtype']) for k, v in hyperdefs.items()],
                          align=True)

        @property
        def name(self):
            return name

        def __init__(self, *args, **kwargs):

            self._theta_values = values = OrderedDict()
            self._theta_bounds = bounds = OrderedDict()

            for symbol, value in zip(self._hyperdefs, args):
                values[symbol] = value

            for symbol in self._hyperdefs:
                try:
                    values[symbol] = kwargs[symbol]
                except KeyError:
                    if symbol not in values:
                        raise KeyError(
                            f'Hyperparameter {symbol} not provided '
                            f'for {self.name}'
                        )

                try:
                    bounds[symbol] = kwargs['%s_bounds' % symbol]
                except KeyError:
                    try:
                        bounds[symbol] = self._hyperdefs[symbol]['bounds']
                    except KeyError:
                        raise KeyError(
                            f'Bounds for hyperparameter {symbol} of '
                            f'microkernel {self.name} not set, and '
                            f'no defaults were given.'
                        )
                self._assert_bounds(symbol, bounds[symbol])

        # @cached_property
        @property
        def _vars_and_hypers(self):
            if not hasattr(self, '_vars_and_hypers_cached'):
                self._vars_and_hypers_cached = [
                    *self._vars, *self._hyperdefs.keys()
                ]
            return self._vars_and_hypers_cached

        # @cached_property
        @property
        def _fun(self):
            if not hasattr(self, '_fun_cached'):
                self._fun_cached = lambdify(
                    self._vars_and_hypers,
                    self._expr
                )
            return self._fun_cached
            # return lambdify(self._vars_and_hypers, self._expr)

        # @cached_property
        @property
        def _jac(self):
            if not hasattr(self, '_jac_cached'):
                self._jac_cached = [
                    lambdify(self._vars_and_hypers, sy.diff(expr, h))
                    for h in self._hyperdefs
                ]
            return self._jac_cached
            # return [lambdify(self._vars_and_hypers, sy.diff(expr, h))
            #         for h in self._hyperdefs]

        def __call__(self, x1, x2, jac=False):
            if jac is True:
                return (
                    self._fun(x1, x2, *self.theta),
                    np.array([j(x1, x2, *self.theta) for j in self._jac])
                )
            else:
                return self._fun(x1, x2, *self.theta)

        def __repr__(self):
            return Template('${cls}(${theta, }, ${bounds, })').render(
                cls=self.name,
                theta=[f'{n}={v}' for n, v in self._theta_values.items()],
                bounds=[f'{n}_bounds={v}'
                        for n, v in self._theta_bounds.items()]
            )

        def gen_expr(self, x, y, theta_scope=''):
            nmap = {
                str(self._vars[0]): x,
                str(self._vars[1]): y,
                **{t: theta_scope + t for t in self._hyperdefs}
            }

            return (
                cudacxxcode(self._expr, nmap),
                [cudacxxcode(sy.diff(self._expr, h), nmap)
                 for h in self._hyperdefs]
            )

        @property
        def dtype(self):
            return self._dtype

        @property
        def state(self):
            return tuple(self._theta_values.values())

        @property
        def theta(self):
            return pretty_tuple(
                self.name,
                self._theta_values.keys()
            )(**self._theta_values)

        @theta.setter
        def theta(self, seq):
            assert(len(seq) == len(self._theta_values))
            for theta, value in zip(self._hyperdefs, seq):
                self._theta_values[theta] = value

        @property
        def bounds(self):
            return tuple(self._theta_bounds.values())

        @property
        def minmax(self):
            return minmax

    '''furnish doc strings'''
    param_docs = [
        Template(
            '${name}: ${type}\n'
            '    ${desc\n    }\n'
            '${name}_bounds: tuple or "fixed"\n'
            '    Lower and upper bounds of `${name}` with respect to '
            'hyperparameter optimization. If "fixed", the hyperparameter will '
            'not be optimized during training.'
        ).render(
            name=name,
            type=hdef['dtype'],
            desc=[s.strip() for s in hdef.get('doc', '').split('\n')]
        ) for name, hdef in hyperdefs.items()
    ]

    uKernel.__doc__ = Template(
        '${desc}\n'
        '\n'
        'Parameters\n'
        '----------\n'
        '${param_docs\n}',
        escape=False
    ).render(
        desc='\n'.join([s.strip() for s in desc.split('\n')]),
        param_docs=param_docs
    )

    return uKernel
Example #9
0
def test_render_file(case):
    key1, key2, val, result = case
    assert (Template(os.path.join(_pwd, 'test_template.tpl')).render(
        key1=key1, key2=key2, val=val) == result)
Example #10
0
def test_render_list(case):
    separators, repls = case
    for sep in separators:
        assert (Template(r'${key%s}' %
                         sep).render(key=repls) == sep.join(repls))
Example #11
0
def test_render_single(case):
    tpl, repl, result = case
    assert (Template(tpl).render(_1=repl) == result)
Example #12
0
 def template(self):
     return Template(os.path.join(os.path.dirname(__file__), '_backend.cu'))
Example #13
0
 def gencode(self, x, y):
     return Template('(${expr*})').render(expr=[
         k.gencode('%s.%s' % (x, key), '%s.%s' % (y, key))
         for key, k in self.kw_kernels.items()
     ])
Example #14
0
 def __repr__(self):
     return Template('TensorProduct(${kwexpr, })').render(kwexpr=[
         '{}={}'.format(kw, repr(k))
         for kw, k in self.kw_kernels.items()
     ])
Example #15
0
class MarginalizedGraphKernel:
    """Implements the random walk-based graph similarity kernel as proposed in:
    Kashima, H., Tsuda, K., & Inokuchi, A. (2003).
    Marginalized kernels between labeled graphs. *In Proceedings of the 20th
    international conference on machine learning (ICML-03)* (pp. 321-328).

    Parameters
    ----------
    node_kernel: base kernel or composition of base kernels
        A kernelet that computes the similarity between individual nodes
    edge_kernel: base kernel or composition of base kernels
        A kernelet that computes the similarity between individual edge
    kwargs: optional arguments
        p: functor or 'uniform' or 'default'
            The starting probability of the random walk on each node. Must be
            either a functor that takes in a node (a dataframe row) and returns
            a number, or the name of a built-in distribution. Currently, only
            'uniform' and 'default' are implemented.
            Note that a custom probability does not have to be normalized.
        q: float in (0, 1)
            The probability for the random walk to stop during each step.
        block_per_sm: int
            Tunes the GPU kernel.
        block_size: int
            Tunes the GPU kernel.
    """

    _template = os.path.join(os.path.dirname(__file__), 'template.cu')

    def __init__(self, node_kernel, edge_kernel, **kwargs):
        self.node_kernel = node_kernel
        self.edge_kernel = edge_kernel
        self.template = Template(self._template)
        self.scratch = None
        self.scratch_capacity = 0
        self.graph_cache = {}

        self.p = kwargs.pop('p', 'default')
        self.q = kwargs.pop('q', 0.01)

        self.block_per_sm = kwargs.pop('block_per_sm', 8)
        self.block_size = kwargs.pop('block_size', 128)

        self.device = pycuda.driver.Device(kwargs.pop('device', 0))
        self.nvcc_extra = kwargs.pop('nvcc_extra', [])
        self.ctx = self.device.make_context()

    def __del__(self):
        self.ctx.synchronize()
        self.ctx.detach()

    def _allocate_scratch(self, count, capacity):
        if (self.scratch is None or len(self.scratch) < count
                or self.scratch[0].capacity < capacity):
            self.ctx.synchronize()
            self.scratch = [BlockScratch(capacity) for _ in range(count)]
            self.scratch_d = umlike(
                np.array([s.state for s in self.scratch], BlockScratch.dtype))
            self.scratch_capacity = self.scratch[0].capacity
            self.ctx.synchronize()

    def clone_with_theta(self):
        """scikit-learn compatibility method"""
        pass

    def _assert_homegeneous(self, X):
        for x1, x2 in zip(X[:-1], X[1:]):
            try:
                assert (x1.weighted == x2.weighted)
                assert (x1.node_type == x2.node_type)
                assert (x1.edge_type == x2.edge_type)
            except AssertionError as e:
                raise TypeError('All graphs must be of the same type: %s' %
                                str(e))

    def _convert_to_octilegraph(self, graph):
        if hasattr(graph, 'uuid') and graph.uuid in self.graph_cache:
            return self.graph_cache[graph.uuid]
        else:
            if not hasattr(graph, 'uuid'):
                graph.uuid = uuid.uuid4()
            og = OctileGraph(graph)
            self.graph_cache[graph.uuid] = og
            return og

    def _get_starting_probability(self, p):
        if isinstance(self.p, str):
            if self.p == 'uniform' or self.p == 'default':
                return lambda n: 1.0
            else:
                raise ValueError(
                    'Unknown starting probability distribution %s' % self.p)
        else:
            return p

    def _launch_kernel(self, graphs, jobs, nodal, lmin):
        if lmin != 0 and lmin != 1:
            raise ValueError('lmin must be 0 or 1')
        ''' transfer grahs to GPU '''
        oct_graphs = [self._convert_to_octilegraph(g) for g in graphs]
        self._assert_homegeneous(oct_graphs)
        oct_graphs_d = umlike(
            np.array([g.state for g in oct_graphs], OctileGraph.dtype))
        ''' upload work item list to device '''
        jobs_d = umlike(np.array([j.state for j in jobs], Job.dtype))
        i_job_global = pycuda.gpuarray.zeros(1, np.uint32)
        ''' prepare GPU kernel launch '''
        x = next(iter(oct_graphs))
        weighted = x.weighted
        node_type = x.node_type
        edge_type = x.edge_type

        if weighted:
            edge_kernel = TensorProduct(weight=_Multiply(),
                                        label=self.edge_kernel)
        else:
            edge_kernel = self.edge_kernel

        node_kernel_src = Template(r'''
        struct node_kernel {
            template<class V> __device__
            static auto compute(V const &v1, V const &v2) {
                return ${node_expr};
            }
        };
        ''').render(node_expr=self.node_kernel.gencode('v1', 'v2'))

        edge_kernel_src = Template(r'''
        struct edge_kernel {
            template<class T> __device__
            static auto compute(T const &e1, T const &e2) {
                return ${edge_expr};
            }
        };
        ''').render(edge_expr=edge_kernel.gencode('e1', 'e2'))

        source = self.template.render(node_kernel=node_kernel_src,
                                      edge_kernel=edge_kernel_src,
                                      node_t=decltype(node_type),
                                      edge_t=decltype(edge_type))
        self.source = source

        with warnings.catch_warnings(record=True) as w:
            mod = SourceModule(source,
                               options=[
                                   '-std=c++14',
                                   '-O4',
                                   '--use_fast_math',
                                   '--expt-relaxed-constexpr',
                                   '--maxrregcount=64',
                                   '-Xptxas',
                                   '-v',
                                   '-lineinfo',
                               ] + self.nvcc_extra,
                               no_extern_c=True,
                               include_dirs=cpp.__path__)
            self.compiler_message = [str(rec.message) for rec in w]
        kernel = mod.get_function('graph_kernel_solver')

        launch_block_count = (self.device.MULTIPROCESSOR_COUNT *
                              self.block_per_sm)
        shmem_bytes_per_warp = mod.get_global('shmem_bytes_per_warp')[1]
        shmem_bytes_per_block = (shmem_bytes_per_warp * self.block_size //
                                 self.device.WARP_SIZE)

        max_graph_size = np.max([g.padded_size for g in oct_graphs])
        self._allocate_scratch(launch_block_count, max_graph_size**2)

        # print("%-32s : %ld" % ("Blocks launched", launch_block_count))
        # print("%-32s : %ld" % ("Shared memory per block",
        #                        shmem_bytes_per_block))

        kernel(
            oct_graphs_d.base,
            self.scratch_d.base,
            jobs_d.base,
            i_job_global,
            np.uint32(len(jobs)),
            np.float32(self.q),
            np.float32(self.q),  # placeholder for q0
            np.int32(lmin),
            grid=(launch_block_count, 1, 1),
            block=(self.block_size, 1, 1),
            shared=shmem_bytes_per_block)

    def __call__(self, X, Y=None, nodal=False, lmin=0):
        """Compute pairwise similarity matrix between graphs

        Parameters
        ----------
        X: list of N graphs
            The graphs must all have same node and edge attributes.
        Y: None or list of M graphs
            The graphs must all have same node and edge attributes.
        nodal: bool
            If True, return node-wise similarities; otherwise, return graphwise
            similarities.
        lmin: 0 or 1
            Number of steps to skip in each random walk path before similarity
            is computed.
            lmin + 1 corresponds to the starting value of l in the summation
            of Eq. 1 in Tang & de Jong, 2019 https://doi.org/10.1063/1.5078640
            (or the first unnumbered equation as in Kashima, Tsuda, and
            Inokuchi, 2003).

        Returns
        -------
        numpy.array
            if Y is None, return a square matrix containing pairwise
            similarities between the graphs in X; otherwise, returns a matrix
            containing similarities across graphs in X and Y.
        """
        ''' generate jobs '''
        if Y is None:
            jobs = [
                Job(i, i + j, umarray(len(g1.nodes) * len(g2.nodes)))
                for i, g1 in enumerate(X) for j, g2 in enumerate(X[i:])
            ]
        else:
            jobs = [
                Job(i,
                    len(X) + j, umarray(len(g1.nodes) * len(g2.nodes)))
                for i, g1 in enumerate(X) for j, g2 in enumerate(Y)
            ]
        ''' assign starting probabilities '''
        p_func = self._get_starting_probability(self.p)
        P = [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in X]
        if Y is not None:
            P += [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in Y]
        ''' call GPU kernel '''
        self._launch_kernel(X + Y if Y is not None else X, jobs, nodal, lmin)
        ''' collect result '''
        if Y is None:
            N = len(X)
            R = np.empty((N, N), np.object)
            for job in jobs:
                r = job.vr.reshape(len(X[job.i].nodes), -1)
                pi = P[job.i]
                pj = P[job.j]
                if nodal is True:
                    R[job.i, job.j] = pi[:, None] * r * pj[None, :]
                    R[job.j, job.i] = R[job.i, job.j].T
                else:
                    R[job.i, job.j] = R[job.j, job.i] = pi.dot(r).dot(pj)
        else:
            N = len(X)
            M = len(Y)
            R = np.empty((N, M), np.object)
            for job in jobs:
                r = job.vr.reshape(len(X[job.i].nodes), -1)
                pi = P[job.i]
                pj = P[job.j]
                if nodal is True:
                    R[job.i, job.j - N] = pi[:, None] * r * pj[None, :]
                else:
                    R[job.i, job.j - N] = pi.dot(r).dot(pj)

        return np.block(R.tolist())

    def diag(self, X, nodal=False, lmin=0):
        """Compute the self-similarities for a list of graphs

        Parameters
        ----------
        X: list of N graphs
            The graphs must all have same node attributes and edge attributes.
        nodal: bool or 'block'
            If True, returns a vector containing nodal self similarties;
            if False, returns a vector containing graphs' overall self
            similarities; if 'block', return a list of square matrices, each
            being a pairwise nodal similarity matrix within a graph.
        lmin: 0 or 1
            Number of steps to skip in each random walk path before similarity
            is computed.
            lmin + 1 corresponds to the starting value of l in the summation
            of Eq. 1 in Tang & de Jong, 2019 https://doi.org/10.1063/1.5078640
            (or the first unnumbered equation as in Kashima, Tsuda, and
            Inokuchi, 2003).

        Returns
        -------
        numpy.array or list of np.array(s)
            If nodal=True, returns a vector containing nodal self similarties;
            if nodal=False, returns a vector containing graphs' overall self
            similarities; if nodal = 'block', return a list of square matrices,
            each being a pairwise nodal similarity matrix within a graph.
        """
        ''' generate jobs '''
        jobs = [Job(i, i, umarray(len(g1.nodes)**2)) for i, g1 in enumerate(X)]
        ''' assign starting probabilities '''
        p_func = self._get_starting_probability(self.p)
        P = [np.array([p_func(n) for n in g.nodes.iterrows()]) for g in X]
        ''' call GPU kernel '''
        self._launch_kernel(X, jobs, nodal, lmin)
        ''' collect result '''
        N = [len(x.nodes) for x in X]
        if nodal is True:
            return np.concatenate(
                [p**2 * job.vr[::n + 1] for job, p, n in zip(jobs, P, N)])
        elif nodal is False:
            return np.array([
                p.dot(job.vr.reshape(n, -1)).dot(p)
                for job, p, n in zip(jobs, P, N)
            ])
        elif nodal == 'block':
            return list(p[:, None] * job.vr.reshape(n, -1) * p[None, :]
                        for job, p, n in zip(jobs, P, N))
        else:
            raise (ValueError("Invalid 'nodal' option '%s'" % nodal))
Example #16
0
    def _launch_kernel(self, graphs, jobs, nodal, lmin):
        if lmin != 0 and lmin != 1:
            raise ValueError('lmin must be 0 or 1')
        ''' transfer grahs to GPU '''
        oct_graphs = [self._convert_to_octilegraph(g) for g in graphs]
        self._assert_homegeneous(oct_graphs)
        oct_graphs_d = umlike(
            np.array([g.state for g in oct_graphs], OctileGraph.dtype))
        ''' upload work item list to device '''
        jobs_d = umlike(np.array([j.state for j in jobs], Job.dtype))
        i_job_global = pycuda.gpuarray.zeros(1, np.uint32)
        ''' prepare GPU kernel launch '''
        x = next(iter(oct_graphs))
        weighted = x.weighted
        node_type = x.node_type
        edge_type = x.edge_type

        if weighted:
            edge_kernel = TensorProduct(weight=_Multiply(),
                                        label=self.edge_kernel)
        else:
            edge_kernel = self.edge_kernel

        node_kernel_src = Template(r'''
        struct node_kernel {
            template<class V> __device__
            static auto compute(V const &v1, V const &v2) {
                return ${node_expr};
            }
        };
        ''').render(node_expr=self.node_kernel.gencode('v1', 'v2'))

        edge_kernel_src = Template(r'''
        struct edge_kernel {
            template<class T> __device__
            static auto compute(T const &e1, T const &e2) {
                return ${edge_expr};
            }
        };
        ''').render(edge_expr=edge_kernel.gencode('e1', 'e2'))

        source = self.template.render(node_kernel=node_kernel_src,
                                      edge_kernel=edge_kernel_src,
                                      node_t=decltype(node_type),
                                      edge_t=decltype(edge_type))
        self.source = source

        with warnings.catch_warnings(record=True) as w:
            mod = SourceModule(source,
                               options=[
                                   '-std=c++14',
                                   '-O4',
                                   '--use_fast_math',
                                   '--expt-relaxed-constexpr',
                                   '--maxrregcount=64',
                                   '-Xptxas',
                                   '-v',
                                   '-lineinfo',
                               ] + self.nvcc_extra,
                               no_extern_c=True,
                               include_dirs=cpp.__path__)
            self.compiler_message = [str(rec.message) for rec in w]
        kernel = mod.get_function('graph_kernel_solver')

        launch_block_count = (self.device.MULTIPROCESSOR_COUNT *
                              self.block_per_sm)
        shmem_bytes_per_warp = mod.get_global('shmem_bytes_per_warp')[1]
        shmem_bytes_per_block = (shmem_bytes_per_warp * self.block_size //
                                 self.device.WARP_SIZE)

        max_graph_size = np.max([g.padded_size for g in oct_graphs])
        self._allocate_scratch(launch_block_count, max_graph_size**2)

        # print("%-32s : %ld" % ("Blocks launched", launch_block_count))
        # print("%-32s : %ld" % ("Shared memory per block",
        #                        shmem_bytes_per_block))

        kernel(
            oct_graphs_d.base,
            self.scratch_d.base,
            jobs_d.base,
            i_job_global,
            np.uint32(len(jobs)),
            np.float32(self.q),
            np.float32(self.q),  # placeholder for q0
            np.int32(lmin),
            grid=(launch_block_count, 1, 1),
            block=(self.block_size, 1, 1),
            shared=shmem_bytes_per_block)