Example #1
0
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                                  dict(a=np.float32, b=np.float32,
                                       g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == f32div == n*m*ell*n_subgroups
    assert f64mul == n*m*n_subgroups
    assert i32add == n*m*2*n_subgroups
Example #2
0
def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1):
    arg = kernel.arg_dict[variable]

    if arg.dim_tags is None:
        raise RuntimeError("cannot find padding multiple--dim_tags of '%s' "
                "are not known" % variable)

    dim_tag = arg.dim_tags[axis]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("cannot find padding multiple--"
                "axis %d of '%s' is not tagged fixed-stride"
                % (axis, variable))

    stride = dim_tag.stride

    if not isinstance(stride, int):
        raise RuntimeError("cannot find padding multiple--stride is not a "
                "known integer")

    from pytools import div_ceil

    multiple = 1
    while True:
        true_size = multiple * stride
        padded_size = div_ceil(true_size, align_bytes) * align_bytes

        if (padded_size - true_size) / true_size <= allowed_waste:
            return multiple

        multiple += 1
Example #3
0
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(
                        not(k<ell-2) and k>6 or k/2==ell,
                        g[i,k]*2,
                        g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32mul == n*m*n_subgroups
    assert f64div == 2*n*m*n_subgroups  # TODO why?
    assert f64add == n*m*n_subgroups
    assert i32add == n*m*n_subgroups
Example #4
0
def test_op_counter_triangular_domain():

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}",
                         """
            a[i, j] = b[i,j] * 2
            """,
                         name="bitwise",
                         assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    op_map = lp.get_op_map(knl, subgroup_size=SGS,
                           count_redundant_work=True)[lp.Op(
                               np.float64, 'mul', CG.SUBGROUP)]
    value_dict = dict(m=13, n=200)
    flops = op_map.eval_with_dict(value_dict)

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group

    if expect_fallback:
        assert flops == 144 * n_subgroups
    else:
        assert flops == 78 * n_subgroups
Example #5
0
def test_op_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul_serial", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == n*m*ell*n_subgroups

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    assert f32 == f32add + f32mul
Example #6
0
    def check_expansion_disks_undisturbed_by_sources(self,
            stage1_density_discr, tree, peer_lists,
            expansion_disturbance_tolerance,
            refine_flags,
            debug, wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.expansion_disk_undisturbed_by_sources_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        from pytential import bind, sym
        center_danger_zone_radii = flatten(
            bind(stage1_density_discr,
                sym.expansion_radii(stage1_density_discr.ambient_dim,
                    granularity=sym.GRANULARITY_CENTER))(self.array_context))

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_panel_to_center_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                center_danger_zone_radii,
                expansion_disturbance_tolerance,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxcenters),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Example #7
0
def add_padding(kernel, variable, axis, align_bytes):
    arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)}
    arg_idx = arg_to_idx[variable]

    new_args = kernel.args[:]
    arg = new_args[arg_idx]

    if arg.dim_tags is None:
        raise RuntimeError("cannot add padding--dim_tags of '%s' "
                           "are not known" % variable)

    new_dim_tags = list(arg.dim_tags)
    dim_tag = new_dim_tags[axis]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("cannot find padding multiple--"
                           "axis %d of '%s' is not tagged fixed-stride" %
                           (axis, variable))

    stride = dim_tag.stride
    if not isinstance(stride, int):
        raise RuntimeError("cannot find split granularity--stride is not a "
                           "known integer")

    from pytools import div_ceil
    new_dim_tags[axis] = FixedStrideArrayDimTag(
        div_ceil(stride, align_bytes) * align_bytes)

    new_args[arg_idx] = arg.copy(dim_tags=tuple(new_dim_tags))

    return kernel.copy(args=new_args)
Example #8
0
    def __call__(self, queue, tree, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(pl, event)*, where *pl* is an instance of
            :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)

        logger.info("peer list finder: find peer lists")

        result, evt = peer_list_finder_kernel(
                queue, tree.nboxes,
                tree.box_centers.data, tree.root_extent,
                tree.box_levels.data, tree.aligned_nboxes,
                tree.box_child_ids.data, tree.box_flags.data,
                wait_for=wait_for)

        logger.info("peer list finder: done")

        return PeerListLookup(
                tree=tree,
                peer_list_starts=result["peers"].starts,
                peer_lists=result["peers"].lists).with_queue(None), evt
Example #9
0
def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1):
    arg = kernel.arg_dict[variable]

    if arg.dim_tags is None:
        raise RuntimeError("cannot find padding multiple--dim_tags of '%s' "
                "are not known" % variable)

    dim_tag = arg.dim_tags[axis]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("cannot find padding multiple--"
                "axis %d of '%s' is not tagged fixed-stride"
                % (axis, variable))

    stride = dim_tag.stride

    if not isinstance(stride, int):
        raise RuntimeError("cannot find padding multiple--stride is not a "
                "known integer")

    from pytools import div_ceil

    multiple = 1
    while True:
        true_size = multiple * stride
        padded_size = div_ceil(true_size, align_bytes) * align_bytes

        if (padded_size - true_size) / true_size <= allowed_waste:
            return multiple

        multiple += 1
Example #10
0
def add_padding(kernel, variable, axis, align_bytes):
    arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args))
    arg_idx = arg_to_idx[variable]

    new_args = kernel.args[:]
    arg = new_args[arg_idx]

    if arg.dim_tags is None:
        raise RuntimeError("cannot add padding--dim_tags of '%s' "
                "are not known" % variable)

    new_dim_tags = list(arg.dim_tags)
    dim_tag = new_dim_tags[axis]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("cannot find padding multiple--"
                "axis %d of '%s' is not tagged fixed-stride"
                % (axis, variable))

    stride = dim_tag.stride
    if not isinstance(stride, int):
        raise RuntimeError("cannot find split granularity--stride is not a "
                "known integer")

    from pytools import div_ceil
    new_dim_tags[axis] = FixedStrideArrayDimTag(
            div_ceil(stride, align_bytes) * align_bytes)

    new_args[arg_idx] = arg.copy(dim_tags=tuple(new_dim_tags))

    return kernel.copy(args=new_args)
Example #11
0
    def check_sufficient_source_quadrature_resolution(self,
                                                      stage2_density_discr,
                                                      tree,
                                                      peer_lists,
                                                      refine_flags,
                                                      debug,
                                                      wait_for=None):
        actx = self.array_context

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(tree.nlevels,
                                                     MAX_LEVELS_INCREMENT)

        knl = self.code_container.sufficient_source_quadrature_resolution_checker(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, tree.particle_id_dtype,
            max_levels)
        if debug:
            nelements_to_refine_prev = actx.to_numpy(
                actx.np.sum(refine_flags)).item()

        found_element_to_refine = actx.zeros(1, dtype=np.int32)
        found_element_to_refine.finish()

        from pytential import bind, sym
        dd = sym.as_dofdesc(sym.GRANULARITY_ELEMENT).to_stage2()
        source_danger_zone_radii_by_element = flatten(
            bind(
                stage2_density_discr,
                sym._source_danger_zone_radii(stage2_density_discr.ambient_dim,
                                              dofdesc=dd))(self.array_context),
            self.array_context)
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        evt = knl(*unwrap_args(
            tree, peer_lists, tree.box_to_qbx_center_starts,
            tree.box_to_qbx_center_lists, tree.qbx_element_to_source_starts,
            tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start,
            tree.sorted_target_ids, source_danger_zone_radii_by_element,
            tree.nqbxelements, refine_flags, found_element_to_refine,
            *tree.sources),
                  range=slice(tree.nqbxsources),
                  queue=actx.queue,
                  wait_for=wait_for)

        import pyopencl as cl
        cl.wait_for_events([evt])

        if debug:
            nelements_to_refine = actx.to_numpy(
                actx.np.sum(refine_flags)).item()
            if nelements_to_refine > nelements_to_refine_prev:
                logger.debug("refiner: found %d element(s) to refine",
                             nelements_to_refine - nelements_to_refine_prev)

        return actx.to_numpy(found_element_to_refine)[0] == 1
Example #12
0
    def check_expansion_disks_undisturbed_by_sources(self,
            lpot_source, tree, peer_lists,
            expansion_disturbance_tolerance,
            refine_flags,
            debug, wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.expansion_disk_undisturbed_by_sources_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        center_danger_zone_radii = lpot_source._expansion_radii("ncenters")

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_panel_to_center_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                center_danger_zone_radii,
                expansion_disturbance_tolerance,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxcenters),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Example #13
0
    def check_sufficient_source_quadrature_resolution(
            self, lpot_source, tree, peer_lists, refine_flags, debug,
            wait_for=None):

        # Avoid generating too many kernels.
        from pytools import div_ceil
        max_levels = MAX_LEVELS_INCREMENT * div_ceil(
                tree.nlevels, MAX_LEVELS_INCREMENT)

        knl = self.code_container.sufficient_source_quadrature_resolution_checker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)
        if debug:
            npanels_to_refine_prev = cl.array.sum(refine_flags).get()

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()

        from pytential import bind, sym
        source_danger_zone_radii_by_panel = bind(lpot_source,
                sym._source_danger_zone_radii(
                    lpot_source.ambient_dim,
                    dofdesc=sym.GRANULARITY_ELEMENT))(self.queue)
        unwrap_args = AreaQueryElementwiseTemplate.unwrap_args

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_center_starts,
                tree.box_to_qbx_center_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_center_slice.start,
                tree.sorted_target_ids,
                source_danger_zone_radii_by_panel,
                tree.nqbxpanels,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxsources),
            queue=self.queue,
            wait_for=wait_for)

        cl.wait_for_events([evt])

        if debug:
            npanels_to_refine = cl.array.sum(refine_flags).get()
            if npanels_to_refine > npanels_to_refine_prev:
                logger.debug("refiner: found {} panel(s) to refine".format(
                    npanels_to_refine - npanels_to_refine_prev))

        return found_panel_to_refine.get()[0] == 1
Example #14
0
def test_mem_access_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))

    subgroup_size = 32

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=subgroup_size)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, subgroup_size)

    f32l = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group

    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32s == (n*ell)*n_workgroups*subgroups_per_group

    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                 ).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store']
                                 ).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4*f32l
    assert st_bytes == 4*f32s
Example #15
0
    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64

        # compute lmem limit
        from pytools import div_ceil
        lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
        result = min(max_work_group_size, lmem_wg_size)

        # round down to power of 2
        from pyopencl.tools import bitlog2
        return 2**bitlog2(result)
Example #16
0
def find_padding_multiple(kernel,
                          variable,
                          axis,
                          align_bytes,
                          allowed_waste=0.1):
    if isinstance(kernel, TranslationUnit):
        kernel_names = [
            i for i, clbl in kernel.callables_table.items()
            if isinstance(clbl, CallableKernel)
        ]
        if len(kernel_names) > 1:
            raise LoopyError()
        return find_padding_multiple(kernel[kernel_names[0]], variable, axis,
                                     align_bytes, allowed_waste)
    assert isinstance(kernel, LoopKernel)

    arg = kernel.arg_dict[variable]

    if arg.dim_tags is None:
        raise RuntimeError("cannot find padding multiple--dim_tags of '%s' "
                           "are not known" % variable)

    dim_tag = arg.dim_tags[axis]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("cannot find padding multiple--"
                           "axis %d of '%s' is not tagged fixed-stride" %
                           (axis, variable))

    stride = dim_tag.stride

    if not isinstance(stride, int):
        raise RuntimeError("cannot find padding multiple--stride is not a "
                           "known integer")

    from pytools import div_ceil

    multiple = 1
    while True:
        true_size = multiple * stride
        padded_size = div_ceil(true_size, align_bytes) * align_bytes

        if (padded_size - true_size) / true_size <= allowed_waste:
            return multiple

        multiple += 1
Example #17
0
def test_op_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                """
    ],
                         name="specialops",
                         assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl,
                           subgroup_size=SGS,
                           count_redundant_work=True,
                           count_within_subscripts=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64pow = op_map[lp.Op(np.float64, 'pow',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.dtype(np.float64), 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin',
                          CG.SUBGROUP)].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32div == 2 * n * m * ell * n_subgroups
    assert f32mul == f32add == n * m * ell * n_subgroups
    assert f64add == 3 * n * m * n_subgroups
    assert f64pow == i32add == f64rsq == f64sin == n * m * n_subgroups
Example #18
0
def test_mem_access_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell,
                    g[i,k]*2,
                    g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))

    subgroup_size = 32

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=subgroup_size)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, subgroup_size)

    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')

    f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32),
                                       direction='load')
                          ].eval_with_dict(params)
    f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                       direction='load')
                          ].eval_with_dict(params)
    f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64),
                                       direction='store')
                          ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
    assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group
    assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group
    assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
Example #19
0
def test_op_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params)
    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP)
                   ].eval_with_dict(params)
    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                      ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert i32add == n*m+n*m*ell*n_subgroups
    assert i32bw == 2*n*m*ell*n_subgroups
    assert i64bw == 2*n*m*n_subgroups
    assert i64add == i64mul == n*m*n_subgroups
    assert i64shift == 2*n*m*n_subgroups
Example #20
0
    def __call__(self, queue, tree, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(pl, event)*, where *pl* is an instance of
            :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
        from pytools import div_ceil

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)

        pl_plog = ProcessLogger(logger, "find peer lists")

        result, evt = peer_list_finder_kernel(queue,
                                              tree.nboxes,
                                              tree.box_centers.data,
                                              tree.root_extent,
                                              tree.box_levels,
                                              tree.aligned_nboxes,
                                              tree.box_child_ids.data,
                                              tree.box_flags,
                                              wait_for=wait_for)

        pl_plog.done()

        return PeerListLookup(
            tree=tree,
            peer_list_starts=result["peers"].starts,
            peer_lists=result["peers"].lists).with_queue(None), evt
Example #21
0
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
        shape, dim_tags):

    # Just to clarify:
    #
    # - user axes are user-facing--what the user actually uses for indexing.
    #
    # - target axes are implementation facing. Normal in-memory arrays have one.
    #   3D images have three.

    import loopy as lp

    # {{{ pick apart arg dim tags into computed, fixed and vec

    vector_dim = None

    # a mapping from target axes to {layout_nesting_level: dim_tag_index}
    target_axis_to_nesting_level_map = {}

    for i, dim_tag in enumerate(dim_tags):
        if isinstance(dim_tag, VectorArrayDimTag):
            if vector_dim is not None:
                raise LoopyError("arg '%s' may only have one vector-tagged "
                        "argument dimension" % name)

            vector_dim = i

        elif isinstance(dim_tag, _StrideArrayDimTagBase):
            if dim_tag.layout_nesting_level is None:
                continue

            nl_map = target_axis_to_nesting_level_map \
                    .setdefault(dim_tag.target_axis, {})
            assert dim_tag.layout_nesting_level not in nl_map
            nl_map[dim_tag.layout_nesting_level] = i

        elif isinstance(dim_tag, SeparateArrayArrayDimTag):
            pass

        else:
            raise LoopyError("invalid array dim tag")

    # }}}

    # {{{ convert computed to fixed stride dim tags

    new_dim_tags = dim_tags[:]

    for target_axis in range(num_target_axes):
        if vector_dim is None:
            stride_so_far = 1
        else:
            if shape is None or shape is lp.auto:
                # unable to normalize without known shape
                return None

            if not is_integer(shape[vector_dim]):
                raise TypeError("shape along vector axis %d of array '%s' "
                        "must be an integer, not an expression ('%s')"
                        % (vector_dim, name, shape[vector_dim]))

            stride_so_far = shape[vector_dim]
            # FIXME: OpenCL-specific
            if stride_so_far == 3:
                stride_so_far = 4

        nesting_level_map = target_axis_to_nesting_level_map.get(target_axis, {})
        nl_keys = sorted(nesting_level_map.keys())

        if not nl_keys:
            continue

        for key in nl_keys:
            dim_tag_index = nesting_level_map[key]
            dim_tag = dim_tags[dim_tag_index]

            if isinstance(dim_tag, ComputedStrideArrayDimTag):
                if stride_so_far is None:
                    raise LoopyError("unable to determine fixed stride "
                            "for axis %d because it is nested outside of "
                            "an 'auto' stride axis"
                            % dim_tag_index)

                new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag(stride_so_far,
                        target_axis=dim_tag.target_axis,
                        layout_nesting_level=dim_tag.layout_nesting_level)

                if shape is None or shape is lp.auto:
                    # unable to normalize without known shape
                    return None

                shape_axis = shape[dim_tag_index]
                if shape_axis is None:
                    stride_so_far = None
                else:
                    stride_so_far *= shape_axis

                if dim_tag.pad_to is not None:
                    from pytools import div_ceil
                    stride_so_far = (
                            div_ceil(stride_so_far, dim_tag.pad_to)
                            * stride_so_far)

            elif isinstance(dim_tag, FixedStrideArrayDimTag):
                stride_so_far = dim_tag.stride

                if stride_so_far is lp.auto:
                    stride_so_far = None

            else:
                raise TypeError("internal error in dim_tag conversion")

    # }}}

    return new_dim_tags
Example #22
0
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
                                       shape, dim_tags):

    # Just to clarify:
    #
    # - user axes are user-facing--what the user actually uses for indexing.
    #
    # - target axes are implementation facing. Normal in-memory arrays have one.
    #   3D images have three.

    import loopy as lp

    # {{{ pick apart arg dim tags into computed, fixed and vec

    vector_dim = None

    # a mapping from target axes to {layout_nesting_level: dim_tag_index}
    target_axis_to_nesting_level_map = {}

    for i, dim_tag in enumerate(dim_tags):
        if isinstance(dim_tag, VectorArrayDimTag):
            if vector_dim is not None:
                raise LoopyError("arg '%s' may only have one vector-tagged "
                                 "argument dimension" % name)

            vector_dim = i

        elif isinstance(dim_tag, _StrideArrayDimTagBase):
            if dim_tag.layout_nesting_level is None:
                continue

            nl_map = target_axis_to_nesting_level_map \
                    .setdefault(dim_tag.target_axis, {})
            assert dim_tag.layout_nesting_level not in nl_map
            nl_map[dim_tag.layout_nesting_level] = i

        elif isinstance(dim_tag, SeparateArrayArrayDimTag):
            pass

        else:
            raise LoopyError("invalid array dim tag")

    # }}}

    # {{{ convert computed to fixed stride dim tags

    new_dim_tags = dim_tags[:]

    for target_axis in range(num_target_axes):
        if vector_dim is None:
            stride_so_far = 1
        else:
            if shape is None or shape is lp.auto:
                # unable to normalize without known shape
                return None

            if not is_integer(shape[vector_dim]):
                raise TypeError(
                    "shape along vector axis %d of array '%s' "
                    "must be an integer, not an expression ('%s')" %
                    (vector_dim, name, shape[vector_dim]))

            stride_so_far = shape[vector_dim]
            # FIXME: OpenCL-specific
            if stride_so_far == 3:
                stride_so_far = 4

        nesting_level_map = target_axis_to_nesting_level_map.get(
            target_axis, {})
        nl_keys = sorted(nesting_level_map.keys())

        if not nl_keys:
            continue

        for key in nl_keys:
            dim_tag_index = nesting_level_map[key]
            dim_tag = dim_tags[dim_tag_index]

            if isinstance(dim_tag, ComputedStrideArrayDimTag):
                if stride_so_far is None:
                    raise LoopyError(
                        "unable to determine fixed stride "
                        "for axis %d because it is nested outside of "
                        "an 'auto' stride axis" % dim_tag_index)

                new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag(
                    stride_so_far,
                    target_axis=dim_tag.target_axis,
                    layout_nesting_level=dim_tag.layout_nesting_level)

                if shape is None or shape is lp.auto:
                    # unable to normalize without known shape
                    return None

                shape_axis = shape[dim_tag_index]
                if shape_axis is None:
                    stride_so_far = None
                else:
                    stride_so_far *= shape_axis

                if dim_tag.pad_to is not None:
                    from pytools import div_ceil
                    stride_so_far = (div_ceil(stride_so_far, dim_tag.pad_to) *
                                     stride_so_far)

            elif isinstance(dim_tag, FixedStrideArrayDimTag):
                stride_so_far = dim_tag.stride

                if stride_so_far is lp.auto:
                    stride_so_far = None

            else:
                raise TypeError("internal error in dim_tag conversion")

    # }}}

    return new_dim_tags
Example #23
0
def split_array_dim(kernel,
                    arrays_and_axes,
                    count,
                    auto_split_inames=True,
                    split_kwargs=None):
    """
    :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating
        that the index in *axis_nr* should be split. The tuples may
        also be *(array, axis_nr, "F")*, indicating that the index will
        be split as it would be according to Fortran order.

        *array* may name a temporary variable or an argument.

        If *arrays_and_axes* is a :class:`tuple`, it is automatically
        wrapped in a list, to make single splits easier.

    :arg count: The group size to use in the split.
    :arg auto_split_inames: Whether to automatically split inames
        encountered in the specified indices.
    :arg split_kwargs: arguments to pass to :func:`loopy.split_inames`

    Note that splits on the corresponding inames are carried out implicitly.
    The inames may *not* be split beforehand. (There's no *really* good reason
    for this--this routine is just not smart enough to deal with this.)
    """

    if count == 1:
        return kernel

    if split_kwargs is None:
        split_kwargs = {}

    # {{{ process input into array_to_rest

    # where "rest" is the non-argument-name part of the input tuples
    # in args_and_axes
    def normalize_rest(rest):
        if len(rest) == 1:
            return (rest[0], "C")
        elif len(rest) == 2:
            return rest
        else:
            raise RuntimeError("split instruction '%s' not understood" % rest)

    if isinstance(arrays_and_axes, tuple):
        arrays_and_axes = [arrays_and_axes]

    array_to_rest = {
        tup[0]: normalize_rest(tup[1:])
        for tup in arrays_and_axes
    }

    if len(arrays_and_axes) != len(array_to_rest):
        raise RuntimeError("cannot split multiple axes of the same variable")

    del arrays_and_axes

    # }}}

    # {{{ adjust arrays

    from loopy.kernel.tools import ArrayChanger

    for array_name, (axis, order) in array_to_rest.items():
        achng = ArrayChanger(kernel, array_name)
        ary = achng.get()

        from pytools import div_ceil

        # {{{ adjust shape

        new_shape = ary.shape
        if new_shape is not None:
            new_shape = list(new_shape)
            axis_len = new_shape[axis]
            new_shape[axis] = count
            outer_len = div_ceil(axis_len, count)

            if order == "F":
                new_shape.insert(axis + 1, outer_len)
            elif order == "C":
                new_shape.insert(axis, outer_len)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_shape = tuple(new_shape)

        # }}}

        # {{{ adjust dim tags

        if ary.dim_tags is None:
            raise RuntimeError("dim_tags of '%s' are not known" % array_name)
        new_dim_tags = list(ary.dim_tags)

        old_dim_tag = ary.dim_tags[axis]

        from loopy.kernel.array import FixedStrideArrayDimTag
        if not isinstance(old_dim_tag, FixedStrideArrayDimTag):
            raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" %
                               (axis, array_name))

        old_stride = old_dim_tag.stride
        outer_stride = count * old_stride

        if order == "F":
            new_dim_tags.insert(axis + 1, FixedStrideArrayDimTag(outer_stride))
        elif order == "C":
            new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride))
        else:
            raise RuntimeError("order '%s' not understood" % order)

        new_dim_tags = tuple(new_dim_tags)

        # }}}

        # {{{ adjust dim_names

        new_dim_names = ary.dim_names
        if new_dim_names is not None:
            new_dim_names = list(new_dim_names)
            existing_name = new_dim_names[axis]
            new_dim_names[axis] = existing_name + "_inner"
            outer_name = existing_name + "_outer"

            if order == "F":
                new_dim_names.insert(axis + 1, outer_name)
            elif order == "C":
                new_dim_names.insert(axis, outer_name)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_dim_names = tuple(new_dim_names)

        # }}}

        kernel = achng.with_changed_array(
            ary.copy(shape=new_shape,
                     dim_tags=new_dim_tags,
                     dim_names=new_dim_names))

    # }}}

    split_vars = {}

    var_name_gen = kernel.get_var_name_generator()

    def split_access_axis(expr):
        axis_nr, order = array_to_rest[expr.aggregate.name]

        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx, )
        idx = list(idx)

        axis_idx = idx[axis_nr]

        if auto_split_inames:
            from pymbolic.primitives import Variable
            if not isinstance(axis_idx, Variable):
                raise RuntimeError(
                    "found access '%s' in which axis %d is not a "
                    "single variable--cannot split "
                    "(Have you tried to do the split yourself, manually, "
                    "beforehand? If so, you shouldn't.)" % (expr, axis_nr))

            split_iname = idx[axis_nr].name
            assert split_iname in kernel.all_inames()

            try:
                outer_iname, inner_iname = split_vars[split_iname]
            except KeyError:
                outer_iname = var_name_gen(split_iname + "_outer")
                inner_iname = var_name_gen(split_iname + "_inner")
                split_vars[split_iname] = outer_iname, inner_iname

            inner_index = Variable(inner_iname)
            outer_index = Variable(outer_iname)

        else:
            from loopy.symbolic import simplify_using_aff
            inner_index = simplify_using_aff(kernel, axis_idx % count)
            outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis + 1, outer_index)
        elif order == "C":
            idx.insert(axis, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, var_name_gen)
    aash = ArrayAxisSplitHelper(rule_mapping_context,
                                set(array_to_rest.keys()), split_access_axis)
    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))

    if auto_split_inames:
        from loopy import split_iname
        for iname, (outer_iname, inner_iname) in split_vars.items():
            kernel = split_iname(kernel,
                                 iname,
                                 count,
                                 outer_iname=outer_iname,
                                 inner_iname=inner_iname,
                                 **split_kwargs)

    return kernel
Example #24
0
    def mark_panels_for_refinement(self, tree, peer_lists, lpot_source,
                                   target_status, refine_flags, debug,
                                   wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.refiner_for_failed_target_association(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32)
        found_panel_to_refine.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.user_source_ids[tree.qbx_user_source_slice]
        sources = [
                axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
        tunnel_radius_by_source = (
                lpot_source._close_target_tunnel_radius("nsources")
                .with_queue(self.queue))

        # See (TGTMARK) above for algorithm.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                sources,
                tunnel_radius_by_source,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_panel_to_source_starts,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_target_slice.start,
                tree.nqbxpanels,
                tree.sorted_target_ids,
                lpot_source._close_target_tunnel_radius("nsources"),
                target_status,
                box_to_search_dist,
                refine_flags,
                found_panel_to_refine,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            refine_flags.finish()
            # Marked panel = 1, 0 otherwise
            marked_panel_count = cl.array.sum(refine_flags).get()
            logger.debug("target association: {} panels flagged for refinement"
                         .format(marked_panel_count))

        cl.wait_for_events([evt])

        return (found_panel_to_refine == 1).all().get()
Example #25
0
    def mark_targets(self, tree, peer_lists, lpot_source, target_status,
                     debug, wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.target_marker(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32)
        found_target_close_to_panel.finish()

        # Perform a space invader query over the sources.
        source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice]
        sources = [
                axis.with_queue(self.queue)[source_slice] for axis in tree.sources]
        tunnel_radius_by_source = (
                lpot_source._close_target_tunnel_radius("nsources")
                .with_queue(self.queue))

        # Target-marking algorithm (TGTMARK):
        #
        # (1) Use a space invader query to tag each leaf box that intersects with the
        # "near-source-detection tunnel" with the distance to the closest source.
        #
        # (2) Do an area query around all targets with the radius resulting
        # from the space invader query, enumerate sources in that vicinity.
        # If a source is found whose distance to the target is less than the
        # source's tunnel radius, mark that target as pending.
        # (or below: mark the source for refinement)

        # Note that this comment is referred to below by "TGTMARK". If you
        # remove this comment or change the algorithm here, make sure that
        # the reference below is still accurate.

        # Trade off for space-invaders vs directly tagging targets in
        # endangered boxes:
        #
        # (-) More complicated
        # (-) More actual work
        # (+) Taking the point of view of the targets could potentially lead to
        # more parallelism, if you think of the targets as unbounded while the
        # sources are fixed (which sort of makes sense, given that the number
        # of targets per box is not bounded).

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                sources,
                tunnel_radius_by_source,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        tunnel_radius_by_source = lpot_source._close_target_tunnel_radius("nsources")

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_source_starts,
                tree.box_to_qbx_source_lists,
                tree.qbx_user_source_slice.start,
                tree.qbx_user_target_slice.start,
                tree.sorted_target_ids,
                tunnel_radius_by_source,
                box_to_search_dist,
                target_status,
                found_target_close_to_panel,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            target_status.finish()
            # Marked target = 1, 0 otherwise
            marked_target_count = cl.array.sum(target_status).get()
            logger.debug("target association: {}/{} targets marked close to panels"
                         .format(marked_target_count, tree.nqbxtargets))

        cl.wait_for_events([evt])

        return (found_target_close_to_panel == 1).all().get()
Example #26
0
def test_mem_access_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k] = g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    f32l = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
    f64l = mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64l += mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32l == (3*n*m*ell)*n_subgroups
    assert f64l == (2*n*m)*n_subgroups

    f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32s == (n*m*ell)*n_subgroups
    assert f64s == (n*m)*n_subgroups
Example #27
0
def test_summations_and_filters():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)

    loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                count_granularity=[CG.SUBGROUP]
                                ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert loads_a == (2*n*m*ell)*n_subgroups

    global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                      count_granularity=[CG.SUBGROUP]
                                      ).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert global_stores == (n*m*ell + n*m)*n_subgroups

    ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).to_bytes().eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups
    assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups

    # ignore stride and variable names in this map
    reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
    f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load')
                          ].eval_with_dict(params)
    f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                          ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32lall == (3*n*m*ell)*n_subgroups
    assert f64lall == (2*n*m)*n_subgroups

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    #for k, v in op_map.items():
    #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params)
    i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params)
    assert f32 == n*m*ell*3
    assert f64 == n*m
    assert i32 == n*m*2

    addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params)
    f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params)
    assert addsub_all == n*m*ell + n*m*2
    assert f32ops_all == n*m*ell*3

    non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params)
    assert non_field == 0

    ops_nodtype = op_map.group_by('name')
    ops_noname = op_map.group_by('dtype')
    mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params)
    f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params)
    assert mul_all == n*m*ell + n*m
    assert f64ops_all == n*m

    def func_filter(key):
        return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \
               key.direction == 'load'
    f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64l == (2*n*m)*n_subgroups
Example #28
0
    def __call__(self,
                 queue,
                 tree,
                 ball_centers,
                 ball_radii,
                 peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(aq, event)*, where *aq* is an instance of
            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue,
                                                    tree,
                                                    wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError(
                "size of peer lists must match with number of boxes")

        area_query_kernel = self.get_area_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels)

        aq_plog = ProcessLogger(logger, "area query")

        result, evt = area_query_kernel(queue,
                                        len(ball_radii),
                                        tree.box_centers.data,
                                        tree.root_extent,
                                        tree.box_levels,
                                        tree.aligned_nboxes,
                                        tree.box_child_ids.data,
                                        tree.box_flags,
                                        peer_lists.peer_list_starts,
                                        peer_lists.peer_lists,
                                        ball_radii,
                                        *(tuple(tree.bounding_box[0]) +
                                          tuple(bc for bc in ball_centers)),
                                        wait_for=wait_for)

        aq_plog.done()

        return AreaQueryResult(
            tree=tree,
            leaves_near_ball_starts=result["leaves"].starts,
            leaves_near_ball_lists=result["leaves"].lists).with_queue(
                None), evt
Example #29
0
    def __call__(self,
                 queue,
                 tree,
                 ball_centers,
                 ball_radii,
                 peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
            for dependency management. The *dtype* of *sqi* is
            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
            The entries of *sqi* are indexed by the global box index and are
            as follows:

            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
            * if *i* is the index of a leaf box, *sqi[i]* is the
              outer space invader distance for *i*.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue,
                                                    tree,
                                                    wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError(
                "size of peer lists must match with number of boxes")

        space_invader_query_kernel = self.get_space_invader_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        si_plog = ProcessLogger(logger, "space invader query")

        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes,
                                                   np.float32)
        if not wait_for:
            wait_for = []
        wait_for = wait_for + outer_space_invader_dists.events

        evt = space_invader_query_kernel(
            *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
                tree, peer_lists, ball_radii, outer_space_invader_dists,
                *tuple(bc for bc in ball_centers)),
            wait_for=wait_for,
            queue=queue,
            range=slice(len(ball_radii)))

        if tree.coord_dtype != np.dtype(np.float32):
            # The kernel output is always an array of float32 due to limited
            # support for atomic operations with float64 in OpenCL.
            # Here the output is cast to match the coord dtype.
            outer_space_invader_dists.finish()
            outer_space_invader_dists = outer_space_invader_dists.astype(
                tree.coord_dtype)
            evt, = outer_space_invader_dists.events

        si_plog.done()

        return outer_space_invader_dists, evt
Example #30
0
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"):
    if count == 1:
        return kernel

    # {{{ adjust arrays

    from loopy.kernel.tools import ArrayChanger

    achng = ArrayChanger(kernel, array_name)
    ary = achng.get()

    from pytools import div_ceil

    # {{{ adjust shape

    new_shape = ary.shape
    if new_shape is not None:
        new_shape = list(new_shape)
        axis_len = new_shape[axis_nr]
        new_shape[axis_nr] = count
        outer_len = div_ceil(axis_len, count)

        if order == "F":
            new_shape.insert(axis_nr+1, outer_len)
        elif order == "C":
            new_shape.insert(axis_nr, outer_len)
        else:
            raise RuntimeError("order '%s' not understood" % order)
        new_shape = tuple(new_shape)

    # }}}

    # {{{ adjust dim tags

    if ary.dim_tags is None:
        raise RuntimeError("dim_tags of '%s' are not known" % array_name)
    new_dim_tags = list(ary.dim_tags)

    old_dim_tag = ary.dim_tags[axis_nr]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(old_dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("axis %d of '%s' is not tagged fixed-stride"
                % (axis_nr, array_name))

    old_stride = old_dim_tag.stride
    outer_stride = count*old_stride

    if order == "F":
        new_dim_tags.insert(axis_nr+1, FixedStrideArrayDimTag(outer_stride))
    elif order == "C":
        new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride))
    else:
        raise RuntimeError("order '%s' not understood" % order)

    new_dim_tags = tuple(new_dim_tags)

    # }}}

    # {{{ adjust dim_names

    new_dim_names = ary.dim_names
    if new_dim_names is not None:
        new_dim_names = list(new_dim_names)
        existing_name = new_dim_names[axis_nr]
        new_dim_names[axis_nr] = existing_name + "_inner"
        outer_name = existing_name + "_outer"

        if order == "F":
            new_dim_names.insert(axis_nr+1, outer_name)
        elif order == "C":
            new_dim_names.insert(axis_nr, outer_name)
        else:
            raise RuntimeError("order '%s' not understood" % order)
        new_dim_names = tuple(new_dim_names)

    # }}}

    kernel = achng.with_changed_array(ary.copy(
        shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names))

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    def split_access_axis(expr):
        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx,)
        idx = list(idx)

        axis_idx = idx[axis_nr]

        from loopy.symbolic import simplify_using_aff
        inner_index = simplify_using_aff(kernel, axis_idx % count)
        outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis_nr+1, outer_index)
        elif order == "C":
            idx.insert(axis_nr, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    aash = ArrayAxisSplitHelper(rule_mapping_context,
            set([array_name]), split_access_axis)
    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))

    return kernel
Example #31
0
    def try_find_centers(self, tree, peer_lists, lpot_source,
                         target_status, target_flags, target_assoc,
                         target_association_tolerance, debug, wait_for=None):
        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        from pytools import div_ceil
        max_levels = 10 * div_ceil(tree.nlevels, 10)

        knl = self.code_container.center_finder(
                tree.dimensions,
                tree.coord_dtype, tree.box_id_dtype,
                peer_lists.peer_list_starts.dtype,
                tree.particle_id_dtype,
                max_levels)

        if debug:
            target_status.finish()
            marked_target_count = int(cl.array.sum(target_status).get())

        # Perform a space invader query over the centers.
        center_slice = (
                tree.sorted_target_ids[tree.qbx_user_center_slice]
                .with_queue(self.queue))
        centers = [
                axis.with_queue(self.queue)[center_slice] for axis in tree.sources]
        expansion_radii_by_center = \
                lpot_source._expansion_radii("ncenters").with_queue(self.queue)
        expansion_radii_by_center_with_tolerance = \
                expansion_radii_by_center * (1 + target_association_tolerance)

        # Idea:
        #
        # (1) Tag leaf boxes around centers with max distance to usable center.
        # (2) Area query from targets with those radii to find closest eligible
        # center.

        box_to_search_dist, evt = self.code_container.space_invader_query()(
                self.queue,
                tree,
                centers,
                expansion_radii_by_center_with_tolerance,
                peer_lists,
                wait_for=wait_for)
        wait_for = [evt]

        min_dist_to_center = cl.array.empty(
                self.queue, tree.nqbxtargets, tree.coord_dtype)
        min_dist_to_center.fill(np.inf)

        wait_for.extend(min_dist_to_center.events)

        evt = knl(
            *unwrap_args(
                tree, peer_lists,
                tree.box_to_qbx_center_starts,
                tree.box_to_qbx_center_lists,
                tree.qbx_user_center_slice.start,
                tree.qbx_user_target_slice.start,
                tree.sorted_target_ids,
                expansion_radii_by_center_with_tolerance,
                box_to_search_dist,
                target_flags,
                target_status,
                target_assoc.target_to_center,
                min_dist_to_center,
                *tree.sources),
            range=slice(tree.nqbxtargets),
            queue=self.queue,
            wait_for=wait_for)

        if debug:
            target_status.finish()
            # Associated target = 2, marked target = 1
            ntargets_associated = (
                int(cl.array.sum(target_status).get()) - marked_target_count)
            assert ntargets_associated >= 0
            logger.debug("target association: {} targets were assigned centers"
                         .format(ntargets_associated))

        cl.wait_for_events([evt])
Example #32
0
def test_mem_access_tagged_variables():
    bsize = 16
    knl = lp.make_kernel(
        "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
        ["c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"],
        name="matmul",
        assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize * bsize
    n_workgroups = div_ceil(n, bsize) * div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group

    mem_access_map = lp.get_mem_access_map(knl,
                                           count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={0: 1},
        gid_strides={1: bsize},
        direction='load',
        variable='b',
        variable_tag='mmbload',
        count_granularity=CG.WORKITEM)].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={1: Variable('m')},
        gid_strides={0: Variable('m') * bsize},
        direction='load',
        variable='a',
        variable_tag='mmaload',
        count_granularity=CG.SUBGROUP)].eval_with_dict(params)

    assert f32s1lb == n * m * ell

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32s1la == m * n_subgroups

    f32coal = mem_access_map[lp.MemAccess(
        'global',
        np.float32,
        lid_strides={
            0: 1,
            1: Variable('ell')
        },
        gid_strides={
            0: Variable('ell') * bsize,
            1: bsize
        },
        direction='store',
        variable='c',
        variable_tag='mmresult',
        count_granularity=CG.WORKITEM)].eval_with_dict(params)

    assert f32coal == n * ell
Example #33
0
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"):
    if count == 1:
        return kernel

    # {{{ adjust arrays

    from loopy.kernel.tools import ArrayChanger

    achng = ArrayChanger(kernel, array_name)
    ary = achng.get()

    from pytools import div_ceil

    # {{{ adjust shape

    new_shape = ary.shape
    if new_shape is not None:
        new_shape = list(new_shape)
        axis_len = new_shape[axis_nr]
        new_shape[axis_nr] = count
        outer_len = div_ceil(axis_len, count)

        if order == "F":
            new_shape.insert(axis_nr + 1, outer_len)
        elif order == "C":
            new_shape.insert(axis_nr, outer_len)
        else:
            raise RuntimeError("order '%s' not understood" % order)
        new_shape = tuple(new_shape)

    # }}}

    # {{{ adjust dim tags

    if ary.dim_tags is None:
        raise RuntimeError("dim_tags of '%s' are not known" % array_name)
    new_dim_tags = list(ary.dim_tags)

    old_dim_tag = ary.dim_tags[axis_nr]

    from loopy.kernel.array import FixedStrideArrayDimTag
    if not isinstance(old_dim_tag, FixedStrideArrayDimTag):
        raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" %
                           (axis_nr, array_name))

    old_stride = old_dim_tag.stride
    outer_stride = count * old_stride

    if order == "F":
        new_dim_tags.insert(axis_nr + 1, FixedStrideArrayDimTag(outer_stride))
    elif order == "C":
        new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride))
    else:
        raise RuntimeError("order '%s' not understood" % order)

    new_dim_tags = tuple(new_dim_tags)

    # }}}

    # {{{ adjust dim_names

    new_dim_names = ary.dim_names
    if new_dim_names is not None:
        new_dim_names = list(new_dim_names)
        existing_name = new_dim_names[axis_nr]
        new_dim_names[axis_nr] = existing_name + "_inner"
        outer_name = existing_name + "_outer"

        if order == "F":
            new_dim_names.insert(axis_nr + 1, outer_name)
        elif order == "C":
            new_dim_names.insert(axis_nr, outer_name)
        else:
            raise RuntimeError("order '%s' not understood" % order)
        new_dim_names = tuple(new_dim_names)

    # }}}

    kernel = achng.with_changed_array(
        ary.copy(shape=new_shape,
                 dim_tags=new_dim_tags,
                 dim_names=new_dim_names))

    # }}}

    var_name_gen = kernel.get_var_name_generator()

    def split_access_axis(expr):
        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx, )
        idx = list(idx)

        axis_idx = idx[axis_nr]

        from loopy.symbolic import simplify_using_aff
        inner_index = simplify_using_aff(kernel, axis_idx % count)
        outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis_nr + 1, outer_index)
        elif order == "C":
            idx.insert(axis_nr, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))

    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, var_name_gen)
    aash = ArrayAxisSplitHelper(rule_mapping_context, {array_name},
                                split_access_axis)
    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))

    return kernel
Example #34
0
def test_mem_access_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int32, h=np.int32))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    i32 = mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert i32 == (4*n*m+2*n*m*ell)*n_subgroups

    i32 = mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    i32 += mem_map[lp.MemAccess('global', np.int32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert i32 == (n*m+n*m*ell)*n_subgroups
Example #35
0
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(aq, event)*, where *aq* is an instance of
            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError("size of peer lists must match with number of boxes")

        area_query_kernel = self.get_area_query_kernel(tree.dimensions,
            tree.coord_dtype, tree.box_id_dtype, ball_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        logger.info("area query: run area query")

        result, evt = area_query_kernel(
                queue, len(ball_radii),
                tree.box_centers.data, tree.root_extent,
                tree.box_levels.data, tree.aligned_nboxes,
                tree.box_child_ids.data, tree.box_flags.data,
                peer_lists.peer_list_starts.data,
                peer_lists.peer_lists.data, ball_radii.data,
                *(tuple(tree.bounding_box[0]) +
                  tuple(bc.data for bc in ball_centers)),
                wait_for=wait_for)

        logger.info("area query: done")

        return AreaQueryResult(
                tree=tree,
                leaves_near_ball_starts=result["leaves"].starts,
                leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt
Example #36
0
def test_mem_access_counter_mixed():
    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
            ],
            name="mixed", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(
                a=np.float32, b=np.float32, g=np.float64, h=np.float64,
                x=np.float32))

    group_size_0 = 65

    knl = lp.split_iname(knl, "j", group_size_0)
    knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = div_ceil(ell, group_size_0)
    group_size = group_size_0
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='g',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f64uniform += mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='h',
                                count_granularity=CG.SUBGROUP)
                          ].eval_with_dict(params)
    f32uniform = mem_map[lp.MemAccess('global', np.float32,
                                lid_strides={}, gid_strides={},
                                direction='load', variable='x',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='load',
                                variable='a',
                                count_granularity=CG.WORKITEM)
                           ].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32),
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='load',
                                variable='b',
                                count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64uniform == (2*n*m)*n_subgroups
    assert f32uniform == (m*n)*n_subgroups

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    if expect_fallback:
        if ell < group_size_0:
            assert f32nonconsec == 3*n*m*ell*n_workgroups
        else:
            assert f32nonconsec == 3*n*m*n_workgroups*group_size_0
    else:
        assert f32nonconsec == 3*n*m*ell

    f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                lid_strides={}, gid_strides={},
                                direction='store', variable='e',
                                count_granularity=CG.SUBGROUP)
                         ].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global', np.float32,
                                lid_strides={0: Variable('m')},
                                gid_strides={0: Variable('m')*group_size_0},
                                direction='store',
                                variable='c',
                                count_granularity=CG.WORKITEM)
                           ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f64uniform == m*n*n_subgroups

    if expect_fallback:
        if ell < group_size_0:
            assert f32nonconsec == n*m*ell*n_workgroups
        else:
            assert f32nonconsec == n*m*n_workgroups*group_size_0
    else:
        assert f32nonconsec == n*m*ell
Example #37
0
    def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        max_levels = div_ceil(tree.nlevels, 10) * 10

        b2l_knl = self.get_balls_to_leaves_kernel(tree.dimensions,
                                                  tree.coord_dtype,
                                                  tree.box_id_dtype,
                                                  ball_id_dtype, max_levels,
                                                  tree.stick_out_factor)

        logger.info("leaves-to-balls lookup: prepare ball list")

        nballs = len(ball_radii)
        result, evt = b2l_knl(queue,
                              nballs,
                              tree.box_flags.data,
                              tree.box_centers.data,
                              tree.box_child_ids.data,
                              tree.box_levels.data,
                              tree.root_extent,
                              tree.aligned_nboxes,
                              ball_radii.data,
                              *tuple(bc.data for bc in ball_centers),
                              wait_for=wait_for)
        wait_for = [evt]

        logger.info("leaves-to-balls lookup: key-value sort")

        balls_near_box_starts, balls_near_box_lists, evt \
                = self.key_value_sorter(
                        queue,
                        # keys
                        result["overlapping_leaves"].lists,
                        # values
                        result["ball_numbers"].lists,
                        tree.nboxes, starts_dtype=tree.box_id_dtype,
                        wait_for=wait_for)

        logger.info("leaves-to-balls lookup: built")

        return LeavesToBallsLookup(
            tree=tree,
            balls_near_box_starts=balls_near_box_starts,
            balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
Example #38
0
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
            for dependency management. The *dtype* of *sqi* is
            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
            The entries of *sqi* are indexed by the global box index and are
            as follows:

            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
            * if *i* is the index of a leaf box, *sqi[i]* is the
              outer space invader distance for *i*.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError("size of peer lists must match with number of boxes")

        space_invader_query_kernel = self.get_space_invader_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        logger.info("space invader query: run space invader query")

        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32)
        if not wait_for:
            wait_for = []
        wait_for = wait_for + outer_space_invader_dists.events

        evt = space_invader_query_kernel(
                *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
                    tree, peer_lists,
                    ball_radii,
                    outer_space_invader_dists,
                    *tuple(bc for bc in ball_centers)),
                wait_for=wait_for,
                queue=queue,
                range=slice(len(ball_radii)))

        if tree.coord_dtype != np.dtype(np.float32):
            # The kernel output is always an array of float32 due to limited
            # support for atomic operations with float64 in OpenCL.
            # Here the output is cast to match the coord dtype.
            outer_space_invader_dists.finish()
            outer_space_invader_dists = outer_space_invader_dists.astype(
                    tree.coord_dtype)
            evt, = outer_space_invader_dists.events

        logger.info("space invader query: done")

        return outer_space_invader_dists, evt
Example #39
0
def test_all_counters_parallel_matmul():
    bsize = 16
    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    group_size = bsize*bsize
    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize

    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    f32mul = op_map[
                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)
    f32add = op_map[
                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops = op_map[
                        lp.Op(np.int32, 'add', CG.SUBGROUP)
                        ].eval_with_dict(params)
    i32ops += op_map[
                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
                        ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert f32mul+f32add == m*2*n_subgroups

    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                           subgroup_size=SGS)

    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={1: bsize},
                             direction='load', variable='b',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('m')},
                             gid_strides={0: Variable('m')*bsize},
                             direction='load',
                             variable='a', count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32s1lb == n*m*ell/bsize
    assert f32s1la == n*m*ell/bsize

    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
                             lid_strides={0: 1, 1: Variable('ell')},
                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
                             direction='store', variable='c',
                             count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)

    assert f32coal == n*ell

    local_mem_map = lp.get_mem_access_map(knl,
                        count_redundant_work=True,
                        subgroup_size=SGS).filter_by(mtype=['local'])

    local_mem_l = local_mem_map.filter_by(direction=['load']
                                          ).eval_and_sum(params)
    # (count-per-sub-group)*n_subgroups
    assert local_mem_l == m*2*n_subgroups

    local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={1: 16},
                                               gid_strides={},
                                               variable='a_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)
    local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                               direction='load',
                                               lid_strides={0: 1},
                                               gid_strides={},
                                               variable='b_fetch',
                                               count_granularity=CG.SUBGROUP)
                                  ].eval_with_dict(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_l_a == local_mem_l_b == m*n_subgroups

    local_mem_s = local_mem_map.filter_by(direction=['store']
                                          ).eval_and_sum(params)

    # (count-per-sub-group)*n_subgroups
    assert local_mem_s == m*2/bsize*n_subgroups
Example #40
0
    def __call__(self, queue, particles, max_particles_in_box,
            allocator=None, debug=False, targets=None,
            source_radii=None, target_radii=None, stick_out_factor=0.25,
            wait_for=None, non_adaptive=False,
            **kwargs):
        """
        :arg queue: a :class:`pyopencl.CommandQueue` instance
        :arg particles: an object array of (XYZ) point coordinate arrays.
        :arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
            If ``None``, *particles* act as targets, too.
            Must have the same (inner) dtype as *particles*.
        :arg source_radii: If not *None*, a :class:`pyopencl.array.Array` of the
            same dtype as *particles*.

            If this is given, *targets* must also be given, i.e. sources and
            targets must be separate. See :ref:`extent`.

        :arg target_radii: Like *source_radii*, but for targets.
        :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :arg non_adaptive: If *True*, return a tree in which all leaf boxes are
            on the same (last) level. The tree is pruned, in the sense that empty
            boxes have been eliminated.
        :arg kwargs: Used internally for debugging.

        :returns: a tuple ``(tree, event)``, where *tree* is an instance of
            :class:`Tree`, and *event* is a :class:`pyopencl.Event` for dependency
            management.
        """

        # {{{ input processing

        # we'll modify this below, so copy it
        if wait_for is None:
            wait_for = []
        else:
            wait_for = list(wait_for)

        dimensions = len(particles)

        from boxtree.tools import AXIS_NAMES
        axis_names = AXIS_NAMES[:dimensions]

        sources_are_targets = targets is None
        sources_have_extent = source_radii is not None
        targets_have_extent = target_radii is not None
        srcntgts_have_extent = sources_have_extent or targets_have_extent

        if srcntgts_have_extent and targets is None:
            raise ValueError("must specify targets when specifying "
                    "any kind of radii")

        from pytools import single_valued
        particle_id_dtype = np.int32
        box_id_dtype = np.int32
        coord_dtype = single_valued(coord.dtype for coord in particles)

        if targets is None:
            nsrcntgts = single_valued(len(coord) for coord in particles)
        else:
            nsources = single_valued(len(coord) for coord in particles)
            ntargets = single_valued(len(coord) for coord in targets)
            nsrcntgts = nsources + ntargets

        if source_radii is not None:
            if source_radii.shape != (nsources,):
                raise ValueError("source_radii has an invalid shape")

            if source_radii.dtype != coord_dtype:
                raise TypeError("dtypes of coordinate arrays and "
                        "source_radii must agree")

        if target_radii is not None:
            if target_radii.shape != (ntargets,):
                raise ValueError("target_radii has an invalid shape")

            if target_radii.dtype != coord_dtype:
                raise TypeError("dtypes of coordinate arrays and "
                        "target_radii must agree")

        # }}}

        empty = partial(cl.array.empty, queue, allocator=allocator)

        def zeros(shape, dtype):
            result = (cl.array.empty(queue, shape, dtype, allocator=allocator)
                    .fill(0, wait_for=wait_for))
            event, = result.events
            return result, event

        knl_info = self.get_kernel_info(dimensions, coord_dtype,
                particle_id_dtype, box_id_dtype,
                sources_are_targets, srcntgts_have_extent,
                stick_out_factor, adaptive=not non_adaptive)

        # {{{ combine sources and targets into one array, if necessary

        prep_events = []

        if targets is None:
            # Targets weren't specified. Sources are also targets. Let's
            # call them "srcntgts".

            srcntgts = particles

            assert source_radii is None
            assert target_radii is None

            srcntgt_radii = None

        else:
            # Here, we mash sources and targets into one array to give us one
            # big array of "srcntgts". In this case, a "srcntgt" is either a
            # source or a target, but not really both, as above. How will we be
            # able to tell which it was? Easy: We'll compare its 'user' id with
            # nsources. If it's >=, it's a target, otherwise it's a source.

            target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets)

            if target_coord_dtype != coord_dtype:
                raise TypeError("sources and targets must have same coordinate "
                        "dtype")

            def combine_srcntgt_arrays(ary1, ary2=None):
                if ary2 is None:
                    dtype = ary1.dtype
                else:
                    dtype = ary2.dtype

                result = empty(nsrcntgts, dtype)
                if (ary1 is None) or (ary2 is None):
                    result.fill(0)

                if ary1 is not None and ary1.nbytes:
                    result[:len(ary1)] = ary1

                if ary2 is not None and ary2.nbytes:
                    result[nsources:] = ary2

                return result

            from pytools.obj_array import make_obj_array
            srcntgts = make_obj_array([
                combine_srcntgt_arrays(src_i, tgt_i)
                for src_i, tgt_i in zip(particles, targets)
                ])

            if srcntgts_have_extent:
                srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii)
            else:
                srcntgt_radii = None

        del source_radii
        del target_radii

        del particles

        user_srcntgt_ids = cl.array.arange(queue, nsrcntgts, dtype=particle_id_dtype,
                allocator=allocator)

        evt, = user_srcntgt_ids.events
        wait_for.append(evt)
        del evt

        # }}}

        # {{{ find and process bounding box

        bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
        bbox = bbox.get()

        root_extent = max(
                bbox["max_"+ax] - bbox["min_"+ax]
                for ax in axis_names) * (1+1e-4)

        # make bbox square and slightly larger at the top, to ensure scaled
        # coordinates are always < 1
        bbox_min = np.empty(dimensions, coord_dtype)
        for i, ax in enumerate(axis_names):
            bbox_min[i] = bbox["min_"+ax]

        bbox_max = bbox_min + root_extent
        for i, ax in enumerate(axis_names):
            bbox["max_"+ax] = bbox_max[i]

        # }}}

        from pytools import div_ceil

        # {{{ allocate data

        logger.debug("allocating memory")

        # box-local morton bin counts for each particle at the current level
        # only valid from scan -> split'n'sort
        morton_bin_counts = empty(nsrcntgts, dtype=knl_info.morton_bin_count_dtype)

        # (local) morton nrs for each particle at the current level
        # only valid from scan -> split'n'sort
        morton_nrs = empty(nsrcntgts, dtype=self.morton_nr_dtype)

        # 0/1 segment flags
        # invariant to sorting once set
        # (particles are only reordered within a box)
        # valid throughout computation
        box_start_flags, evt = zeros(nsrcntgts, dtype=np.int8)
        prep_events.append(evt)
        srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
        prep_events.append(evt)
        split_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype)
        prep_events.append(evt)

        # number of boxes total, and a guess
        nboxes_dev = empty((), dtype=box_id_dtype)
        nboxes_dev.fill(1)

        # /!\ If you're allocating an array here that depends on nboxes_guess,
        # you *must* also write reallocation code down below for the case when
        # nboxes_guess was too low.

        # Outside nboxes_guess feeding is solely for debugging purposes,
        # to test the reallocation code.
        nboxes_guess = kwargs.get("nboxes_guess")
        if nboxes_guess is None:
            nboxes_guess = div_ceil(nsrcntgts, max_particles_in_box) * 2**dimensions

        # per-box morton bin counts
        box_morton_bin_counts = empty(nboxes_guess,
                dtype=knl_info.morton_bin_count_dtype)

        # particle# at which each box starts
        box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
        prep_events.append(evt)

        # pointer to parent box
        box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype)
        prep_events.append(evt)

        # morton nr identifier {quadr,oct}ant of parent in which this box was created
        box_morton_nrs, evt = zeros(nboxes_guess, dtype=self.morton_nr_dtype)
        prep_events.append(evt)

        # box -> level map
        box_levels, evt = zeros(nboxes_guess, self.box_level_dtype)
        prep_events.append(evt)

        # number of particles in each box
        # needs to be globally initialized because empty boxes never get touched
        box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype)
        prep_events.append(evt)

        # Initalize box 0 to contain all particles
        evt = box_srcntgt_counts_cumul[0].fill(
                nsrcntgts, queue=queue, wait_for=[evt])

        # set parent of root box to itself
        evt = cl.enqueue_copy(
                queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype))
        prep_events.append(evt)

        # }}}

        def fin_debug(s):
            if debug:
                queue.finish()

            logger.debug(s)

        from pytools.obj_array import make_obj_array
        have_oversize_split_box, evt = zeros((), np.int32)
        prep_events.append(evt)

        wait_for = prep_events

        # {{{ level loop

        # Level 0 starts at 0 and always contains box 0 and nothing else.
        # Level 1 therefore starts at 1.
        level_start_box_nrs = [0, 1]

        from time import time
        start_time = time()
        if nsrcntgts > max_particles_in_box:
            level = 1
        else:
            level = 0

        # INVARIANTS -- Upon entry to this loop:
        #
        # - level is the level being built.
        # - the last entry of level_start_box_nrs is the beginning of the level
        #   to be built

        # This while condition prevents entering the loop in case there's just a
        # single box, by how 'level' is set above. Read this as 'while True' with
        # an edge case.

        logger.debug("entering level loop with %s srcntgts" % nsrcntgts)

        while level:
            if debug:
                # More invariants:
                assert level == len(level_start_box_nrs) - 1

            if level > np.iinfo(self.box_level_dtype).max:
                raise RuntimeError("level count exceeded maximum")

            common_args = ((morton_bin_counts, morton_nrs,
                    box_start_flags, srcntgt_box_ids, split_box_ids,
                    box_morton_bin_counts,
                    box_srcntgt_starts, box_srcntgt_counts_cumul,
                    box_parent_ids, box_morton_nrs,
                    nboxes_dev,
                    level, max_particles_in_box, bbox,
                    user_srcntgt_ids)
                    + tuple(srcntgts)
                    + ((srcntgt_radii,) if srcntgts_have_extent else ())
                    )

            fin_debug("morton count scan")

            # writes: box_morton_bin_counts, morton_nrs
            evt = knl_info.morton_count_scan(
                    *common_args, queue=queue, size=nsrcntgts,
                    wait_for=wait_for)
            wait_for = [evt]

            fin_debug("split box id scan")

            # writes: nboxes_dev, split_box_ids
            evt = knl_info.split_box_id_scan(
                    srcntgt_box_ids,
                    box_srcntgt_starts,
                    box_srcntgt_counts_cumul,
                    max_particles_in_box,
                    box_morton_bin_counts,
                    box_levels,
                    level,

                    # input/output:
                    nboxes_dev,

                    # output:
                    split_box_ids,
                    queue=queue, size=nsrcntgts, wait_for=wait_for)
            wait_for = [evt]

            nboxes_new = int(nboxes_dev.get())

            # Assumption: Everything between here and the top of the loop must
            # be repeatable, so that in an out-of-memory situation, we can just
            # rerun this bit of the code after reallocating and a minimal reset
            # procedure.

            # {{{ reallocate and retry if nboxes_guess was too small

            if nboxes_new > nboxes_guess:
                fin_debug("starting nboxes_guess increase")

                while nboxes_guess < nboxes_new:
                    nboxes_guess *= 2

                from boxtree.tools import realloc_array
                my_realloc = partial(realloc_array, new_shape=nboxes_guess,
                        zero_fill=False, queue=queue, wait_for=wait_for)
                my_realloc_zeros = partial(realloc_array, new_shape=nboxes_guess,
                        zero_fill=True, queue=queue, wait_for=wait_for)

                resize_events = []
                box_morton_bin_counts, evt = my_realloc(box_morton_bin_counts)
                resize_events.append(evt)

                box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts)
                resize_events.append(evt)
                box_parent_ids, evt = my_realloc_zeros(box_parent_ids)
                resize_events.append(evt)
                box_morton_nrs, evt = my_realloc_zeros(box_morton_nrs)
                resize_events.append(evt)
                box_levels, evt = my_realloc_zeros(box_levels)
                resize_events.append(evt)
                box_srcntgt_counts_cumul, evt = \
                        my_realloc_zeros(box_srcntgt_counts_cumul)
                resize_events.append(evt)

                del my_realloc
                del my_realloc_zeros

                # reset nboxes_dev to previous value
                nboxes_dev.fill(level_start_box_nrs[-1])
                resize_events.append(evt)

                wait_for = resize_events

                # retry
                logger.info("nboxes_guess exceeded: "
                        "enlarged allocations, restarting level")

                continue

            # }}}

            logger.info("LEVEL %d -> %d boxes" % (level, nboxes_new))

            assert level_start_box_nrs[-1] != nboxes_new or srcntgts_have_extent

            if level_start_box_nrs[-1] == nboxes_new:
                # We haven't created new boxes in this level loop trip.  Unless
                # srcntgts have extent, this should never happen.  (I.e., we
                # should've never entered this loop trip.)
                #
                # If srcntgts have extent, this can happen if boxes were
                # in-principle overfull, but couldn't subdivide because of
                # extent restrictions.

                assert srcntgts_have_extent

                level -= 1

                logger.debug("no new boxes created this loop trip")
                break

            level_start_box_nrs.append(nboxes_new)
            del nboxes_new

            new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
            new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
            split_and_sort_args = (
                    common_args
                    + (new_user_srcntgt_ids, have_oversize_split_box,
                        new_srcntgt_box_ids, box_levels))

            fin_debug("split and sort")

            evt = knl_info.split_and_sort_kernel(*split_and_sort_args,
                    wait_for=wait_for)
            wait_for = [evt]

            if debug:
                level_bl_chunk = box_levels.get()[
                        level_start_box_nrs[-2]:level_start_box_nrs[-1]]
                assert ((level_bl_chunk == level) | (level_bl_chunk == 0)).all()
                del level_bl_chunk

            if debug:
                assert (box_srcntgt_starts.get() < nsrcntgts).all()

            user_srcntgt_ids = new_user_srcntgt_ids
            del new_user_srcntgt_ids
            srcntgt_box_ids = new_srcntgt_box_ids
            del new_srcntgt_box_ids

            if not int(have_oversize_split_box.get()):
                logger.debug("no overfull boxes left")
                break

            level += 1

            have_oversize_split_box.fill(0)

        end_time = time()
        elapsed = end_time-start_time
        npasses = level+1
        logger.info("elapsed time: %g s (%g s/particle/pass)" % (
                elapsed, elapsed/(npasses*nsrcntgts)))
        del npasses

        nboxes = int(nboxes_dev.get())

        # }}}

        # {{{ extract number of non-child srcntgts from box morton counts

        if srcntgts_have_extent:
            box_srcntgt_counts_nonchild = empty(nboxes, particle_id_dtype)
            fin_debug("extract non-child srcntgt count")

            assert len(level_start_box_nrs) >= 2
            highest_possibly_split_box_nr = level_start_box_nrs[-2]

            evt = knl_info.extract_nonchild_srcntgt_count_kernel(
                    # input
                    box_morton_bin_counts,
                    box_srcntgt_counts_cumul,
                    highest_possibly_split_box_nr,

                    # output
                    box_srcntgt_counts_nonchild,

                    range=slice(nboxes), wait_for=wait_for)
            wait_for = [evt]

            del highest_possibly_split_box_nr

            if debug:
                assert (box_srcntgt_counts_nonchild.get()
                        <= box_srcntgt_counts_cumul.get()[:nboxes]).all()

        # }}}

        del morton_nrs
        del box_morton_bin_counts

        # {{{ prune empty leaf boxes

        is_pruned = not kwargs.get("skip_prune")
        if is_pruned:

            # What is the original index of this box?
            from_box_id = empty(nboxes, box_id_dtype)

            # Where should I put this box?
            to_box_id = empty(nboxes, box_id_dtype)

            fin_debug("find prune indices")

            nboxes_post_prune_dev = empty((), dtype=box_id_dtype)
            evt = knl_info.find_prune_indices_kernel(
                    box_srcntgt_counts_cumul,
                    to_box_id, from_box_id, nboxes_post_prune_dev,
                    size=nboxes, wait_for=wait_for)
            wait_for = [evt]

            fin_debug("prune copy")

            nboxes_post_prune = int(nboxes_post_prune_dev.get())

            logger.info("%d empty leaves" % (nboxes-nboxes_post_prune))

            prune_events = []

            prune_empty = partial(self.gappy_copy_and_map,
                    queue, allocator, nboxes_post_prune, from_box_id)

            box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts)
            prune_events.append(evt)

            box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul)
            prune_events.append(evt)

            if debug:
                assert (box_srcntgt_counts_cumul.get() > 0).all()

            srcntgt_box_ids = cl.array.take(to_box_id, srcntgt_box_ids)

            box_parent_ids, evt = prune_empty(box_parent_ids, map_values=to_box_id)
            prune_events.append(evt)
            box_morton_nrs, evt = prune_empty(box_morton_nrs)
            prune_events.append(evt)
            box_levels, evt = prune_empty(box_levels)
            prune_events.append(evt)
            if srcntgts_have_extent:
                box_srcntgt_counts_nonchild, evt = prune_empty(
                        box_srcntgt_counts_nonchild)
                prune_events.append(evt)

            # Remap level_start_box_nrs to new box IDs.
            # FIXME: It would be better to do this on the device.
            level_start_box_nrs = list(
                    to_box_id.get()
                    [np.array(level_start_box_nrs[:-1], box_id_dtype)])
            level_start_box_nrs = level_start_box_nrs + [nboxes_post_prune]

            wait_for = prune_events
        else:
            logger.info("skipping empty-leaf pruning")
            nboxes_post_prune = nboxes

        level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype)

        # }}}

        del nboxes

        # {{{ compute source/target particle indices and counts in each box

        if targets is None:
            from boxtree.tools import reverse_index_array
            user_source_ids = user_srcntgt_ids
            sorted_target_ids = reverse_index_array(user_srcntgt_ids)

            box_source_starts = box_target_starts = box_srcntgt_starts
            box_source_counts_cumul = box_target_counts_cumul = \
                    box_srcntgt_counts_cumul
            if srcntgts_have_extent:
                box_source_counts_nonchild = box_target_counts_nonchild = \
                        box_srcntgt_counts_nonchild
        else:
            source_numbers = empty(nsrcntgts, particle_id_dtype)

            fin_debug("source counter")
            evt = knl_info.source_counter(user_srcntgt_ids, nsources,
                    source_numbers, queue=queue, allocator=allocator,
                    wait_for=wait_for)
            wait_for = [evt]

            user_source_ids = empty(nsources, particle_id_dtype)
            # srcntgt_target_ids is temporary until particle permutation is done
            srcntgt_target_ids = empty(ntargets, particle_id_dtype)
            sorted_target_ids = empty(ntargets, particle_id_dtype)

            # need to use zeros because parent boxes won't be initialized
            box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype)
            wait_for.append(evt)
            box_source_counts_cumul, evt = zeros(
                    nboxes_post_prune, particle_id_dtype)
            wait_for.append(evt)
            box_target_starts, evt = zeros(
                    nboxes_post_prune, particle_id_dtype)
            wait_for.append(evt)
            box_target_counts_cumul, evt = zeros(
                    nboxes_post_prune, particle_id_dtype)
            wait_for.append(evt)

            if srcntgts_have_extent:
                box_source_counts_nonchild, evt = zeros(
                        nboxes_post_prune, particle_id_dtype)
                wait_for.append(evt)
                box_target_counts_nonchild, evt = zeros(
                        nboxes_post_prune, particle_id_dtype)
                wait_for.append(evt)

            fin_debug("source and target index finder")
            evt = knl_info.source_and_target_index_finder(*(
                # input:
                (
                    user_srcntgt_ids, nsources, srcntgt_box_ids,
                    box_parent_ids,

                    box_srcntgt_starts, box_srcntgt_counts_cumul,
                    source_numbers,
                )
                + ((box_srcntgt_counts_nonchild,)
                    if srcntgts_have_extent else ())

                # output:
                + (
                    user_source_ids, srcntgt_target_ids, sorted_target_ids,
                    box_source_starts, box_source_counts_cumul,
                    box_target_starts, box_target_counts_cumul,
                    )
                + ((
                    box_source_counts_nonchild,
                    box_target_counts_nonchild,
                    ) if srcntgts_have_extent else ())
                ),
                queue=queue, range=slice(nsrcntgts),
                wait_for=wait_for)
            wait_for = [evt]

            if srcntgts_have_extent:
                if debug:
                    assert (
                            box_srcntgt_counts_nonchild.get()
                            ==
                            (box_source_counts_nonchild
                            + box_target_counts_nonchild).get()).all()

            if debug:
                usi_host = user_source_ids.get()
                assert (usi_host < nsources).all()
                assert (0 <= usi_host).all()
                del usi_host

                sti_host = srcntgt_target_ids.get()
                assert (sti_host < nsources+ntargets).all()
                assert (nsources <= sti_host).all()
                del sti_host

                assert (box_source_counts_cumul.get()
                        + box_target_counts_cumul.get()
                        == box_srcntgt_counts_cumul.get()).all()

            del source_numbers

        del box_srcntgt_starts
        if srcntgts_have_extent:
            del box_srcntgt_counts_nonchild

        # }}}

        # {{{ permute and source/target-split (if necessary) particle array

        if targets is None:
            sources = targets = make_obj_array([
                cl.array.empty_like(pt) for pt in srcntgts])

            fin_debug("srcntgt permuter (particles)")
            evt = knl_info.srcntgt_permuter(
                    user_srcntgt_ids,
                    *(tuple(srcntgts) + tuple(sources)),
                    wait_for=wait_for)
            wait_for = [evt]

            assert srcntgt_radii is None

        else:
            sources = make_obj_array([
                empty(nsources, coord_dtype) for i in range(dimensions)])
            fin_debug("srcntgt permuter (sources)")
            evt = knl_info.srcntgt_permuter(
                    user_source_ids,
                    *(tuple(srcntgts) + tuple(sources)),
                    queue=queue, range=slice(nsources),
                    wait_for=wait_for)
            wait_for = [evt]

            targets = make_obj_array([
                empty(ntargets, coord_dtype) for i in range(dimensions)])
            fin_debug("srcntgt permuter (targets)")
            evt = knl_info.srcntgt_permuter(
                    srcntgt_target_ids,
                    *(tuple(srcntgts) + tuple(targets)),
                    queue=queue, range=slice(ntargets),
                    wait_for=wait_for)
            wait_for = [evt]

            if srcntgt_radii is not None:
                fin_debug("srcntgt permuter (source radii)")
                source_radii = cl.array.take(
                        srcntgt_radii, user_source_ids, queue=queue,
                        wait_for=wait_for)

                fin_debug("srcntgt permuter (target radii)")
                target_radii = cl.array.take(
                        srcntgt_radii, srcntgt_target_ids, queue=queue,
                        wait_for=wait_for)

                wait_for = source_radii.events + target_radii.events

            del srcntgt_target_ids

        del srcntgt_radii

        # }}}

        del srcntgts

        nlevels = len(level_start_box_nrs) - 1
        assert level + 1 == nlevels, (level+1, nlevels)
        if debug:
            max_level = np.max(box_levels.get())

            assert max_level + 1 == nlevels

        # {{{ compute box info

        # A number of arrays below are nominally 2-dimensional and stored with
        # the box index as the fastest-moving index. To make sure that accesses
        # remain aligned, we round up the number of boxes used for indexing.
        aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32

        box_child_ids, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
        wait_for.append(evt)
        box_centers = empty((dimensions, aligned_nboxes), coord_dtype)

        from boxtree.tree import box_flags_enum
        box_flags = empty(nboxes_post_prune, box_flags_enum.dtype)

        if not srcntgts_have_extent:
            # If srcntgts_have_extent, then non-child counts have already been
            # computed, and we have nothing to do here. But if not, then
            # we must fill these non-child counts. This amounts to copying
            # the cumulative counts and setting them to zero for non-leaves.

            # {{{ make sure box_{source,target}_counts_nonchild are not defined
            # (before we overwrite them)

            try:
                box_source_counts_nonchild
            except NameError:
                pass
            else:
                assert False

            try:
                box_target_counts_nonchild
            except NameError:
                pass
            else:
                assert False

            # }}}

            box_source_counts_nonchild, evt = zeros(
                    nboxes_post_prune, particle_id_dtype)
            wait_for.append(evt)

            if sources_are_targets:
                box_target_counts_nonchild = box_source_counts_nonchild
            else:
                box_target_counts_nonchild, evt = zeros(
                        nboxes_post_prune, particle_id_dtype)
                wait_for.append(evt)

        fin_debug("compute box info")
        evt = knl_info.box_info_kernel(
                *(
                    # input:
                    box_parent_ids, box_morton_nrs, bbox, aligned_nboxes,

                    box_srcntgt_counts_cumul,
                    box_source_counts_cumul, box_target_counts_cumul,
                    max_particles_in_box,
                    box_levels, nlevels,

                    # output if srcntgts_have_extent, input+output otherwise
                    box_source_counts_nonchild, box_target_counts_nonchild,

                    # output:
                    box_child_ids, box_centers, box_flags,
                ),
                range=slice(nboxes_post_prune),
                wait_for=wait_for)

        # }}}

        # {{{ build output

        extra_tree_attrs = {}

        if sources_have_extent:
            extra_tree_attrs.update(source_radii=source_radii)
        if targets_have_extent:
            extra_tree_attrs.update(target_radii=target_radii)

        logger.info("tree build complete")

        return Tree(
                # If you change this, also change the documentation
                # of what's in the tree, above.

                sources_are_targets=sources_are_targets,
                sources_have_extent=sources_have_extent,
                targets_have_extent=targets_have_extent,

                particle_id_dtype=knl_info.particle_id_dtype,
                box_id_dtype=knl_info.box_id_dtype,
                coord_dtype=coord_dtype,
                box_level_dtype=self.box_level_dtype,

                root_extent=root_extent,
                stick_out_factor=stick_out_factor,

                bounding_box=(bbox_min, bbox_max),
                level_start_box_nrs=level_start_box_nrs,
                level_start_box_nrs_dev=cl.array.to_device(
                    queue, level_start_box_nrs,
                    allocator=allocator),

                sources=sources,
                targets=targets,

                box_source_starts=box_source_starts,
                box_source_counts_nonchild=box_source_counts_nonchild,
                box_source_counts_cumul=box_source_counts_cumul,
                box_target_starts=box_target_starts,
                box_target_counts_nonchild=box_target_counts_nonchild,
                box_target_counts_cumul=box_target_counts_cumul,

                box_parent_ids=box_parent_ids,
                box_child_ids=box_child_ids,
                box_centers=box_centers,
                box_levels=box_levels,
                box_flags=box_flags,

                user_source_ids=user_source_ids,
                sorted_target_ids=sorted_target_ids,

                _is_pruned=is_pruned,

                **extra_tree_attrs
                ).with_queue(None), evt
Example #41
0
    def __call__(self, queue, tree, wait_for=None, debug=False):
        """
        :arg queue: A :class:`pyopencl.CommandQueue` instance.
        :arg tree: A :class:`boxtree.Tree` instance.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :return: A tuple *(trav, event)*, where *trav* is a new instance of
            :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        if not tree._is_pruned:
            raise ValueError("tree must be pruned for traversal generation")

        # Generated code shouldn't depend on tje *exact* number of tree levels.
        # So round up to the next multiple of 5.
        from pytools import div_ceil
        max_levels = div_ceil(tree.nlevels, 5) * 5

        knl_info = self.get_kernel_info(
                tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype,
                tree.coord_dtype, tree.box_level_dtype, max_levels,
                tree.sources_are_targets,
                tree.sources_have_extent, tree.targets_have_extent,
                tree.stick_out_factor)

        def fin_debug(s):
            if debug:
                queue.finish()

            logger.debug(s)

        logger.info("start building traversal")

        # {{{ source boxes, their parents, and target boxes

        fin_debug("building list of source boxes, their parents, and target boxes")

        result, evt = knl_info.sources_parents_and_targets_builder(
                queue, tree.nboxes, tree.box_flags.data, wait_for=wait_for)
        wait_for = [evt]

        source_parent_boxes = result["source_parent_boxes"].lists
        source_boxes = result["source_boxes"].lists
        target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists

        if not tree.sources_are_targets:
            target_boxes = result["target_boxes"].lists
        else:
            target_boxes = source_boxes

        # }}}

        # {{{ figure out level starts in *_parent_boxes

        def extract_level_start_box_nrs(box_list, wait_for):
            result = cl.array.empty(queue,
                    tree.nlevels+1, tree.box_id_dtype) \
                            .fill(len(box_list))
            evt = knl_info.level_start_box_nrs_extractor(
                    tree.level_start_box_nrs_dev,
                    tree.box_levels,
                    box_list,
                    result,
                    range=slice(1, len(box_list)),
                    queue=queue, wait_for=wait_for)

            result = result.get()

            # We skipped box 0 above. This is always true, whether
            # box 0 (=level 0) is a leaf or a parent.
            result[0] = 0

            # Postprocess result for unoccupied levels
            prev_start = len(box_list)
            for ilev in range(tree.nlevels-1, -1, -1):
                result[ilev] = prev_start = \
                        min(result[ilev], prev_start)

            return result, evt

        fin_debug("finding level starts in source parent boxes array")
        level_start_source_parent_box_nrs, evt_s = \
                extract_level_start_box_nrs(
                        source_parent_boxes, wait_for=wait_for)

        fin_debug("finding level starts in target or target parent boxes array")
        level_start_target_or_target_parent_box_nrs, evt_t = \
                extract_level_start_box_nrs(
                        target_or_target_parent_boxes, wait_for=wait_for)

        wait_for = [evt_s, evt_t]

        # }}}

        # {{{ colleagues

        fin_debug("finding colleagues")

        result, evt = knl_info.colleagues_builder(
                queue, tree.nboxes,
                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
                wait_for=wait_for)
        wait_for = [evt]
        colleagues = result["colleagues"]

        # }}}

        # {{{ neighbor source boxes ("list 1")

        fin_debug("finding neighbor source boxes ('list 1')")

        result, evt = knl_info.neighbor_source_boxes_builder(
                queue, len(target_boxes),
                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
                target_boxes.data, wait_for=wait_for)

        wait_for = [evt]
        neighbor_source_boxes = result["neighbor_source_boxes"]

        # }}}

        # {{{ well-separated siblings ("list 2")

        fin_debug("finding well-separated siblings ('list 2')")

        result, evt = knl_info.sep_siblings_builder(
                queue, len(target_or_target_parent_boxes),
                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
                target_or_target_parent_boxes.data, tree.box_parent_ids.data,
                colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
        wait_for = [evt]
        sep_siblings = result["sep_siblings"]

        # }}}

        # {{{ separated smaller ("list 3")

        fin_debug("finding separated smaller ('list 3')")

        result, evt = knl_info.sep_smaller_builder(
                queue, len(target_boxes),
                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
                target_boxes.data,
                colleagues.starts.data, colleagues.lists.data,
                wait_for=wait_for)
        wait_for = [evt]
        sep_smaller = result["sep_smaller"]

        if tree.sources_have_extent or tree.targets_have_extent:
            sep_close_smaller_starts = result["sep_close_smaller"].starts
            sep_close_smaller_lists = result["sep_close_smaller"].lists
        else:
            sep_close_smaller_starts = None
            sep_close_smaller_lists = None

        # }}}

        # {{{ separated bigger ("list 4")

        fin_debug("finding separated bigger ('list 4')")

        result, evt = knl_info.sep_bigger_builder(
                queue, len(target_or_target_parent_boxes),
                tree.box_centers.data, tree.root_extent, tree.box_levels.data,
                tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data,
                target_or_target_parent_boxes.data, tree.box_parent_ids.data,
                colleagues.starts.data, colleagues.lists.data, wait_for=wait_for)
        wait_for = [evt]
        sep_bigger = result["sep_bigger"]

        if tree.sources_have_extent or tree.targets_have_extent:
            sep_close_bigger_starts = result["sep_close_bigger"].starts
            sep_close_bigger_lists = result["sep_close_bigger"].lists
        else:
            sep_close_bigger_starts = None
            sep_close_bigger_lists = None

        # }}}

        evt, = wait_for

        logger.info("traversal built")

        return FMMTraversalInfo(
                tree=tree,

                source_boxes=source_boxes,
                target_boxes=target_boxes,

                source_parent_boxes=source_parent_boxes,
                level_start_source_parent_box_nrs=level_start_source_parent_box_nrs,

                target_or_target_parent_boxes=target_or_target_parent_boxes,
                level_start_target_or_target_parent_box_nrs=(
                    level_start_target_or_target_parent_box_nrs),

                colleagues_starts=colleagues.starts,
                colleagues_lists=colleagues.lists,

                neighbor_source_boxes_starts=neighbor_source_boxes.starts,
                neighbor_source_boxes_lists=neighbor_source_boxes.lists,

                sep_siblings_starts=sep_siblings.starts,
                sep_siblings_lists=sep_siblings.lists,

                sep_smaller_starts=sep_smaller.starts,
                sep_smaller_lists=sep_smaller.lists,

                sep_close_smaller_starts=sep_close_smaller_starts,
                sep_close_smaller_lists=sep_close_smaller_lists,

                sep_bigger_starts=sep_bigger.starts,
                sep_bigger_lists=sep_bigger.lists,

                sep_close_bigger_starts=sep_close_bigger_starts,
                sep_close_bigger_lists=sep_close_bigger_lists,
                ).with_queue(None), evt
Example #42
0
    def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        max_levels = div_ceil(tree.nlevels, 10) * 10

        b2l_knl = self.get_balls_to_leaves_kernel(
                tree.dimensions, tree.coord_dtype,
                tree.box_id_dtype, ball_id_dtype,
                max_levels, tree.stick_out_factor)

        logger.info("leaves-to-balls lookup: prepare ball list")

        nballs = len(ball_radii)
        result, evt = b2l_knl(
                queue, nballs,
                tree.box_flags.data, tree.box_centers.data,
                tree.box_child_ids.data, tree.box_levels.data,
                tree.root_extent, tree.aligned_nboxes,
                ball_radii.data, *tuple(bc.data for bc in ball_centers),
                wait_for=wait_for)
        wait_for = [evt]

        logger.info("leaves-to-balls lookup: key-value sort")

        balls_near_box_starts, balls_near_box_lists, evt \
                = self.key_value_sorter(
                        queue,
                        # keys
                        result["overlapping_leaves"].lists,
                        # values
                        result["ball_numbers"].lists,
                        tree.nboxes, starts_dtype=tree.box_id_dtype,
                        wait_for=wait_for)

        logger.info("leaves-to-balls lookup: built")

        return LeavesToBallsLookup(
                tree=tree,
                balls_near_box_starts=balls_near_box_starts,
                balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
Example #43
0
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
        split_kwargs=None):
    """
    :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating
        that the index in *axis_nr* should be split. The tuples may
        also be *(array, axis_nr, "F")*, indicating that the index will
        be split as it would be according to Fortran order.

        *array* may name a temporary variable or an argument.

        If *arrays_and_axes* is a :class:`tuple`, it is automatically
        wrapped in a list, to make single splits easier.

    :arg count: The group size to use in the split.
    :arg auto_split_inames: Whether to automatically split inames
        encountered in the specified indices.
    :arg split_kwargs: arguments to pass to :func:`loopy.split_inames`

    Note that splits on the corresponding inames are carried out implicitly.
    The inames may *not* be split beforehand. (There's no *really* good reason
    for this--this routine is just not smart enough to deal with this.)
    """

    if count == 1:
        return kernel

    if split_kwargs is None:
        split_kwargs = {}

    # {{{ process input into array_to_rest

    # where "rest" is the non-argument-name part of the input tuples
    # in args_and_axes
    def normalize_rest(rest):
        if len(rest) == 1:
            return (rest[0], "C")
        elif len(rest) == 2:
            return rest
        else:
            raise RuntimeError("split instruction '%s' not understood" % rest)

    if isinstance(arrays_and_axes, tuple):
        arrays_and_axes = [arrays_and_axes]

    array_to_rest = dict(
            (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes)

    if len(arrays_and_axes) != len(array_to_rest):
        raise RuntimeError("cannot split multiple axes of the same variable")

    del arrays_and_axes

    # }}}

    # {{{ adjust arrays

    from loopy.kernel.tools import ArrayChanger

    for array_name, (axis, order) in six.iteritems(array_to_rest):
        achng = ArrayChanger(kernel, array_name)
        ary = achng.get()

        from pytools import div_ceil

        # {{{ adjust shape

        new_shape = ary.shape
        if new_shape is not None:
            new_shape = list(new_shape)
            axis_len = new_shape[axis]
            new_shape[axis] = count
            outer_len = div_ceil(axis_len, count)

            if order == "F":
                new_shape.insert(axis+1, outer_len)
            elif order == "C":
                new_shape.insert(axis, outer_len)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_shape = tuple(new_shape)

        # }}}

        # {{{ adjust dim tags

        if ary.dim_tags is None:
            raise RuntimeError("dim_tags of '%s' are not known" % array_name)
        new_dim_tags = list(ary.dim_tags)

        old_dim_tag = ary.dim_tags[axis]

        from loopy.kernel.array import FixedStrideArrayDimTag
        if not isinstance(old_dim_tag, FixedStrideArrayDimTag):
            raise RuntimeError("axis %d of '%s' is not tagged fixed-stride"
                    % (axis, array_name))

        old_stride = old_dim_tag.stride
        outer_stride = count*old_stride

        if order == "F":
            new_dim_tags.insert(axis+1, FixedStrideArrayDimTag(outer_stride))
        elif order == "C":
            new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride))
        else:
            raise RuntimeError("order '%s' not understood" % order)

        new_dim_tags = tuple(new_dim_tags)

        # }}}

        # {{{ adjust dim_names

        new_dim_names = ary.dim_names
        if new_dim_names is not None:
            new_dim_names = list(new_dim_names)
            existing_name = new_dim_names[axis]
            new_dim_names[axis] = existing_name + "_inner"
            outer_name = existing_name + "_outer"

            if order == "F":
                new_dim_names.insert(axis+1, outer_name)
            elif order == "C":
                new_dim_names.insert(axis, outer_name)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_dim_names = tuple(new_dim_names)

        # }}}

        kernel = achng.with_changed_array(ary.copy(
            shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names))

    # }}}

    split_vars = {}

    var_name_gen = kernel.get_var_name_generator()

    def split_access_axis(expr):
        axis_nr, order = array_to_rest[expr.aggregate.name]

        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx,)
        idx = list(idx)

        axis_idx = idx[axis_nr]

        if auto_split_inames:
            from pymbolic.primitives import Variable
            if not isinstance(axis_idx, Variable):
                raise RuntimeError("found access '%s' in which axis %d is not a "
                        "single variable--cannot split "
                        "(Have you tried to do the split yourself, manually, "
                        "beforehand? If so, you shouldn't.)"
                        % (expr, axis_nr))

            split_iname = idx[axis_nr].name
            assert split_iname in kernel.all_inames()

            try:
                outer_iname, inner_iname = split_vars[split_iname]
            except KeyError:
                outer_iname = var_name_gen(split_iname+"_outer")
                inner_iname = var_name_gen(split_iname+"_inner")
                split_vars[split_iname] = outer_iname, inner_iname

            inner_index = Variable(inner_iname)
            outer_index = Variable(outer_iname)

        else:
            from loopy.symbolic import simplify_using_aff
            inner_index = simplify_using_aff(kernel, axis_idx % count)
            outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis+1, outer_index)
        elif order == "C":
            idx.insert(axis, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    aash = ArrayAxisSplitHelper(rule_mapping_context,
            set(six.iterkeys(array_to_rest)), split_access_axis)
    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))

    if auto_split_inames:
        from loopy import split_iname
        for iname, (outer_iname, inner_iname) in six.iteritems(split_vars):
            kernel = split_iname(kernel, iname, count,
                    outer_iname=outer_iname, inner_iname=inner_iname,
                    **split_kwargs)

    return kernel
Example #44
0
def test_mem_access_counter_specialops():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
            ],
            name="specialops", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                            g=np.float64, h=np.float64))

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                    subgroup_size=SGS)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group

    f32 = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='a',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f32 += mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='load', variable='b',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='g',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64),
                        lid_strides={}, gid_strides={},
                        direction='load', variable='h',
                        count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32 == (2*n*m*ell)*n_subgroups
    assert f64 == (2*n*m)*n_subgroups

    f32 = mem_map[lp.MemAccess('global', np.float32,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='c',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)
    f64 = mem_map[lp.MemAccess('global', np.float64,
                        lid_strides={}, gid_strides={},
                        direction='store', variable='e',
                        count_granularity=CG.SUBGROUP)
                  ].eval_with_dict(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert f32 == (n*m*ell)*n_subgroups
    assert f64 == (n*m)*n_subgroups

    filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                         count_granularity=CG.SUBGROUP)
    tot = filtered_map.eval_and_sum(params)

    # uniform: (count-per-sub-group)*n_subgroups
    assert tot == (n*m*ell + n*m)*n_subgroups