def vol_empty():
            from hedge.backends.cuda.tools import int_ceiling
            dofs = int_ceiling(
                    given.total_dofs(), self.plan.dofs_per_macroblock())

            return gpuarray.empty((dofs,), dtype=given.float_type,
                    allocator=discr.pool.allocate)
Example #2
0
    def _find_microblock_size(self, allow_microblocking):
        from hedge.backends.cuda.tools import int_ceiling
        align_size = self.devdata.align_words(self.float_size())

        from pytools import Record
        class MicroblockInfo(Record):
            pass

        if not allow_microblocking:
            return MicroblockInfo(
                    align_size=align_size,
                    elements=1,
                    aligned_floats=int_ceiling(self.dofs_per_el(), align_size)
                    )

        for mb_align_chunks in range(1, 256):
            mb_aligned_floats = align_size * mb_align_chunks
            mb_elements = mb_aligned_floats // self.dofs_per_el()
            mb_floats = self.dofs_per_el()*mb_elements
            overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats
            if overhead <= 0.05:
                return MicroblockInfo(
                        align_size=align_size,
                        elements=mb_elements,
                        aligned_floats=mb_aligned_floats,
                        )

        assert False, "a valid microblock size was not found"
Example #3
0
    def __init__(self, discr, plan):
        self.discr = discr
        self.plan = plan

        from hedge.backends.cuda.tools import int_ceiling

        self.grid = (int_ceiling(self.plan.given.total_dofs() / self.plan.dofs_per_macroblock()), 1)
Example #4
0
    def max_elements_touched_by_segment(self):
        given = self.given

        from hedge.backends.cuda.tools import int_ceiling
        if given.dofs_per_el() > self.segment_size:
            return 2
        else:
            return int_ceiling(self.segment_size/given.dofs_per_el()) + 1
Example #5
0
        def vol_empty():
            from hedge.backends.cuda.tools import int_ceiling
            dofs = int_ceiling(
                    given.total_dofs(), self.plan.dofs_per_macroblock())

            import pycuda.gpuarray as gpuarray
            return gpuarray.empty((dofs,), dtype=given.float_type,
                    allocator=discr.pool.allocate)
Example #6
0
    def threads_per_face(self):
        dpf = self.dofs_per_face

        devdata = self.given.devdata
        if dpf % devdata.smem_granularity >= devdata.smem_granularity // 2:
            from hedge.backends.cuda.tools import int_ceiling
            return int_ceiling(dpf, devdata.smem_granularity)
        else:
            return dpf
Example #7
0
    def __init__(self, discr, plan):
        self.discr = discr
        self.plan = plan

        from hedge.backends.cuda.tools import int_ceiling

        self.grid = (int_ceiling(
            self.plan.given.total_dofs()
            / self.plan.dofs_per_macroblock()), 1)
Example #8
0
    def matmul_preimage_shape(self, matmul_plan):
        from hedge.backends.cuda.tools import int_ceiling
        fof_dofs = (
            self.block_count
            * self.microblocks_per_block
            * matmul_plan.aligned_preimage_dofs_per_microblock)
        fof_dofs = int_ceiling(fof_dofs, matmul_plan.preimage_dofs_per_macroblock())

        return (fof_dofs,)
Example #9
0
    def threads_per_face(self):
        dpf = self.dofs_per_face

        devdata = self.given.devdata
        if dpf % devdata.smem_granularity >= devdata.smem_granularity // 2:
            from hedge.backends.cuda.tools import int_ceiling
            return int_ceiling(dpf, devdata.smem_granularity)
        else:
            return dpf
Example #10
0
    def __init__(self, discr, plan):
        self.discr = discr
        self.plan = plan

        from hedge.backends.cuda.tools import int_ceiling

        given = self.plan.given
        self.grid = (plan.segments_per_microblock(),
                    int_ceiling(given.total_dofs()/plan.dofs_per_macroblock()))
Example #11
0
    def __init__(self, discr, plan, with_index_check):
        self.discr = discr
        self.plan = plan
        self.with_index_check = with_index_check

        from hedge.backends.cuda.tools import int_ceiling
        self.grid = (plan.segments_per_microblock(),
                     int_ceiling(self.plan.given.total_dofs() /
                                 plan.dofs_per_macroblock()))
Example #12
0
    def __init__(self, discr, plan):
        self.discr = discr
        self.plan = plan

        from hedge.backends.cuda.tools import int_ceiling
        self.grid = (int_ceiling(
            plan.microblock_count
            * plan.aligned_image_dofs_per_microblock
            / self.plan.image_dofs_per_macroblock()),
            1)
Example #13
0
    def __init__(self, discr, plan):
        self.discr = discr
        self.plan = plan

        from hedge.backends.cuda.tools import int_ceiling

        given = self.plan.given
        self.grid = (plan.segments_per_microblock(),
                     int_ceiling(given.total_dofs() /
                                 plan.dofs_per_macroblock()))
Example #14
0
    def benchmark(self):
        discr = self.discr
        given = self.plan.given

        from hedge.backends.cuda.tools import int_ceiling
        block_count = int_ceiling(
                len(discr.mesh.elements)/self.plan.elements_per_block())
        all_fluxes_on_faces = [gpuarray.empty(
            (block_count * self.plan.microblocks_per_block()
                * given.aligned_face_dofs_per_microblock(),),
                dtype=given.float_type,
                allocator=discr.pool.allocate)
                for i in range(len(self.fluxes))]

        field = gpuarray.empty(
                (self.plan.input_dofs_per_block() * block_count,),
                dtype=given.float_type,
                allocator=discr.pool.allocate)

        fdata = self.fake_flux_face_data_block(block_count)
        ilist_data = self.fake_index_list_data()

        block, gather, texref_map = self.get_kernel(fdata, ilist_data,
                for_benchmark=True)

        for dep_expr in self.all_deps:
            field.bind_to_texref_ext(texref_map[dep_expr],
                    allow_double_hack=True)

        if "cuda_fastbench" in discr.debug:
            count = 1
        else:
            count = 20

        start = cuda.Event()
        start.record()
        for i in range(count):
            if block_count >= 2**16:
                return None

            try:
                gather.prepared_call(
                        (block_count, 1), block,
                        0,
                        fdata.device_memory,
                        *tuple(fof.gpudata for fof in all_fluxes_on_faces)
                        )
            except cuda.LaunchError:
                return None

        stop = cuda.Event()
        stop.record()
        stop.synchronize()

        return 1e-3/count * stop.time_since(start)
Example #15
0
    def benchmark(self):
        discr = self.discr
        given = self.plan.given

        from hedge.backends.cuda.tools import int_ceiling
        block_count = int_ceiling(
            len(discr.mesh.elements) / self.plan.elements_per_block())
        all_fluxes_on_faces = [
            gpuarray.empty((block_count * self.plan.microblocks_per_block() *
                            given.aligned_face_dofs_per_microblock(), ),
                           dtype=given.float_type,
                           allocator=discr.pool.allocate)
            for i in range(len(self.fluxes))
        ]

        field = gpuarray.empty(
            (self.plan.input_dofs_per_block() * block_count, ),
            dtype=given.float_type,
            allocator=discr.pool.allocate)

        fdata = self.fake_flux_face_data_block(block_count)
        ilist_data = self.fake_index_list_data()

        block, gather, texref_map = self.get_kernel(fdata,
                                                    ilist_data,
                                                    for_benchmark=True)

        for dep_expr in self.all_deps:
            field.bind_to_texref_ext(texref_map[dep_expr],
                                     allow_double_hack=True)

        if "cuda_fastbench" in discr.debug:
            count = 1
        else:
            count = 20

        start = cuda.Event()
        start.record()
        for i in range(count):
            if block_count >= 2**16:
                return None

            try:
                gather.prepared_call(
                    (block_count, 1), block, 0, fdata.device_memory,
                    *tuple(fof.gpudata for fof in all_fluxes_on_faces))
            except cuda.LaunchError:
                return None

        stop = cuda.Event()
        stop.record()
        stop.synchronize()

        return 1e-3 / count * stop.time_since(start)
    def __init__(self, discr, plan, with_index_check):
        self.discr = discr
        self.plan = plan
        self.with_index_check = with_index_check

        from hedge.backends.cuda.tools import int_ceiling
        self.grid = (plan.segments_per_microblock(),
                int_ceiling(
                    self.plan.given.total_dofs()
                    / plan.dofs_per_macroblock())
                )
Example #17
0
 def segments_per_microblock(self):
     from hedge.backends.cuda.tools import int_ceiling
     return int_ceiling(
             self.given.microblock.aligned_floats/self.segment_size)