Esempio n. 1
0
    def prepare_matrix(self, matrix):
        discr = self.discr
        given = self.plan.given

        assert matrix.shape == (given.dofs_per_el(),
                                self.plan.preimage_dofs_per_el)

        columns = self.plan.gpu_matrix_columns()
        block_floats = self.plan.gpu_matrix_block_floats()

        vstacked_matrix = numpy.vstack(given.microblock.elements * (matrix, ))

        if vstacked_matrix.shape[1] < columns:
            vstacked_matrix = numpy.hstack(
                (vstacked_matrix,
                 numpy.zeros((vstacked_matrix.shape[0],
                              columns - vstacked_matrix.shape[1]))))

        segments = [
            buffer(
                numpy.asarray(vstacked_matrix[segment_start:segment_start +
                                              self.plan.segment_size],
                              dtype=given.float_type,
                              order="C"))
            for segment_start in range(
                0, given.microblock.elements *
                given.dofs_per_el(), self.plan.segment_size)
        ]

        from hedge.backends.cuda.tools import pad_and_join

        return cuda.to_device(
            pad_and_join(segments, block_floats * given.float_size()))
Esempio n. 2
0
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
Esempio n. 3
0
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
Esempio n. 4
0
    def gpu_diffmats(self, diff_op_cls, elgroup):
        discr = self.discr
        given = self.plan.given

        columns = given.dofs_per_el() * discr.dimensions
        additional_columns = 0
        # avoid smem fetch bank conflicts by ensuring odd col count
        if columns % 2 == 0:
            columns += 1
            additional_columns += 1

        block_floats = given.devdata.align_dtype(
            columns * self.plan.segment_size, given.float_size())

        vstacked_matrices = [
            numpy.vstack(given.microblock.elements * (m, ))
            for m in diff_op_cls.matrices(elgroup)
        ]

        segments = []

        from pytools import single_valued
        for segment_start in range(
                0, given.microblock.elements * given.dofs_per_el(),
                self.plan.segment_size):
            matrices = [
                m[segment_start:segment_start + self.plan.segment_size]
                for m in vstacked_matrices
            ]

            matrices.append(
                numpy.zeros(
                    (single_valued(m.shape[0]
                                   for m in matrices), additional_columns)))

            diffmats = numpy.asarray(numpy.hstack(matrices),
                                     dtype=given.float_type,
                                     order="C")
            segments.append(buffer(diffmats))

        from hedge.backends.cuda.tools import pad_and_join

        from pytools import Record

        class GPUDifferentiationMatrices(Record):
            pass

        return GPUDifferentiationMatrices(device_memory=cuda.to_device(
            pad_and_join(segments, block_floats * given.float_size())),
                                          block_floats=block_floats,
                                          matrix_columns=columns)
Esempio n. 5
0
def make_blocks(devdata, data):
    from hedge.backends.cuda.tools import pad_and_join

    blocks = ["".join(b) for b in data]
    block_size = devdata.align(max(len(b) for b in blocks))

    class BlockedDataStructure(Record):
        pass

    return BlockedDataStructure(
            blocks=cuda.to_device(pad_and_join(blocks, block_size)),
            max_per_block=max(len(b) for b in data),
            block_size=block_size,
            )
Esempio n. 6
0
def make_blocks(devdata, data):
    from hedge.backends.cuda.tools import pad_and_join

    blocks = ["".join(b) for b in data]
    block_size = devdata.align(max(len(b) for b in blocks))

    class BlockedDataStructure(Record):
        pass

    return BlockedDataStructure(
            blocks=cuda.to_device(pad_and_join(blocks, block_size)),
            max_per_block=max(len(b) for b in data),
            block_size=block_size,
            )
Esempio n. 7
0
    def gpu_diffmats(self, diff_op_cls, elgroup):
        discr = self.discr
        given = self.plan.given

        columns = given.dofs_per_el()*discr.dimensions
        additional_columns = 0
        # avoid smem fetch bank conflicts by ensuring odd col count
        if columns % 2 == 0:
            columns += 1
            additional_columns += 1

        block_floats = given.devdata.align_dtype(
                columns*self.plan.segment_size, given.float_size())

        vstacked_matrices = [
                numpy.vstack(given.microblock.elements*(m,))
                for m in diff_op_cls.matrices(elgroup)
                ]

        segments = []

        from pytools import single_valued
        for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size):
            matrices = [
                m[segment_start:segment_start+self.plan.segment_size]
                for m in vstacked_matrices]

            matrices.append(
                numpy.zeros((single_valued(m.shape[0] for m in matrices),
                    additional_columns))
                )

            diffmats = numpy.asarray(
                    numpy.hstack(matrices),
                    dtype=given.float_type,
                    order="C")
            segments.append(buffer(diffmats))

        from hedge.backends.cuda.tools import pad_and_join

        from pytools import Record
        class GPUDifferentiationMatrices(Record):
            pass

        return GPUDifferentiationMatrices(
                device_memory=cuda.to_device(
                    pad_and_join(segments, block_floats*given.float_size())),
                block_floats=block_floats,
                matrix_columns=columns)
    def prepare_matrix(self, matrix):
        discr = self.discr
        given = self.plan.given

        assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el)

        columns = self.plan.gpu_matrix_columns()
        block_floats = self.plan.gpu_matrix_block_floats()

        vstacked_matrix = numpy.vstack(
                given.microblock.elements*(matrix,))

        if vstacked_matrix.shape[1] < columns:
            vstacked_matrix = numpy.hstack((
                vstacked_matrix,
                numpy.zeros((
                    vstacked_matrix.shape[0],
                    columns-vstacked_matrix.shape[1]
                    ))
                ))

        segments = [
                buffer(numpy.asarray(
                    vstacked_matrix[
                        segment_start:segment_start+self.plan.segment_size],
                    dtype=given.float_type,
                    order="C"))
                for segment_start in range(
                    0, given.microblock.elements*given.dofs_per_el(),
                    self.plan.segment_size)
                ]

        from hedge.backends.cuda.tools import pad_and_join

        return cuda.to_device(
                pad_and_join(segments, block_floats*given.float_size()))