Esempio n. 1
0
    def baseline(m, n, k, gpu, autotuning):
        """
        Given an (m, n, k)-triplet and GPu and autotuning properties, return a set of parameters corresponding to a
        baseline ("educated guess") of the kernel's optimal parameters
        """
        from kernels.smm_acc_dnt_base import round_up_to_nearest_multiple

        grp = 16
        minblk = 2
        tm = 2
        tn = 2
        cmax = (n + tn - 1) // tn
        rmax = (m + tm - 1) // tm
        min_threads = cmax * rmax

        while True:

            base = {
                "threads": round_up_to_nearest_multiple(min_threads, 32),
                "grouping": grp,
                "minblocks": minblk,
                "tile_m": tn,
                "tile_n": tn,
                "w": float("nan"),
                "v": float("nan"),
            }

            if (len(
                    Kernel_dnt_medium.promising_parameters(
                        m, n, k, gpu, autotuning, **base)) > 0):
                break
            else:
                grp -= 1
                if grp == 0:
                    base = Kernel_dnt_medium.promising_parameters(
                        m, n, k, gpu, autotuning)[0]
                    base.update(
                        dict([("w", float("nan")), ("v", float("nan"))]))
                    break

        base.update(
            dict([
                ("m", m),
                ("n", n),
                ("k", k),
                ("algorithm", "medium"),
                ("perf", 0),
                ("source", "predicted"),
            ]))
        return base
Esempio n. 2
0
    def promising_parameters(
        m,
        n,
        k,
        gpu,
        autotuning,
        threads=None,
        grouping=None,
        minblocks=None,
        tile_m=None,
        tile_n=None,
        w=None,
        v=None,
    ):
        """
        Given a certain (m,n,k)-triplet, GPU properties and autotuning properties, return a list of all possible
        kernel parameters
        """
        from kernels.smm_acc_dnt_base import round_up_to_nearest_multiple

        params = []
        for minblocks_ in range(1, 28) if minblocks is None else [minblocks]:
            # for exhaustive search: range(1, gpu["Thread_Blocks_/_Multiprocessor"] + 1):
            # heuristic: the optimal minblocks is never > 28
            for grouping_ in range(1, 32 +
                                   1, 1) if grouping is None else [grouping]:

                if m >= 28 and grouping_ not in (3, 4, 5, 24, 26, 29, 32):
                    continue  # heuristic: investigate a smaller search space of grouping for large matrices

                for tm in (range(
                        1,
                        min(12, m) + 1) if tile_m is None else [
                            tile_m
                        ]):  # heuristic: the optimal tile_m is never above 12
                    for tn in (
                            range(1,
                                  min(12, n) + 1) if tile_n is None else
                        [tile_n
                         ]):  # heuristic: the optimal tile_m is never above 12

                        if tm * tn > 16:
                            continue  # heuristic: performance decreases for very large tiles

                        # Number of tiled columns, rows
                        cmax = (n + tn - 1) // tn
                        rmax = (m + tm - 1) // tm

                        # Max work ("operations") which can be run concurrently
                        max_concurrent_work = max(grouping_, m * k, k * n,
                                                  m * n, cmax * rmax)

                        # Minimum number of threads required to have one thread per tile
                        # i.e., cover the result matrix
                        min_threads = cmax * rmax

                        # Shared memory buffer size
                        buf_sz = max(m * n, m * k + k * tn * cmax,
                                     tm * rmax * k + 1)
                        smem_tot = (buf_sz * autotuning["sizeof_double"] +
                                    autotuning["npars"] * grouping_ *
                                    autotuning["sizeof_int"])
                        if smem_tot > gpu["Max_Shared_Memory_/_Block_(bytes)"]:
                            continue
                        if (smem_tot * minblocks_ >
                                gpu["Shared_Memory_/_Multiprocessor_(bytes)"]):
                            continue

                        # Use all concurrency available: fill warps
                        for threads_ in (range(
                                gpu["Threads_/_Warp"],
                                gpu["Max_Thread_Block_Size"] + 1,
                                gpu["Threads_/_Warp"],
                        ) if threads is None else [threads]):

                            if threads_ > round_up_to_nearest_multiple(
                                    max_concurrent_work,
                                    gpu["Threads_/_Warp"]):
                                continue  # soft: too much concurrency harms performance
                            if threads_ * minblocks_ > gpu[
                                    "Threads_/_Multiprocessor"]:
                                continue
                            if threads_ < min_threads:
                                continue

                            params.append({
                                "m": m,
                                "n": n,
                                "k": k,
                                "tile_m": tm,
                                "tile_n": tn,
                                "threads": threads_,
                                "grouping": grouping_,
                                "minblocks": minblocks_,
                            })
        return params
Esempio n. 3
0
    def promising_parameters(
        m,
        n,
        k,
        gpu,
        autotuning,
        threads=None,
        grouping=None,
        minblocks=None,
        tile_m=None,
        tile_n=None,
        w=None,
        v=None,
    ):
        """
        Given a certain (m,n,k)-triplet, GPU properties and autotuning properties, return a list of all possible
        kernel parameters
        """
        from kernels.smm_acc_dnt_base import round_up_to_nearest_multiple

        # Shared memory buffer size
        buf_sz = k * (
            m + n
        )  # number of elements in the a_block buffer = mk, and in the b_block buffer = kn

        # Minimum number of threads required to cover the result matrix c
        min_threads = m * n

        # Parameter space:
        params = []
        for minblocks_ in (range(1, gpu["Thread_Blocks_/_Multiprocessor"] +
                                 1) if minblocks is None else [minblocks]):
            # heuristic: never seen optimal=1 hence start from 2
            for grouping_ in range(2, 32 +
                                   1, 1) if grouping is None else [grouping]:

                # Max work ("operations")  which can be run concurrently
                max_concurrent_work = max(grouping_, m * k, k * n, m * n)

                # Shared memory utilisation (bytes)
                smem_tot = (
                    buf_sz * autotuning["sizeof_double"] +
                    autotuning["npars"] * grouping_ * autotuning["sizeof_int"])
                if smem_tot > gpu["Max_Shared_Memory_/_Block_(bytes)"]:
                    continue
                if smem_tot * minblocks_ > gpu[
                        "Max_Shared_Memory_/_Block_(bytes)"]:
                    continue

                # Use all concurrency available: fill warps
                for threads_ in (range(
                        gpu["Threads_/_Warp"],
                        gpu["Max_Thread_Block_Size"] + 1,
                        gpu["Threads_/_Warp"],
                ) if threads is None else [threads]):

                    if threads_ > round_up_to_nearest_multiple(
                            max_concurrent_work, gpu["Threads_/_Warp"]):
                        continue  # soft: too much concurrency harms performance
                    if threads_ * minblocks_ > gpu["Threads_/_Multiprocessor"]:
                        continue
                    if threads_ < min_threads:
                        continue

                    params.append({
                        "m": m,
                        "n": n,
                        "k": k,
                        "threads": threads_,
                        "grouping": grouping_,
                        "minblocks": minblocks_,
                    })
        return params
Esempio n. 4
0
    def promising_parameters(
        m,
        n,
        k,
        gpu,
        autotuning,
        threads=None,
        grouping=None,
        minblocks=None,
        tile_m=None,
        tile_n=None,
        w=None,
        v=None,
    ):
        """
        Given a certain (m,n,k)-triplet, GPU properties and autotuning properties, return a list of all possible
        kernel parameters
        """
        from kernels.smm_acc_dnt_base import round_up_to_nearest_multiple

        params = []
        grouping = 16

        for minblocks_ in (1, 2, 4, 8,
                           12) if minblocks is None else [minblocks]:
            # for exhaustive search, it should be: range(1, gpu["Thread_Blocks_/_Multiprocessor"] + 1):
            # but heuristically reduce the search space
            for threads_ in (range(
                    gpu["Threads_/_Warp"],
                    gpu["Max_Thread_Block_Size"] + 1,
                    gpu["Threads_/_Warp"],
            ) if threads is None else [threads]):

                if threads_ * minblocks_ > gpu["Threads_/_Multiprocessor"]:
                    continue

                for tm in range(1, min(12, m +
                                       1)) if tile_m is None else [tile_m]:
                    for tn in range(1, min(12, n +
                                           1)) if tile_n is None else [tile_n]:

                        if tm * tn > 49:
                            continue  # heuristic: performance decreases for very large tiles

                        # Number of tiled columns, rows
                        cmax = (n + tn - 1) // tn
                        rmax = (m + tm - 1) // tm

                        # Minimum number of threads required to have one thread per tile,
                        # i.e., cover the result matrix
                        min_threads = cmax * rmax
                        if threads_ < min_threads:
                            continue
                        if min_threads < (threads_ - 32):
                            continue  # heuristic: too many threads unused during calculation

                        for w_ in range(4,
                                        (k + 1) // 2, 2) if w is None else [w]:
                            # heuristic: even numbers yield better performance
                            if w_ < tn:
                                continue  # invalid: input slap too small
                            if 2 * w_ > k:
                                continue  # heuristic: do at least one double-buffering step

                            for v_ in range(2, n + 1, 2) if v is None else [v]:
                                # heuristic: even numbers yield better performance

                                if v_ < tm:
                                    continue  # invalid: output slab too small

                                # Number of registers
                                n_regs = (tm * tn +
                                          (w_ * m + threads_ - 1) // threads_ +
                                          (w_ * n + threads_ - 1) // threads_)
                                if n_regs * threads_ * minblocks_ > 15000:
                                    continue  # heuristic: too many registers used

                                # Max work ("operations") which can be run concurrently
                                max_concurrent_work = max(
                                    grouping, m * w_, w_ * n, m * v_,
                                    cmax * rmax)
                                if threads_ > round_up_to_nearest_multiple(
                                        max_concurrent_work,
                                        gpu["Threads_/_Warp"]):
                                    continue  # heuristics: too much concurrency harms performance

                                # Shared memory buffer size
                                buf_sz = max(
                                    (w_ - 1) * m + rmax * tm,
                                    m * w_ + (w_ - 1) * n + cmax * tn,
                                    v_ * m,
                                )
                                smem_tot = (
                                    buf_sz * autotuning["sizeof_double"] +
                                    autotuning["npars"] * grouping *
                                    autotuning["sizeof_int"])
                                if smem_tot > gpu[
                                        "Max_Shared_Memory_/_Block_(bytes)"]:
                                    continue  # invalid: uses too much shared memory
                                if (smem_tot * minblocks_ > gpu[
                                        "Shared_Memory_/_Multiprocessor_(bytes)"]
                                    ):
                                    continue  # invalid: uses too much shared memory

                                params.append({
                                    "m": m,
                                    "n": n,
                                    "k": k,
                                    "tile_m": tm,
                                    "tile_n": tn,
                                    "w": w_,
                                    "v": v_,
                                    "threads": threads_,
                                    "grouping": grouping,
                                    "minblocks": minblocks_,
                                })
        return params