Esempio n. 1
0
def _optimize_loop_axis(dim):
    """
    Chooses kernel parameters including CUDA block size, grid size, and
    number of elements to compute per thread for the loop axis. The loop
    axis is the axis of the tensor for which a thread can compute multiple
    outputs. Uses a simple heuristic which tries to get at least 4 warps
    per block and 8 items per thread to hide latencies. Prefers a higher
    item-per-thread to launching many blocks for very large axes since
    blocks are serialized by the GPU after all SMs are filled.

    Arguments:
        dim (int): Size of the tensor on the loop axis.

    Returns: tuple of grid dimension, block dimension, and items per thread
    """
    sm_count = _get_sm_count()

    griddim = min(sm_count, -((-dim) // 32))
    items_per_block = -((-dim) // griddim)

    items_per_thread = 1
    warps = -((-items_per_block) // (32 * items_per_thread))

    while (warps > 4 and items_per_thread < 8) or (warps > 32):
        items_per_thread = items_per_thread + 1
        warps = -((-items_per_block) // (32 * items_per_thread))

    blockdim = warps * 32

    return (griddim, blockdim, items_per_thread)
Esempio n. 2
0
    def __init__(self,
                 device_id=None,
                 enable_winograd=True,
                 deterministic=True,
                 scratch_size=0):
        drv.init()
        self.device_id = device_id if device_id is not None else 0

        # check compute capability
        self.compute_capability = drv.Device(
            self.device_id).compute_capability()
        if self.compute_capability[0] < 3:
            raise RuntimeError("Unsupported GPU")

        # context
        self.ctx = drv.Device(self.device_id).make_context()

        # attributes
        self.stream = None
        self.warmup = False
        self.scratch_size = scratch_size
        self.scratch_offset = 0
        self.sm_count = _get_sm_count()

        # store GPU memory size in bytes
        self.gpu_memory_size = drv.mem_get_info()[1]

        # Fall back to CUDA C kernels on older (pre-Maxwell) GPU generations
        if self.compute_capability[0] < 5:
            # TODO: this is not fully supported in graph yet
            self.use_cudac_kernels = True
        else:
            self.use_cudac_kernels = False

        # TODO
        # self.cublas_handle = cublas.cublasCreate()
        self.pcg = rng_mrg()

        self.enable_winograd = enable_winograd
        self.deterministic = deterministic
        self.cache_dir = get_cache_dir()
Esempio n. 3
0
 def sm_count(self):
     return _get_sm_count()
Esempio n. 4
0
    def _build_maxas_kernel(self, op, size=None):
        """
        Uses tensor dimensions and axis ordering to select a sass kernel and use
        maxas to compile it for later use.

        Arguments:
            op (DotOp): Graph op being transformed into this kernel
            size (str): Optional preselected tile size
        """
        # Get inputs to gemm
        C = TensorDescriptionWrapper(op.tensor_description(), 2)
        A, B = (TensorDescriptionWrapper(_, 2) for _ in op.call_info())

        # If both inputs are 1d, need to transpose one of them
        if min(A.strides) == 0 and min(B.strides) == 0:
            A.strides = tuple(reversed(A.strides))
            A.shape = tuple(reversed(A.shape))
            vector_dot = True
        else:
            vector_dot = False

        self.C = C
        self.A = A
        self.B = B

        # Kernels only support 2d tensors
        assert len(A.shape) == 2
        assert len(B.shape) == 2
        assert len(C.shape) == 2

        # one dimension must be contiguous
        assert min(A.strides) == 1 or max(A.strides) == 1
        assert min(B.strides) == 1 or max(B.strides) == 1
        assert min(C.strides) == 1 or max(C.strides) == 1 or vector_dot

        lda = max(A.strides)
        ldb = max(B.strides)
        ldc = max(C.strides)

        if A.is_trans:
            opA = 't'
            if size not in ("32x64", "16x64"):
                lda *= 8 * A.dtype.itemsize  # saves a kernel register
        else:
            opA = 'n'

        if B.is_trans:
            opB = 't'
        else:
            opB = 'n'
            if size not in ("32x64", "16x64"):
                ldb *= 8 * B.dtype.itemsize  # saves a kernel register

        op = opA + opB
        assert op != "tt"

        m = A.shape[0]
        n = B.shape[1]
        k = A.shape[1]

        assert m == C.shape[0]
        assert n == C.shape[1]
        assert k == B.shape[0]

        # Flex only has the 128x128 tile size
        if C.is_flex():
            size = "128x128"

        # Some basic tile size selection.
        # Your best bet is to benchmark your code with all 3 sizes
        # and manually fine tune the selection for each layer.
        # TODO: Perhaps I'll add an autotuning mode.
        if size is None:
            # find the shorter side
            short = min(m, n)
            # anything bigger than this just use 128
            if short < 384 - 16:
                # compute remainder of 128
                short128 = short % 128
                # if remainder is more than 112 just use 128
                if 0 < short128 < 112:
                    # to figure out when to use 64 over 32 we need to calc
                    # occupancy at 64
                    if 48 < short128 <= 64:
                        occupancy64 = short // 64
                        wide = max(m, n)
                        occupancy64 *= (wide // 128 +
                                        (wide % 128 != 0)) // _get_sm_count()
                        # 64 is only faster than 32 when occupancy is more than
                        # 1 warp per scheduler.
                        if occupancy64 > 1:
                            size = 64
                        else:
                            size = 32
                    else:
                        size = 32
                else:
                    size = 128
            # There's a large regime where 64 is faster, but it's hard to
            # characterize
            else:
                size = 128

            # match the kernel to the optimal short size but avoid not
            # implemented kernels
            if m >= n:
                if op == "nt":
                    size = 128
                sizeA, sizeB = (128, size)
            else:
                if op == "tn":
                    size = 128
                # temp till I can write these kernels (coming soon)
                elif size == 64:
                    size = 32
                sizeA, sizeB = (size, 128)

            size = "%dx%d" % (sizeA, sizeB)

        else:
            sizeA, sizeB = (int(s) for s in size.split('x'))

        gridA = m // sizeA + (m % sizeA != 0)
        gridB = n // sizeB + (n % sizeB != 0)

        k_vec = 8 if sizeA in (16, 32) or sizeB == 32 else 16

        vec_opt = None
        if op == "tn":
            if (m % 4 == 0 and n % 4 == 0 and A.strides[1] % 4 == 0
                    and B.strides[0] % 4 == 0):
                vec_opt = ("vec", )
        elif op == "nn":
            if (k % k_vec == 0 and n % 4 == 0 and A.strides[0] % k_vec == 0
                    and B.strides[0] % 4 == 0):
                vec_opt = ("vec", )
        elif op == "nt":
            if (k % k_vec == 0 and n % 4 == 0 and A.strides[0] % k_vec == 0
                    and B.strides[1] % k_vec == 0):
                vec_opt = ("vec", )

        # nt and nn are more efficient with k%16==0
        if C.is_flex():
            clss = "fgemm"
        elif C.dtype.type is np.float16:
            clss = "hgemm"
        elif C.dtype.type is np.float32:
            clss = "sgemm"
        else:
            raise TypeError("Only floating point dot currently supported.")

        # TODO: Flex may not have all "size" options (Urs)
        self.kernel = kernel_specs.get_kernel("_".join((clss, op, size)),
                                              vec_opt)
        # alpha, beta
        self.alpha = 1.0
        self.beta = 0.0
        # create params
        # if params list changes, indices in bind_flex_scales may need updating
        self.params = [(1, int(gridA), int(gridB)),
                       (self.kernel.threads, 1, 1), None, C.td, A.td, B.td,
                       self.alpha, self.beta, 0,
                       int(lda),
                       int(ldb),
                       int(ldc),
                       int(m),
                       int(n),
                       int(k), 0, 0, 0, 0]
        if clss == "fgemm":
            # save flex entries for bind_flex_scales
            self.flex_entry_A = A.flex_entry()
            self.flex_entry_B = B.flex_entry()
            self.flex_entry_C = C.flex_entry()

            # flex params
            self.params += [FlexPtrDescription(self.flex_entry_C),
                            1.0]  # maxabs ptr, output scale
            # record output flex id for autoflex
            self.output_flex_ids = [self.flex_entry_C.flex_id]