Beispiel #1
0
def _get_oned_copy_kernel(dtype, shape):
    copy = r"""
__global__ void copy_oned(%(type)s* out, const %(type)s* in, int dim, long long src_str,
                          long long dst_str)
{
    int tid_x = threadIdx.x;
    int idx = blockIdx.x;

    idx = (idx << 5) + tid_x;

    const %(type)s* in0 = in + (src_str * idx);
    %(type)s* out0 = out + (dst_str * idx);

    if(idx < dim) *out0 = *in0;
}
"""
    code = copy % dict(
        type=_get_register_type(dtype, memory=True)
    )

    # print code
    module = SourceModule(code)
    kernel = module.get_function("copy_oned")
    kernel.prepare("PPIqq")

    kernel.grid = (_ceil_div(shape[0], 32), 1, 1)
    kernel.block = (32, 1, 1)
    kernel.args = (shape[0], )

    return kernel
Beispiel #2
0
    def __init__(self, transformer, op):
        super(PoolFpropKernel, self).__init__(transformer)

        (self.I, ) = (_ for _ in op.call_info())
        self.O = op.tensor_description()
        self.dtype = self.O.dtype
        self.index = op.index

        if self.dtype.type is np.float16:
            clss = "hpool"
        elif self.dtype.type is np.float32:
            clss = "spool"
        else:
            raise TypeError("Type not supported {}".format(clss))

        C, D, H, W, _ = self.I.axes.lengths
        K, M, P, Q, N = self.O.axes.lengths

        J, T, R, S, pool_op = itemgetter(*('J', 'T', 'R', 'S',
                                           'op'))(op.pool_params)
        pad_c, pad_d, pad_h, pad_w = \
            itemgetter(*('pad_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params)
        str_c, str_d, str_h, str_w = \
            itemgetter(*('str_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params)

        # default to non-overlapping
        if str_c is None:
            str_c = J
        if str_d is None:
            str_d = T
        if str_h is None:
            str_h = R
        if str_w is None:
            str_w = S

        self.overlap = 1.0

        # TODO: detect other forms of gaps
        if str_c > J or str_d > T or str_h > R or str_w > S:
            self.gaps = 1
        else:
            self.gaps = 0

        self.op = pool_op
        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.JTRS = (J, T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_c, pad_d, pad_h, pad_w)
        self.strides = (str_c, str_d, str_h, str_w)

        self.dimI = (C, D, H, W, N)
        self.dimO = (K, M, P, Q, N)
        self.dimF2 = None
        self.dimI2 = (C * D * H * W, N)
        self.dimO2 = (K * M * P * Q, N)
        self.sizeI = np.product(self.dimI)
        self.sizeO = np.product(self.dimO)
        self.nOut = np.product(self.MPQ) * K

        # precompute some multiplications for fast constant memory access
        WN = W * N
        HWN = H * WN
        DHWN = D * HWN
        RS = R * S
        RST = T * RS
        JRST = J * RST
        QN = Q * N
        PQN = P * QN
        MPQN = M * PQN

        assert JRST + 32 < 2**16, "Integer division is faster with 16bit numerators"

        sb_large = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            1: (0, 0x00, 0, 0, 0x00, 0, 0xfff, 32),  # 1x1  nnnnn
            2: (0, 0x00, 0, 1, 0x10, 4, 0x00f, 4),  # 1x2  xnnnn
            4: (0, 0x00, 0, 2, 0x18, 3, 0x007, 3),  # 1x4  xxnnn
            8: (0, 0x00, 0, 3, 0x1c, 2, 0x003, 2),  # 1x8  xxxnn
            16: (0, 0x00, 0, 4, 0x1e, 1, 0x001, 1),  # 1x16 xxxxn
            32: (0, 0x00, 0, 5, 0x1f, 0, 0x000, 0),  # 1x32 xxxxx
        }
        sb_medium = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            8: (1, 0x10, 4, 2, 0x0c, 2, 0x003, 2),  # 2x4  yxxnn
            16: (1, 0x10, 4, 3, 0x0e, 1, 0x001, 1),  # 2x8  yxxxn
            32: (1, 0x10, 4, 4, 0x0f, 0, 0x000, 0),  # 2x16 yxxxx
        }
        sb_small = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            16: (2, 0x18, 3, 2, 0x06, 1, 0x001, 1),  # 4x4  yyxxn
            32: (2, 0x18, 3, 3, 0x07, 0, 0x000, 0),  # 4x8  yyxxx
        }

        if N == 1:
            super_block = 0
        elif N < 32:
            super_block = len(bin(N - 1)) - 2
        else:
            super_block = 5
        super_block = 1 << (5 - super_block)

        # try to minimize the zero overlap in the superblock
        # but maximize the x dim of the superblock for more contiguous memory access
        if super_block < 8 or Q > 64:
            sb_params = sb_large.get(super_block)
        elif super_block < 16 or Q > 32:
            sb_params = sb_medium.get(super_block)
        else:
            sb_params = sb_small.get(super_block)

        supP = _ceil_div(P, 1 << sb_params[0])
        supQ = _ceil_div(Q, 1 << sb_params[3])

        # precompute the magic numbers and shift amounts for integer division
        magic_RST = _magic32(JRST + 32, RST)
        magic_RS = _magic32(RST + 32, RS)
        magic_S = _magic32(RS + 32, S)
        magic_P = _magic32(M * supP, supP)

        fprop_name = "fprop_" + pool_op

        threads = 32 if super_block > 1 else N

        self.fprop_kernel = [
            fprop_name, (supQ, supP * M, K), (threads, 1, 1),
            _flatten([
                N, W, H, D, C, WN, HWN, DHWN, P, Q, magic_P, QN, PQN, MPQN,
                pad_c, pad_d, pad_h, pad_w, str_c, str_d, str_h, str_w, S, RS,
                RST, JRST, magic_S, magic_RS, magic_RST, supP, supQ, sb_params
            ])
        ]

        lut_size = JRST
        if lut_size % 4 != 0:
            lut_size += 4 - lut_size % 4

        self.fprop_lut_size = super_block * lut_size * 4

        self.kernel = pooling.map_string2func(
            self.fprop_kernel[0], self.dtype.str[1:],
            self.transformer.runtime.compute_capability)
Beispiel #3
0
    def __init__(self, transformer, op):
        super(PoolBpropKernel, self).__init__(transformer)

        (self.I, ) = (_ for _ in op.call_info())
        self.O = op.tensor_description()
        self.dtype = self.O.dtype
        self.op = op

        if not (self.dtype.type in self.supported_types):
            raise TypeError("Type not supported: {}".format(self.dtype.type))

        C, D, H, W, _ = self.O.axes.lengths
        K, M, P, Q, N = self.I.axes.lengths

        J, T, R, S, pool_op = itemgetter(*('J', 'T', 'R', 'S', 'op'))(op.pool_params)
        pad_c, pad_d, pad_h, pad_w = \
            itemgetter(*('pad_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params)
        str_c, str_d, str_h, str_w = \
            itemgetter(*('str_' + s for s in ('c', 'd', 'h', 'w')))(op.pool_params)

        # default to non-overlapping
        if str_c is None:
            str_c = J
        if str_d is None:
            str_d = T
        if str_h is None:
            str_h = R
        if str_w is None:
            str_w = S

        self.overlap = 1.0

        # TODO: detect other forms of gaps
        if str_c > J or str_d > T or str_h > R or str_w > S:
            self.gaps = 1
        else:
            self.gaps = 0

        self.pool_op   = pool_op
        self.C    = C
        self.K    = K
        self.M    = M
        self.P    = P
        self.Q    = Q
        self.JTRS = (J, T, R, S)
        self.DHW  = (D, H, W)
        self.MPQ  = (M, P, Q)
        self.padding = (pad_c, pad_d, pad_h, pad_w)
        self.strides = (str_c, str_d, str_h, str_w)

        self.dimI   = (C, D, H, W, N)
        self.dimO   = (K, M, P, Q, N)
        self.dimF2  = None
        self.dimI2  = (C * D * H * W, N)
        self.dimO2  = (K * M * P * Q, N)
        self.sizeI  = np.product(self.dimI)
        self.sizeO  = np.product(self.dimO)
        self.nOut   = np.product(self.MPQ) * K

        # precompute some multiplications for fast constant memory access
        WN   = W * N
        HWN  = H * WN
        DHWN = D * HWN
        DH   = D * H
        RS   = R * S
        RST  = T * RS
        JRST = J * RST
        QN   = Q * N
        PQN  = P * QN
        MPQN = M * PQN

        assert JRST + 32 < 2**16, "Integer division is faster with 16bit numerators"

        sb_large = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            1  : (0,   0x00, 0,   0,   0x00, 0,   0xfff, 32), # 1x1  nnnnn
            2  : (0,   0x00, 0,   1,   0x10, 4,   0x00f,  4), # 1x2  xnnnn
            4  : (0,   0x00, 0,   2,   0x18, 3,   0x007,  3), # 1x4  xxnnn
            8  : (0,   0x00, 0,   3,   0x1c, 2,   0x003,  2), # 1x8  xxxnn
            16 : (0,   0x00, 0,   4,   0x1e, 1,   0x001,  1), # 1x16 xxxxn
            32 : (0,   0x00, 0,   5,   0x1f, 0,   0x000,  0), # 1x32 xxxxx
        }
        sb_medium = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            8  : (1,   0x10, 4,   2,   0x0c, 2,   0x003,  2), # 2x4  yxxnn
            16 : (1,   0x10, 4,   3,   0x0e, 1,   0x001,  1), # 2x8  yxxxn
            32 : (1,   0x10, 4,   4,   0x0f, 0,   0x000,  0), # 2x16 yxxxx
        }
        sb_small = {
            #SB  shlP maskP shrP shlQ maskQ shrQ maskN shrN
            16 : (2,   0x18, 3,   2,   0x06, 1,   0x001,  1), # 4x4  yyxxn
            32 : (2,   0x18, 3,   3,   0x07, 0,   0x000,  0), # 4x8  yyxxx
        }

        if N == 1:
            super_block = 0
        elif N < 32:
            super_block = len(bin(N - 1)) - 2
        else:
            super_block = 5
        super_block = 1 << (5 - super_block)

        # try to minimize the zero overlap in the superblock
        # but maximize the x dim of the superblock for more contiguous memory access
        if super_block < 8 or Q > 64:
            sb_params = sb_large.get(super_block)
        elif super_block < 16 or Q > 32:
            sb_params = sb_medium.get(super_block)
        else:
            sb_params = sb_small.get(super_block)

        supP = _ceil_div(P, 1 << sb_params[0])
        supQ = _ceil_div(Q, 1 << sb_params[3])

        # precompute the magic numbers and shift amounts for integer division
        magic_RST = _magic32(JRST + 32, RST)
        magic_RS  = _magic32(RST + 32, RS)
        magic_S   = _magic32(RS + 32, S)
        magic_P   = _magic32(M * supP, supP)

        bprop_name = "bprop_" + pool_op

        threads = 32 if super_block > 1 else N

        lut_size = JRST
        if lut_size % 4 != 0:
            lut_size += 4 - lut_size % 4

        self.bprop_lut_size = self.fprop_lut_size = super_block * lut_size * 4

        if self.overlap > 0:

            # we have a special kernel to handle the overlapping avg pooling
            bprop_name += "_overlap"

            magic_str_w = _magic32(W + S, str_w)
            magic_str_h = _magic32(H + R, str_h)
            magic_str_d = _magic32(D + T, str_d)
            magic_str_c = _magic32(C + J, str_c)

            if super_block > 1:

                bprop_name += "_smallN"

                if super_block < 8 or W > 64:
                    sb_params = sb_large.get(super_block)
                elif super_block < 16 or W > 32:
                    sb_params = sb_medium.get(super_block)
                else:
                    sb_params = sb_small.get(super_block)

                supH = _ceil_div(H, 1 << sb_params[0])
                supW = _ceil_div(W, 1 << sb_params[3])

                magic_H = _magic32(D * supH, supH)

                maxLutSize = \
                    _ceil_div(S, str_w) * \
                    _ceil_div(R, str_h) * \
                    _ceil_div(T, str_d) * \
                    _ceil_div(J, str_c)

                #neon_logger.display((supW, D*supH, C), sb_params, maxLutSize)

                self.bprop_kernel = [bprop_name, (supW, D * supH, C), (threads, 1, 1), _flatten([
                    N, W, H, D, C, WN, HWN, DHWN, magic_H,
                    pad_w, pad_h, pad_d, pad_c,
                    str_w, str_h, str_d, str_c,
                    magic_str_w, magic_str_h, magic_str_d, magic_str_c,
                    S, R, T, J, RS, RST, JRST, magic_S, magic_RS, magic_RST,
                    Q, P, M, K, QN, PQN, MPQN,
                    supH, supW, sb_params, maxLutSize])]

                lut_size = maxLutSize
                if lut_size % 4 != 0:
                    lut_size += 4 - lut_size % 4

                self.bprop_lut_size = super_block * lut_size * 4 * 2

            else:

                # The overlap kernel can be much more efficient if we aren't doing superblocking
                magic_H = _magic32(DH, H)

                self.bprop_kernel = [bprop_name, (W, DH, C), (threads, 1, 1), _flatten([
                    N, W, H, D, C, WN, HWN, DHWN, magic_H,
                    pad_w, pad_h, pad_d, pad_c,
                    str_w, str_h, str_d, str_c,
                    magic_str_w, magic_str_h, magic_str_d, magic_str_c,
                    S, R, T, J, RS, RST, JRST, magic_S, magic_RS, magic_RST,
                    Q, P, M, K, QN, PQN, MPQN])]

                self.bprop_lut_size = lut_size * 4 * 2
        else:
            self.bprop_kernel = [bprop_name, (supQ, supP * M, K), (threads, 1, 1), _flatten([
                N, W, H, D, C, WN, HWN, DHWN,
                P, Q, magic_P, QN, PQN, MPQN,
                pad_c, pad_d, pad_h, pad_w,
                str_c, str_d, str_h, str_w,
                S, RS, RST, JRST, magic_S, magic_RS, magic_RST,
                supP, supQ, sb_params])]

        self.kernel = pooling.map_string2func(self.bprop_kernel[0],
                                              self.dtype.str[1:],
                                              self.transformer.runtime.compute_capability)
Beispiel #4
0
def _get_copy_transpose_kernel(dtype, shape, axes=None):
    if len(shape) == 1:
        return _get_oned_copy_kernel(dtype, shape)

    src = list(range(len(shape)))
    dst = list(axes)

    src_dim = src[-1]
    dst_dim = dst[-1]

    # If the inner dim is the same for both, no need for shared memory tile
    # Then map the outer source dim to the threadIdx.y values
    if src_dim == dst_dim:
        dst_dim = src[0]
        shared_tile = False
    else:
        shared_tile = True

    src_offset = []
    dst_offset = []
    params = []
    values = []
    magic = ""

    # add dims for bounds checking
    for dim in (src_dim, dst_dim):
        params.append("int dim_%s" % dim)
        values.append(shape[dim])

    # collapse src and dst shape by 32
    grid_shape = list(shape)
    grid_shape[src_dim] = _ceil_div(shape[src_dim], 32)
    grid_shape[dst_dim] = _ceil_div(shape[dst_dim], 32)

    # get a src list without dst dim
    src2 = [s for s in src if s != dst_dim]

    # get the name of the first compound index
    blkx_name = compound_idx = "".join(native_str(x) for x in src2)

    # generate the magic number math to extract all indeces
    while len(src2) > 1:

        idx1 = src2[0]
        del src2[0]
        idx2 = "".join(native_str(i) for i in src2)
        div = reduce(mul, (grid_shape[i] for i in src2), 1)

        params.extend(p % idx2 for p in ("int magic_%s", "int shift_%s", "int div_%s"))
        values.extend(_magic64(div))
        values.append(div)

        magic += r"""
    int idx_{1} = div64(idx_{0}, magic_{2}, shift_{2});
    int idx_{2} = idx_{0} - idx_{1}*div_{2};
""".format(compound_idx, idx1, idx2)

        compound_idx = idx2

    # Add params for src strides and generate src offset
    # The param values will be added externally
    for s in src:
        params.append("long long src_str_%d" % s)
        src_offset.append("src_str_%d*idx_%d" % (s, s))

    # Add params for dst strides and generate dst offset
    for d in dst:
        params.append("long long dst_str_%d" % d)
        dst_offset.append("dst_str_%d*idx_%d" % (d, d))

    num_strides = len(src) + len(dst)

    if shared_tile:
        copy_transpose = r"""
%(common)s

__global__ void copy_transpose(%(type)s* out, const %(type)s* in, %(params)s)
{
    __shared__ %(type)s tile[32][33];

    int tid_x = threadIdx.x;
    int tid_y = threadIdx.y;
    int idx_%(blk)s = blockIdx.x;
    int idx_%(dst)s = blockIdx.y;

    %(magic)s

    idx_%(src)s = (idx_%(src)s << 5) + tid_x;
    idx_%(dst)s = (idx_%(dst)s << 5) + tid_y;

    const %(type)s* in00 = in   + %(src_offset)s;
    const %(type)s* in08 = in00 + src_str_%(dst)s*8;
    const %(type)s* in16 = in08 + src_str_%(dst)s*8;
    const %(type)s* in24 = in16 + src_str_%(dst)s*8;

    bool b%(src)s = idx_%(src)s < dim_%(src)s;

    if (idx_%(dst)s +  0 < dim_%(dst)s && b%(src)s) tile[tid_y +  0][tid_x] = *in00;
    if (idx_%(dst)s +  8 < dim_%(dst)s && b%(src)s) tile[tid_y +  8][tid_x] = *in08;
    if (idx_%(dst)s + 16 < dim_%(dst)s && b%(src)s) tile[tid_y + 16][tid_x] = *in16;
    if (idx_%(dst)s + 24 < dim_%(dst)s && b%(src)s) tile[tid_y + 24][tid_x] = *in24;

    __syncthreads();

    %(type)s val00 = tile[tid_x][tid_y +  0];
    %(type)s val08 = tile[tid_x][tid_y +  8];
    %(type)s val16 = tile[tid_x][tid_y + 16];
    %(type)s val24 = tile[tid_x][tid_y + 24];

    idx_%(src)s += tid_y - tid_x;
    idx_%(dst)s += tid_x - tid_y;

    bool b%(dst)s = idx_%(dst)s < dim_%(dst)s;

    %(type)s* out00 = out   + %(dst_offset)s;
    %(type)s* out08 = out00 + dst_str_%(src)s*8;
    %(type)s* out16 = out08 + dst_str_%(src)s*8;
    %(type)s* out24 = out16 + dst_str_%(src)s*8;

    if (idx_%(src)s +  0 < dim_%(src)s && b%(dst)s) *out00 = val00;
    if (idx_%(src)s +  8 < dim_%(src)s && b%(dst)s) *out08 = val08;
    if (idx_%(src)s + 16 < dim_%(src)s && b%(dst)s) *out16 = val16;
    if (idx_%(src)s + 24 < dim_%(src)s && b%(dst)s) *out24 = val24;
}
"""
    else:
        copy_transpose = r"""
%(common)s

__global__ void copy_transpose(%(type)s* out, const %(type)s* in, %(params)s)
{
    int tid_x = threadIdx.x;
    int tid_y = threadIdx.y;
    int idx_%(blk)s = blockIdx.x;
    int idx_%(dst)s = blockIdx.y;

    %(magic)s

    idx_%(src)s = (idx_%(src)s << 5) + tid_x;
    idx_%(dst)s = (idx_%(dst)s << 5) + tid_y;

    bool b%(src)s    = idx_%(src)s      < dim_%(src)s;
    bool b%(dst)s_00 = idx_%(dst)s +  0 < dim_%(dst)s && b%(src)s;
    bool b%(dst)s_08 = idx_%(dst)s +  8 < dim_%(dst)s && b%(src)s;
    bool b%(dst)s_16 = idx_%(dst)s + 16 < dim_%(dst)s && b%(src)s;
    bool b%(dst)s_24 = idx_%(dst)s + 24 < dim_%(dst)s && b%(src)s;

    %(type)s val00 = 0;
    %(type)s val08 = 0;
    %(type)s val16 = 0;
    %(type)s val24 = 0;

    const %(type)s* in00 = in   + %(src_offset)s;
    const %(type)s* in08 = in00 + src_str_%(dst)s*8;
    const %(type)s* in16 = in08 + src_str_%(dst)s*8;
    const %(type)s* in24 = in16 + src_str_%(dst)s*8;

    if (b%(dst)s_00) val00 = *in00;
    if (b%(dst)s_08) val08 = *in08;
    if (b%(dst)s_16) val16 = *in16;
    if (b%(dst)s_24) val24 = *in24;

    %(type)s* out00 = out   + %(dst_offset)s;
    %(type)s* out08 = out00 + dst_str_%(dst)s*8;
    %(type)s* out16 = out08 + dst_str_%(dst)s*8;
    %(type)s* out24 = out16 + dst_str_%(dst)s*8;

    if (b%(dst)s_00) *out00 = val00;
    if (b%(dst)s_08) *out08 = val08;
    if (b%(dst)s_16) *out16 = val16;
    if (b%(dst)s_24) *out24 = val24;
}
"""
    code = copy_transpose % dict(
        common=_div64,
        type=_get_register_type(dtype, memory=True),
        params=", ".join(params),
        blk=blkx_name,
        src=src_dim,
        dst=dst_dim,
        magic=magic,
        src_offset=" + ".join(src_offset),
        dst_offset=" + ".join(dst_offset)
    )
    # print code
    module = SourceModule(code)
    kernel = module.get_function("copy_transpose")
    kernel.prepare("PP" + ("I" * (len(params) - num_strides)) + "q" * num_strides)

    grid_x = grid_shape[src_dim]
    grid_y = grid_shape[dst_dim]
    for s in src:
        if s not in (src_dim, dst_dim):
            grid_x *= grid_shape[s]

    kernel.grid = (grid_x, grid_y, 1)
    kernel.block = (32, 8, 1)
    kernel.args = tuple(values)

    return kernel