Example #1
0
    def __init__(self,
                 lib,
                 dtype,
                 N,
                 C,
                 K,
                 D=1,
                 H=1,
                 W=1,
                 T=1,
                 R=1,
                 S=1,
                 pad_d=0,
                 pad_h=0,
                 pad_w=0,
                 str_d=1,
                 str_h=1,
                 str_w=1):

        super(ConvLayer, self).__init__(lib, dtype, N, np.float32)

        # Compute the output spatial dimensions
        M = lib.output_dim(D, T, pad_d, str_d)
        P = lib.output_dim(H, R, pad_h, str_h)
        Q = lib.output_dim(W, S, pad_w, str_w)

        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.NCK = (N, C, K)
        self.TRS = (T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_d, pad_h, pad_w)
        self.strides = (str_d, str_h, str_w)

        self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w,
                           str_d, str_h, str_w)

        self.dimI = (C, D, H, W, N)
        self.dimF = (C, T, R, S, K)
        self.dimFb = (K, T, R, S, C)
        self.dimO = (K, M, P, Q, N)
        self.dimI2 = (C * D * H * W, N)
        self.dimF2 = (C * T * R * S, K)
        self.dimF2t = (K, C * T * R * S)
        self.dimO2 = (K * M * P * Q, N)
        self.dimS = (K, 1)
        self.sizeI = reduce(mul, self.dimI, 1)
        self.sizeF = reduce(mul, self.dimF, 1)
        self.sizeO = reduce(mul, self.dimO, 1)
        self.nOut = reduce(mul, self.MPQ, 1) * K

        # flop count for benchmarking
        self.flops = P * Q * M * K * N * C * R * S * T * 2.0

        args = (lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d,
                pad_h, pad_w, str_d, str_h, str_w)

        #lib.enable_winograd = 0

        ####### Cuda C ###########
        if lib.use_cudac_kernels:

            #3D conv not supported yet
            if T > 1 or D > 1:
                raise ValueError(
                    "3D Convolution not supported by CUDA C kernels and pre-Maxwell GPUs"
                )

            # TODO small C bprop?
            self.fprop_kernels = convolution.FpropCuda(*args)
            self.bprop_kernels = convolution.BpropCuda(*args)
            self.updat_kernels = convolution.UpdateCuda(*args)

        ####### Winograd ###########
        elif lib.enable_winograd and R == 3 and S == 3 and all(
                x == 1 for x in (D, M, T, str_w, str_h, str_d)):
            from .winograd_conv import (FpropWinograd_2x2_3x3,
                                        BpropWinograd_2x2_3x3,
                                        UpdateWinograd_3x3_2x2,
                                        FpropWinograd_4x4_3x3,
                                        BpropWinograd_4x4_3x3,
                                        UpdateWinograd_3x3_4x4)

            # Temp for now till we can autotune
            # 2 is safer for fp16 without batchnorm
            if dtype == np.float32 and lib.enable_winograd == 4:
                winograd = 4
            else:
                winograd = 2

            if C < 8:
                self.fprop_kernels = convolution.FpropDirect(*args)
            elif winograd == 4 and H * W < 112 * 112:
                self.fprop_kernels = FpropWinograd_4x4_3x3(*args)
            else:
                self.fprop_kernels = FpropWinograd_2x2_3x3(*args)

            if winograd == 4 and H * W < 112 * 112:
                self.bprop_kernels = BpropWinograd_4x4_3x3(*args)
            else:
                self.bprop_kernels = BpropWinograd_2x2_3x3(*args)

            if N >= 4 and (C < 8 or H * W > 112 * 112):
                self.updat_kernels = convolution.UpdateDirect(*args)
            elif winograd == 4:
                self.updat_kernels = UpdateWinograd_3x3_4x4(*args)
            else:
                self.updat_kernels = UpdateWinograd_3x3_2x2(*args)

#        elif lib.enable_winograd and not lib.deterministic and N > 1 and \
#            R == 5 and S == 5 and all(x == 1 for x in (D,M,T,str_w,str_h,str_d)):
#
#            from .winograd_conv import (FpropWinograd_2x2_5x5, BpropWinograd_2x2_5x5)
#
#            self.fprop_kernels = FpropWinograd_2x2_5x5(*args)
#            self.bprop_kernels = BpropWinograd_2x2_5x5(*args)
#            if N >= 4:
#                self.updat_kernels = convolution.UpdateDirect(*args)

####### Direct ###########
        else:

            self.fprop_kernels = convolution.FpropDirect(*args)
            self.bprop_kernels = convolution.BpropDirect(*args)
            if N >= 4:
                self.updat_kernels = convolution.UpdateDirect(*args)
Example #2
0
    def __init__(self, lib, dtype,
                 N, C, K,
                 D=1, H=1, W=1,
                 T=1, R=1, S=1,
                 pad_d=0, pad_h=0, pad_w=0,
                 str_d=1, str_h=1, str_w=1,
                 relu=False, bsum=False):

        super(ConvLayer, self).__init__(lib, dtype, N, np.float32)

        # Compute the output spatial dimensions
        M = lib.output_dim(D, T, pad_d, str_d)
        P = lib.output_dim(H, R, pad_h, str_h)
        Q = lib.output_dim(W, S, pad_w, str_w)

        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.NCK = (N, C, K)
        self.TRS = (T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_d, pad_h, pad_w)
        self.strides = (str_d, str_h, str_w)
        self.relu = relu
        self.bsum = bsum

        self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w)

        self.dimI   = (C, D, H, W, N)
        self.dimF   = (C, T, R, S, K)
        self.dimFb  = (K, T, R, S, C)
        self.dimO   = (K, M, P, Q, N)
        self.dimI2  = (C*D*H*W, N)
        self.dimF2  = (C*T*R*S, K)
        self.dimF2t = (K, C*T*R*S)
        self.dimO2  = (K*M*P*Q, N)
        self.dimS   = (K, 1)
        self.sizeI  = reduce(mul, self.dimI, 1)
        self.sizeF  = reduce(mul, self.dimF, 1)
        self.sizeO  = reduce(mul, self.dimO, 1)
        self.nOut   = reduce(mul, self.MPQ, 1) * K

        # flop count for benchmarking
        self.flops = P*Q*M*K*N*C*R*S*T * 2.0

        ####### Cuda C ###########
        if lib.use_cudac_kernels:

            #3D conv not supported yet
            if T > 1 or D > 1:
                raise ValueError("3D Convolution not supported by CUDA C kernels.")

            self.fprop_kernels = convolution.FpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                                                       pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum)
            # TODO small C bprop?
            self.bprop_kernels = convolution.BpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                                                       pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum)
            self.updat_kernels = convolution.UpdateCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                                                        pad_d, pad_h, pad_w, str_d, str_h, str_w) 
            
        ####### Winograd ###########
        elif lib.have_winograd and R == 3 and S == 3 and all(x == 1 for x in (D,M,T,str_w,str_h,str_d)):
            from winograd.convolution import FpropWinograd, BpropWinograd, UpdateWinograd

            if N >= 64 and C < 8 or not lib.have_winograd:
                self.fprop_kernels = convolution.FpropDirect(
                    lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                     pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum)
            else:
                self.fprop_kernels = FpropWinograd(
                    lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w, relu, bsum)

            self.bprop_kernels = BpropWinograd(
                lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w, relu, bsum)

            if N >=32 and C < 8 or not lib.have_winograd:
                self.updat_kernels = convolution.UpdateDirect(
                    lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                     pad_d, pad_h, pad_w, str_d, str_h, str_w)
            else:
                self.updat_kernels = UpdateWinograd(
                    lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w)

        ####### Direct ###########
        else:
            vec_size = 4 if self.dtype.itemsize == 4 else 8

            self.fprop_kernels = convolution.FpropDirect(
                lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                 pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum)

            if C % vec_size == 0:
                self.bprop_kernels = convolution.BpropDirect(
                    lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                     pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum)
            else:
                # special kernel for deconv into first layer
                self.bprop_kernels = convolution.BpropDirectSmallC(
                    lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                     pad_d, pad_h, pad_w, str_d, str_h, str_w)

            self.updat_kernels = convolution.UpdateDirect(
                lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q,
                 pad_d, pad_h, pad_w, str_d, str_h, str_w)

        logger.debug("%s: %s, %s, %s", str(self), str(self.fprop_kernels), str(self.bprop_kernels), str(self.updat_kernels))
Example #3
0
    def __init__(self,
                 lib,
                 dtype,
                 N,
                 C,
                 K,
                 D=1,
                 H=1,
                 W=1,
                 T=1,
                 R=1,
                 S=1,
                 pad_d=0,
                 pad_h=0,
                 pad_w=0,
                 str_d=1,
                 str_h=1,
                 str_w=1,
                 bsum=False):
        super(ConvLayer, self).__init__(lib, dtype, N, np.float32)

        # Compute the output spatial dimensions
        M = lib.output_dim(D, T, pad_d, str_d)
        P = lib.output_dim(H, R, pad_h, str_h)
        Q = lib.output_dim(W, S, pad_w, str_w)

        self.C = C
        self.K = K
        self.M = M
        self.P = P
        self.Q = Q
        self.NCK = (N, C, K)
        self.TRS = (T, R, S)
        self.DHW = (D, H, W)
        self.MPQ = (M, P, Q)
        self.padding = (pad_d, pad_h, pad_w)
        self.strides = (str_d, str_h, str_w)
        self.bsum = bsum

        self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w,
                           str_d, str_h, str_w)

        self.dimI = (C, D, H, W, N)
        self.dimF = (C, T, R, S, K)
        self.dimFb = (K, T, R, S, C)
        self.dimO = (K, M, P, Q, N)
        self.dimI2 = (C * D * H * W, N)
        self.dimF2 = (C * T * R * S, K)
        self.dimF2t = (K, C * T * R * S)
        self.dimO2 = (K * M * P * Q, N)
        self.dimS = (K, 1)
        self.sizeI = reduce(mul, self.dimI, 1)
        self.sizeF = reduce(mul, self.dimF, 1)
        self.sizeO = reduce(mul, self.dimO, 1)
        self.nOut = reduce(mul, self.MPQ, 1) * K

        # flop count for benchmarking
        self.flops = P * Q * M * K * N * C * R * S * T * 2.0

        if T > 1 or D > 1:
            raise ValueError("3D Convolution not supported by CUDA C kernels.")

        self.fprop_kernels = convolution.FpropCuda(lib,
                                                   self.dtype,
                                                   N,
                                                   C,
                                                   K,
                                                   D,
                                                   H,
                                                   W,
                                                   T,
                                                   R,
                                                   S,
                                                   M,
                                                   P,
                                                   Q,
                                                   pad_d,
                                                   pad_h,
                                                   pad_w,
                                                   str_d,
                                                   str_h,
                                                   str_w,
                                                   bsum=bsum)
        # TODO small C bprop?
        self.bprop_kernels = convolution.BpropCuda(lib,
                                                   self.dtype,
                                                   N,
                                                   C,
                                                   K,
                                                   D,
                                                   H,
                                                   W,
                                                   T,
                                                   R,
                                                   S,
                                                   M,
                                                   P,
                                                   Q,
                                                   pad_d,
                                                   pad_h,
                                                   pad_w,
                                                   str_d,
                                                   str_h,
                                                   str_w,
                                                   bsum=bsum)
        self.updat_kernels = convolution.UpdateCuda(lib, self.dtype, N, C, K,
                                                    D, H, W, T, R, S, M, P, Q,
                                                    pad_d, pad_h, pad_w, str_d,
                                                    str_h, str_w)