def __init__(self, lib, dtype, N, C, K, D=1, H=1, W=1, T=1, R=1, S=1, pad_d=0, pad_h=0, pad_w=0, str_d=1, str_h=1, str_w=1): super(ConvLayer, self).__init__(lib, dtype, N, np.float32) # Compute the output spatial dimensions M = lib.output_dim(D, T, pad_d, str_d) P = lib.output_dim(H, R, pad_h, str_h) Q = lib.output_dim(W, S, pad_w, str_w) self.C = C self.K = K self.M = M self.P = P self.Q = Q self.NCK = (N, C, K) self.TRS = (T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_d, pad_h, pad_w) self.strides = (str_d, str_h, str_w) self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimF = (C, T, R, S, K) self.dimFb = (K, T, R, S, C) self.dimO = (K, M, P, Q, N) self.dimI2 = (C * D * H * W, N) self.dimF2 = (C * T * R * S, K) self.dimF2t = (K, C * T * R * S) self.dimO2 = (K * M * P * Q, N) self.dimS = (K, 1) self.sizeI = reduce(mul, self.dimI, 1) self.sizeF = reduce(mul, self.dimF, 1) self.sizeO = reduce(mul, self.dimO, 1) self.nOut = reduce(mul, self.MPQ, 1) * K # flop count for benchmarking self.flops = P * Q * M * K * N * C * R * S * T * 2.0 args = (lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w) #lib.enable_winograd = 0 ####### Cuda C ########### if lib.use_cudac_kernels: #3D conv not supported yet if T > 1 or D > 1: raise ValueError( "3D Convolution not supported by CUDA C kernels and pre-Maxwell GPUs" ) # TODO small C bprop? self.fprop_kernels = convolution.FpropCuda(*args) self.bprop_kernels = convolution.BpropCuda(*args) self.updat_kernels = convolution.UpdateCuda(*args) ####### Winograd ########### elif lib.enable_winograd and R == 3 and S == 3 and all( x == 1 for x in (D, M, T, str_w, str_h, str_d)): from .winograd_conv import (FpropWinograd_2x2_3x3, BpropWinograd_2x2_3x3, UpdateWinograd_3x3_2x2, FpropWinograd_4x4_3x3, BpropWinograd_4x4_3x3, UpdateWinograd_3x3_4x4) # Temp for now till we can autotune # 2 is safer for fp16 without batchnorm if dtype == np.float32 and lib.enable_winograd == 4: winograd = 4 else: winograd = 2 if C < 8: self.fprop_kernels = convolution.FpropDirect(*args) elif winograd == 4 and H * W < 112 * 112: self.fprop_kernels = FpropWinograd_4x4_3x3(*args) else: self.fprop_kernels = FpropWinograd_2x2_3x3(*args) if winograd == 4 and H * W < 112 * 112: self.bprop_kernels = BpropWinograd_4x4_3x3(*args) else: self.bprop_kernels = BpropWinograd_2x2_3x3(*args) if N >= 4 and (C < 8 or H * W > 112 * 112): self.updat_kernels = convolution.UpdateDirect(*args) elif winograd == 4: self.updat_kernels = UpdateWinograd_3x3_4x4(*args) else: self.updat_kernels = UpdateWinograd_3x3_2x2(*args) # elif lib.enable_winograd and not lib.deterministic and N > 1 and \ # R == 5 and S == 5 and all(x == 1 for x in (D,M,T,str_w,str_h,str_d)): # # from .winograd_conv import (FpropWinograd_2x2_5x5, BpropWinograd_2x2_5x5) # # self.fprop_kernels = FpropWinograd_2x2_5x5(*args) # self.bprop_kernels = BpropWinograd_2x2_5x5(*args) # if N >= 4: # self.updat_kernels = convolution.UpdateDirect(*args) ####### Direct ########### else: self.fprop_kernels = convolution.FpropDirect(*args) self.bprop_kernels = convolution.BpropDirect(*args) if N >= 4: self.updat_kernels = convolution.UpdateDirect(*args)
def __init__(self, lib, dtype, N, C, K, D=1, H=1, W=1, T=1, R=1, S=1, pad_d=0, pad_h=0, pad_w=0, str_d=1, str_h=1, str_w=1, relu=False, bsum=False): super(ConvLayer, self).__init__(lib, dtype, N, np.float32) # Compute the output spatial dimensions M = lib.output_dim(D, T, pad_d, str_d) P = lib.output_dim(H, R, pad_h, str_h) Q = lib.output_dim(W, S, pad_w, str_w) self.C = C self.K = K self.M = M self.P = P self.Q = Q self.NCK = (N, C, K) self.TRS = (T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_d, pad_h, pad_w) self.strides = (str_d, str_h, str_w) self.relu = relu self.bsum = bsum self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimF = (C, T, R, S, K) self.dimFb = (K, T, R, S, C) self.dimO = (K, M, P, Q, N) self.dimI2 = (C*D*H*W, N) self.dimF2 = (C*T*R*S, K) self.dimF2t = (K, C*T*R*S) self.dimO2 = (K*M*P*Q, N) self.dimS = (K, 1) self.sizeI = reduce(mul, self.dimI, 1) self.sizeF = reduce(mul, self.dimF, 1) self.sizeO = reduce(mul, self.dimO, 1) self.nOut = reduce(mul, self.MPQ, 1) * K # flop count for benchmarking self.flops = P*Q*M*K*N*C*R*S*T * 2.0 ####### Cuda C ########### if lib.use_cudac_kernels: #3D conv not supported yet if T > 1 or D > 1: raise ValueError("3D Convolution not supported by CUDA C kernels.") self.fprop_kernels = convolution.FpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum) # TODO small C bprop? self.bprop_kernels = convolution.BpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum) self.updat_kernels = convolution.UpdateCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w) ####### Winograd ########### elif lib.have_winograd and R == 3 and S == 3 and all(x == 1 for x in (D,M,T,str_w,str_h,str_d)): from winograd.convolution import FpropWinograd, BpropWinograd, UpdateWinograd if N >= 64 and C < 8 or not lib.have_winograd: self.fprop_kernels = convolution.FpropDirect( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum) else: self.fprop_kernels = FpropWinograd( lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w, relu, bsum) self.bprop_kernels = BpropWinograd( lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w, relu, bsum) if N >=32 and C < 8 or not lib.have_winograd: self.updat_kernels = convolution.UpdateDirect( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w) else: self.updat_kernels = UpdateWinograd( lib, self.dtype, N, C, K, H, W, P, Q, pad_h, pad_w) ####### Direct ########### else: vec_size = 4 if self.dtype.itemsize == 4 else 8 self.fprop_kernels = convolution.FpropDirect( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum) if C % vec_size == 0: self.bprop_kernels = convolution.BpropDirect( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, relu, bsum) else: # special kernel for deconv into first layer self.bprop_kernels = convolution.BpropDirectSmallC( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.updat_kernels = convolution.UpdateDirect( lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w) logger.debug("%s: %s, %s, %s", str(self), str(self.fprop_kernels), str(self.bprop_kernels), str(self.updat_kernels))
def __init__(self, lib, dtype, N, C, K, D=1, H=1, W=1, T=1, R=1, S=1, pad_d=0, pad_h=0, pad_w=0, str_d=1, str_h=1, str_w=1, bsum=False): super(ConvLayer, self).__init__(lib, dtype, N, np.float32) # Compute the output spatial dimensions M = lib.output_dim(D, T, pad_d, str_d) P = lib.output_dim(H, R, pad_h, str_h) Q = lib.output_dim(W, S, pad_w, str_w) self.C = C self.K = K self.M = M self.P = P self.Q = Q self.NCK = (N, C, K) self.TRS = (T, R, S) self.DHW = (D, H, W) self.MPQ = (M, P, Q) self.padding = (pad_d, pad_h, pad_w) self.strides = (str_d, str_h, str_w) self.bsum = bsum self.all_params = (N, C, K, D, H, W, T, R, S, pad_d, pad_h, pad_w, str_d, str_h, str_w) self.dimI = (C, D, H, W, N) self.dimF = (C, T, R, S, K) self.dimFb = (K, T, R, S, C) self.dimO = (K, M, P, Q, N) self.dimI2 = (C * D * H * W, N) self.dimF2 = (C * T * R * S, K) self.dimF2t = (K, C * T * R * S) self.dimO2 = (K * M * P * Q, N) self.dimS = (K, 1) self.sizeI = reduce(mul, self.dimI, 1) self.sizeF = reduce(mul, self.dimF, 1) self.sizeO = reduce(mul, self.dimO, 1) self.nOut = reduce(mul, self.MPQ, 1) * K # flop count for benchmarking self.flops = P * Q * M * K * N * C * R * S * T * 2.0 if T > 1 or D > 1: raise ValueError("3D Convolution not supported by CUDA C kernels.") self.fprop_kernels = convolution.FpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum) # TODO small C bprop? self.bprop_kernels = convolution.BpropCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum=bsum) self.updat_kernels = convolution.UpdateCuda(lib, self.dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w)