def __init__(self, lib, dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum): super(FpropCuda, self).__init__(lib, dtype) assert N % 32 == 0, "N dim must be multiple of 32" assert K % self.vec_size == 0, "K dim must be multiple of %d" % self.vec_size magic_PQ = _magic64(P * Q) magic_Q = _magic64(Q) magic_S = _magic32(R * S + 32, S) HWN = H * W * N RST = R * S * T KRST = K * RST PQ = P * Q PQN = PQ * N from neon.backends.kernels.cuda.convolution import _get_conv_kernel self.kernel = _get_conv_kernel(dtype=self.dtype.str[1:], filter_size=R * S, bsum=bsum, operation="fprop") grid = (PQ * (-(-N // 32)), (-(-K // 32)), 1) block = (8, 8, 1) static_kernel_args = _flatten([ C, D, H, W, N, T, R, S, K, M, P, Q, str_w, str_h, pad_w, pad_h, HWN // 4, KRST // 4, PQN // 4, PQ, 0, 0, magic_PQ, magic_Q, magic_S ]) self.launch_args = [grid, block] + [None] * 7 + static_kernel_args self.shared = RST * 4 * 2 self.flags = (bsum and 4)
def __init__(self, lib, dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum): super(FpropCuda, self).__init__(lib, dtype) assert N % 32 == 0, "N dim must be multiple of 32" assert K % self.vec_size == 0, "K dim must be multiple of %d" % self.vec_size magic_PQ = magic64(P*Q) magic_Q = magic64(Q) magic_S = magic32(R*S+32, S) HWN = H * W * N RST = R * S * T KRST = K * RST PQ = P * Q PQN = PQ * N self.kernel = _get_conv_kernel(dtype=self.dtype.str[1:], filter_size=R*S, bsum=bsum, operation="fprop") grid = (PQ * (-(-N // 32)), (-(-K // 32)), 1) block = (8, 8, 1) static_kernel_args = _flatten([C, D, H, W, N, T, R, S, K, M, P, Q, str_w, str_h, pad_w, pad_h, HWN // 4, KRST // 4, PQN // 4, PQ, 0, 0, magic_PQ, magic_Q, magic_S]) self.launch_args = [grid, block] + [None] * 7 + static_kernel_args self.shared = RST * 4 * 2 self.flags = (bsum and 4)
def __init__(self, lib, dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w, bsum): super(BpropCuda, self).__init__(lib, dtype) assert N % 32 == 0, "N dim must be multiple of 32" assert K % self.vec_size == 0, "K dim must be multiple of %d" % self.vec_size magic_HW = _magic64(H*W) magic_W = _magic64(W) magic_RS = _magic32(R*S*T+32, R*S) magic_S = _magic32(R*S+32, S) HW = H * W HWN = HW * N RST = R * S * T CRST = C * RST PQ = P * Q PQN = PQ * N self.bsum = bsum from neon.backends.kernels.cuda.convolution import _get_conv_kernel self.kernel = _get_conv_kernel(dtype=self.dtype.str[1:], filter_size=R*S, bsum=bsum, operation="bprop") grid = (HW * (-(-N // 32)), -(-C // 32), 1) block = (8, 8, 1) static_kernel_args = _flatten([K, M, P, Q, N, T, R, S, C, D, H, W, str_w, str_h, pad_w, pad_h, PQN // 4, CRST // 4, HWN // 4, HW, 0, 0, magic_HW, magic_W, magic_S]) self.launch_args = [grid, block] + [None] * 7 + static_kernel_args self.shared = R*S*T * 4 * 2 self.flags = (bsum and 4) # generate the kernel args for dim shuffling CTRSK => KTRSC shuffle_grid = (_ceil_div(K, 32), _ceil_div(C, 32), R*S*T) self.shuffle_size = C*T*R*S*K*dtype.itemsize self.shuffle_args = [shuffle_grid, (32, 8, 1), None, None, None] self.shuffle_args.extend(_flatten([ R*S*T*K, R*S*K, S*K, K, R*S*T*C, R*S*C, S*C, C, R*S, T, R, S, magic_RS, magic_S])) lib.set_scratch_size(self.shuffle_size)
def __init__(self, lib, dtype, N, C, K, D, H, W, T, R, S, M, P, Q, pad_d, pad_h, pad_w, str_d, str_h, str_w): super(UpdateCuda, self).__init__(lib, dtype) assert N % 32 == 0, "N dim must be multiple of 32" HWN = H * W * N RS = R * S RST = RS * T KRST = K * RST CRSTK = KRST * C PQ = P * Q PQN = PQ * N magic_S = _magic32(R*S+32, S) if lib.deterministic: grid_P = 1 grid_Q = 1 self.determ = CRSTK else: grid_P = P grid_Q = Q self.determ = 0 pq_blocks = grid_P * grid_Q magic_PQ = _magic64(pq_blocks) magic_Q = _magic64(grid_Q) from neon.backends.kernels.cuda.convolution import _get_conv_kernel self.kernel = _get_conv_kernel(dtype=self.dtype.str[1:], filter_size=R*S, bsum=False, operation="update") grid = (pq_blocks * (-(-K // 32)), (-(-(C*RS) // 32)), 1) block = (8, 32, 1) static_kernel_args = _flatten([C, D, H, W, N, T, R, S, K, M, P, Q, str_w, str_h, pad_w, pad_h, HWN // 4, KRST // 4, PQN // 4, PQ, grid_P, grid_Q, magic_PQ, magic_Q, magic_S]) self.launch_args = [grid, block] + [None] * 7 + static_kernel_args lib.set_scratch_size((self.determ or C*T*R*S*K)*4)