def _init_thread_memory(self, dev_id: int, ctx: cuda.Context, alloc_size: int) -> None: ''' Single thread that initializes the memory for all the stream for a single GPU. ''' ctx.push() size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream)) # Initialize streams for i in range(self.num_stream): self.streams[dev_id].append(cuda.Stream()) for i in range(0, self.num_stream, 1): # allocate memory on device self.moments_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10)))) self.w_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.x_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) self.y_device[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9)))) # set host memory for returned output self.c_moments[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7)))) self.mu[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.yf[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.m1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5)))) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(0), size_per_batch, size_per_batch, block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.float_value_set[dev_id](self.m1[dev_id][i], np.float32(1), size_per_batch, np.int32(0), block=self.block_size, grid=self.grid_size, stream=self.streams[dev_id][i]) self.x1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w1[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.x2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) self.w2[dev_id].append( (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3)))) ctx.synchronize() ctx.pop()
def calcV(I_shape, I_cu, V_cu): #Ifull = I Ci = I_shape[0] iH = I_shape[1] iW = I_shape[2] N = I_shape[3] tiles = iW // 4 oH = iH oW = iW padH = 1 padW = 1 # adapted from winograd_conv.py #if N == 1: # shlN = 0 #elif N < 32: # shlN = len(bin(N-1))-2 #else: # shlN = 5 shlN = 5 shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = { 0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx 1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn 2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn 3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn 4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn 5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn }.get(shlN) GYS = ceil_div(oH, 1 << shlY) GXS = ceil_div(oW, 1 << shlX) GN = ceil_div(N, 1 << shlN) # GK = ceil_div(Co, 32) GYS2 = GYS // 2 GXS2 = GXS * 2 div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2) div_GXS = get_div_mul_shift_32(GXS * GYS, GXS) image_size = 1152*Ci*GXS*GYS*GN print('div_GXS', div_GXS) print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS) grid = (GN, GYS*GXS, Ci) block = (32, 1, 1) call_cu_kernel( k_calcV, grid, block, V_cu, I_cu, iH, iW, N, padH, padW, GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1], shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN, iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152, GXS, GXS * GYS, GN, Ci) Context.synchronize() timecheck('calced V_cu')
def gpu_filter(pixels, width, height): size = np.array([width, height]) filtered = np.zeros_like(pixels) grid_dim = (width // BLOCK_SIZE, height // BLOCK_SIZE) median_filter(In(pixels), Out(filtered), In(size), block=BLOCK, grid=grid_dim) Context.synchronize() return filtered
def _set_thread_args(self, dev_id: int, ctx: cuda.Context, moment: np.ndarray, w_out: np.ndarray, x_out: np.ndarray, y_out: np.ndarray): ''' Set the input moment for all the stream for a specific GPU ''' ctx.push() # number of input for this GPU max_size = moment.shape[1] # loop through the streams to set their input for i in range(0, self.num_stream, 1): # Size of input allocated for each stream size_per_batch = int(np.ceil(max_size / self.num_stream)) # location on the original input array where the input to this stream starts loc = np.int32((i) * size_per_batch) if loc + size_per_batch > max_size: size_per_batch = max_size - loc self.moment_chunk_host[dev_id].append( np.ascontiguousarray(moment[:, loc:loc + size_per_batch], dtype=np.float32)) self.moment_chunk_host[dev_id][i] = cuda.register_host_memory( self.moment_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.w_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(w_out[:, loc:loc + size_per_batch]))) self.w_chunk_host[dev_id][i] = cuda.register_host_memory( self.w_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.x_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(x_out[:, loc:loc + size_per_batch]))) self.x_chunk_host[dev_id][i] = cuda.register_host_memory( self.x_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) self.y_chunk_host[dev_id].append( np.ascontiguousarray( np.zeros_like(y_out[:, loc:loc + size_per_batch]))) self.y_chunk_host[dev_id][i] = cuda.register_host_memory( self.y_chunk_host[dev_id][i], cuda.mem_host_register_flags.PORTABLE) ctx.synchronize() ctx.pop()
def calcO(O_cu, M_shape, M_cu): GK = M_shape[2] GN = M_shape[0] tiles = M_shape[4] num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles grid = (ceil_div(num_xinu_tiles, 32), 1, 1) block = (32, 1, 1) call_cu_kernel( k_calcO, grid, block, O_cu, M_cu, num_xinu_tiles ) Context.synchronize() timecheck('calced O_cu')
def calcM(N, Co, M_cu, U_shape, U_cu, V_shape, V_cu): Co = (U_shape[2] - 1) * 32 + U_shape[4] Ci = U_shape[3] GK = ceil_div(Co, 32) tiles = V_shape[4] GN = V_shape[2] print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N) grid = (tiles * tiles,1,1) # b block = (32, 16, 1) # 16 for intel... call_cu_kernel( k_calcM, grid, block, M_cu, U_cu, V_cu, Ci, 1, tiles, GN, GK) #, # cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4)) Context.synchronize() timecheck('calced M_cu')
def calcU(W_shape, W_cu, U_cu): Ci = W_shape[0] kH = W_shape[1] kW = W_shape[2] Co = W_shape[3] # this is adapted from neon's winograd_conv.py: GK = ceil_div(Co, 32) filter_size = 1152*Ci*GK grid = (GK, Ci, 1) block = (32, 1, 1) call_cu_kernel( k_calcU, grid, block, U_cu, W_cu, kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152, Ci, GK) Context.synchronize() timecheck('calced U_cu')
def process(iH, iW, N, Ci, Co, kH=3, kW=3): inittime() np.random.seed(123) oH = iH oW = iW tiles = iW // 4 shlN = 5 shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = { 0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8 yyxxx 1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4 yyxxn 2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4 yxxnn 3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4 xxnnn 4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2 xnnnn 5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1 nnnnn }.get(shlN) GYS = ceil_div(oH, 1 << shlY) GXS = ceil_div(oW, 1 << shlX) GN = ceil_div(N, 1 << shlN) # GK = ceil_div(Co, 32) GYS2 = GYS // 2 GXS2 = GXS * 2 GK = ceil_div(Co, 32) W = np.random.randn(Ci,kH,kW,Co).astype(np.float32) I = np.zeros((Ci,iH, iW,N), dtype=np.float32) I[:] = np.random.randn(*I.shape) print('Co', Co, 'iH', iH, 'iW', iW, 'N', N, 'tiles', tiles) W_cu = gpuarray.to_gpu(W) I_cu = gpuarray.to_gpu(I) U = np.zeros((6, 6, GK, Ci, 32,), dtype=np.float32) U_cu = gpuarray.to_gpu(U) V = np.zeros((6, 6, GN,GXS, GYS, Ci, 32), dtype=np.float32) V_cu = gpuarray.to_gpu(V) M = np.zeros((GN, 32, GK, 32, tiles, tiles, 6, 6,), dtype=np.float32) M_cu = gpuarray.to_gpu(M) O = np.zeros((GN, 32, GK, 32, tiles, tiles, 4, 4,), dtype=np.float32) O_cu = gpuarray.to_gpu(O) Context.synchronize() print('allocated buffers') start = time.time() for it in range(3): calcU(U_cu=U_cu, W_shape=W.shape, W_cu=W_cu) calcV(V_cu=V_cu, I_shape=I.shape, I_cu=I_cu) calcM(N=N, Co=Co, M_cu=M_cu, U_shape=U.shape, U_cu=U_cu, V_shape=V.shape, V_cu=V_cu) calcO(O_cu=O_cu, M_shape=M.shape, M_cu=M_cu) Context.synchronize() end = time.time() print('calcs done') print('time for all calcs:', end - start) start = time.time() O = O_cu.get() # cl.enqueue_copy(q, O, O_cu) O = O.transpose(2,3, 4,6, 5,7, 0,1).reshape( GK * 32, tiles * 4, tiles * 4, GN * 32) print('O.shape', O.shape) W_from_cu = np.zeros((Ci, 3, 3, Co), dtype=np.float32) W_from_cu = W_cu.get() U_from_cpu = winograd_cpu.calcU(W=W) U_from_cu = np.zeros((6, 6, GK, Ci, 32), dtype=np.float32) U_from_cu = U_cu.get() U_from_cu_ = U_from_cu.transpose( 0, 1, 2, 4, 3).reshape(6, 6, GK * 32, Ci)[:, :, :Co] assert np.allclose(U_from_cu_, U_from_cpu, atol=1e-4) V_from_cpu = winograd_cpu.calcV(I=I) V_from_cu = np.copy(V) V_from_cu = V_cu.get() print('tiles', tiles) # 0 1 2 3 4 5 6 # 6, 6, GN,GXS, GYS, Ci, 32 V_from_cu_ = V_from_cu.transpose( 2,6,0,1,5,3,4).reshape( GN * 32, 6, 6, Ci, tiles, tiles)[:N] assert np.allclose(V_from_cu_, V_from_cpu, atol=1e-3) # 0 1 2 3 4 5 6 7 # [n//32][n % 32][co // 32][co % 32][th][tw][xi][nu] M_from_cpu = winograd_cpu.calcM(U=U_from_cu, V=V_from_cu, N=N, Co=Co) M_from_cu = np.copy(M) M_from_cu = M_cu.get() Context.synchronize() M_from_cu = M_from_cu.reshape(GN * 32, GK * 32, tiles, tiles, 6, 6)[:N, :Co] print(M_from_cu.reshape(M_from_cu.size)[:20]) assert np.allclose(M_from_cu, M_from_cpu, atol=1e-2) #np.transpose(V_from_cu, [2, 6, 4, 5, 3, 0, 1]) #V_from_cu = V_from_cu.reshape(GN * 32, 6, 6, Ci, tiles, tiles)[:N,:,:,:,:,:] return {'W': W, 'O': O, 'I': I}