def add_row_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix row and store the result to the corresponding position of the vec Unlike other function that only provide small computation, this function raise the upper bound for the number of column to 2^16, actually it could be 2^20 ''' timer.start() mh, mw = mat.shape vh, vw = vec.shape assert (vw == 1 and vh == mh or vh == 1 and vw == mh) if mw != 1: cudaconv2.sum(mat, 1, vec) else: gpu_partial_copy_to(mat, vec, 0, mh, 0, 1) # if mat.shape[1] <= INTERNAL_SIZE: # grid = (1, mh) # block = (mw, 1, 1) # leading = mat.strides[0] /4 # _add_row_sum_to_vec_(mat, F(alpha), vec, F(beta),I(leading), I(mh), I(mw), block = block, grid= grid) # else: # block = (INTERNAL_SIZE, 1, 1) # grid = (divup(mw, INTERNAL_SIZE), mh) # #tmp = gpuarray.to_gpu(np.zeros((mh, divup(mw, INTERNAL_SIZE)) ).astype(np.float32)) # tmp = gpuarray.zeros((mh, divup(mw, INTERNAL_SIZE)), dtype=np.float32) # #print 'TOGPU', tmp.shape # leading = mat.strides[0]/4 # _add_row_sum_to_vec_(mat, F(alpha), tmp, F(beta), I(leading), I(mh),I(mw), block = block, grid = grid) # add_row_sum_to_vec(vec, tmp) timer.end('add_row_sum_to_vec')
def add_row_sum_to_vec(vec, mat, alpha = 1.0, beta = 1.0): ''' This function would sum up the element int a matrix row and store the result to the corresponding position of the vec Unlike other function that only provide small computation, this function raise the upper bound for the number of column to 2^16, actually it could be 2^20 ''' timer.start() mh, mw = mat.shape vh, vw = vec.shape assert(vw == 1 and vh == mh or vh == 1 and vw == mh) cudaconv2.sum(mat, 1, vec) #if mat.shape[1] <= INTERNAL_SIZE: # grid = (1, mh) # block = (mw, 1, 1) # leading = mat.strides[0] /4 # _add_row_sum_to_vec_(mat, F(alpha), vec, F(beta),I(leading), I(mh), I(mw), block = block, grid= grid) #else: # block = (INTERNAL_SIZE, 1, 1) # grid = (ceil(mw, INTERNAL_SIZE), mh) # #tmp = gpuarray.to_gpu(np.zeros((mh, ceil(mw, INTERNAL_SIZE)) ).astype(np.float32)) # tmp = gpuarray.zeros((mh, ceil(mw, INTERNAL_SIZE)), dtype=np.float32) # #print 'TOGPU', tmp.shape # leading = mat.strides[0]/4 # _add_row_sum_to_vec_(mat, F(alpha), tmp, F(beta), I(leading), I(mh),I(mw), block = block, grid = grid) # add_row_sum_to_vec(vec, tmp) timer.end('add_row_sum_to_vec')
def convWeightActs(input, ingrad, weight_grad, bias_grad, padding, stride, color, *args): image_y = input.shape[ConvDataLayout.HEIGHT] output_y = ingrad.shape[ConvDataLayout.HEIGHT] output_x = ingrad.shape[ConvDataLayout.WIDTH] filter_size = weight_grad.shape[FilterLayout.HEIGHT] color = input.shape[ConvDataLayout.CHANNEL] cudaconv2.convWeightActs(input, ingrad, weight_grad, image_y, output_y, output_x, filter_size, padding, stride, color, 1, 0) batch_size = ingrad.shape[ConvDataLayout.BATCH] channel = ingrad.shape[ConvDataLayout.CHANNEL] cudaconv2.sum(ingrad.reshape((channel, output_y * output_x * batch_size)), 1, bias_grad)
def add_col_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix column and store the result to the corresponding position of the vec ONLY work on small matrix Small means the row of the matrix is up to 1024 and the column, seams like a little big, can be 2048, but the upper bound has not been tested ''' mh, mw = mat.shape vh, vw = vec.shape assert(vw == 1 and vh == mw or vh == 1 and vw == mw) cudaconv2.sum(mat, 0, vec)
def add_col_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix column and store the result to the corresponding position of the vec ONLY work on small matrix Small means the row of the matrix is up to 1024 and the column, seams like a little big, can be 2048, but the upper bound has not been tested ''' mh, mw = mat.shape vh, vw = vec.shape assert (vw == 1 and vh == mw or vh == 1 and vw == mw) cudaconv2.sum(mat, 0, vec)
def add_row_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix row and store the result to the corresponding position of the vec Unlike other function that only provide small computation, this function raise the upper bound for the number of column to 2^16, actually it could be 2^20 ''' mh, mw = mat.shape vh, vw = vec.shape assert(vw == 1 and vh == mh or vh == 1 and vw == mh) if mw != 1: cudaconv2.sum(mat, 1, vec) else: gpu_partial_copy_to(mat, vec, 0, mh, 0, 1)
def add_row_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix row and store the result to the corresponding position of the vec Unlike other function that only provide small computation, this function raise the upper bound for the number of column to 2^16, actually it could be 2^20 ''' mh, mw = mat.shape vh, vw = vec.shape assert (vw == 1 and vh == mh or vh == 1 and vw == mh) if mw != 1: cudaconv2.sum(mat, 1, vec) else: gpu_partial_copy_to(mat, vec, 0, mh, 0, 1)
def add_col_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix column and store the result to the corresponding position of the vec ONLY work on small matrix Small means the row of the matrix is up to 1024 and the column, seams like a little big, can be 2048, but the upper bound has not been tested ''' timer.start() mh, mw = mat.shape vh, vw = vec.shape assert(vw == 1 and vh == mw or vh == 1 and vw == mw) cudaconv2.sum(mat, 0, vec) #grid = (mw, 1) #block = (1, mh, 1) #leading = mat.strides[0] / 4 #_add_col_sum_to_vec_(mat, F(alpha), vec, F(beta), I(leading), I(mh), I(mw), block=block, grid=grid) timer.end('add_col_sum_to_vec')
def add_col_sum_to_vec(vec, mat, alpha=1.0, beta=1.0): ''' This function would sum up the element int a matrix column and store the result to the corresponding position of the vec ONLY work on small matrix Small means the row of the matrix is up to 1024 and the column, seams like a little big, can be 2048, but the upper bound has not been tested ''' timer.start() mh, mw = mat.shape vh, vw = vec.shape assert (vw == 1 and vh == mw or vh == 1 and vw == mw) cudaconv2.sum(mat, 0, vec) #grid = (mw, 1) #block = (1, mh, 1) #leading = mat.strides[0] / 4 #_add_col_sum_to_vec_(mat, F(alpha), vec, F(beta), I(leading), I(mh), I(mw), block=block, grid=grid) timer.end('add_col_sum_to_vec')