def SART2DBackWard(grad, order, sp): grad_ = grad.copy() block1D = (8, 1) grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1) block2D = (8, 8) grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0], (sp['nHeight'] + block2D[1] - 1) // block2D[1]) mod = cupy.RawModule(code=source_texref) AssignResidualError = mod.get_function('AssignResidualError_kernel') FpKernel = mod.get_function('fGetFp_kernel') # 2D texture channelDescImg = ChannelFormatDescriptor( 32, 0, 0, 0, runtime.cudaChannelFormatKindFloat) cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight']) resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArrayImg) address_modeImg = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) texDescImg = TextureDescriptor(address_modeImg, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) # 1D texture channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0, runtime.cudaChannelFormatKindFloat) cuArray1D = CUDAarray(channelDesc1D, sp['nBins']) resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArray1D) address_mode1D = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) d_fOneProj = cupy.zeros(sp['nBins'], cupy.float32) for v in range(sp['nViews']): nView = order[sp['nViews'] - 1 - v] fLambda = sp['fRotateDir'] * 2.0 * np.pi / float( sp['nNumAngle']) * float(nView + sp['nStartAngle']) fCosLambda = np.cos(fLambda) fSinLambda = np.sin(fLambda) # A*x cuArrayImg.copy_from(grad) TextureReference(mod.get_texref('texImage'), resourceDescImg, texDescImg) args = (d_fOneProj, sp['nBins'], sp['fSod'], sp['fOdd'], sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'], fCosLambda, fSinLambda, nView, sp['fOffSet'], sp['fAngleOfSlope']) FpKernel(grid1D, block1D, args) # AT*A*x cuArray1D.copy_from(d_fOneProj) TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D) AssignResidualErrorArgs = (grad, sp['nWidth'], sp['nHeight'], sp['nBins'], sp['fSod'], sp['fOdd'], sp['fCellSize'], sp['fPixelSize'], fCosLambda, fSinLambda, sp['fOffSet'], sp['fAngleOfSlope'], sp['relax_factor']) AssignResidualError(grid2D, block2D, AssignResidualErrorArgs) grad = grad_ - sp['relax_factor'] * grad return grad
def SART2D(p, sp, order, x0): # x0 = xinit.copy() block1D = (8, 1) grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1) block2D = (8, 8) grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0], (sp['nHeight'] + block2D[1] - 1) // block2D[1]) mod = cupy.RawModule(code=source_texref) fGetResiduals = mod.get_function('fGetResiduals') AssignResidualError = mod.get_function('AssignResidualError_kernel') channelDescImg = ChannelFormatDescriptor( 32, 0, 0, 0, runtime.cudaChannelFormatKindFloat) cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight']) resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArrayImg) address_modeImg = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) texDescImg = TextureDescriptor(address_modeImg, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) # 1D texture channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0, runtime.cudaChannelFormatKindFloat) cuArray1D = CUDAarray(channelDesc1D, sp['nBins']) resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArray1D) address_mode1D = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) d_fResidualsData = cupy.zeros(sp['nBins'], cupy.float32) for v in range(sp['nViews']): # print('{}\n'.format(v)) nView = order[v] fLambda = sp['fRotateDir'] * 2.0 * np.pi / float( sp['nNumAngle']) * float(nView + sp['nStartAngle']) fCosLambda = np.cos(fLambda) fSinLambda = np.sin(fLambda) cuArrayImg.copy_from(x0) TextureReference(mod.get_texref('texImage'), resourceDescImg, texDescImg) getErrArgs = (d_fResidualsData, p, sp['nBins'], sp['fSod'], sp['fOdd'], sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'], fCosLambda, fSinLambda, nView, sp['fOffSet'], sp['fAngleOfSlope']) fGetResiduals(grid1D, block1D, getErrArgs) cuArray1D.copy_from(d_fResidualsData) TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D) AssignResidualErrorArgs = (x0, sp['nWidth'], sp['nHeight'], sp['nBins'], sp['fSod'], sp['fOdd'], sp['fCellSize'], sp['fPixelSize'], fCosLambda, fSinLambda, sp['fOffSet'], sp['fAngleOfSlope'], sp['relax_factor']) AssignResidualError(grid2D, block2D, AssignResidualErrorArgs) return x0
def test_fetch_float4_texture(self): width = 47 height = 39 depth = 11 n_channel = 4 # generate input data and allocate output buffer in_shape = (depth, height, n_channel * width) out_shape = (depth, height, width) # prepare input, output, and texture memory tex_data = cupy.random.random(in_shape, dtype=cupy.float32) real_output_x = cupy.zeros(out_shape, dtype=cupy.float32) real_output_y = cupy.zeros(out_shape, dtype=cupy.float32) real_output_z = cupy.zeros(out_shape, dtype=cupy.float32) real_output_w = cupy.zeros(out_shape, dtype=cupy.float32) ch = ChannelFormatDescriptor(32, 32, 32, 32, runtime.cudaChannelFormatKindFloat) arr = CUDAarray(ch, width, height, depth) arr.copy_from(tex_data) # create resource and texture descriptors res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr) address_mode = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) if self.target == 'object': # create a texture object texobj = TextureObject(res, tex) mod = cupy.RawModule(code=source_texobj) else: # self.target == 'reference' mod = cupy.RawModule(code=source_texref) texrefPtr = mod.get_texref('texref3Df4') # bind texture ref to resource texref = TextureReference(texrefPtr, res, tex) # noqa # get and launch the kernel ker_name = 'copyKernel3D_4ch' ker = mod.get_function(ker_name) block = (4, 4, 2) grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1], (depth + block[2] - 1) // block[2]) args = (real_output_x, real_output_y, real_output_z, real_output_w) if self.target == 'object': args = args + (texobj, ) args = args + (width, height, depth) ker(grid, block, args) # validate result assert (real_output_x == tex_data[..., 0::4]).all() assert (real_output_y == tex_data[..., 1::4]).all() assert (real_output_z == tex_data[..., 2::4]).all() assert (real_output_w == tex_data[..., 3::4]).all()
def test_fetch_float_texture(self): width, height, depth = self.dimensions dim = 3 if depth != 0 else 2 if height != 0 else 1 if (self.mem_type == 'linear' and dim != 1) or \ (self.mem_type == 'pitch2D' and dim != 2): pytest.skip('The test case {0} is inapplicable for {1} and thus ' 'skipped.'.format(self.dimensions, self.mem_type)) # generate input data and allocate output buffer shape = (depth, height, width) if dim == 3 else \ (height, width) if dim == 2 else \ (width,) # prepare input, output, and texture memory tex_data = cupy.random.random(shape, dtype=cupy.float32) real_output = cupy.zeros_like(tex_data) ch = ChannelFormatDescriptor(32, 0, 0, 0, runtime.cudaChannelFormatKindFloat) assert tex_data.flags['C_CONTIGUOUS'] assert real_output.flags['C_CONTIGUOUS'] if self.mem_type == 'CUDAarray': arr = CUDAarray(ch, width, height, depth) expected_output = cupy.zeros_like(tex_data) assert expected_output.flags['C_CONTIGUOUS'] # test bidirectional copy arr.copy_from(tex_data) arr.copy_to(expected_output) else: # linear are pitch2D are backed by ndarray arr = tex_data expected_output = tex_data # create resource and texture descriptors if self.mem_type == 'CUDAarray': res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr) elif self.mem_type == 'linear': res = ResourceDescriptor(runtime.cudaResourceTypeLinear, arr=arr, chDesc=ch, sizeInBytes=arr.size * arr.dtype.itemsize) else: # pitch2D # In this case, we rely on the fact that the hand-picked array # shape meets the alignment requirement. This is CUDA's limitation, # see CUDA Runtime API reference guide. "TexturePitchAlignment" is # assumed to be 32, which should be applicable for most devices. res = ResourceDescriptor(runtime.cudaResourceTypePitch2D, arr=arr, chDesc=ch, width=width, height=height, pitchInBytes=width * arr.dtype.itemsize) address_mode = (runtime.cudaAddressModeClamp, runtime.cudaAddressModeClamp) tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint, runtime.cudaReadModeElementType) if self.target == 'object': # create a texture object texobj = TextureObject(res, tex) mod = cupy.RawModule(code=source_texobj) else: # self.target == 'reference' mod = cupy.RawModule(code=source_texref) texref_name = 'texref' texref_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D' texrefPtr = mod.get_texref(texref_name) # bind texture ref to resource texref = TextureReference(texrefPtr, res, tex) # noqa # get and launch the kernel ker_name = 'copyKernel' ker_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D' ker_name += 'fetch' if self.mem_type == 'linear' else '' ker = mod.get_function(ker_name) block = (4, 4, 2) if dim == 3 else (4, 4) if dim == 2 else (4, ) grid = () args = (real_output, ) if self.target == 'object': args = args + (texobj, ) if dim >= 1: grid_x = (width + block[0] - 1) // block[0] grid = grid + (grid_x, ) args = args + (width, ) if dim >= 2: grid_y = (height + block[1] - 1) // block[1] grid = grid + (grid_y, ) args = args + (height, ) if dim == 3: grid_z = (depth + block[2] - 1) // block[2] grid = grid + (grid_z, ) args = args + (depth, ) ker(grid, block, args) # validate result assert (real_output == expected_output).all()