Example #1
0
    def _prep_texture(self):
        width, height, depth = self.dimensions
        dim = 3 if depth != 0 else 2 if height != 0 else 1

        # generate input data and allocate output buffer
        shape = (depth, height, width) if dim == 3 else \
                (height, width) if dim == 2 else \
                (width,)
        self.shape = shape

        # prepare input, output, and texture memory
        # self.data holds the data stored in the texture memory
        tex_data = cupy.random.random(shape, dtype=cupy.float32)
        ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                     runtime.cudaChannelFormatKindFloat)
        arr = CUDAarray(ch, width, height, depth)
        arr.copy_from(tex_data)
        self.data = tex_data

        # create resource and texture descriptors
        res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)
        address_mode = (runtime.cudaAddressModeClamp,
                        runtime.cudaAddressModeClamp)
        tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint,
                                runtime.cudaReadModeElementType)

        # create a texture object
        return TextureObject(res, tex)
Example #2
0
def SART2DBackWard(grad, order, sp):
    grad_ = grad.copy()
    block1D = (8, 1)
    grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1)
    block2D = (8, 8)
    grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0],
              (sp['nHeight'] + block2D[1] - 1) // block2D[1])
    mod = cupy.RawModule(code=source_texref)
    AssignResidualError = mod.get_function('AssignResidualError_kernel')
    FpKernel = mod.get_function('fGetFp_kernel')
    # 2D texture
    channelDescImg = ChannelFormatDescriptor(
        32, 0, 0, 0, runtime.cudaChannelFormatKindFloat)
    cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight'])
    resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                         cuArrayImg)
    address_modeImg = (runtime.cudaAddressModeClamp,
                       runtime.cudaAddressModeClamp)
    texDescImg = TextureDescriptor(address_modeImg,
                                   runtime.cudaFilterModePoint,
                                   runtime.cudaReadModeElementType)
    # 1D texture
    channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0,
                                            runtime.cudaChannelFormatKindFloat)
    cuArray1D = CUDAarray(channelDesc1D, sp['nBins'])
    resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                        cuArray1D)
    address_mode1D = (runtime.cudaAddressModeClamp,
                      runtime.cudaAddressModeClamp)
    texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint,
                                  runtime.cudaReadModeElementType)
    d_fOneProj = cupy.zeros(sp['nBins'], cupy.float32)
    for v in range(sp['nViews']):
        nView = order[sp['nViews'] - 1 - v]
        fLambda = sp['fRotateDir'] * 2.0 * np.pi / float(
            sp['nNumAngle']) * float(nView + sp['nStartAngle'])
        fCosLambda = np.cos(fLambda)
        fSinLambda = np.sin(fLambda)
        # A*x
        cuArrayImg.copy_from(grad)
        TextureReference(mod.get_texref('texImage'), resourceDescImg,
                         texDescImg)
        args = (d_fOneProj, sp['nBins'], sp['fSod'], sp['fOdd'],
                sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'],
                fCosLambda, fSinLambda, nView, sp['fOffSet'],
                sp['fAngleOfSlope'])
        FpKernel(grid1D, block1D, args)
        # AT*A*x
        cuArray1D.copy_from(d_fOneProj)
        TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D)
        AssignResidualErrorArgs = (grad, sp['nWidth'], sp['nHeight'],
                                   sp['nBins'], sp['fSod'], sp['fOdd'],
                                   sp['fCellSize'], sp['fPixelSize'],
                                   fCosLambda, fSinLambda, sp['fOffSet'],
                                   sp['fAngleOfSlope'], sp['relax_factor'])
        AssignResidualError(grid2D, block2D, AssignResidualErrorArgs)
        grad = grad_ - sp['relax_factor'] * grad
    return grad
Example #3
0
def SART2D(p, sp, order, x0):
    # x0 = xinit.copy()
    block1D = (8, 1)
    grid1D = ((sp['nBins'] + block1D[0] - 1) // block1D[0], 1)
    block2D = (8, 8)
    grid2D = ((sp['nWidth'] + block2D[0] - 1) // block2D[0],
              (sp['nHeight'] + block2D[1] - 1) // block2D[1])
    mod = cupy.RawModule(code=source_texref)
    fGetResiduals = mod.get_function('fGetResiduals')
    AssignResidualError = mod.get_function('AssignResidualError_kernel')

    channelDescImg = ChannelFormatDescriptor(
        32, 0, 0, 0, runtime.cudaChannelFormatKindFloat)
    cuArrayImg = CUDAarray(channelDescImg, sp['nWidth'], sp['nHeight'])
    resourceDescImg = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                         cuArrayImg)
    address_modeImg = (runtime.cudaAddressModeClamp,
                       runtime.cudaAddressModeClamp)
    texDescImg = TextureDescriptor(address_modeImg,
                                   runtime.cudaFilterModePoint,
                                   runtime.cudaReadModeElementType)

    # 1D texture
    channelDesc1D = ChannelFormatDescriptor(32, 0, 0, 0,
                                            runtime.cudaChannelFormatKindFloat)
    cuArray1D = CUDAarray(channelDesc1D, sp['nBins'])
    resourceDesc1D = ResourceDescriptor(runtime.cudaResourceTypeArray,
                                        cuArray1D)
    address_mode1D = (runtime.cudaAddressModeClamp,
                      runtime.cudaAddressModeClamp)
    texDesc1D = TextureDescriptor(address_mode1D, runtime.cudaFilterModePoint,
                                  runtime.cudaReadModeElementType)
    d_fResidualsData = cupy.zeros(sp['nBins'], cupy.float32)

    for v in range(sp['nViews']):
        # print('{}\n'.format(v))
        nView = order[v]
        fLambda = sp['fRotateDir'] * 2.0 * np.pi / float(
            sp['nNumAngle']) * float(nView + sp['nStartAngle'])
        fCosLambda = np.cos(fLambda)
        fSinLambda = np.sin(fLambda)
        cuArrayImg.copy_from(x0)
        TextureReference(mod.get_texref('texImage'), resourceDescImg,
                         texDescImg)
        getErrArgs = (d_fResidualsData, p, sp['nBins'], sp['fSod'], sp['fOdd'],
                      sp['fCellSize'], sp['fPixelSize'], sp['fFovRadius'],
                      fCosLambda, fSinLambda, nView, sp['fOffSet'],
                      sp['fAngleOfSlope'])
        fGetResiduals(grid1D, block1D, getErrArgs)
        cuArray1D.copy_from(d_fResidualsData)
        TextureReference(mod.get_texref('texFP'), resourceDesc1D, texDesc1D)
        AssignResidualErrorArgs = (x0, sp['nWidth'], sp['nHeight'],
                                   sp['nBins'], sp['fSod'], sp['fOdd'],
                                   sp['fCellSize'], sp['fPixelSize'],
                                   fCosLambda, fSinLambda, sp['fOffSet'],
                                   sp['fAngleOfSlope'], sp['relax_factor'])
        AssignResidualError(grid2D, block2D, AssignResidualErrorArgs)
    return x0
Example #4
0
def deskew(data, angle, dx, dz, rotate=True, return_resolution=True, out=None):
    """
    Args:
        data (ndarray): 3-D array to apply deskew
        angle (float): angle between the objective and coverslip, in degree
        dx (float): X resolution
        dz (float): Z resolution
        rotate (bool, optional): rotate and crop the output
        return_resolution (bool, optional): return deskewed X/Z resolution
        out (ndarray, optional): array to store the result
    """
    angle = radians(angle)

    # shift along X axis, in pixels
    shift = dz * cos(angle) / dx
    logger.debug(f"layer shift: {shift:.04f} px")

    # estimate new size
    nw, nv, nu = data.shape
    nz, ny, nx = nw, nv, nu + ceil(shift * (nw - 1))

    # upload texture
    ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                 runtime.cudaChannelFormatKindFloat)
    arr = CUDAarray(ch, nu, nw)
    res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)

    address_mode = (runtime.cudaAddressModeBorder,
                    runtime.cudaAddressModeBorder)
    tex = TextureDescriptor(address_mode, runtime.cudaFilterModeLinear,
                            runtime.cudaReadModeElementType)

    # transpose
    data = np.swapaxes(data, 0, 1)
    data = np.ascontiguousarray(data)

    data_in = data.astype(np.float32)
    data_out = cp.empty((ny, nz, nx), np.float32)
    for i, layer in enumerate(data_in):
        arr.copy_from(layer)  # TODO use stream
        texobj = TextureObject(res, tex)

        kernels["shear_kernel"](
            (ceil(nx / 16), ceil(nz / 16)),
            (16, 16),
            (data_out[i, ...], texobj, nx, nz, nu, np.float32(shift)),
        )

    data_out = cp.swapaxes(data_out, 0, 1)
    data_out = cp.asnumpy(data_out)
    data_out = data_out.astype(data.dtype)

    if return_resolution:
        # new resolution
        dz *= sin(angle)
        return data_out, (dz, dx)
    else:
        return data_out
Example #5
0
    def test_fetch_float4_texture(self):
        width = 47
        height = 39
        depth = 11
        n_channel = 4

        # generate input data and allocate output buffer
        in_shape = (depth, height, n_channel * width)
        out_shape = (depth, height, width)

        # prepare input, output, and texture memory
        tex_data = cupy.random.random(in_shape, dtype=cupy.float32)
        real_output_x = cupy.zeros(out_shape, dtype=cupy.float32)
        real_output_y = cupy.zeros(out_shape, dtype=cupy.float32)
        real_output_z = cupy.zeros(out_shape, dtype=cupy.float32)
        real_output_w = cupy.zeros(out_shape, dtype=cupy.float32)
        ch = ChannelFormatDescriptor(32, 32, 32, 32,
                                     runtime.cudaChannelFormatKindFloat)
        arr = CUDAarray(ch, width, height, depth)
        arr.copy_from(tex_data)

        # create resource and texture descriptors
        res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)
        address_mode = (runtime.cudaAddressModeClamp,
                        runtime.cudaAddressModeClamp)
        tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint,
                                runtime.cudaReadModeElementType)

        if self.target == 'object':
            # create a texture object
            texobj = TextureObject(res, tex)
            mod = cupy.RawModule(code=source_texobj)
        else:  # self.target == 'reference'
            mod = cupy.RawModule(code=source_texref)
            texrefPtr = mod.get_texref('texref3Df4')
            # bind texture ref to resource
            texref = TextureReference(texrefPtr, res, tex)  # noqa

        # get and launch the kernel
        ker_name = 'copyKernel3D_4ch'
        ker = mod.get_function(ker_name)
        block = (4, 4, 2)
        grid = ((width + block[0] - 1) // block[0],
                (height + block[1] - 1) // block[1],
                (depth + block[2] - 1) // block[2])
        args = (real_output_x, real_output_y, real_output_z, real_output_w)
        if self.target == 'object':
            args = args + (texobj, )
        args = args + (width, height, depth)
        ker(grid, block, args)

        # validate result
        assert (real_output_x == tex_data[..., 0::4]).all()
        assert (real_output_y == tex_data[..., 1::4]).all()
        assert (real_output_z == tex_data[..., 2::4]).all()
        assert (real_output_w == tex_data[..., 3::4]).all()
Example #6
0
    def test_array_gen_cpy(self):
        xp = numpy if self.xp == 'numpy' else cupy
        stream = None if not self.stream else cupy.cuda.Stream()
        width, height, depth = self.dimensions
        n_channel = self.n_channels

        dim = 3 if depth != 0 else 2 if height != 0 else 1
        shape = (depth, height, n_channel*width) if dim == 3 else \
                (height, n_channel*width) if dim == 2 else \
                (n_channel*width,)

        # generate input data and allocate output buffer
        if self.dtype in (numpy.float16, numpy.float32):
            arr = xp.random.random(shape).astype(self.dtype)
            kind = runtime.cudaChannelFormatKindFloat
        else:  # int
            arr = xp.random.randint(100, size=shape, dtype=self.dtype)
            if self.dtype in (numpy.int8, numpy.int16, numpy.int32):
                kind = runtime.cudaChannelFormatKindSigned
            else:
                kind = runtime.cudaChannelFormatKindUnsigned

        if self.c_contiguous:
            arr2 = xp.zeros_like(arr)
            assert arr.flags.c_contiguous
            assert arr2.flags.c_contiguous
        else:
            arr = arr[..., ::2]
            arr2 = xp.zeros_like(arr)
            width = arr.shape[-1] // n_channel
            assert not arr.flags.c_contiguous
            assert arr2.flags.c_contiguous
            assert arr.shape[-1] == n_channel*width

        # create a CUDA array
        ch_bits = [0, 0, 0, 0]
        for i in range(n_channel):
            ch_bits[i] = arr.dtype.itemsize*8
        ch = ChannelFormatDescriptor(*ch_bits, kind)
        cu_arr = CUDAarray(ch, width, height, depth)

        # need to wait for the current stream to finish initialization
        if stream is not None:
            s = cupy.cuda.get_current_stream()
            e = s.record()
            stream.wait_event(e)

        # copy from input to CUDA array, and back to output
        cu_arr.copy_from(arr, stream)
        cu_arr.copy_to(arr2, stream)

        # check input and output are identical
        if stream is not None:
            stream.synchronize()
        assert (arr == arr2).all()
Example #7
0
    def test_write_float_surface(self):
        width, height, depth = self.dimensions
        dim = 3 if depth != 0 else 2 if height != 0 else 1

        # generate input data and allocate output buffer
        shape = (depth, height, width) if dim == 3 else \
                (height, width) if dim == 2 else \
                (width,)

        # prepare input, output, and surface memory
        real_output = cupy.zeros(shape, dtype=cupy.float32)
        assert real_output.flags['C_CONTIGUOUS']
        ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                     runtime.cudaChannelFormatKindFloat)
        expected_output = cupy.arange(numpy.prod(shape), dtype=cupy.float32)
        expected_output = expected_output.reshape(shape) * 3.0
        assert expected_output.flags['C_CONTIGUOUS']

        # create resource descriptor
        # note that surface memory only support CUDA array
        arr = CUDAarray(ch, width, height, depth,
                        runtime.cudaArraySurfaceLoadStore)
        arr.copy_from(real_output)  # init to zero
        res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)

        # create a surface object; currently we don't support surface reference
        surfobj = SurfaceObject(res)
        mod = cupy.RawModule(code=source_surfobj)

        # get and launch the kernel
        ker_name = 'writeKernel'
        ker_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
        ker = mod.get_function(ker_name)
        block = (4, 4, 2) if dim == 3 else (4, 4) if dim == 2 else (4, )
        grid = ()
        args = (surfobj, )
        if dim >= 1:
            grid_x = (width + block[0] - 1) // block[0]
            grid = grid + (grid_x, )
            args = args + (width, )
        if dim >= 2:
            grid_y = (height + block[1] - 1) // block[1]
            grid = grid + (grid_y, )
            args = args + (height, )
        if dim == 3:
            grid_z = (depth + block[2] - 1) // block[2]
            grid = grid + (grid_z, )
            args = args + (depth, )
        ker(grid, block, args)

        # validate result
        arr.copy_to(real_output)
        assert (real_output == expected_output).all()
Example #8
0
    def test_array_gen_cpy(self):
        xp = numpy if self.xp == 'numpy' else cupy
        stream = None if not self.stream else cupy.cuda.Stream()
        width, height, depth = self.dimensions
        n_channel = self.n_channels

        dim = 3 if depth != 0 else 2 if height != 0 else 1
        shape = (depth, height, n_channel*width) if dim == 3 else \
                (height, n_channel*width) if dim == 2 else \
                (n_channel*width,)

        # generate input data and allocate output buffer
        if self.dtype in (numpy.float16, numpy.float32):
            arr = xp.random.random(shape).astype(self.dtype)
            kind = runtime.cudaChannelFormatKindFloat
        else:  # int
            # randint() in NumPy <= 1.10 does not have the dtype argument...
            arr = xp.random.randint(100, size=shape).astype(self.dtype)
            if self.dtype in (numpy.int8, numpy.int16, numpy.int32):
                kind = runtime.cudaChannelFormatKindSigned
            else:
                kind = runtime.cudaChannelFormatKindUnsigned
        arr2 = xp.zeros_like(arr)

        assert arr.flags['C_CONTIGUOUS']
        assert arr2.flags['C_CONTIGUOUS']

        # create a CUDA array
        ch_bits = [0, 0, 0, 0]
        for i in range(n_channel):
            ch_bits[i] = arr.dtype.itemsize * 8
        # unpacking arguments using *ch_bits is not supported before PY35...
        ch = ChannelFormatDescriptor(ch_bits[0], ch_bits[1], ch_bits[2],
                                     ch_bits[3], kind)
        cu_arr = CUDAarray(ch, width, height, depth)

        # copy from input to CUDA array, and back to output
        cu_arr.copy_from(arr, stream)
        cu_arr.copy_to(arr2, stream)

        # check input and output are identical
        if stream is not None:
            dev.synchronize()
        assert (arr == arr2).all()
Example #9
0
    def test_fetch_float_texture(self):
        width, height, depth = self.dimensions
        dim = 3 if depth != 0 else 2 if height != 0 else 1

        if (self.mem_type == 'linear' and dim != 1) or \
           (self.mem_type == 'pitch2D' and dim != 2):
            pytest.skip('The test case {0} is inapplicable for {1} and thus '
                        'skipped.'.format(self.dimensions, self.mem_type))

        # generate input data and allocate output buffer
        shape = (depth, height, width) if dim == 3 else \
                (height, width) if dim == 2 else \
                (width,)

        # prepare input, output, and texture memory
        tex_data = cupy.random.random(shape, dtype=cupy.float32)
        real_output = cupy.zeros_like(tex_data)
        ch = ChannelFormatDescriptor(32, 0, 0, 0,
                                     runtime.cudaChannelFormatKindFloat)
        assert tex_data.flags['C_CONTIGUOUS']
        assert real_output.flags['C_CONTIGUOUS']
        if self.mem_type == 'CUDAarray':
            arr = CUDAarray(ch, width, height, depth)
            expected_output = cupy.zeros_like(tex_data)
            assert expected_output.flags['C_CONTIGUOUS']
            # test bidirectional copy
            arr.copy_from(tex_data)
            arr.copy_to(expected_output)
        else:  # linear are pitch2D are backed by ndarray
            arr = tex_data
            expected_output = tex_data

        # create resource and texture descriptors
        if self.mem_type == 'CUDAarray':
            res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)
        elif self.mem_type == 'linear':
            res = ResourceDescriptor(runtime.cudaResourceTypeLinear,
                                     arr=arr,
                                     chDesc=ch,
                                     sizeInBytes=arr.size * arr.dtype.itemsize)
        else:  # pitch2D
            # In this case, we rely on the fact that the hand-picked array
            # shape meets the alignment requirement. This is CUDA's limitation,
            # see CUDA Runtime API reference guide. "TexturePitchAlignment" is
            # assumed to be 32, which should be applicable for most devices.
            res = ResourceDescriptor(runtime.cudaResourceTypePitch2D,
                                     arr=arr,
                                     chDesc=ch,
                                     width=width,
                                     height=height,
                                     pitchInBytes=width * arr.dtype.itemsize)
        address_mode = (runtime.cudaAddressModeClamp,
                        runtime.cudaAddressModeClamp)
        tex = TextureDescriptor(address_mode, runtime.cudaFilterModePoint,
                                runtime.cudaReadModeElementType)

        if self.target == 'object':
            # create a texture object
            texobj = TextureObject(res, tex)
            mod = cupy.RawModule(code=source_texobj)
        else:  # self.target == 'reference'
            mod = cupy.RawModule(code=source_texref)
            texref_name = 'texref'
            texref_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
            texrefPtr = mod.get_texref(texref_name)
            # bind texture ref to resource
            texref = TextureReference(texrefPtr, res, tex)  # noqa

        # get and launch the kernel
        ker_name = 'copyKernel'
        ker_name += '3D' if dim == 3 else '2D' if dim == 2 else '1D'
        ker_name += 'fetch' if self.mem_type == 'linear' else ''
        ker = mod.get_function(ker_name)
        block = (4, 4, 2) if dim == 3 else (4, 4) if dim == 2 else (4, )
        grid = ()
        args = (real_output, )
        if self.target == 'object':
            args = args + (texobj, )
        if dim >= 1:
            grid_x = (width + block[0] - 1) // block[0]
            grid = grid + (grid_x, )
            args = args + (width, )
        if dim >= 2:
            grid_y = (height + block[1] - 1) // block[1]
            grid = grid + (grid_y, )
            args = args + (height, )
        if dim == 3:
            grid_z = (depth + block[2] - 1) // block[2]
            grid = grid + (grid_z, )
            args = args + (depth, )
        ker(grid, block, args)

        # validate result
        assert (real_output == expected_output).all()