Example #1
0
 def init_reikna(self):
     if REIK:
         if CUDA:
             self.api = cluda.cuda_api()
         else:
             self.api = cluda.ocl_api()
         self.dev = self.api.get_platforms()[0].get_devices()[0]
Example #2
0
 def init_reikna(self):
     if REIK:
         if CUDA:
             self.api = cluda.cuda_api()
         else:
             self.api = cluda.ocl_api()
         self.dev = self.api.get_platforms()[0].get_devices()[0]
Example #3
0
 def _initialize(self):
     if self._pid != os.getpid():
         print 'Initializing CUDA thread for pid', os.getpid()
         self._cluda_api = cuda_api()
         self._thr = self._cluda_api.Thread.create()
         self._fftc = {}
         self._pid = os.getpid()
Example #4
0
    def __initialize_gpu(self):
        try:
            import reikna.cluda as cluda
            from reikna.fft import FFT
            dtype = numpy.complex64
            data = numpy.zeros( self.st['Kd'],dtype=dtype)
            data2 = numpy.empty_like(data)
            api = cluda.cuda_api()
            self.thr = api.Thread.create()      
            self.data_dev = self.thr.to_device(data)
            self.data_rec = self.thr.to_device(data2)
            axes=range(0,numpy.size(self.st['Kd']))
            myfftf= FFT( data, axes=axes)
            self.myfft = myfftf.compile(self.thr)
            self.cuda_flag=1
            print('create gpu fft?',self.cuda_flag)
            print('line 642')
            W= self.st['q'][...,0]
            print('line 645')
            self.W = numpy.reshape(W, self.st['Kd'],order='C')
            print('line 647')
#             self.thr2 = api.Thread.create() 
            print('line 649')
            self.W_dev = self.thr.to_device(self.W.astype(dtype))
            
            print('line 652')
        except:
            self.cuda_flag=0              
            print('get error, using cpu')
Example #5
0
    def __init__(self, *args, **kwargs):
        self.api = cluda.cuda_api()
        super(CUDAContext, self).__init__(*args, **kwargs)

        # work-around:
        # thread synchronization does not involve CUBLAS.
        # cublas gets synchronized as soon as there is a memory transfer
        # thus we need a tiny array which is there just for synchronization
        self._sync_array = self.thread.array((1, ), dtype=numpy.float32)
Example #6
0
 def __init__(self, *args, **kwargs): 
     self.api = cluda.cuda_api()  
     super(CUDAContext, self).__init__(*args, **kwargs)
     
     # work-around: 
     # thread synchronization does not involve CUBLAS.
     # cublas gets synchronized as soon as there is a memory transfer
     # thus we need a tiny array which is there just for synchronization
     self._sync_array = self.thread.array((1,), dtype=numpy.float32)
Example #7
0
def modified_gemm_gpu(A, B, C):
    shape = (A.shape[0], B.shape[1])
    api = cluda.cuda_api()
    thr = api.Thread.create()
    res_arr = thr.array((shape[0], shape[1]), dtype=A.dtype)

    mul = MatrixMul(A, B, out_arr=res_arr)
    mulc = mul.compile(thr)
    mulc(res_arr, A, B)

    return res_arr + C
Example #8
0
def test_fft():
    api = cluda.cuda_api()
    thr = api.Thread.create()

    N = 256
    M = 10000

    #data_in = np.random.rand(N, N) + 1j*np.random.rand(N, N)
    data_in = np.random.rand(N, N).astype('complex')
    cl_data_in = thr.to_device(data_in)
    cl_data_out = thr.empty_like(cl_data_in)
    fft = FFT(thr).prepare_for(cl_data_out, cl_data_in, -1, axes=(0, ))
 def __init__(self, diffs, coords, mask, probe, sample, sample_support, pmod_int = False): 
     """Initialise the Ptychography module with the data in 'inputDir' 
     
     Naming convention:
     coords_100x2.raw            list of y, x coordinates in np.float64 pixel units
     diffs_322x256x512.raw       322 (256,512) diffraction patterns in np.float64
                                 The zero pixel must be at [0, 0] and there must 
                                 be an equal no. of postive and negative frequencies
     mask_256x512.raw            (optional) mask for the diffraction data np.float64
     probeInit_256x512           (optional) Initial estimate for the probe np.complex128
     sampleInit_1024x2048        (optional) initial estimate for the sample np.complex128
                                 also sets the field of view
                                 If not present then initialise with random numbers        
     """
     #
     # Get the shape
     shape  = diffs[0].shape
     #
     # Store these values
     self.exits      = makeExits(sample, probe, coords)
     #
     # This will save time later
     self.diffAmps   = bg.quadshift(np.sqrt(diffs))
     self.shape      = shape
     self.shape_sample = sample.shape
     self.coords     = coords
     self.mask       = bg.quadshift(mask)
     self.probe      = probe
     self.sample     = sample
     self.alpha_div  = 1.0e-10
     self.error_mod  = []
     self.error_sup  = []
     self.error_conv = []
     self.probe_sum  = None
     self.sample_sum = None
     self.diffNorm   = np.sum(self.mask * (self.diffAmps)**2)
     self.pmod_int   = pmod_int
     self.sample_support = sample_support
     #
     # create a gpu thread
     api               = cluda.cuda_api()
     self.thr          = api.Thread.create()
     #
     # send the diffraction amplitudes, the exit waves and the mask to the gpu
     self.diffAmps_gpu = self.thr.to_device(self.diffAmps) * np.sqrt(float(self.diffAmps.shape[1]) * float(self.diffAmps.shape[2]))
     self.exits_gpu    = self.thr.to_device(self.exits)
     mask2             = np.zeros_like(diffs, dtype=np.complex128)
     mask2[:]          = self.mask.astype(np.complex128)
     self.mask_gpu     = self.thr.to_device(mask2)
     #
     # compile the fft routine
     fft               = FFT(self.diffAmps_gpu.astype(np.complex128), axes=(1,2))
     self.fftc         = fft.compile(self.thr, fast_math=True)
Example #10
0
    def initialize_gpu(self):
        try:
            import reikna.cluda as cluda
            from reikna.fft import FFT 

            data = numpy.zeros( self.st['Kd'],dtype=dtype)

            print('get_platform')
            api = cluda.ocl_api()

            print('api=',api== cluda.cuda_api())

            self.gpu_api =  'opencl'
                
            self.thr = api.Thread.create(async=True)   
            print('line 630')   
            self.data_dev = self.thr.to_device(data)

            axes=range(0,numpy.size(self.st['Kd']))
            print('line 635')   
            myfft=  FFT( data, axes=axes)
            print('line 640')   
            self.myfft = myfft.compile(self.thr,fast_math=True)
            print('line 640')   
            self.gpu_flag=1

            print('create gpu fft?',self.gpu_flag)
            print('line 642')#             self.data_rec = self.thr.to_device(data2)

                
            W= self.st['w'][...,0]

            print('line 645')   
                
            self.W = numpy.reshape(W, self.st['Kd'],order='C')
            

            print('line 647')
#             self.thr2 = api.Thread.create() 
            print('line 649')
            self.W_dev = self.thr.to_device(self.W.astype(dtype))
            self.W2_dev = self.thr.to_device(self.W.astype(dtype))
            self.tmp_dev = self.thr.to_device(self.W.astype(dtype)) # device memory
#             self.tmp2_dev = self.thr.to_device(1.0/self.W.astype(dtype)) # device memory
            self.gpu_flag=1      
#             if self.debug > 0:          
            print('line 652')
        except:
            self.gpu_flag=0
#             if self.debug > 0:              
            print('get error, using cpu')
Example #11
0
def process(hdr_fiename, filename):
    api = cluda.cuda_api()
    thr = api.Thread.create()
    X = thr.array((10, 32768 * 2), dtype=numpy.complex128)

    iq_data = TCAPData(filename, hdr_fiename)
    file_counter = int(iq_data.filename_wo_ext[-3:])
    fs = 312500
    file_length_in_sec = 15625 * 32768 / fs
    time_passed_upto_now = (file_counter - 1) * file_length_in_sec

    # extract hour min sec
    hr, placeholder = divmod(time_passed_upto_now, 3600)
    mnt, sec = divmod(placeholder, 60)
    total_time = '{}h-{}m-{}s'.format(int(hr), int(mnt), int(sec))
    title = 'Time: {}:{}:{}'.format(int(hr), int(mnt), int(sec))

    zz = np.array([])
    for j in range(1, 780 * 2 * 10 + 1, 2 * 10):
        data = np.array([])
        # read 2*10 i.e. 20 blocks
        for i in range(j, j + 2 * 10):
            data = np.append(data, iq_data.read_block(i))
        data = np.reshape(data, (10, 32768 * 2))

        x = thr.to_device(data)
        fft = FFT(x, axes=(1, ))
        fftc = fft.compile(thr)
        fftc(X, x, 0)
        data_fft = X
        #data_fft = np.fft.fft(data, axis=1)

        data_fft = np.average(data_fft, axis=0)
        data_fft = np.abs(np.fft.fftshift(data_fft))
        zz = np.append(zz, data_fft)

    zz = np.reshape(zz, (780, 32768 * 2))
    data_fft_freqs = np.fft.fftshift(np.fft.fftfreq(32768 * 2,
                                                    d=1 / fs))  # in Hz
    xx, yy = np.meshgrid(data_fft_freqs, np.arange(780))
    yy = yy * 2.10  # in seconds
    plt_filename = '{}_{}'.format(iq_data.filename_wo_ext, total_time)
    print('Printing into file: ' + plt_filename)
    plot_spectrogram(xx,
                     yy,
                     zz,
                     dbm=False,
                     cmap=cm.jet,
                     filename=plt_filename,
                     dpi=500,
                     title=title)
Example #12
0
def initialize_gpu(backend, **kwargs):
    '''
    Initialize a new GPU context.

    :param backend: backend to use. It must be any of "cuda" or "opencl".
    :type backend: str
    :param kwargs: configuration for the device lookup (see below for details).
    :type kwargs: dict

    * *interactive*: (bool) whether to select the device manually
       (defaults to False).

    * *device*: (int) number of the device to use (defaults to None).

    .. note:: The device can be selected using the MINKIT_DEVICE environment variable.
    '''
    from reikna import cluda

    if backend == CUDA:
        api = cluda.cuda_api()
    elif backend == OPENCL:
        api = cluda.ocl_api()
    else:
        raise ValueError(f'Unknown backend type "{backend}"')

    # Get all available devices
    platforms = api.get_platforms()

    all_devices = [(p, d) for p in platforms for d in p.get_devices()]

    # Determine the device to use
    idev = device_lookup(all_devices, **kwargs)

    platform, device = all_devices[idev]

    logger.info(
        f'Selected device "{device.name}" ({idev}) (platform: {platform.name})'
    )

    return Context(api, device, backend)
Example #13
0
import pycuda.driver as cuda
import pycuda.autoinit as autoinit
from pycuda.compiler import SourceModule
from pycuda.elementwise import ElementwiseKernel


import reikna.cluda as cluda
from reikna.fft import FFT


ctx = autoinit.context

if __name__ == '__main__':


    api = cluda.cuda_api()
    thr = api.Thread.create()

    size = (256,256,256)
    units = (.1,)*3
    lam = .5
    u0 = None
    n0 = 1.
    dn = np.zeros(size[::-1],np.complex64)


    clock = StopWatch()

    clock.tic("setup")

    Nx, Ny, Nz = size
Example #14
0
    def __init__(self, API = None, platform_number=None, device_number=None):
        """
        Constructor.
        :param API: The API for the heterogeneous system. API='cuda' or API='ocl'
        :param platform_number: The number of the platform found by the API. 
        :param device_number: The number of the device found on the platform. 
        :type API: string
        :type platform_number: integer 
        :type device_number: integer 
        :returns: 0
        :rtype: int, float
 
        :Example:
 
        >>> import pynufft
        >>> NufftObj = pynufft.NUFFT_hsa(API='cuda', 0, 0)        
        """
         
#         pass
        self.dtype = numpy.complex64
#         NUFFT_cpu.__init__(self)
     
        import reikna.cluda as cluda
        print('API = ', API)
        self.cuda_flag, self.ocl_flag = helper.diagnose()
        if None is API:
            if self.cuda_flag is 1:
                API = 'cuda'
            elif self.ocl_flag is 1:
                API = 'ocl'
            else:
                print('No accelerator is available.')
        else:
            api = API
        print('now using API = ', API)
        if platform_number is None:
            platform_number = 0
        if device_number is None:
            device_number = 0
         
        from reikna import cluda
        import reikna.transformations
        from reikna.cluda import functions, dtypes
        try: # try to create api/platform/device using the given parameters
            if 'cuda' == API:
                api = cluda.cuda_api()
            elif 'ocl' == API:
                api = cluda.ocl_api()
      
            platform = api.get_platforms()[platform_number]
             
            device = platform.get_devices()[device_number]
        except: # if failed, find out what's going wrong?
            print('No accelerator is detected.')
             
#             return 1
 
#         Create context from device
        self.thr = api.Thread(device) #pyopencl.create_some_context()
        print('Using opencl or cuda = ', self.thr.api)
         
#         print('Using opencl?  ', self.thr.api is reikna.cluda.ocl)
#         """
#         Wavefront: as warp in cuda. Can control the width in a workgroup
#         Wavefront is required in spmv_vector as it improves data coalescence.
#         see cCSR_spmv and zSparseMatVec
#         """
        self.wavefront = api.DeviceParameters(device).warp_size
 
        print('wavefront of OpenCL (as warp in CUDA) = ',self.wavefront)
 
 
        from ..src.re_subroutine import create_kernel_sets
        kernel_sets = create_kernel_sets(API)
                
        prg = self.thr.compile(kernel_sets, 
                                render_kwds=dict(LL =  str(self.wavefront)), 
                                fast_math=False)
        self.prg = prg        
         
        print("Note: In the future the api will change!")
        print("You have been warned!")
Example #15
0
import time
import numpy as np
from reikna import cluda
from reikna.fft import fft
from numpy.linalg import norm

#reikna fft only works with complex numbers
dtype = np.complex64

#getting the cuda api
api = cluda.cuda_api()
thr = api.Thread.create()

shape = (512, 2, 544, 3)

#sending the array from host to array and creating result array on device
a = np.random.randn(*shape).astype(dtype)
a_dev = thr.to_device(a)
res_dev = thr.array(shape, dtype=dtype)

#reikna version of fft (compilation sends it to the thread)
lastAxis = len(shape) - 1
t0 = time.time()
fft = fft.FFT(a, axes=(lastAxis, ))
fftc = fft.compile(thr)
fftc(res_dev, a_dev)
t1 = time.time()
print('Time 1: ', t1 - t0)

#tried to run compiled fft function with different input (it worked!)
b = np.random.randn(*shape).astype(dtype)
Example #16
0
        device for device in Device.__dict__.keys() if device[0] != "_"
]:
    setattr(Tensor, f"{device.lower()}",
            functools.partialmethod(Tensor.to, Device.__dict__[device]))
    setattr(Tensor, f"{device.lower()}_",
            functools.partialmethod(Tensor.to_, Device.__dict__[device]))


# this registers all the operations
def _register_ops(namespace, device=Device.CPU):
    for name, cls in inspect.getmembers(namespace, inspect.isclass):
        if name[0] != "_": register(name.lower(), cls, device=device)


from tinygrad import ops_cpu

_register_ops(ops_cpu)
try:
    import reikna.cluda as cluda
    from tinygrad import ops_gpu

    _register_ops(ops_gpu, device=Device.GPU)
    api = cluda.cuda_api() if os.environ.get(
        "GPAPI", "opencl") == "cuda" else cluda.ocl_api()
    thr = api.Thread.create()
    GPU = True
except ImportError:
    # no GPU support
    GPU = False
ANE = False
Example #17
0
    def __init__(self,inputSize, axes=(-1,),mode="pyfftw",dtype="complex64",
                    direction="FORWARD",fftw_FLAGS=("FFTW_MEASURE","FFTW_DESTROY_INPUT"),
                    THREADS=None, loggingLevel=None):
        self.axes = axes
        self.direction=direction

        if loggingLevel:
            logger.setLoggingLevel(loggingLevel)

        if mode=="gpu" or mode=="gpu_ocl" or mode=="gpu_cuda":
            if mode == "gpu":
                mode = "gpu_ocl"
            if REIKNA_AVAILABLE:
                if mode=="gpu_ocl":
                    try:
                        reikna_api = cluda.ocl_api()
                        self.reikna_thread = reikna_api.Thread.create()
                        self.FFTMODE="gpu"
                    except:
                        logger.warning("no reikna opencl available. \
                                            will try cuda")
                        mode = "gpu_cuda"
                if mode=="gpu_cuda":
                    try:
                        reikna_api = cluda.cuda_api()
                        self.reikna_thread = reikna_api.Thread.create()
                        self.FFTMODE="gpu"
                    except:
                        logger.warning("no cuda available. \
                                Switching to pyfftw")
                        mode = "pyfftw"
            else:
                logger.warning("No gpu algorithms available\
                        switching to pyfftw")
                mode = "pyfftw"

        if mode=="pyfftw":
            if PYFFTW_AVAILABLE:
                self.FFTMODE = "pyfftw"
            else:
                logger.warning("No pyfftw available. \
                                Defaulting to scipy.fftpack")
                mode = "scipy"

        if mode=="scipy":
            if SCIPY_AVAILABLE:
                self.FFTMODE = "scipy"
            else:
                logger.warning("No scipy available - fft won't function.")


        if self.FFTMODE=="gpu":
            if direction=="FORWARD":
                self.inverse=1
            elif direction=="BACKWARD":
                self.inverse=0

            self.inputData = numpy.zeros( inputSize, dtype=dtype)
            inputData_dev = self.reikna_thread.to_device(self.inputData)
            self.outputData_dev = self.reikna_thread.array(inputSize,
                                                     dtype=dtype)

            logger.info("Generating and compiling reikna gpu fft plan...")
            reikna_ft = reikna.fft.FFT(inputData_dev, axes=axes)
            self.reikna_ft_c = reikna_ft.compile(self.reikna_thread)
            logger.info("Done!")

        if self.FFTMODE=="pyfftw":
            if THREADS==None:
                THREADS=cpu_count()

            #fftw_FLAGS Set the optimisation level of fftw3,
            #(more optimisation takes longer - but gives quicker ffts.)
            #Can be FFTW_ESTIMATE, FFTW_MEASURE, FFT_PATIENT, FFTW_EXHAUSTIVE
            n = pyfftw.simd_alignment

            self.inputData = pyfftw.n_byte_align_empty( inputSize,n,
                                dtype)
            self.inputData[:] = numpy.zeros( inputSize, dtype=dtype)
            self.outputData = pyfftw.n_byte_align_empty(inputSize,n,
                                dtype)
            self.outputData[:] = numpy.zeros( inputSize,dtype=dtype)

            logger.info("Generating fftw3 plan....\nIf this takes too long, change fftw_FLAGS.")
            logger.debug("currently set to: {})".format(fftw_FLAGS))
            if direction=="FORWARD":
                self.fftwPlan = pyfftw.FFTW(self.inputData,self.outputData,
                                axes=axes, threads=THREADS,flags=fftw_FLAGS)
            elif direction=="BACKWARD":
                self.fftwPlan = pyfftw.FFTW(self.inputData,self.outputData,
                                direction='FFTW_BACKWARD', axes=axes,
                                threads=THREADS,flags=fftw_FLAGS)
            logger.info("Done!")


        elif self.FFTMODE=="scipy":

            self.direction=direction
            self.inputData = numpy.zeros(inputSize,dtype=dtype)
            self.size=[]
            for i in range(len(self.axes)):
                self.size.append(inputSize[self.axes[i]])
Example #18
0
    def offload(self, API, platform_number=0, device_number=0):
        """
        self.offload():
        
        Off-load NUFFT to the opencl or cuda device(s)
        
        :param API: define the device type, which can be 'cuda' or 'ocl'
        :param platform_number: define which platform to be used. The default platform_number = 0.
        :param device_number: define which device to be used. The default device_number = 0.
        :type API: string
        :type platform_number: int
        :type device_number: int
        :return: self: instance

        """
        from reikna import cluda
        import reikna.transformations
        from reikna.cluda import functions, dtypes
        try: # try to create api/platform/device using the given parameters
            if 'cuda' == API:
                api = cluda.cuda_api()
            elif 'ocl' == API:
                api = cluda.ocl_api()
     
            platform = api.get_platforms()[platform_number]
            
            device = platform.get_devices()[device_number]
        except: # if failed, find out what's going wrong?
            diagnose()
            
            return 1

        
#         print('device = ', device)
#         Create context from device
        self.thr = api.Thread(device) #pyopencl.create_some_context()
#         self.queue = pyopencl.CommandQueue( self.ctx)

#         """
#         Wavefront: as warp in cuda. Can control the width in a workgroup
#         Wavefront is required in spmv_vector as it improves data coalescence.
#         see cSparseMatVec and zSparseMatVec
#         """
        self.wavefront = api.DeviceParameters(device).warp_size
        print(api.DeviceParameters(device).max_work_group_size)
#         print(self.wavefront)
#         print(type(self.wavefront))
#          pyopencl.characterize.get_simd_group_size(device[0], dtype.size)
        from src.re_subroutine import cMultiplyScalar, cCopy, cAddScalar,cAddVec, cSparseMatVec, cSelect, cMultiplyVec, cMultiplyVecInplace, cMultiplyConjVec, cDiff, cSqrt, cAnisoShrink
        # import complex float routines
#         print(dtypes.ctype(dtype))
        prg = self.thr.compile( 
                                cMultiplyScalar.R + #cCopy.R, 
                                cCopy.R + 
                                cAddScalar.R + 
                                cSelect.R +cMultiplyConjVec.R + cAddVec.R+
                                cMultiplyVecInplace.R +cSparseMatVec.R+cDiff.R+ cSqrt.R+ cAnisoShrink.R+ cMultiplyVec.R,
                                render_kwds=dict(
                                    LL =  str(self.wavefront)), fast_math=False)
#                                fast_math = False)
#                                 "#define LL  "+ str(self.wavefront) + "   "+cSparseMatVec.R)
#                                ),
#                                 fast_math=False)
#         prg2 = pyopencl.Program(self.ctx, "#define LL "+ str(self.wavefront) + " "+cSparseMatVec.R).build()
        
        self.cMultiplyScalar = prg.cMultiplyScalar
#         self.cMultiplyScalar.set_scalar_arg_dtypes( cMultiplyScalar.scalar_arg_dtypes)
        self.cCopy = prg.cCopy
        self.cAddScalar = prg.cAddScalar
        self.cAddVec = prg.cAddVec
        self.cSparseMatVec = prg.cSparseMatVec     
        self.cSelect = prg.cSelect
        self.cMultiplyVecInplace = prg.cMultiplyVecInplace
        self.cMultiplyVec = prg.cMultiplyVec
        self.cMultiplyConjVec = prg.cMultiplyConjVec
        self.cDiff = prg.cDiff
        self.cSqrt= prg.cSqrt
        self.cAnisoShrink = prg.cAnisoShrink                                 

#         self.xx_Kd = pyopencl.array.zeros(self.queue, self.st['Kd'], dtype=dtype, order="C")
        self.k_Kd = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C"))
        self.k_Kd2 = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C"))
        self.y =self.thr.to_device( numpy.zeros((self.st['M'],), dtype=dtype, order="C"))
        self.x_Nd = self.thr.to_device(numpy.zeros(self.st['Nd'], dtype=dtype, order="C"))
#         self.xx_Nd =     pyopencl.array.zeros(self.queue, self.st['Nd'], dtype=dtype, order="C")

        self.NdCPUorder, self.KdCPUorder, self.nelem =     preindex_copy(self.st['Nd'], self.st['Kd'])
        self.NdGPUorder = self.thr.to_device( self.NdCPUorder)
        self.KdGPUorder =  self.thr.to_device( self.KdCPUorder)
        self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
        self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
        self.M = numpy.int32( self.st['M'])
        
        self.SnGPUArray = self.thr.to_device(  self.sn)
        
        self.sp_data = self.thr.to_device( self.sp.data.astype(dtype))
        self.sp_indices =self.thr.to_device( self.sp.indices.astype(numpy.int32))
        self.sp_indptr = self.thr.to_device( self.sp.indptr.astype(numpy.int32))
        self.sp_numrow =  self.M
        del self.sp
        self.spH_data = self.thr.to_device(  self.spH.data.astype(dtype))
        self.spH_indices = self.thr.to_device(  self.spH.indices)
        self.spH_indptr = self.thr.to_device(  self.spH.indptr)
        self.spH_numrow = self.Kdprod
        del self.spH
        self.spHsp_data = self.thr.to_device(  self.spHsp.data.astype(dtype))
        self.spHsp_indices = self.thr.to_device( self.spHsp.indices)
        self.spHsp_indptr =self.thr.to_device(  self.spHsp.indptr)
        self.spHsp_numrow = self.Kdprod
        del self.spHsp
#         import reikna.cluda
        import reikna.fft
#         api = 
#         self.thr = reikna.cluda.ocl_api().Thread(self.queue)        
        self.fft = reikna.fft.FFT(self.k_Kd, numpy.arange(0, self.ndims)).compile(self.thr, fast_math=False)
#         self.fft = reikna.fft.FFT(self.k_Kd).compile(thr, fast_math=True)
#         self.fft = FFT(self.ctx, self.queue,  self.k_Kd, fast_math=True)
#         self.ifft = FFT(self.ctx, self.queue, self.k_Kd2,  fast_math=True)
        self.zero_scalar=dtype(0.0+0.0j)
Example #19
0
    def __init__(self,
                 API=None,
                 platform_number=None,
                 device_number=None,
                 verbosity=0):
        """
        Constructor.

        :param API: The API for the heterogeneous system. API='cuda'
                    or API='ocl'
        :param platform_number: The number of the platform found by the API.
        :param device_number: The number of the device found on the platform.
        :param verbosity: Defines the verbosity level, default value is 0
        :type API: string
        :type platform_number: integer
        :type device_number: integer
        :type verbosity: integer
        :returns: 0
        :rtype: int, float

        :Example:

        >>> import pynufft
        >>> NufftObj = pynufft.NUFFT_hsa(API='cuda', platform_number=0,
                                         device_number=0, verbosity=0)
        """
        warnings.warn(
            'In the future NUFFT_hsa and NUFFT_cpu api will'
            ' be merged', FutureWarning)
        self.dtype = numpy.complex64
        self.verbosity = verbosity

        import reikna.cluda as cluda
        if self.verbosity > 0:
            print('The choosen API by the user is ', API)
        self.cuda_flag, self.ocl_flag = helper.diagnose(
            verbosity=self.verbosity)
        if None is API:
            if self.cuda_flag is 1:
                API = 'cuda'
            elif self.ocl_flag is 1:
                API = 'ocl'
            else:
                warnings.warn(
                    'No parallelization will be made since no GPU '
                    'device has been detected.', UserWarning)
        else:
            api = API
        if self.verbosity > 0:
            print('The used API will be ', API)
        if platform_number is None:
            platform_number = 0
        if device_number is None:
            device_number = 0

        from reikna import cluda
        import reikna.transformations
        from reikna.cluda import functions, dtypes
        try:  # try to create api/platform/device using the given parameters
            if 'cuda' == API:
                api = cluda.cuda_api()
            elif 'ocl' == API:
                api = cluda.ocl_api()

            platform = api.get_platforms()[platform_number]

            device = platform.get_devices()[device_number]
        except:  # if failed, find out what's going wrong?
            warnings.warn(
                'No parallelization will be made since no GPU '
                'device has been detected.', UserWarning)

#             return 1

#         Create context from device
        self.thr = api.Thread(device)  # pyopencl.create_some_context()
        self.device = device  # : device name
        if self.verbosity > 0:
            print('Using opencl or cuda = ', self.thr.api)

#         """
#         Wavefront: as warp in cuda. Can control the width in a workgroup
#         Wavefront is required in spmv_vector as it improves data coalescence.
#         see cCSR_spmv and zSparseMatVec
#         """
        self.wavefront = api.DeviceParameters(device).warp_size
        if self.verbosity > 0:
            print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront)

        from ..src import re_subroutine  # import create_kernel_sets
        kernel_sets = re_subroutine.create_kernel_sets(API)

        prg = self.thr.compile(kernel_sets,
                               render_kwds=dict(LL=str(self.wavefront)),
                               fast_math=False)
        self.prg = prg
Example #20
0
    def __init__(self,
                 inputSize,
                 axes=(-1, ),
                 mode="pyfftw",
                 dtype="complex64",
                 direction="FORWARD",
                 fftw_FLAGS=("FFTW_MEASURE", "FFTW_DESTROY_INPUT"),
                 THREADS=None,
                 loggingLevel=None):
        self.axes = axes
        self.direction = direction

        if loggingLevel:
            logger.setLoggingLevel(loggingLevel)

        if mode == "gpu" or mode == "gpu_ocl" or mode == "gpu_cuda":
            if mode == "gpu":
                mode = "gpu_ocl"
            if REIKNA_AVAILABLE:
                if mode == "gpu_ocl":
                    try:
                        reikna_api = cluda.ocl_api()
                        self.reikna_thread = reikna_api.Thread.create()
                        self.FFTMODE = "gpu"
                    except:
                        logger.warning("no reikna opencl available. \
                                            will try cuda")
                        mode = "gpu_cuda"
                if mode == "gpu_cuda":
                    try:
                        reikna_api = cluda.cuda_api()
                        self.reikna_thread = reikna_api.Thread.create()
                        self.FFTMODE = "gpu"
                    except:
                        logger.warning("no cuda available. \
                                Switching to pyfftw")
                        mode = "pyfftw"
            else:
                logger.warning("No gpu algorithms available\
                        switching to pyfftw")
                mode = "pyfftw"

        if mode == "pyfftw":
            if PYFFTW_AVAILABLE:
                self.FFTMODE = "pyfftw"
            else:
                logger.warning("No pyfftw available. \
                                Defaulting to scipy.fftpack")
                mode = "scipy"

        if mode == "scipy":
            if SCIPY_AVAILABLE:
                self.FFTMODE = "scipy"
            else:
                logger.warning("No scipy available - fft won't function.")

        if self.FFTMODE == "gpu":
            if direction == "FORWARD":
                self.inverse = 1
            elif direction == "BACKWARD":
                self.inverse = 0

            self.inputData = numpy.zeros(inputSize, dtype=dtype)
            inputData_dev = self.reikna_thread.to_device(self.inputData)
            self.outputData_dev = self.reikna_thread.array(inputSize,
                                                           dtype=dtype)

            logger.info("Generating and compiling reikna gpu fft plan...")
            reikna_ft = reikna.fft.FFT(inputData_dev, axes=axes)
            self.reikna_ft_c = reikna_ft.compile(self.reikna_thread)
            logger.info("Done!")

        if self.FFTMODE == "pyfftw":
            if THREADS == None:
                THREADS = cpu_count()

            #fftw_FLAGS Set the optimisation level of fftw3,
            #(more optimisation takes longer - but gives quicker ffts.)
            #Can be FFTW_ESTIMATE, FFTW_MEASURE, FFT_PATIENT, FFTW_EXHAUSTIVE
            n = pyfftw.simd_alignment

            self.inputData = pyfftw.n_byte_align_empty(inputSize, n, dtype)
            self.inputData[:] = numpy.zeros(inputSize, dtype=dtype)
            self.outputData = pyfftw.n_byte_align_empty(inputSize, n, dtype)
            self.outputData[:] = numpy.zeros(inputSize, dtype=dtype)

            logger.info(
                "Generating fftw3 plan....\nIf this takes too long, change fftw_FLAGS."
            )
            logger.debug("currently set to: {})".format(fftw_FLAGS))
            if direction == "FORWARD":
                self.fftwPlan = pyfftw.FFTW(self.inputData,
                                            self.outputData,
                                            axes=axes,
                                            threads=THREADS,
                                            flags=fftw_FLAGS)
            elif direction == "BACKWARD":
                self.fftwPlan = pyfftw.FFTW(self.inputData,
                                            self.outputData,
                                            direction='FFTW_BACKWARD',
                                            axes=axes,
                                            threads=THREADS,
                                            flags=fftw_FLAGS)
            logger.info("Done!")

        elif self.FFTMODE == "scipy":

            self.direction = direction
            self.inputData = numpy.zeros(inputSize, dtype=dtype)
            self.size = []
            for i in range(len(self.axes)):
                self.size.append(inputSize[self.axes[i]])
def kspacegaussian_filter_CL(ksp, sigma):
    sz = ksp.shape
    dtype = np.complex64
    ftype = np.float32
    #api = cluda.ocl_api()
    api = cuda_api()
    thr = api.Thread.create()
    data_dev = thr.to_device(ksp)
    ifft = FFT(data_dev)
    FACTOR = 1.0
    program = thr.compile("""
KERNEL void gauss_kernel(
    GLOBAL_MEM ${ctype} *dest,
    GLOBAL_MEM ${ctype} *src)
{
  const ulong x = get_global_id(0);const ulong y = get_global_id(1);const ulong z = get_global_id(2);
  const SIZE_T dim1= %d;
  const SIZE_T dim2= %d;
  const SIZE_T dim3= %d;                    
  ${ftype} sigma[3];
  sigma[0]=%f;sigma[1]=%f;sigma[2]=%f;
  ${ftype} factor = %f;            
  const double TWOPISQ = 19.739208802178716; //6.283185307179586;  //2*3.141592;
  const ${ftype} SQRT2PI = 2.5066282746;
  const double CUBEDSQRT2PI = 15.749609945722419;
  const ulong idx = dim1*dim2*z + dim1*y + x;
  ${ftype} i = (${ftype})(x); // )((x / dim3) / dim2);
      i = (i - (${ftype})floor((${ftype})(dim1)/2.0))/(${ftype})(dim1);
  ${ftype} j = (${ftype})(y); //(x / dim3);
      if((SIZE_T)j > dim2) {j=(${ftype})fmod(j, (${ftype})dim2);};
      j = (j - (${ftype})floor((${ftype})(dim2)/2.0f))/(${ftype})(dim2);
  //Account for large global index (stored as ulong) before performing modulus
  //double pre_k=fmod((double)(x) , (double) dim3);
  ${ftype} k = (${ftype}) (z); //pre_k;
      k = (k - (${ftype})floor((${ftype})(dim3)/2.0f))/(${ftype})(dim3);

  ${ftype} weight = exp(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  //${ftype} weight = expm1(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]))+1;
  //${ftype} weight= ${exp}(-TWOPISQ*((i*i)*sigma[0]*sigma[0] + (j*j)*sigma[1]*sigma[1] + (k*k)*sigma[2]*sigma[2]));
  dest[idx].x = src[idx].x * weight;
  dest[idx].y = src[idx].y * weight; 
  
}
""" % (sz[0], sz[1], sz[2], sigma[0], sigma[1], sigma[2], FACTOR),
        render_kwds=dict(ctype=dtypes.ctype(dtype),
                         ftype=dtypes.ctype(ftype),
                         exp=functions.exp(ftype)), fast_math=True)
    gauss_kernel = program.gauss_kernel
    #data_dev = thr.empty_like(ksp_dev)
    gauss_kernel(data_dev, data_dev, global_size=(sz[0], sz[1], sz[2]))
    # ksp_out = data_dev.get()
    thr.synchronize()
    ##
    #api = cuda_api()
    #thr = api.Thread.create()
    #data_dev = thr.to_device(ksp_out)
    ifft = FFT(data_dev)
    cifft = ifft.compile(thr)
    cifft(data_dev, data_dev, inverse=0)
    result = np.fft.fftshift(data_dev.get() / sz[0] * sz[1] * sz[2])
    result = result[::-1, ::-1, ::-1]
    result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0)
    return result  # ,ksp_out
tic()
imggauss2 = kspacegaussian_filter_CL2(ksp, np.ones(3))
print 'Reikna Cuda Gaussian+recon+ Reikna FFTShift: first run'
toc()
pycuda.tools.clear_context_caches()

tic()
kspgauss2 = KSP.kspacegaussian_filter2(ksp, 1)
image_filtered = simpleifft(procpar, dims, hdr, kspgauss2, args)
toc()


# create two timers so we can speed-test each approach
#start = drv.Event()
#end = drv.Event()
api = cuda_api()
thr = api.Thread.create()
N = 512

tic()
data_dev = thr.to_device(ksp)
ifft = FFT(data_dev)
cifft = ifft.compile(thr)
thr.synchronize()
cifft(data_dev, data_dev, inverse=0)
result = np.fft.fftshift(data_dev.get() / N**3)
result = result[::-1, ::-1, ::-1]
result = np.roll(np.roll(np.roll(result, 1, axis=2), 1, axis=1), 1, axis=0)

print "Reikna IFFT time and first three results:"
print "%s sec, %s" % (toc(), str(np.abs(result[:3, 0, 0])))
Example #23
0
def initialize_gpu(backend, **kwargs):
    '''
    Initialize a new GPU context.

    :param backend: backend to use. It must be any of "cuda" or "opencl".
    :type backend: str
    :param kwargs: it may contain any of the following values: \
    - interactive: (bool) whether to select the device manually (defaults to False) \
    - device: (int) number of the device to use (defaults to None).
    :type kwargs: dict

    .. note:: The device can be selected using the MINKIT_DEVICE environment variable.
    '''
    global BACKEND
    global DEVICE
    global CONTEXT
    global THREAD

    from reikna import cluda

    # Establish the backend
    if BACKEND is not None and backend != BACKEND:
        raise RuntimeError(
            f'Attempt to change backend from "{BACKEND}" to "{backend}"; not supported'
        )
    elif backend == CUDA:
        API = cluda.cuda_api()
    elif backend == OPENCL:
        API = cluda.ocl_api()
    elif backend == BACKEND:
        # Using same backend
        return
    else:
        raise ValueError(f'Unknown backend type "{backend}"')

    BACKEND = backend

    # Get all available devices
    platforms = API.get_platforms()

    all_devices = [(p, d) for p in platforms for d in p.get_devices()]

    # Determine the device to use
    idev = device_lookup(all_devices, **kwargs)

    platform, device = all_devices[idev]

    logger.info(
        f'Selected device "{device.name}" ({idev}) (platform: {platform.name})'
    )

    DEVICE = device

    # Create the context and thread
    if BACKEND == CUDA:
        CONTEXT = DEVICE.make_context()

        def clear_cuda_context():
            from pycuda.tools import clear_context_caches
            CONTEXT.pop()
            clear_context_caches()

        atexit.register(clear_cuda_context)
    else:
        # OPENCL
        import pyopencl
        CONTEXT = pyopencl.Context([DEVICE])

    THREAD = API.Thread(CONTEXT)