Exemple #1
0
 print 'CPU RBF takes',t1-t0, 's'
 kij= np.array( [ki,kj]).flatten()
 print 'Total sum:',kij.sum()
 print kij[0:1000:skip]
 
 
 import pycuda.driver as cuda
 import pycuda.tools
 import pycuda.autoinit
 from pycuda.compiler import SourceModule
 
 ##----------------------------------------------
 # Ellpakc gpu kernel
 
 
 v,c,r=spf.csr2ellpack(X,align=prefetch)
 
 sd=rbf.Diag
 self_dot = rbf.Xsquare
 results = np.zeros(2*num_el,dtype=np.float32)
 
 kernel_file = "ellpackKernel.cu"
 
 with open (kernel_file,"r") as CudaFile:
     data = CudaFile.read();
 
 #copy memory to device
 g_val = cuda.to_device(v)
 g_col = cuda.to_device(c)
 g_r   = cuda.to_device(r)
 g_self = cuda.to_device(self_dot)
Exemple #2
0
    def init_cuda(self,X,Y, cls_start, max_kernels=1 ):
        
        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels 
        
        self.X =X
        self.Y = Y
        
        self.cls_start=cls_start.astype(np.int32)
        
        #handle to gpu memory for y for each concurrent classifier
        self.g_y=[]
        #handle to gpu memory for results for each concurrent classifier
        self.g_out=[] #gpu kernel out
        self.kernel_out=[] #cpu kernel out
        #blocks per grid for each concurrent classifier    
        self.bpg=[]
        
        #function reference
        self.func=[]
        
        #texture references for each concurrent kernel
        self.tex_ref=[]

        #main vectors 
        #gpu        
        self.g_vecI=[]
        self.g_vecJ=[]
        #cpu
        self.main_vecI=[]
        self.main_vecJ=[]    
        
        #cpu class 
        self.cls_count=[]
        self.cls=[]
        #gpu class
        self.g_cls_count=[]
        self.g_cls=[]
        
        self.sum_cls=[]
        
        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)            
#            self.func.append(0)
#            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
#            self.main_vecI.append(0)
#            self.main_vecJ.append(0)
            self.sum_cls.append(0)
            
            
        self.N,self.Dim = X.shape
        column_size = self.N*4
        cacheMB = self.cache_size*1024*1024 #100MB for cache size   
        
        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB/column_size).astype(int)
        
        cache_items = min(self.N,cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)        
        
        self.compute_diag()
        
        #cuda initialization
        cuda.init()        
        
        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code        
        with open (self.module_file,"r") as CudaFile:
            module_code = CudaFile.read();
        
        #compile module
        self.module = SourceModule(module_code,keep=True,no_extern_c=True)
        
        (g_gamma,gsize)=self.module.get_global('GAMMA')       
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) )
        
        #get functions reference

        Dim =self.Dim        
        vecBytes = Dim*4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex=self.module.get_texref('VecI_TexRef')
            self.g_vecI[f]=cuda.mem_alloc( vecBytes)           
            vecI_tex.set_address(self.g_vecI[f],vecBytes)

            #init texture for vector J
            vecJ_tex=self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f]=cuda.mem_alloc( vecBytes)     
            vecJ_tex.set_address(self.g_vecJ[f],vecBytes)
            
            self.tex_ref.append((vecI_tex,vecJ_tex) )
            
            self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32))
            self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32))
            
            texReflist = list(self.tex_ref[f])
            
            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist)
            
        
        #transform X to particular format
        v,c,r=spf.csr2ellpack(self.X,align=self.prefetch)
        #copy format data structure to gpu memory
        
        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)
        
        self.g_cls_start = cuda.to_device(self.cls_start)
Exemple #3
0
    def init_cuda(self, X, Y, cls_start, max_kernels=1):

        #assert X.shape[0]==Y.shape[0]
        self.max_concurrent_kernels = max_kernels

        self.X = X
        self.Y = Y

        self.cls_start = cls_start.astype(np.int32)

        #handle to gpu memory for y for each concurrent classifier
        self.g_y = []
        #handle to gpu memory for results for each concurrent classifier
        self.g_out = []  #gpu kernel out
        self.kernel_out = []  #cpu kernel out
        #blocks per grid for each concurrent classifier
        self.bpg = []

        #function reference
        self.func = []

        #texture references for each concurrent kernel
        self.tex_ref = []

        #main vectors
        #gpu
        self.g_vecI = []
        self.g_vecJ = []
        #cpu
        self.main_vecI = []
        self.main_vecJ = []

        #cpu class
        self.cls_count = []
        self.cls = []
        #gpu class
        self.g_cls_count = []
        self.g_cls = []

        self.sum_cls = []

        for i in range(max_kernels):
            self.bpg.append(0)
            self.g_y.append(0)
            self.g_out.append(0)
            self.kernel_out.append(0)
            self.cls_count.append(0)
            self.cls.append(0)
            self.g_cls_count.append(0)
            self.g_cls.append(0)
            #            self.func.append(0)
            #            self.tex_ref.append(0)
            self.g_vecI.append(0)
            self.g_vecJ.append(0)
            #            self.main_vecI.append(0)
            #            self.main_vecJ.append(0)
            self.sum_cls.append(0)

        self.N, self.Dim = X.shape
        column_size = self.N * 4
        cacheMB = self.cache_size * 1024 * 1024  #100MB for cache size

        #how many kernel colums will be stored in cache
        cache_items = np.floor(cacheMB / column_size).astype(int)

        cache_items = min(self.N, cache_items)
        self.kernel_cache = pylru.lrucache(cache_items)

        self.compute_diag()

        #cuda initialization
        cuda.init()

        self.dev = cuda.Device(0)
        self.ctx = self.dev.make_context()

        #reade cuda .cu file with module code
        with open(self.module_file, "r") as CudaFile:
            module_code = CudaFile.read()

        #compile module
        self.module = SourceModule(module_code, keep=True, no_extern_c=True)

        (g_gamma, gsize) = self.module.get_global('GAMMA')
        cuda.memcpy_htod(g_gamma, np.float32(self.Gamma))

        #get functions reference

        Dim = self.Dim
        vecBytes = Dim * 4
        for f in range(self.max_concurrent_kernels):
            gfun = self.module.get_function(self.func_name)
            self.func.append(gfun)

            #init texture for vector I
            vecI_tex = self.module.get_texref('VecI_TexRef')
            self.g_vecI[f] = cuda.mem_alloc(vecBytes)
            vecI_tex.set_address(self.g_vecI[f], vecBytes)

            #init texture for vector J
            vecJ_tex = self.module.get_texref('VecJ_TexRef')
            self.g_vecJ[f] = cuda.mem_alloc(vecBytes)
            vecJ_tex.set_address(self.g_vecJ[f], vecBytes)

            self.tex_ref.append((vecI_tex, vecJ_tex))

            self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32))
            self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32))

            texReflist = list(self.tex_ref[f])

            #function definition P-pointer i-int
            gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist)

        #transform X to particular format
        v, c, r = spf.csr2ellpack(self.X, align=self.prefetch)
        #copy format data structure to gpu memory

        self.g_val = cuda.to_device(v)
        self.g_col = cuda.to_device(c)
        self.g_len = cuda.to_device(r)
        self.g_sdot = cuda.to_device(self.Xsquare)

        self.g_cls_start = cuda.to_device(self.cls_start)