print 'CPU RBF takes',t1-t0, 's' kij= np.array( [ki,kj]).flatten() print 'Total sum:',kij.sum() print kij[0:1000:skip] import pycuda.driver as cuda import pycuda.tools import pycuda.autoinit from pycuda.compiler import SourceModule ##---------------------------------------------- # Ellpakc gpu kernel v,c,r=spf.csr2ellpack(X,align=prefetch) sd=rbf.Diag self_dot = rbf.Xsquare results = np.zeros(2*num_el,dtype=np.float32) kernel_file = "ellpackKernel.cu" with open (kernel_file,"r") as CudaFile: data = CudaFile.read(); #copy memory to device g_val = cuda.to_device(v) g_col = cuda.to_device(c) g_r = cuda.to_device(r) g_self = cuda.to_device(self_dot)
def init_cuda(self,X,Y, cls_start, max_kernels=1 ): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X =X self.Y = Y self.cls_start=cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y=[] #handle to gpu memory for results for each concurrent classifier self.g_out=[] #gpu kernel out self.kernel_out=[] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg=[] #function reference self.func=[] #texture references for each concurrent kernel self.tex_ref=[] #main vectors #gpu self.g_vecI=[] self.g_vecJ=[] #cpu self.main_vecI=[] self.main_vecJ=[] #cpu class self.cls_count=[] self.cls=[] #gpu class self.g_cls_count=[] self.g_cls=[] self.sum_cls=[] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N,self.Dim = X.shape column_size = self.N*4 cacheMB = self.cache_size*1024*1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB/column_size).astype(int) cache_items = min(self.N,cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open (self.module_file,"r") as CudaFile: module_code = CudaFile.read(); #compile module self.module = SourceModule(module_code,keep=True,no_extern_c=True) (g_gamma,gsize)=self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma) ) #get functions reference Dim =self.Dim vecBytes = Dim*4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex=self.module.get_texref('VecI_TexRef') self.g_vecI[f]=cuda.mem_alloc( vecBytes) vecI_tex.set_address(self.g_vecI[f],vecBytes) #init texture for vector J vecJ_tex=self.module.get_texref('VecJ_TexRef') self.g_vecJ[f]=cuda.mem_alloc( vecBytes) vecJ_tex.set_address(self.g_vecJ[f],vecBytes) self.tex_ref.append((vecI_tex,vecJ_tex) ) self.main_vecI.append(np.zeros((1,Dim),dtype=np.float32)) self.main_vecJ.append(np.zeros((1,Dim),dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP",texrefs=texReflist) #transform X to particular format v,c,r=spf.csr2ellpack(self.X,align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start)
def init_cuda(self, X, Y, cls_start, max_kernels=1): #assert X.shape[0]==Y.shape[0] self.max_concurrent_kernels = max_kernels self.X = X self.Y = Y self.cls_start = cls_start.astype(np.int32) #handle to gpu memory for y for each concurrent classifier self.g_y = [] #handle to gpu memory for results for each concurrent classifier self.g_out = [] #gpu kernel out self.kernel_out = [] #cpu kernel out #blocks per grid for each concurrent classifier self.bpg = [] #function reference self.func = [] #texture references for each concurrent kernel self.tex_ref = [] #main vectors #gpu self.g_vecI = [] self.g_vecJ = [] #cpu self.main_vecI = [] self.main_vecJ = [] #cpu class self.cls_count = [] self.cls = [] #gpu class self.g_cls_count = [] self.g_cls = [] self.sum_cls = [] for i in range(max_kernels): self.bpg.append(0) self.g_y.append(0) self.g_out.append(0) self.kernel_out.append(0) self.cls_count.append(0) self.cls.append(0) self.g_cls_count.append(0) self.g_cls.append(0) # self.func.append(0) # self.tex_ref.append(0) self.g_vecI.append(0) self.g_vecJ.append(0) # self.main_vecI.append(0) # self.main_vecJ.append(0) self.sum_cls.append(0) self.N, self.Dim = X.shape column_size = self.N * 4 cacheMB = self.cache_size * 1024 * 1024 #100MB for cache size #how many kernel colums will be stored in cache cache_items = np.floor(cacheMB / column_size).astype(int) cache_items = min(self.N, cache_items) self.kernel_cache = pylru.lrucache(cache_items) self.compute_diag() #cuda initialization cuda.init() self.dev = cuda.Device(0) self.ctx = self.dev.make_context() #reade cuda .cu file with module code with open(self.module_file, "r") as CudaFile: module_code = CudaFile.read() #compile module self.module = SourceModule(module_code, keep=True, no_extern_c=True) (g_gamma, gsize) = self.module.get_global('GAMMA') cuda.memcpy_htod(g_gamma, np.float32(self.Gamma)) #get functions reference Dim = self.Dim vecBytes = Dim * 4 for f in range(self.max_concurrent_kernels): gfun = self.module.get_function(self.func_name) self.func.append(gfun) #init texture for vector I vecI_tex = self.module.get_texref('VecI_TexRef') self.g_vecI[f] = cuda.mem_alloc(vecBytes) vecI_tex.set_address(self.g_vecI[f], vecBytes) #init texture for vector J vecJ_tex = self.module.get_texref('VecJ_TexRef') self.g_vecJ[f] = cuda.mem_alloc(vecBytes) vecJ_tex.set_address(self.g_vecJ[f], vecBytes) self.tex_ref.append((vecI_tex, vecJ_tex)) self.main_vecI.append(np.zeros((1, Dim), dtype=np.float32)) self.main_vecJ.append(np.zeros((1, Dim), dtype=np.float32)) texReflist = list(self.tex_ref[f]) #function definition P-pointer i-int gfun.prepare("PPPPPPiiiiiiPPP", texrefs=texReflist) #transform X to particular format v, c, r = spf.csr2ellpack(self.X, align=self.prefetch) #copy format data structure to gpu memory self.g_val = cuda.to_device(v) self.g_col = cuda.to_device(c) self.g_len = cuda.to_device(r) self.g_sdot = cuda.to_device(self.Xsquare) self.g_cls_start = cuda.to_device(self.cls_start)