def _rebuild(self): key = self._expected_order + self._old_shape if key in self._kernal_table: self._spmv_csr_vector_kernel = self._kernal_table[key] return preamble = """ #define blockSize %d #define num_rows %d #define num_cols %d """ % ( self.local_size[:1] + self.shape ) preamble = opencl_tools.build_preamble_for_context(self._queue.context, preamble) if self._expected_order[0] == "F": preamble += "#define INPUT_COLUMN_MAJOR\n" if self._expected_order[1] == "F": preamble += "#define OUTPUT_COLUMN_MAJOR\n" prg = cl.Program(self._queue.context, preamble + _csr_code).build() self._spmv_csr_vector_kernel = prg.spmv_csr_vector_kernel self._spmv_csr_vector_kernel.set_scalar_arg_dtypes( [ None, # __global const uint * ptr, None, # __global const uint * indices, None, # __global const float * data, None, # __global const float * in, None, # __global float * out, np.uint32, # const uint row_offset, np.uint32, # const uint col_offset, np.uint32, # const uint max_cols None, # __local float* sdata) ] ) self._kernal_table[key] = self._spmv_csr_vector_kernel
def get_knn(self,querys,k,query_offset=None): float32_size = int(np.float32(0).nbytes) #####Figure out sizes point_length=int(self._gpu_points.shape[0]); dims = int(self._gpu_points.shape[1]) dims_po2 = 2**dims.bit_length() device = self._ctx.devices[0] querys_per_run = (timing_constant*3200)/point_length querys_per_run = min(querys_per_run,\ (2**(device.get_info(cl.device_info.ADDRESS_BITS))-1)\ /(2*point_length*dims_po2)) querys_per_run = min(querys_per_run,querys.shape[0]) querys_per_run = min(querys_per_run,\ device.get_info(cl.device_info.MAX_MEM_ALLOC_SIZE)\ /(self._gpu_points.shape[0]*float32_size*5)) querys_per_run = int(max(querys_per_run,1)); ###Create buffers true_k = min(k,point_length) dists = np.empty((querys.shape[0],true_k),dtype=np.float32); indexes = np.empty((querys.shape[0],true_k),dtype=np.int32); dists_gpu = \ cl_array.empty(self._queue,(self._gpu_points.shape[0],querys_per_run),np.float32); query_gpu = \ cl_array.to_device(self._queue,querys.astype(np.float32)) if(query_offset!=None): gpu_query_offset \ = cl_array.to_device(self._queue,query_offset.astype(np.float32)) else: gpu_query_offset \ = cl_array.zeros(self._queue,querys.shape[0],np.float32) ###Create kernal preamble=""" #define blockSize %d #define dims %d #define rows %d """ % (dims_po2,dims,self._gpu_points.shape[0]) preamble = opencl_tools.build_preamble_for_context(self._ctx,preamble) prg = cl.Program(self._ctx,preamble+_OpenCL_code).build(); distkernal = prg.calc_dists; distkernal.set_scalar_arg_dtypes\ ([None,None,np.int32,None,None,None,None]) ##calulate run size max_run_items=distkernal.get_work_group_info\ (cl.kernel_work_group_info.WORK_GROUP_SIZE,device) work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES); point_per_run=min([work_item_sizes[0]/4,\ (max_run_items)/(8*dims_po2),\ point_length]) if(point_per_run < 1): point_per_run = 1 local_shape = (point_per_run ,dims_po2,1) global_shape = (point_length,dims_po2,querys_per_run) overshoot = global_shape[0] % local_shape[0] if overshoot > 0: global_shape = (global_shape[0]+ (local_shape[0]-overshoot)\ ,global_shape[1]\ ,global_shape[2]) #selector = gpu_quick_selection.GPUQuickSelection\ # (ctx,self._queue,dists_gpu.shape) dist_calc = None; def make_calc(stop_point,ii) : def calc(): if(k<dist_local.shape[0]): index_local = bn.argpartsort(dist_local,k,axis=0)[:k,:] for r in xrange(stop_point-ii): dists_cpu_buffer_local = dist_local[index_local[:,r],r]; indexes_cpu_buffer_local = index_local[:,r] index_local2 = np.argsort(dists_cpu_buffer_local,axis=0) dists[ii+r,:] = dists_cpu_buffer_local[index_local2] indexes[ii+r,:] = indexes_cpu_buffer_local[index_local2] #print ii+r,indexes_cpu_buffer_local[index_local2[0]] else: for r in xrange(stop_point-ii): dists_cpu_buffer_local = dist_local[:,r]; index_local2 = np.argsort(dists_cpu_buffer_local,axis=0) dists[ii+r,:] = dists_cpu_buffer_local[index_local2] indexes[ii+r,:] = index_local2 return calc for ii in xrange(0,querys.shape[0],querys_per_run): #For the last step, we need to make shure not to go over if (querys.shape[0]-ii) < querys_per_run: global_shape = (global_shape[0], global_shape[1], (querys.shape[0]-ii)) distkernal(self._queue,global_shape,local_shape,\ self._gpu_points.data,\ query_gpu.data,\ ii,\ dists_gpu.data,\ self._point_offset.data,\ gpu_query_offset.data,\ cl.LocalMemory(float32_size*local_shape[0]*local_shape[1])).wait() stop_point = min(ii+querys_per_run,querys.shape[0]) if(dist_calc != None): dist_calc(); # newdists, newindexs = \ # selector.gpu_quick_selection(self._queue,dists_gpu,k,stop_point-ii) # # dists[ii:stop_point,:] = newdists.T # indexes[ii:stop_point,:] = newindexs.T dist_local = dists_gpu.get(self._queue); dist_calc = make_calc(stop_point,ii) if(dist_calc != None): dist_calc(); dists_gpu.data.release(); query_gpu.data.release(); gpu_query_offset.data.release(); return indexes,dists