def _rebuild(self):
        key = self._expected_order + self._old_shape
        if key in self._kernal_table:
            self._spmv_csr_vector_kernel = self._kernal_table[key]
            return

        preamble = """
        #define blockSize %d
        #define num_rows %d
        #define num_cols %d 
        """ % (
            self.local_size[:1] + self.shape
        )

        preamble = opencl_tools.build_preamble_for_context(self._queue.context, preamble)

        if self._expected_order[0] == "F":
            preamble += "#define INPUT_COLUMN_MAJOR\n"
        if self._expected_order[1] == "F":
            preamble += "#define OUTPUT_COLUMN_MAJOR\n"

        prg = cl.Program(self._queue.context, preamble + _csr_code).build()
        self._spmv_csr_vector_kernel = prg.spmv_csr_vector_kernel
        self._spmv_csr_vector_kernel.set_scalar_arg_dtypes(
            [
                None,  # __global const uint * ptr,
                None,  # __global const uint * indices,
                None,  # __global const float * data,
                None,  # __global const float * in,
                None,  # __global float * out,
                np.uint32,  # const uint row_offset,
                np.uint32,  # const uint col_offset,
                np.uint32,  # const uint max_cols
                None,  # __local float* sdata)
            ]
        )
        self._kernal_table[key] = self._spmv_csr_vector_kernel
Example #2
0
    def get_knn(self,querys,k,query_offset=None):
        float32_size = int(np.float32(0).nbytes)
        


        #####Figure out sizes
        point_length=int(self._gpu_points.shape[0]);
        dims = int(self._gpu_points.shape[1])
        dims_po2 = 2**dims.bit_length()
        device = self._ctx.devices[0]
        
        querys_per_run = (timing_constant*3200)/point_length
                
        querys_per_run = min(querys_per_run,\
                    (2**(device.get_info(cl.device_info.ADDRESS_BITS))-1)\
                                                    /(2*point_length*dims_po2))                                            
        querys_per_run = min(querys_per_run,querys.shape[0])
        querys_per_run = min(querys_per_run,\
                            device.get_info(cl.device_info.MAX_MEM_ALLOC_SIZE)\
                                /(self._gpu_points.shape[0]*float32_size*5))
                                
        querys_per_run = int(max(querys_per_run,1));
        
        ###Create buffers
        true_k = min(k,point_length)
        dists = np.empty((querys.shape[0],true_k),dtype=np.float32);
        indexes = np.empty((querys.shape[0],true_k),dtype=np.int32);
        
        dists_gpu = \
            cl_array.empty(self._queue,(self._gpu_points.shape[0],querys_per_run),np.float32);
        query_gpu = \
                cl_array.to_device(self._queue,querys.astype(np.float32)) 
            
        if(query_offset!=None):
            gpu_query_offset \
                = cl_array.to_device(self._queue,query_offset.astype(np.float32))
        else:
            gpu_query_offset \
                    = cl_array.zeros(self._queue,querys.shape[0],np.float32)        
        
        
        ###Create kernal
        preamble="""
        #define blockSize %d
        #define dims %d
        #define rows %d
        
        """ % (dims_po2,dims,self._gpu_points.shape[0])

        preamble = opencl_tools.build_preamble_for_context(self._ctx,preamble)
        prg = cl.Program(self._ctx,preamble+_OpenCL_code).build();
        
        distkernal = prg.calc_dists;
        distkernal.set_scalar_arg_dtypes\
                        ([None,None,np.int32,None,None,None,None])  

        ##calulate run size
        max_run_items=distkernal.get_work_group_info\
                            (cl.kernel_work_group_info.WORK_GROUP_SIZE,device)                             
        
        work_item_sizes = device.get_info(cl.device_info.MAX_WORK_ITEM_SIZES);
        
        
        point_per_run=min([work_item_sizes[0]/4,\
                            (max_run_items)/(8*dims_po2),\
                            point_length])       
                            
        if(point_per_run < 1):
            point_per_run = 1              
        local_shape =  (point_per_run ,dims_po2,1)
        
        global_shape = (point_length,dims_po2,querys_per_run)
        overshoot = global_shape[0] % local_shape[0]
        if overshoot > 0:
            global_shape = (global_shape[0]+ (local_shape[0]-overshoot)\
                                                            ,global_shape[1]\
                                                            ,global_shape[2])
                                                            
        #selector = gpu_quick_selection.GPUQuickSelection\
        #                                    (ctx,self._queue,dists_gpu.shape)      

        dist_calc = None;
        def make_calc(stop_point,ii) : 
                def calc():
                    if(k<dist_local.shape[0]):
                        index_local  = bn.argpartsort(dist_local,k,axis=0)[:k,:]
                        
                        for r in xrange(stop_point-ii):
                            dists_cpu_buffer_local = dist_local[index_local[:,r],r];
                            indexes_cpu_buffer_local = index_local[:,r]
                         
                
                            index_local2  = np.argsort(dists_cpu_buffer_local,axis=0)
                        
                            dists[ii+r,:] = dists_cpu_buffer_local[index_local2]
                            indexes[ii+r,:] = indexes_cpu_buffer_local[index_local2]
                            #print ii+r,indexes_cpu_buffer_local[index_local2[0]]
                    else:
                        for r in xrange(stop_point-ii):
                            dists_cpu_buffer_local = dist_local[:,r];

                            index_local2  = np.argsort(dists_cpu_buffer_local,axis=0)
                        
                            dists[ii+r,:] = dists_cpu_buffer_local[index_local2]
                            indexes[ii+r,:] = index_local2                        
                return calc
                                              
        for ii in xrange(0,querys.shape[0],querys_per_run):
            
            #For the last step, we need to make shure not to go over
            if (querys.shape[0]-ii) < querys_per_run:
                global_shape = (global_shape[0],
                                    global_shape[1],
                                     (querys.shape[0]-ii))
            
            distkernal(self._queue,global_shape,local_shape,\
               self._gpu_points.data,\
               query_gpu.data,\
               ii,\
               dists_gpu.data,\
               self._point_offset.data,\
               gpu_query_offset.data,\
               cl.LocalMemory(float32_size*local_shape[0]*local_shape[1])).wait()
            
            stop_point = min(ii+querys_per_run,querys.shape[0])
            
            if(dist_calc != None):
                dist_calc();

            
#            newdists, newindexs = \
#                    selector.gpu_quick_selection(self._queue,dists_gpu,k,stop_point-ii)
#                    
#            dists[ii:stop_point,:] = newdists.T
#            indexes[ii:stop_point,:] = newindexs.T
            

            dist_local = dists_gpu.get(self._queue);
            dist_calc = make_calc(stop_point,ii) 

        if(dist_calc != None):
            dist_calc();
            
        dists_gpu.data.release();
        query_gpu.data.release();
        gpu_query_offset.data.release();
        
        return indexes,dists