def __init__(self,a,b_shape,b_order='C',c_order='C',queue=None): assert a.shape[1] == b_shape[0] assert a.dtype == np.float32 if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) else: self._queue = queue self._block_size = 16 self.shape = a.shape if min(a.shape[0],a.shape[1],b_shape[0],b_shape[1]) < self._block_size: self._block_size = min(a.shape[0],a.shape[1],b_shape[0],b_shape[1]) self._b_shape = b_shape self._kernel_params = {"block_size": self._block_size, "w_a":a.shape[1], "h_a":a.shape[0], "w_b":b_shape[1]} self._a_order = gpu_algorithms._get_matrix_order(a) self._b_order = gpu_algorithms._get_matrix_order(b_order) self._c_order = gpu_algorithms._get_matrix_order(c_order) preamble = "" if(self._a_order == 'C'): preamble+= "#define A_ROW_MAJOR_ORDER\n" if(self._b_order == 'C'): preamble+= "#define B_ROW_MAJOR_ORDER\n" if(self._c_order == 'C'): preamble+= "#define C_ROW_MAJOR_ORDER\n" full_kernal = preamble + (KERNEL_CODE % self._kernel_params) prg = cl.Program(self._queue.context, full_kernal ).build() self.kernel = prg.matrixMul self.kernel.set_scalar_arg_dtypes([ None,#__global float* C, None,#__global float* A, None,#__global float* B, np.uint32,#uint x_offset, np.uint32,#uint y_offset ]) #Transfer the matrix to both the gpu, if it is not there already if isinstance(a,cl_array.Array): self.mat = a; else: self.mat = cl_array.to_device(self._queue,a) self.max_batch_size = (2048,2048)
def mul(self, other, ret_stor): ret_shape = ret_stor.shape float32_size = int(np.float32(0).nbytes) local_size = self.local_size rows_per_run = min(self.rows_per_run, ret_shape[0]) cols_per_run = min(self.cols_per_run, ret_shape[1]) overshoot = rows_per_run % local_size[1] rows_per_run = rows_per_run + (local_size[1] - overshoot) overshoot = cols_per_run % local_size[2] cols_per_run = cols_per_run + (local_size[2] - overshoot) global_size = (local_size[0], rows_per_run, cols_per_run) order_of_mats = (gpu_algorithms._get_matrix_order(other), gpu_algorithms._get_matrix_order(ret_stor)) # We need to recompile the kernal if the size changed if self.shape != self._old_shape or self._expected_order != order_of_mats: # Note that we will change back self._expected_order = order_of_mats self._old_shape = self.shape self._rebuild() # and rebuild for row in xrange(0, ret_shape[0], rows_per_run): for col in xrange(0, ret_shape[1], cols_per_run): self._spmv_csr_vector_kernel( self._queue, global_size, local_size, self.indptr.data, # __global const uint * ptr, self.indices.data, # __global const uint * indices, self.data.data, # __global const float * data, other.data, # __global const float * in, ret_stor.data, # __global const float * out, row, # const uint row_offset, col, # const uint col_offset, ret_shape[1], # const uint max_cols cl.LocalMemory(float32_size * local_size[0] * local_size[1]), ) cl.enqueue_barrier(self._queue)
def dot(self,h_b,out=None): assert h_b.shape == self._b_shape assert h_b.dtype == np.float32 assert gpu_algorithms._get_matrix_order(h_b) == self._b_order local_size = (self._block_size,self._block_size) if out is not None: assert gpu_algorithms._get_matrix_order(out) == self._c_order assert out.dtype == np.float32 if(out is None or not isinstance(out,cl_array.Array) ): out_tmp = cl_array.empty(self._queue, (self._kernel_params["w_a"],h_b.shape[1]), order = self._c_order, dtype=np.float32) else: out_tmp = out if(not isinstance(h_b,cl_array.Array) ): h_b_tmp = cl_array.to_device(self._queue,h_b) else: h_b_tmp = h_b global_size = out_tmp.shape[::-1] global_size = map(min,global_size,self.max_batch_size) global_size = map(opencl_tools.pad_overshoot,global_size,local_size) for x in xrange(0,out_tmp.shape[1],global_size[0]): for y in xrange(0,out_tmp.shape[0],global_size[1]): self.kernel(self._queue,global_size,local_size, out_tmp.data,self.mat.data,h_b_tmp.data,x,y)#.wait(); cl.enqueue_barrier(self._queue) if(out is None): return out_tmp else: if(not isinstance(out,cl_array.Array) ): cl.enqueue_copy(self._queue,out,out_tmp.data) return out
def mul(self,other,ret_stor): ret_shape = ret_stor.shape float32_size = int(np.float32(0).nbytes) local_size = self.local_size; rows_per_run = min(self.rows_per_run,ret_shape[0]) cols_per_run = min(self.cols_per_run,ret_shape[1]) overshoot = rows_per_run % local_size[1] rows_per_run = rows_per_run + (local_size[1]-overshoot) overshoot = cols_per_run % local_size[2] cols_per_run = cols_per_run + (local_size[2]-overshoot) global_size = (local_size[0],rows_per_run,cols_per_run) order_of_mats = (gpu_algorithms._get_matrix_order(other),gpu_algorithms._get_matrix_order(ret_stor)) #We need to recompile the kernal if the size changed if(self.shape != self._old_shape or self._expected_order != order_of_mats): #Note that we will change back self._expected_order = order_of_mats self._old_shape = self.shape self._rebuild() #and rebuild for row in xrange(0,ret_shape[0],rows_per_run): for col in xrange(0,ret_shape[1],cols_per_run): self._spmv_csr_vector_kernel(self._queue,global_size,local_size, self.indptr.data, #__global const uint * ptr, self.indices.data, #__global const uint * indices, self.data.data, # __global const float * data, other.data, # __global const float * in, ret_stor.data, # __global const float * out, row, #const uint row_offset, col, #const uint col_offset, ret_shape[1],#const uint max_cols cl.LocalMemory(float32_size*local_size[0]*local_size[1])) cl.enqueue_barrier(self._queue)