def __init__(self,image_shape,filter_array,boundry='clamp',queue=None): if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(), properties=opencl_tools.profile_properties) else: self._queue = queue filter_array =filter_array.astype(np.float32) self._image_shape = image_shape self._filter_size = filter_array.shape; self._kernal_size = (4,21) half_kernal_sizes = (filter_array.shape[0]/2,filter_array.shape[1]/2) kernal_ratios = (int(np.ceil(2*filter_array.shape[0]/float(self._kernal_size[0]))), int(np.ceil(2*filter_array.shape[1]/float(self._kernal_size[1])))) preamble = """ #define ROWS %d #define COLS %d #define NUMBER_COLORS %d #define FILTER_HEIGHT %d #define FILTER_WIDTH %d #define KERNAL_HEIGHT %d #define KERNAL_WIDTH %d #define half_fheight %d #define half_fwidth %d #define FILTER_2_TO_KERNAL_HEIGHT %d #define FILTER_2_TO_KERNAL_WIDTH %d """ % (image_shape + filter_array.shape + self._kernal_size + half_kernal_sizes + kernal_ratios ) if(boundry == 'clamp'): preamble += "#define BOUNDRY_CLAMP\n" elif(boundry == 'zero'): preamble += "#define BOUNDRY_CLAMP\n" else: raise ValueError('Unknown boundry value: %s' % boundry) if not isinstance(filter_array,cl_array.Array): self._filter = cl_array.to_device(self._queue,filter_array) else: self._filter = filter_array preamble += gpu_algorithms.add_matrix_axses_for('filter',self._filter) preamble = opencl_tools.build_preamble_for_context\ (self._queue.context,preamble) prg = cl.Program(self._queue.context,preamble + _blur_kernal).build(options=[]); self._blur_kernal = prg.blur_kernal
def __init__(self, points,point_offset=None): self._ctx = opencl_tools.get_a_context() self._queue = cl.CommandQueue(self._ctx,properties=opencl_tools.profile_properties); self._gpu_points = cl_array.to_device(self._queue,points.astype(np.float32)) if(point_offset!=None): self._point_offset \ = cl_array.to_device(self._queue,point_offset.astype(np.float32)) else: self._point_offset \ = cl_array.zeros(self._queue,points.shape[0],np.float32)
def __init__(self,a,b_shape,b_order='C',c_order='C',queue=None): assert a.shape[1] == b_shape[0] assert a.dtype == np.float32 if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) else: self._queue = queue self._block_size = 16 self.shape = a.shape if min(a.shape[0],a.shape[1],b_shape[0],b_shape[1]) < self._block_size: self._block_size = min(a.shape[0],a.shape[1],b_shape[0],b_shape[1]) self._b_shape = b_shape self._kernel_params = {"block_size": self._block_size, "w_a":a.shape[1], "h_a":a.shape[0], "w_b":b_shape[1]} self._a_order = gpu_algorithms._get_matrix_order(a) self._b_order = gpu_algorithms._get_matrix_order(b_order) self._c_order = gpu_algorithms._get_matrix_order(c_order) preamble = "" if(self._a_order == 'C'): preamble+= "#define A_ROW_MAJOR_ORDER\n" if(self._b_order == 'C'): preamble+= "#define B_ROW_MAJOR_ORDER\n" if(self._c_order == 'C'): preamble+= "#define C_ROW_MAJOR_ORDER\n" full_kernal = preamble + (KERNEL_CODE % self._kernel_params) prg = cl.Program(self._queue.context, full_kernal ).build() self.kernel = prg.matrixMul self.kernel.set_scalar_arg_dtypes([ None,#__global float* C, None,#__global float* A, None,#__global float* B, np.uint32,#uint x_offset, np.uint32,#uint y_offset ]) #Transfer the matrix to both the gpu, if it is not there already if isinstance(a,cl_array.Array): self.mat = a; else: self.mat = cl_array.to_device(self._queue,a) self.max_batch_size = (2048,2048)
def test(): import time queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) #test_sum cpu_array1 = np.random.rand(1000,1000).astype(np.float32) cpu_array2 = np.random.rand(1000,1000).astype(np.float32) true_val = [] for axs in xrange(2): t0 =time.time() true_val.append(np.sum(cpu_array1*cpu_array2,axis = axs)) t1 =time.time() print "Their time for axis ",axs,":", t1-t0 for in_order in ['F','C']: for axs in xrange(2): print "Testing: In order",in_order, "with axis",axs t1 =time.time() gpu_array1 = cl_array.to_device(queue, np.asarray(cpu_array1, order=in_order)) gpu_array2 = cl_array.to_device(queue, np.asarray(cpu_array2, order=in_order)) test_sum=SumProductKernal(gpu_array1,axis = axs,queue=queue) t2 =time.time() gpu_val = test_sum(None,gpu_array2,gpu_array1) cl.enqueue_barrier(queue).wait() t3 =time.time() sum_val=gpu_val.get() t4 = time.time() print "\tOur time:", t3-t2 print "\tOur time with transfers:", t4-t1 err = true_val[axs]-sum_val print "\tMax error:",np.max(np.abs(err)/true_val[axs]) gpu_array1.data.release(); del gpu_array1 gpu_array2.data.release(); del gpu_array2 gpu_val.data.release(); del gpu_val
def test(): import time queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) #test_sum cpu_array = np.random.rand(7546*5,6425).astype(np.float32) true_val = [] for axs in xrange(2): t0 =time.time() true_val.append(np.sum(cpu_array,axis = axs)) t1 =time.time() print "Their time for axis ",axs,":", t1-t0 for in_order in ['F','C']: for axs in xrange(2): precompute_test_sum = SumKernal(np.float32,axis = axs,queue=queue) for size_type in ['size at construction','size at run']: print "Testing: In order",in_order, "with axis",axs, 'and', size_type t1 =time.time() gpu_array = cl_array.to_device(queue, np.asarray(cpu_array, order=in_order)) if size_type == 'size at construction' : test_sum = SumKernal(gpu_array,axis = axs,queue=queue) else: test_sum = precompute_test_sum t2 =time.time() gpu_val = test_sum(matrix=gpu_array) cl.enqueue_barrier(queue).wait() t3 =time.time() sum_val=gpu_val.get() t4 = time.time() print "\tOur time:", t3-t2 print "\tOur time with transfers\\construction:", t4-t1 err = true_val[axs]-sum_val print "\tMax error:",np.max(np.abs(err)/true_val[axs]) gpu_array.data.release(); del gpu_array gpu_val.data.release(); del gpu_val
def __init__(self,matrix,axis,queue=None): assert axis >= 0 and axis <= 1 if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) else: self._queue = queue self._block_size = 32 self._matrix = matrix self._axis = axis self._matrix_order = _get_matrix_order(matrix) #if we are using C major order, we see the sum order as oposit, so #exchange the order if(self._matrix_order == 'C'): self._effective_axis = 1 - axis else: self._effective_axis = axis preamble=""" #define sum_size %d #define other_size %d #define blockSize %d """ % (matrix.shape[axis], matrix.shape[1-axis], self._block_size) preamble = opencl_tools.build_preamble_for_context\ (self._queue.context,preamble) if(self._effective_axis == 0 ): preamble += "#define SUM_OVER_SLOW_CHANGING\n" prg = cl.Program(self._queue.context,preamble+_sum_product_code).build(); self.kernel = prg.sum_product_per_axis
assert mat.size == 1 assert mat.dtype == self.mat_struct mat_view = np.empty((self.n,self.vector_padded),self.element_type) mat_view.data[:] = mat.data[:] #mat_view = mat.view(self.element_type) #mat_view = mat_view.reshape((self.n,self.vector_padded)) return mat_view[:self.n,:self.n] # if __name__ == "__main__": queue = cl.CommandQueue(opencl_tools.get_a_context(), properties=opencl_tools.profile_properties) test_code = Template(""" __kernel void test_kernal(${matrix_type} mat_in, ${vector_type} v_in,__global ${vector_type}* v_out) { *v_out = mat_dot(mat_in,v_in); } __kernel void test_kernal2(${matrix_type} mat1_in, ${matrix_type} mat2_in,__global ${matrix_type}* m_out) { *m_out = mat_mul(mat1_in,mat2_in); }
def __init__(self,matrix,axis,queue=None): assert axis >= 0 and axis <= 1 if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) else: self._queue = queue self._block_size = 32 self._axis = axis if np.issubdtype(matrix,type): self._sizes_given_as_arguments = True self._dtype = matrix self._ctype = cl.tools.dtype_to_ctype(self._dtype) preamble=""" #define blockSize %d #define DTYPE %s """ % (self._block_size, self._ctype) preamble = opencl_tools.build_preamble_for_context\ (self._queue.context,preamble) preamble += "#define SIZE_AS_ARGUMENT\n" else: self._sizes_given_as_arguments = False self._matrix = matrix self._matrix_order = _get_matrix_order(matrix) #if we are using C major order, we see the sum order as oposit, so #exchange the order if(self._matrix_order == 'C'): self._effective_axis = 1 - axis else: self._effective_axis = axis self._dtype = matrix.dtype self._ctype = cl.tools.dtype_to_ctype(self._dtype) preamble=""" #define sum_size %d #define other_size %d #define blockSize %d #define DTYPE %s """ % (matrix.shape[axis], matrix.shape[1-axis], self._block_size, self._ctype) preamble = opencl_tools.build_preamble_for_context\ (self._queue.context,preamble) if(self._effective_axis == 0 ): preamble += "#define SUM_OVER_SLOW_CHANGING\n" prg = cl.Program(self._queue.context,preamble+_sum_code).build(); self.kernel = prg.sum_per_axis if self._sizes_given_as_arguments: self.kernel.set_scalar_arg_dtypes([None,None, np.int32,np.int32,np.int32])
def matrix_to_np_array(self, mat): assert mat.size == 1 assert mat.dtype == self.mat_struct mat_view = np.empty((self.n, self.vector_padded), self.element_type) mat_view.data[:] = mat.data[:] #mat_view = mat.view(self.element_type) #mat_view = mat_view.reshape((self.n,self.vector_padded)) return mat_view[:self.n, :self.n] # if __name__ == "__main__": queue = cl.CommandQueue(opencl_tools.get_a_context(), properties=opencl_tools.profile_properties) test_code = Template(""" __kernel void test_kernal(${matrix_type} mat_in, ${vector_type} v_in,__global ${vector_type}* v_out) { *v_out = mat_dot(mat_in,v_in); } __kernel void test_kernal2(${matrix_type} mat1_in, ${matrix_type} mat2_in,__global ${matrix_type}* m_out) { *m_out = mat_mul(mat1_in,mat2_in); }
def __init__(self,image_shape,row_filter_array, col_filter_array,boundry='clamp',queue=None): assert row_filter_array.shape[1] == 1 assert col_filter_array.shape[0] == 1 if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(), properties=opencl_tools.profile_properties) else: self._queue = queue row_filter_array =row_filter_array.astype(np.float32) col_filter_array =col_filter_array.astype(np.float32) self._image_shape = image_shape self._filter_size = (row_filter_array.shape[0],col_filter_array.shape[1]); self._kernal_size = (1,256) half_filter_sizes = (self._filter_size[0]/2,self._filter_size[1]/2) buffer_width = (self._kernal_size[1]+self._filter_size[1],) preamble = """ #define ROWS %d #define COLS %d #define NUMBER_COLORS %d #define FILTER_HEIGHT %d #define FILTER_WIDTH %d #define KERNAL_HEIGHT %d #define KERNAL_WIDTH %d #define half_fheight %d #define half_fwidth %d #define local_buffer_width %d """ % (image_shape + self._filter_size + self._kernal_size + half_filter_sizes + buffer_width ) if(boundry == 'clamp'): preamble += "#define BOUNDRY_CLAMP\n" elif(boundry == 'zero'): preamble += "#define BOUNDRY_CLAMP\n" else: raise ValueError('Unknown boundry value: %s' % boundry) if not isinstance(row_filter_array,cl_array.Array): self._row_filter_array = cl_array.to_device(self._queue,row_filter_array) else: self._row_filter_array = row_filter_array if not isinstance(col_filter_array,cl_array.Array): self._col_filter_array = cl_array.to_device(self._queue,col_filter_array) else: self._col_filter_array = col_filter_array preamble = opencl_tools.build_preamble_for_context\ (self._queue.context,preamble) prg = cl.Program(self._queue.context,preamble + _separable_filter_kernal_float4).build(options=[]); self._separable_filter_kernal = prg.separable_filter_kernal
def __init__(self, arg1, shape=None, dtype=None, copy=False, queue=None): """Orginaly taken from scipi's implementation""" if queue is None: self._queue = cl.CommandQueue(opencl_tools.get_a_context(), properties=opencl_tools.profile_properties) else: self._queue = queue self.size = -1 # Default running sizes self.rows_per_run = 1000 self.cols_per_run = 100 self.local_size = (16, 16, 1) if isinstance(arg1, self.__class__) or sp.isspmatrix(arg1): self._set_self(arg1, copy) elif isinstance(arg1, tuple): if sp.sputils.isshape(arg1): # It's a tuple of matrix dimensions (M, N) # create empty matrix self.shape = arg1 # spmatrix checks for errors here M, N = self.shape self.data = cl_array.zeros(self._queue, 0, sp.sputils.getdtype(dtype, default=np.float32)) self.indices = cl_array.zeros(self._queue, 0, np.intc) self.indptr = cl_array.zeros(self._queue, self._swap((M, N))[0] + 1, dtype=np.intc) else: if len(arg1) == 2: # (data, ij) format other = sp.coo_matrix(arg1, shape=shape) self._set_self(other) elif len(arg1) == 3: # (data, indices, indptr) format (data, indices, indptr) = arg1 self.indices = self._make_gpu_array(indices, copy=copy) self.indptr = self._make_gpu_array(indptr, copy=copy) self.data = self._make_gpu_array(data, copy=copy, dtype=sp.sputils.getdtype(dtype, data)) else: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) else: # must be dense try: arg1 = np.asarray(arg1) except: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) if dtype == None: dtype = np.float32 self._set_self(sp.coo_matrix(arg1, dtype=dtype)) # Read matrix dimensions given, if any if shape is not None: self.shape = shape # spmatrix will check for errors else: if self.shape is None: # shape not already set, try to infer dimensions try: major_dim = len(self.indptr) - 1 minor_dim = self.indices.max() + 1 except: raise ValueError("unable to infer matrix dimensions") else: self.shape = self._swap((major_dim, minor_dim)) if dtype is not None: if self.data.dtype != dtype: old_data = self.data self.data = self.data.astype(dtype) old_data.data.release() # self.check_format(full_check=False) self._old_shape = (-1, -1) self._expected_order = ("", "") self.dtype = self.data.dtype self._kernal_table = dict()
def __init__(self, arg1, shape=None, dtype=None, copy=False,queue=None): """Orginaly taken from scipi's implementation""" if(queue is None): self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties) else: self._queue = queue self.size =-1 #Default running sizes self.rows_per_run = 1000 self.cols_per_run = 100 self.local_size = (16,16,1); if isinstance(arg1,self.__class__) or sp.isspmatrix(arg1): self._set_self( arg1,copy ) elif isinstance(arg1, tuple): if sp.sputils.isshape(arg1): # It's a tuple of matrix dimensions (M, N) # create empty matrix self.shape = arg1 #spmatrix checks for errors here M, N = self.shape self.data = cl_array.zeros(self._queue,0, sp.sputils.getdtype(dtype, default=np.float32)) self.indices = cl_array.zeros(self._queue,0, np.intc) self.indptr = cl_array.zeros(self._queue,self._swap((M,N))[0] + 1, dtype=np.intc) else: if len(arg1) == 2: # (data, ij) format other = sp.coo_matrix(arg1, shape=shape) self._set_self( other ) elif len(arg1) == 3: # (data, indices, indptr) format (data, indices, indptr) = arg1 self.indices = self._make_gpu_array(indices, copy=copy) self.indptr = self._make_gpu_array(indptr, copy=copy) self.data = self._make_gpu_array(data, copy=copy, dtype=sp.sputils.getdtype(dtype, data)) else: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) else: #must be dense try: arg1 = np.asarray(arg1) except: raise ValueError("unrecognized %s_matrix constructor usage" % self.format) if dtype == None: dtype = np.float32 self._set_self(sp.coo_matrix(arg1, dtype=dtype) ) # Read matrix dimensions given, if any if shape is not None: self.shape = shape # spmatrix will check for errors else: if self.shape is None: # shape not already set, try to infer dimensions try: major_dim = len(self.indptr) - 1 minor_dim = self.indices.max() + 1 except: raise ValueError('unable to infer matrix dimensions') else: self.shape = self._swap((major_dim,minor_dim)) if dtype is not None: if(self.data.dtype != dtype): old_data = self.data self.data = self.data.astype(dtype) old_data.data.release() #self.check_format(full_check=False) self._old_shape = (-1,-1) self._expected_order = ("","") self.dtype = self.data.dtype self._kernal_table = dict()