def __init__(self,a,b_shape,b_order='C',c_order='C',queue=None):
        assert a.shape[1] == b_shape[0]
        assert a.dtype == np.float32
        
        if(queue is None):
            self._queue = cl.CommandQueue(opencl_tools.get_a_context(),properties=opencl_tools.profile_properties)
        else:
            self._queue =  queue
            
        self._block_size = 16

        self.shape = a.shape        
        
        if min(a.shape[0],a.shape[1],b_shape[0],b_shape[1]) < self._block_size:
            self._block_size = min(a.shape[0],a.shape[1],b_shape[0],b_shape[1])
            
        self._b_shape = b_shape
        self._kernel_params  = {"block_size": self._block_size,
                                "w_a":a.shape[1], 
                                "h_a":a.shape[0], 
                                "w_b":b_shape[1]}
        
                      
        self._a_order = gpu_algorithms._get_matrix_order(a)
        self._b_order = gpu_algorithms._get_matrix_order(b_order)
        self._c_order = gpu_algorithms._get_matrix_order(c_order)
        
        
        preamble = ""
        if(self._a_order == 'C'):
            preamble+= "#define A_ROW_MAJOR_ORDER\n"
        if(self._b_order == 'C'):
            preamble+= "#define B_ROW_MAJOR_ORDER\n"
        if(self._c_order == 'C'):
            preamble+= "#define C_ROW_MAJOR_ORDER\n"
            
        full_kernal = preamble + (KERNEL_CODE % self._kernel_params)
        prg = cl.Program(self._queue.context, full_kernal ).build()
                         
        self.kernel = prg.matrixMul
        self.kernel.set_scalar_arg_dtypes([ 
                                            None,#__global float* C, 
                                            None,#__global float* A, 
                                            None,#__global float* B,
                                            np.uint32,#uint x_offset,
                                            np.uint32,#uint y_offset
                                          ])
        #Transfer the matrix to both the gpu, if it is not there already
        if isinstance(a,cl_array.Array):
            self.mat = a;
        else:
            self.mat = cl_array.to_device(self._queue,a)
        
        self.max_batch_size = (2048,2048)
    def mul(self, other, ret_stor):
        ret_shape = ret_stor.shape
        float32_size = int(np.float32(0).nbytes)

        local_size = self.local_size

        rows_per_run = min(self.rows_per_run, ret_shape[0])
        cols_per_run = min(self.cols_per_run, ret_shape[1])

        overshoot = rows_per_run % local_size[1]
        rows_per_run = rows_per_run + (local_size[1] - overshoot)

        overshoot = cols_per_run % local_size[2]
        cols_per_run = cols_per_run + (local_size[2] - overshoot)

        global_size = (local_size[0], rows_per_run, cols_per_run)

        order_of_mats = (gpu_algorithms._get_matrix_order(other), gpu_algorithms._get_matrix_order(ret_stor))

        # We need to recompile the kernal if the size changed
        if self.shape != self._old_shape or self._expected_order != order_of_mats:
            # Note that we will change back
            self._expected_order = order_of_mats
            self._old_shape = self.shape
            self._rebuild()  # and rebuild

        for row in xrange(0, ret_shape[0], rows_per_run):
            for col in xrange(0, ret_shape[1], cols_per_run):
                self._spmv_csr_vector_kernel(
                    self._queue,
                    global_size,
                    local_size,
                    self.indptr.data,  # __global const uint * ptr,
                    self.indices.data,  # __global const uint * indices,
                    self.data.data,  # __global const float * data,
                    other.data,  # __global const float * in,
                    ret_stor.data,  # __global const float * out,
                    row,  # const uint row_offset,
                    col,  # const uint col_offset,
                    ret_shape[1],  # const uint max_cols
                    cl.LocalMemory(float32_size * local_size[0] * local_size[1]),
                )
        cl.enqueue_barrier(self._queue)
    def dot(self,h_b,out=None):
        assert h_b.shape == self._b_shape
        assert h_b.dtype == np.float32
        assert gpu_algorithms._get_matrix_order(h_b) == self._b_order
        
        local_size = (self._block_size,self._block_size)
        if out is not None:
            assert gpu_algorithms._get_matrix_order(out) == self._c_order
            assert out.dtype == np.float32

        if(out is None or not isinstance(out,cl_array.Array) ):
            out_tmp = cl_array.empty(self._queue,
                                 (self._kernel_params["w_a"],h_b.shape[1]),
                                  order = self._c_order,
                                  dtype=np.float32)
        else:
           out_tmp = out
            

        if(not isinstance(h_b,cl_array.Array) ):
            h_b_tmp = cl_array.to_device(self._queue,h_b)
        else:
            h_b_tmp = h_b
        
        global_size = out_tmp.shape[::-1]  
        global_size = map(min,global_size,self.max_batch_size)
        global_size = map(opencl_tools.pad_overshoot,global_size,local_size)
        


        for x in xrange(0,out_tmp.shape[1],global_size[0]):
            for y in xrange(0,out_tmp.shape[0],global_size[1]):
                self.kernel(self._queue,global_size,local_size,
                                out_tmp.data,self.mat.data,h_b_tmp.data,x,y)#.wait();
        cl.enqueue_barrier(self._queue)
        if(out is None):
            return out_tmp
        else:
            if(not isinstance(out,cl_array.Array) ):
                cl.enqueue_copy(self._queue,out,out_tmp.data)
                
        return out
    def mul(self,other,ret_stor):
        ret_shape = ret_stor.shape
        float32_size = int(np.float32(0).nbytes)

        local_size = self.local_size;
        
        rows_per_run = min(self.rows_per_run,ret_shape[0])        
        cols_per_run = min(self.cols_per_run,ret_shape[1])

        overshoot = rows_per_run % local_size[1]
        rows_per_run = rows_per_run + (local_size[1]-overshoot)
        
        overshoot = cols_per_run % local_size[2]
        cols_per_run = cols_per_run + (local_size[2]-overshoot)        
        
        global_size = (local_size[0],rows_per_run,cols_per_run)

        order_of_mats = (gpu_algorithms._get_matrix_order(other),gpu_algorithms._get_matrix_order(ret_stor))
        
        #We need to recompile the kernal if the size changed
        if(self.shape != self._old_shape or self._expected_order != order_of_mats):
            #Note that we will change back
            self._expected_order = order_of_mats 
            self._old_shape = self.shape 
            self._rebuild() #and rebuild
            
        

        for row in xrange(0,ret_shape[0],rows_per_run):
            for col in xrange(0,ret_shape[1],cols_per_run):
                self._spmv_csr_vector_kernel(self._queue,global_size,local_size,
                                    self.indptr.data,  #__global const uint * ptr,
                                    self.indices.data, #__global const uint * indices,
                                    self.data.data,    # __global const float * data,
                                    other.data,        # __global const float * in,
                                    ret_stor.data,     # __global const float * out,
                                    row,               #const uint row_offset,
                                    col,               #const uint col_offset,
                                    ret_shape[1],#const uint max_cols
                                    cl.LocalMemory(float32_size*local_size[0]*local_size[1]))
        cl.enqueue_barrier(self._queue)