def __init__(self, source, b, a): # Automatically duplicate mono input to fit the desired output shape if b.shape[0]!=source.nchannels: if source.nchannels!=1: raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.') source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) # Weave version of filtering requires Fortran ordering of filter params if len(b.shape)==2 and len(a.shape)==2: b = reshape(b, b.shape+(1,)) a = reshape(a, a.shape+(1,)) self.filt_b = array(b, order='F') self.filt_a = array(a, order='F') self.filt_state = zeros((b.shape[0], b.shape[1]-1, b.shape[2]), order='F')
def __init__(self, source, b, a): # Automatically duplicate mono input to fit the desired output shape if b.shape[0]!=source.nchannels: if source.nchannels!=1: raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.') source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) # Weave version of filtering requires Fortran ordering of filter params if len(b.shape)==2 and len(a.shape)==2: b = reshape(b, b.shape+(1,)) a = reshape(a, a.shape+(1,)) self.filt_b = array(b, order='F') self.filt_a = array(a, order='F') self.filt_state = zeros((b.shape[0], b.shape[1], b.shape[2]), order='F') self.use_weave = get_global_preference('useweave') if self.use_weave: log_info('brian.hears.filtering.linearfilterbank', 'Using weave') self.cpp_compiler = get_global_preference('weavecompiler') self.extra_compile_args = ['-O3'] if self.cpp_compiler=='gcc': self.extra_compile_args += get_global_preference('gcc_options')
def __init__(self, source, b, a): # Automatically duplicate mono input to fit the desired output shape if b.shape[0] != source.nchannels: if source.nchannels != 1: raise ValueError( 'Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.' ) source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) # Weave version of filtering requires Fortran ordering of filter params if len(b.shape) == 2 and len(a.shape) == 2: b = reshape(b, b.shape + (1, )) a = reshape(a, a.shape + (1, )) self.filt_b = array(b, order='F') self.filt_a = array(a, order='F') self.filt_state = zeros((b.shape[0], b.shape[1], b.shape[2]), order='F') self.use_weave = get_global_preference('useweave') if self.use_weave: log_info('brian.hears.filtering.linearfilterbank', 'Using weave') self.cpp_compiler = get_global_preference('weavecompiler') self.extra_compile_args = ['-O3'] if self.cpp_compiler == 'gcc': self.extra_compile_args += get_global_preference('gcc_options')
def __init__(self, source, b, a, samplerate=None, precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None): # Automatically duplicate mono input to fit the desired output shape if b.shape[0]!=source.nchannels: if source.nchannels!=1: raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.') source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) if pycuda.context is None: set_gpu_device(0) self.precision=precision if self.precision=='double': self.precision_dtype=float64 else: self.precision_dtype=float32 self.forcesync=forcesync self.pagelocked_mem=pagelocked_mem n, m, p=b.shape self.filt_b=b self.filt_a=a filt_b_gpu=array(b, dtype=self.precision_dtype) filt_a_gpu=array(a, dtype=self.precision_dtype) filt_state=zeros((n, m-1, p), dtype=self.precision_dtype) if pagelocked_mem: filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype) self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype) else: filt_y=zeros(n, dtype=self.precision_dtype) self.pre_x=zeros(n, dtype=self.precision_dtype) self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds self.filt_state=gpuarray.to_gpu(filt_state.T.flatten()) self.unroll_filterorder = unroll_filterorder if unroll_filterorder is None: if m<=32: unroll_filterorder = True else: unroll_filterorder = False # TODO: improve code, check memory access patterns, maybe use local memory code=''' #define x(s,i) _x[(s)*n+(i)] #define y(s,i) _y[(s)*n+(i)] #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m] #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m] #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)] __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples) { int j = blockIdx.x * blockDim.x + threadIdx.x; if(j>=n) return; for(int s=0; s<numsamples; s++) { ''' for k in range(p): loopcode=''' y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k); ''' if unroll_filterorder: for i in range(m-2): loopcode+=re.sub('\\bi\\b', str(i), ''' zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''') else: loopcode+=''' for(int i=0;i<m-2;i++) zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''' loopcode+=''' zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j); ''' if k<p-1: loopcode+=''' x(s,j) = y(s,j); ''' loopcode=re.sub('\\bk\\b', str(k), loopcode) code+=loopcode code+=''' } } ''' code=code.replace('SCALAR', self.precision) code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values code=re.sub("\\bm\\b", str(m), code) code=re.sub("\\bn\\b", str(n), code) #print code self.gpu_mod=pycuda.compiler.SourceModule(code) self.gpu_filt_func=self.gpu_mod.get_function("filt") blocksize=256 if n<blocksize: blocksize=n if n%blocksize==0: gridsize=n/blocksize else: gridsize=n/blocksize+1 self.block=(blocksize, 1, 1) self.grid=(gridsize, 1) self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block) self._has_run_once=False
def __init__(self, source, b, a, samplerate=None, precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None): # Automatically duplicate mono input to fit the desired output shape if b.shape[0] != source.nchannels: if source.nchannels != 1: raise ValueError( 'Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.' ) source = RestructureFilterbank(source, b.shape[0]) Filterbank.__init__(self, source) if pycuda.context is None: set_gpu_device(0) self.precision = precision if self.precision == 'double': self.precision_dtype = float64 else: self.precision_dtype = float32 self.forcesync = forcesync self.pagelocked_mem = pagelocked_mem n, m, p = b.shape self.filt_b = b self.filt_a = a filt_b_gpu = array(b, dtype=self.precision_dtype) filt_a_gpu = array(a, dtype=self.precision_dtype) filt_state = zeros((n, m - 1, p), dtype=self.precision_dtype) if pagelocked_mem: filt_y = drv.pagelocked_zeros((n, ), dtype=self.precision_dtype) self.pre_x = drv.pagelocked_zeros((n, ), dtype=self.precision_dtype) else: filt_y = zeros(n, dtype=self.precision_dtype) self.pre_x = zeros(n, dtype=self.precision_dtype) self.filt_b_gpu = gpuarray.to_gpu(filt_b_gpu.T.flatten( )) # transform to Fortran order for better GPU mem self.filt_a_gpu = gpuarray.to_gpu( filt_a_gpu.T.flatten()) # access speeds self.filt_state = gpuarray.to_gpu(filt_state.T.flatten()) self.unroll_filterorder = unroll_filterorder if unroll_filterorder is None: if m <= 32: unroll_filterorder = True else: unroll_filterorder = False # TODO: improve code, check memory access patterns, maybe use local memory code = ''' #define x(s,i) _x[(s)*n+(i)] #define y(s,i) _y[(s)*n+(i)] #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m] #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m] #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)] __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples) { int j = blockIdx.x * blockDim.x + threadIdx.x; if(j>=n) return; for(int s=0; s<numsamples; s++) { ''' for k in range(p): loopcode = ''' y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k); ''' if unroll_filterorder: for i in range(m - 2): loopcode += re.sub( '\\bi\\b', str(i), ''' zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''') else: loopcode += ''' for(int i=0;i<m-2;i++) zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j); ''' loopcode += ''' zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j); ''' if k < p - 1: loopcode += ''' x(s,j) = y(s,j); ''' loopcode = re.sub('\\bk\\b', str(k), loopcode) code += loopcode code += ''' } } ''' code = code.replace('SCALAR', self.precision) code = re.sub("\\bp\\b", str(p), code) #replace the variable by their values code = re.sub("\\bm\\b", str(m), code) code = re.sub("\\bn\\b", str(n), code) #print code self.gpu_mod = pycuda.compiler.SourceModule(code) self.gpu_filt_func = self.gpu_mod.get_function("filt") blocksize = 256 if n < blocksize: blocksize = n if n % blocksize == 0: gridsize = n / blocksize else: gridsize = n / blocksize + 1 self.block = (blocksize, 1, 1) self.grid = (gridsize, 1) self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block) self._has_run_once = False