Python cublashandle Examples, utils.linalg.cublashandle Python Examples

Example #1

0

Show file

File: vrf.py Project: bionet/vtem

    def filter(self, V):
        """
        Filter a video V
        Must set up parameters of CS RF first
        
        Parameters
        ----------
        V : 3D ndarray, with shape (num_frames, Px, Py)
           
        Returns
        -------
        the filtered output by the gabor filters specified in self
        output is a PitchArray with shape (num_neurons, num_frames),
        jth row of which is the output of jth gabor filter

        """
        d_output = parray.empty((self.num_neurons, V.shape[0]), self.dtype)
        d_video = parray.to_gpu(V.reshape(V.shape[0], V.shape[1]*V.shape[2]))
    
        free,total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = (free / self.dtype.itemsize) * 3/4 / self.Pxall / self.Pyall
        
        handle = la.cublashandle()
        for i in np.arange(0,self.num_neurons,self.ONE_TIME_FILTERS):
            Nfilters =  min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_visual_receptive_fields(startbias = i, N_filters = Nfilters)
            cublasDgemm(handle.handle, 't','n', V.shape[0], int(Nfilters), self.Pxall*self.Pyall, self.dx*self.dy, d_video.gpudata, d_video.ld, self.filters.gpudata, self.filters.ld, 0, int(int(d_output.gpudata)+int(d_output.ld*i*d_output.dtype.itemsize)) , d_output.ld)
        return d_output.T()

Example #2

0

Show file

File: vrf.py Project: neurokernel/retina

    def filter(self, video_input):
        """
        Performs RF filtering on input video
        for all the rfs
        """
        if len(video_input.shape) == 2:
            # if input has 2 dimensions
            assert video_input.shape[1] == self.size
        else:
            # if input has 3 dimensions
            assert (video_input.shape[1]*video_input.shape[2] ==
                    self.size)
        # rasterizing inputs
        video_input.resize((video_input.shape[0], self.size))

        d_video = parray.to_gpu(video_input)
        d_output = parray.empty((self.num_neurons, video_input.shape[0]),
                                self.dtype)
        free, total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize)
                                 * 3 // 4 // self.size)
        self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2
        self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons)
        handle = la.cublashandle()

        for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS):
            Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_filters(startbias=i, N_filters=Nfilters)
            la.dot(self.filters, d_video, opb='t',
                   C=d_output[i: i+Nfilters],
                   handle=handle)
        del self.filters
        return d_output.T()

Example #3

0

Show file

File: vrf.py Project: bionet/vtem

 def compute_Dsw(self, d_Ds, Mx, My, h_norm):
     """
     Compute the weighting matrix of the "correlation" between each two RFs
     
     Parameters
     ----------
     d_Ds : PitchArray 
         containing dirichlet coefficient most possibly created by compute_Ds
     Mx : integer
         order in the x dimension
     My : integer
         order in the y dimension
     
     Returns
     -------
     PitchArray with shape (num_neurons, num_neurons)
     """
     
     if self.dtype == np.complex128:
         gemm = cublasZgemm
     else:
         gemm = cublasCgemm
     
     d_weight = parray.empty((self.num_neurons, self.num_neurons), self.dtype)
     
     handle = la.cublashandle()
     
     gemm(handle.handle, 'c', 'n', self.num_neurons, self.num_neurons, (2*Mx+1)*(2*My+1), 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 0, d_weight.gpudata, d_weight.ld);
     d_Dsw = d_weight.real()
     
     norm_func = get_put_norm_kernel(d_Dsw.dtype)
     launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, parray.to_gpu(h_norm.astype(np.float64)), d_Dsw.ld])
     
     
     return d_Dsw

Example #4

0

Show file

    def compute_Gb(self, Dsfilename, lamb=0.0):
        """
        compute G matrix using dirichlet coefficients
        Dsfilename: generated by VTDM_prepb
        lamb: smoothing parameter \lambda
        
        """
        handle = la.cublashandle()
        import tables
        h5file = tables.openFile(Dsfilename)
        Ds = h5file.root.real.read()

        d_Ds = parray.to_gpu(Ds.reshape((Ds.shape[0], -1)))
        del Ds

        d_Dsw = parray.empty((d_Ds.shape[0], d_Ds.shape[0]), d_Ds.dtype)
        if d_Ds.dtype == np.float64:
            from scikits.cuda.cublas import cublasDgemm
            gemm = cublasDgemm
        else:
            from scikits.cuda.cublas import cublasSgemm
            gemm = cublasSgemm
        gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0],
             d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld,
             0.0, d_Dsw.gpudata, d_Dsw.ld)
        Ds = h5file.root.imag.read()
        d_Ds.set(Ds)
        gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0],
             d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld,
             1.0, d_Dsw.gpudata, d_Dsw.ld)
        del Ds
        h5file.close()

        norm_func = get_put_norm_kernel(d_Dsw.dtype)
        launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0], 1),
                      [d_Dsw, self.d_norm, d_Dsw.ld])

        self.d_G = parray.empty((self.size, self.size), self.dtype)

        G_func = get_G_kernel(self.dtype, d_Dsw.dtype)
        launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [
            self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt,
            d_Dsw, d_Dsw.ld, self.d_neuron_ind
        ],
                      timed="G matrix")

        if lamb != 0:
            lamb_func = get_diag_add_kernel(self.dtype)
            launch_kernel(
                lamb_func, (256, 1, 1),
                (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [
                    self.d_G, self.d_G.ld, self.d_G.shape[0],
                    self.dtype.type(lamb)
                ])

Example #5

0

Show file

File: dirichlet.py Project: bionet/vtem

    def compute_Gb(self, Dsfilename, lamb=0.0):
        """
        compute G matrix using dirichlet coefficients
        Dsfilename: generated by VTDM_prepb
        lamb: smoothing parameter \lambda
        
        """
        handle = la.cublashandle()
        import tables
        h5file = tables.openFile(Dsfilename)
        Ds = h5file.root.real.read()
        
        d_Ds = parray.to_gpu(Ds.reshape((Ds.shape[0],-1)))
        del Ds
 
        d_Dsw = parray.empty((d_Ds.shape[0], d_Ds.shape[0]), d_Ds.dtype)
        if d_Ds.dtype == np.float64:
            from scikits.cuda.cublas import cublasDgemm
            gemm = cublasDgemm
        else:
            from scikits.cuda.cublas import cublasSgemm
            gemm = cublasSgemm
        gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 0.0, d_Dsw.gpudata, d_Dsw.ld)
        Ds = h5file.root.imag.read()
        d_Ds.set(Ds)
        gemm(handle.handle, 't', 'n', d_Dsw.shape[0], d_Dsw.shape[0], d_Ds.shape[1], 1.0, d_Ds.gpudata, d_Ds.ld, d_Ds.gpudata, d_Ds.ld, 1.0, d_Dsw.gpudata, d_Dsw.ld)        
        del Ds
        h5file.close()
        
        norm_func = get_put_norm_kernel(d_Dsw.dtype)
        launch_kernel(norm_func, (256, 1, 1), (d_Dsw.shape[0],1), [d_Dsw, self.d_norm, d_Dsw.ld])

        self.d_G = parray.empty((self.size, self.size), self.dtype)

        G_func = get_G_kernel(self.dtype, d_Dsw.dtype)
        launch_kernel(G_func, (256, 1, 1), (self.d_G.shape[0], 1), [self.d_G, self.d_G.ld, self.d_tk1, self.d_tk2, self.Wt, self.Mt, d_Dsw, d_Dsw.ld, self.d_neuron_ind], timed = "G matrix")

        if lamb != 0:
            lamb_func = get_diag_add_kernel(self.dtype)
            launch_kernel(lamb_func, (256,1,1), (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1), [self.d_G, self.d_G.ld, self.d_G.shape[0], self.dtype.type(lamb)])

Example #6

0

Show file

File: vrf.py Project: neurokernel/retina

    def filter_image(self, image_input):
        """
        Performs RF filtering on input video
        for all the rfs
        """
        # video dimensions should match screen dimensions
        # numpy resize operation doesn,t make any checks
        if len(image_input.shape) == 2:
            # if input has 2 dimensions
            assert image_input.shape[1] == self.size
        else:
            # if input has 3 dimensions
            assert (image_input.shape[1]*image_input.shape[2] ==
                    self.size)

        # rasterizing inputs
        image_input.resize((1, self.size))

        d_image = parray.to_gpu(image_input)
        d_output = parray.empty((self.num_neurons, image_input.shape[0]),
                                self.dtype)
        free, total = cuda.mem_get_info()
        self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize)
                                 * 3 // 4 // self.size)
        self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2
        self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons)
        handle = la.cublashandle()

        for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS):
            Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i)
            self.generate_filters(startbias=i, N_filters=Nfilters)
            la.dot(self.filters, d_image, opb='t',
                   C=d_output[i: i+Nfilters],
                   handle=handle)
        del self.filters
        return d_output.T()

Example #7

0

Show file

File: neural_network.py Project: bionet/vtem

def rnn3(G, q, dt = 1e-6, alpha = 5000, steps = 4000, XOUTPUTSTEPS = None):
    """
    Solving the decoding problem using a recurrent neural network.
    
    Parameters
    ----------
    G: PitchArray
       Must be real and positive semidefinite.
    q: PitchArray
       The measurements from spikes
    dt: float (optional)
        the time step in simulating the continuous network
    alpha: float (optional)
           scaling factor
    steps: int (optional)
           the number of steps to run the network
    XOUTPUTSTEPS: int (optional)
           The number of steps that are returned.
           If using default None, only return the final result.
    
    Return
    ------
    c: PitchArray
       The approximate solution to the decoding problem
    output: PitchArray (optional)
            If XOUTPUTSTEPS is not None, the full output specified
    """
    if G.dtype != q.dtype:
        raise TypeError("matrix multiplication must have same dtype")

    if np.iscomplexobj(G):
        raise TypeError("RNN currently only solves real types")

    if (len(G.shape) != 2) | (len(q.shape) != 2):
        raise TypeError("G, q must both be matrices")

    if XOUTPUTSTEPS is None:
        XOUTPUTSTEPS = min(20, steps)
        x_steps = steps / XOUTPUTSTEPS
        fullout = False
    else:
        fullout = True
        x_steps = steps / int(XOUTPUTSTEPS)
        output = parray.empty((XOUTPUTSTEPS, q.size), q.dtype)

    c = parray.zeros_like(q)
    update_func = get_rnn3_update_func(G.dtype)

    dt = float(dt)
    alpha = float(alpha)

    y = parray.empty_like(q)

    if y.dtype == np.float64:
        normfunc = cublasDnrm2
    else:
        normfunc = cublasSnrm2

    grid = (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1)
    
    handle = la.cublashandle()
    
    start = time.time()
    for i in range(0,steps+1):
        Gc = la.dot(G, c, handle = handle)
        launch_kernel(update_func, (256,1,1), grid, 
                      [c, dt*alpha, q, Gc, y, c.size, 1],
                      prepared = True)
        
        if i%x_steps == 0:
            ynorm = normfunc(handle.handle, y.size, y.gpudata, 1)
            print "%d, norm = %.10f, time=%f(ms)" % (i / x_steps, ynorm,
                                                     (time.time()-start)*1000);
            if fullout:
                cuda.memcpy_dtod(
                    int(output.gpudata) + 
                    output.dtype.itemsize*output.ld*int(i/x_steps-1), 
                    c.gpudata, c.dtype.itemsize * c.size)

    #cuda.memcpy_dtod(q.gpudata, c.gpudata, c.dtype.itemsize*c.size)

    if fullout:
        return c,output
    else:
		return c

Example #8

0

Show file

File: neural_network.py Project: bionet/vtem

def rnn3(G, q, dt=1e-6, alpha=5000, steps=4000, XOUTPUTSTEPS=None):
    """
    Solving the decoding problem using a recurrent neural network.
    
    Parameters
    ----------
    G: PitchArray
       Must be real and positive semidefinite.
    q: PitchArray
       The measurements from spikes
    dt: float (optional)
        the time step in simulating the continuous network
    alpha: float (optional)
           scaling factor
    steps: int (optional)
           the number of steps to run the network
    XOUTPUTSTEPS: int (optional)
           The number of steps that are returned.
           If using default None, only return the final result.
    
    Return
    ------
    c: PitchArray
       The approximate solution to the decoding problem
    output: PitchArray (optional)
            If XOUTPUTSTEPS is not None, the full output specified
    """
    if G.dtype != q.dtype:
        raise TypeError("matrix multiplication must have same dtype")

    if np.iscomplexobj(G):
        raise TypeError("RNN currently only solves real types")

    if (len(G.shape) != 2) | (len(q.shape) != 2):
        raise TypeError("G, q must both be matrices")

    if XOUTPUTSTEPS is None:
        XOUTPUTSTEPS = min(20, steps)
        x_steps = steps / XOUTPUTSTEPS
        fullout = False
    else:
        fullout = True
        x_steps = steps / int(XOUTPUTSTEPS)
        output = parray.empty((XOUTPUTSTEPS, q.size), q.dtype)

    c = parray.zeros_like(q)
    update_func = get_rnn3_update_func(G.dtype)

    dt = float(dt)
    alpha = float(alpha)

    y = parray.empty_like(q)

    if y.dtype == np.float64:
        normfunc = cublasDnrm2
    else:
        normfunc = cublasSnrm2

    grid = (6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1)

    handle = la.cublashandle()

    start = time.time()
    for i in range(0, steps + 1):
        Gc = la.dot(G, c, handle=handle)
        launch_kernel(update_func, (256, 1, 1),
                      grid, [c, dt * alpha, q, Gc, y, c.size, 1],
                      prepared=True)

        if i % x_steps == 0:
            ynorm = normfunc(handle.handle, y.size, y.gpudata, 1)
            print "%d, norm = %.10f, time=%f(ms)" % (i / x_steps, ynorm,
                                                     (time.time() - start) *
                                                     1000)
            if fullout:
                cuda.memcpy_dtod(
                    int(output.gpudata) +
                    output.dtype.itemsize * output.ld * int(i / x_steps - 1),
                    c.gpudata, c.dtype.itemsize * c.size)

    #cuda.memcpy_dtod(q.gpudata, c.gpudata, c.dtype.itemsize*c.size)

    if fullout:
        return c, output
    else:
        return c