def train_rfn_gpu(X,
                  n_hidden,
                  n_iter,
                  learnrateW,
                  learnratePsi,
                  dropout_rate,
                  input_droput_rate,
                  minPsi=0.1,
                  seed=32):
    k = n_hidden
    n, m = X.shape
    W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32)
    P = np.array([0.1] * m, dtype=np.float32)
    XXdiag = np.diag(np.dot(X.T, X) /
                     n).copy()  # explicit copy to avoid numpy 1.8 warning
    W = gpu.to_gpu(W, allocator=_mempool.allocate)
    P = gpu.to_gpu(P, allocator=_mempool.allocate)
    X = gpu.to_gpu(X, allocator=_mempool.allocate)
    XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate)
    I = la.eye(k, dtype=np.float32)

    init_rng(seed)
    t0 = time.time()
    for cur_iter in range(n_iter):
        H, tmp = calculate_H_gpu(X, W, P)
        if dropout_rate > 0:
            dropout(H, dropout_rate)
        Xtmp = X
        if input_dropout_rate > 0:
            Xtmp = X.copy()
            saltpepper_noise(Xtmp, input_dropout_rate)
        U = la.dot(Xtmp, H, "t", "n") / n
        S = la.dot(H, H, "t", "n") / n
        S += I
        S -= la.dot(tmp, W, "n", "t")
        Cii = la.dot(la.dot(W, S, "t") - 2 * U, W)

        Sinv = la.inv(S, overwrite=True)
        dW = la.dot(Sinv, U, "n", "t") - W
        dP = XXdiag + la.diag(Cii) - P

        W += learnrateW * dW
        P += learnratePsi * dP

        P = gpu.maximum(P, minPsi)
        if cur_iter % 25 == 0:
            print "iter %3d (elapsed time: %5.2fs)" % (cur_iter,
                                                       time.time() - t0)
    return W.get(), P.get()
def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32):
    k = n_hidden
    n, m = X.shape
    W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32)
    P = np.array([0.1] * m, dtype=np.float32)
    XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning
    W = gpu.to_gpu(W, allocator=_mempool.allocate)
    P = gpu.to_gpu(P, allocator=_mempool.allocate)
    X = gpu.to_gpu(X, allocator=_mempool.allocate)
    XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate)
    I = la.eye(k, dtype=np.float32)

    init_rng(seed)
    t0 = time.time()
    for cur_iter in range(n_iter):
        H, tmp = calculate_H_gpu(X, W, P)
        if dropout_rate > 0:
            dropout(H, dropout_rate)
        Xtmp = X
        if input_dropout_rate > 0:
            Xtmp = X.copy()
            saltpepper_noise(Xtmp, input_dropout_rate)
        U = la.dot(Xtmp, H, "t", "n") / n
        S = la.dot(H, H, "t", "n") / n
        S += I
        S -= la.dot(tmp, W, "n", "t")
        Cii = la.dot(la.dot(W, S, "t") - 2*U, W)

        Sinv = la.inv(S, overwrite=True)
        dW = la.dot(Sinv, U, "n", "t") - W
        dP = XXdiag + la.diag(Cii) - P

        W += learnrateW * dW
        P += learnratePsi * dP

        P = gpu.maximum(P, minPsi)
        if cur_iter % 25 == 0:
            print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0)
    return W.get(), P.get()
 def test_eye_large_float32(self):
     N = 128
     e_gpu = linalg.eye(N, dtype=np.float32)
     assert np.all(np.eye(N, dtype=np.float32) == e_gpu.get())
 def test_eye_complex128(self):
     N = 10
     e_gpu = linalg.eye(N, dtype=np.complex128)
     assert np.all(np.eye(N, dtype=np.complex128) == e_gpu.get())
 def test_eye_float64(self):
     N = 10
     e_gpu = linalg.eye(N, dtype=np.float64)
     assert np.all(np.eye(N, dtype=np.float64) == e_gpu.get())
Exemple #6
0
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0):
    """
    IAF time decoding machine.
    
    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.
        
    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.

    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')
        
    T = 2*np.pi*M/bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    dev = cumisc.get_current_device()
    
    # Prepare kernels:
    cache_dir = None
    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_q_ideal = compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = compute_q_mod.get_function('compute_q_leaky')

    compute_F_mod = \
                  SourceModule(compute_F_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_F_ideal = compute_F_mod.get_function('compute_F_ideal')
    compute_F_leaky = compute_F_mod.get_function('compute_F_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into GPU memory:
    s_gpu = gpuarray.to_gpu(s)

    # XXX: Eventually replace this with a PyCUDA equivalent
    ts = np.cumsum(s)
    ts_gpu = gpuarray.to_gpu(ts)

    # Set up GPUArrays for intermediary data. Note that all of the
    # arrays are complex to facilitate use of CUBLAS matrix
    # multiplication functions:
    q_gpu = gpuarray.empty((N-1, 1), complex_type)
    F_gpu = gpuarray.empty((N-1, 2*M+1), complex_type)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    max_threads_per_block = 256
    block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes(dev,
                                                             q_gpu.shape,
                                                             max_threads_per_block)
    block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes(dev,
                                                             F_gpu.shape,
                                                             max_threads_per_block)
    if np.isinf(R):
        compute_q_ideal(s_gpu, q_gpu, float_type(b), float_type(d),
                        float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_F_ideal(s_gpu, ts_gpu, F_gpu, float_type(bw),
                        np.int32(M), np.uint32((N-1)*(2*M+1)),
                        block=block_dim_F, grid=grid_dim_F)
    else:
        compute_q_leaky(s_gpu, q_gpu, float_type(b), float_type(d),
                        float_type(R), float_type(C), np.uint32(N-1),
                        block=block_dim_s, grid=grid_dim_s)
        compute_F_leaky(s_gpu, ts_gpu, F_gpu, float_type(bw),
                        float_type(R), float_type(C),
                        np.int32(M), np.uint32((N-1)*(2*M+1)),
                        block=block_dim_F, grid=grid_dim_F)

    # Compute the product of F^H and q first so that q
    # can be dropped from memory:
    FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c')
    del q_gpu
    
    if smoothing == 0:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'),
                                           __pinv_rcond__),
                             FHq_gpu)
    else:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+
                                           (N-1)*smoothing*culinalg.eye(2*M+1,
                                                                        float_type),
                                           __pinv_rcond__),
                             FHq_gpu)
        
    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur/dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu, c_gpu, float_type(bw),
              float_type(dt),
              np.int32(M),
              np.uint32(Nt),
              block=block_dim_t, grid=grid_dim_t)

    return np.real(u_rec_gpu.get())
Exemple #7
0
def iaf_decode_pop(s_gpu, ns_gpu, dur, dt, bw, b_gpu, d_gpu, R_gpu,
                   C_gpu, M=5, smoothing=0.0):
    """
    Population IAF time decoding machine.
    
    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of trigonometric polynomials.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.
        
    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.

    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')
    
    T = 2*np.pi*M/bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')
                    
    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Prepare kernels:
    cache_dir = None
    compute_ts_pop_mod = SourceModule(compute_ts_pop_template.substitute(use_double=use_double),
                                  cache_dir=cache_dir)
    compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts')
    
    compute_q_pop_mod = \
                      SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir)
    compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky')

    compute_F_pop_mod = \
                  SourceModule(compute_F_pop_template.substitute(use_double=use_double),
                               cache_dir=cache_dir,
                               options=['-I', install_headers])
    compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal')
    compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky')

    compute_u_pop_mod = \
                      SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir,
                                   options=['-I', install_headers])
    compute_u_pop = compute_u_pop_mod.get_function('compute_u')
    
    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than
    Nq = int(np.sum(ns)-np.sum(ns>1))
    
    # Set up GPUArrays for intermediary data: 
    ts_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    F_gpu = gpuarray.empty((Nq, 2*M+1), complex_type) 

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_F, grid_dim_F = \
                 cumisc.select_block_grid_sizes(dev, F_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu, ns_gpu, ts_gpu, np.uint32(s_gpu.shape[1]),
                   np.uint32(N),
                   block=block_dim_ts, grid=grid_dim_ts)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_pop_ideal(s_gpu, q_gpu,
                            b_gpu, d_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_F_pop_ideal(s_gpu, ts_gpu, F_gpu,
                            float_type(bw),
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.int32(M), np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F, grid=grid_dim_F)
    else:
        compute_q_pop_leaky(s_gpu, q_gpu,
                            b_gpu, d_gpu,
                            R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q, grid=grid_dim_q)
        compute_F_pop_leaky(s_gpu, ts_gpu, F_gpu,
                            float_type(bw), R_gpu, C_gpu,
                            idx_to_ni_gpu, idx_to_k_gpu,
                            np.int32(M), np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F, grid=grid_dim_F)

    # Free unneeded variables:
    del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu

    # Compute the product of F^H and q first so that both F^H and q
    # can be dropped from memory:
    FH_gpu = culinalg.hermitian(F_gpu)
    FHq_gpu = culinalg.dot(FH_gpu, q_gpu)
    del FH_gpu, q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'),
                                           __pinv_rcond__), 
                             FHq_gpu)
    else:
        c_gpu = culinalg.dot(culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c')+
                                           np.sum(ns)*smoothing*culinalg.eye(2*M+1,
                                                                        float_type),
                                           __pinv_rcond__),   
                             FHq_gpu)
        
    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur/dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu, c_gpu, float_type(bw),
                  float_type(dt),
                  np.int32(M),
                  np.uint32(Nt),
                  block=block_dim_t, grid=grid_dim_t)

    return np.real(u_rec_gpu.get())
Exemple #8
0
 def test_eye_large_float32(self):
     N = 128
     e_gpu = linalg.eye(N, dtype=np.float32)
     assert np.all(np.eye(N, dtype=np.float32) == e_gpu.get())
Exemple #9
0
 def test_eye_complex128(self):
     N = 10
     e_gpu = linalg.eye(N, dtype=np.complex128)
     assert np.all(np.eye(N, dtype=np.complex128) == e_gpu.get())
Exemple #10
0
 def test_eye_float64(self):
     N = 10
     e_gpu = linalg.eye(N, dtype=np.float64)
     assert np.all(np.eye(N, dtype=np.float64) == e_gpu.get())
Exemple #11
0
def iaf_decode_pop(s_gpu,
                   ns_gpu,
                   dur,
                   dt,
                   bw,
                   b_gpu,
                   d_gpu,
                   R_gpu,
                   C_gpu,
                   M=5,
                   smoothing=0.0):
    """
    Population IAF time decoding machine.
    
    Decode a signal encoded with an ensemble of Integrate-and-Fire
    neurons assuming that the encoded signal is representable in terms
    of trigonometric polynomials.

    Parameters
    ----------
    s_gpu : pycuda.gpuarray.GPUArray
        Signal encoded by an ensemble of encoders. The nonzero
        values represent the time between spikes (in s). The number of
        arrays in the list corresponds to the number of encoders in
        the ensemble.
    ns_gpu : pycuda.gpuarray.GPUArray
        Number of interspike intervals in each row of `s_gpu`.
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b_gpu : pycuda.gpuarray.GPUArray
        Array of encoder biases.
    d_gpu : pycuda.gpuarray.GPUArray
        Array of encoder thresholds.
    R_gpu : pycuda.gpuarray.GPUArray
        Array of neuron resistances.
    C_gpu : pycuda.gpuarray.GPUArray
        Array of neuron capacitances.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.

    Returns
    -------
    u_rec : pycuda.gpuarray.GPUArray
        Recovered signal.
        
    Notes
    -----
    The number of spikes contributed by each neuron may differ from the
    number contributed by other neurons.

    """

    # Sanity checks:
    float_type = s_gpu.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    N = s_gpu.shape[0]
    if not N:
        raise ValueError('no spike data given')
    if (ns_gpu.size != N) or (b_gpu.size != N) or (d_gpu.size != N) or \
       (R_gpu.size != N) or (C_gpu.size != N):
        raise ValueError('parameter arrays must be of same length')

    T = 2 * np.pi * M / bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    # Map CUDA index to neuron index and interspike interval index:
    ns = ns_gpu.get()
    idx_to_ni, idx_to_k = _compute_idx_map(ns)
    idx_to_ni_gpu = gpuarray.to_gpu(idx_to_ni)
    idx_to_k_gpu = gpuarray.to_gpu(idx_to_k)

    dev = cumisc.get_current_device()

    # Use a smaller block size than the maximum to prevent the kernels
    # from using too many registers:
    max_threads_per_block = 256

    # Prepare kernels:
    cache_dir = None
    compute_ts_pop_mod = SourceModule(
        compute_ts_pop_template.substitute(use_double=use_double),
        cache_dir=cache_dir)
    compute_ts_pop = compute_ts_pop_mod.get_function('compute_ts')

    compute_q_pop_mod = \
                      SourceModule(compute_q_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir)
    compute_q_pop_ideal = compute_q_pop_mod.get_function('compute_q_ideal')
    compute_q_pop_leaky = compute_q_pop_mod.get_function('compute_q_leaky')

    compute_F_pop_mod = \
                  SourceModule(compute_F_pop_template.substitute(use_double=use_double),
                               cache_dir=cache_dir,
                               options=['-I', install_headers])
    compute_F_pop_ideal = compute_F_pop_mod.get_function('compute_F_ideal')
    compute_F_pop_leaky = compute_F_pop_mod.get_function('compute_F_leaky')

    compute_u_pop_mod = \
                      SourceModule(compute_u_pop_template.substitute(use_double=use_double),
                                   cache_dir=cache_dir,
                                   options=['-I', install_headers])
    compute_u_pop = compute_u_pop_mod.get_function('compute_u')

    # Total number of interspike intervals per neuron less 1 for each
    # spike train with more than
    Nq = int(np.sum(ns) - np.sum(ns > 1))

    # Set up GPUArrays for intermediary data:
    ts_gpu = gpuarray.zeros_like(s_gpu)

    # Note that these arrays are complex to enable use of CUBLAS
    # matrix multiplication functions:
    q_gpu = gpuarray.empty((Nq, 1), complex_type)
    F_gpu = gpuarray.empty((Nq, 2 * M + 1), complex_type)

    # Get required block/grid sizes:
    block_dim_ts, grid_dim_ts = \
                  cumisc.select_block_grid_sizes(dev, N,
                                                 max_threads_per_block)
    block_dim_q, grid_dim_q = \
                 cumisc.select_block_grid_sizes(dev, q_gpu.shape,
                                                max_threads_per_block)
    block_dim_F, grid_dim_F = \
                 cumisc.select_block_grid_sizes(dev, F_gpu.shape,
                                                max_threads_per_block)

    # Launch kernels:
    compute_ts_pop(s_gpu,
                   ns_gpu,
                   ts_gpu,
                   np.uint32(s_gpu.shape[1]),
                   np.uint32(N),
                   block=block_dim_ts,
                   grid=grid_dim_ts)
    if np.all(np.isinf(R_gpu.get())):
        compute_q_pop_ideal(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_F_pop_ideal(s_gpu,
                            ts_gpu,
                            F_gpu,
                            float_type(bw),
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.int32(M),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F,
                            grid=grid_dim_F)
    else:
        compute_q_pop_leaky(s_gpu,
                            q_gpu,
                            b_gpu,
                            d_gpu,
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(Nq),
                            block=block_dim_q,
                            grid=grid_dim_q)
        compute_F_pop_leaky(s_gpu,
                            ts_gpu,
                            F_gpu,
                            float_type(bw),
                            R_gpu,
                            C_gpu,
                            idx_to_ni_gpu,
                            idx_to_k_gpu,
                            np.int32(M),
                            np.uint32(s_gpu.shape[1]),
                            np.uint32(F_gpu.size),
                            block=block_dim_F,
                            grid=grid_dim_F)

    # Free unneeded variables:
    del s_gpu, ts_gpu, idx_to_ni_gpu, idx_to_k_gpu

    # Compute the product of F^H and q first so that both F^H and q
    # can be dropped from memory:
    FH_gpu = culinalg.hermitian(F_gpu)
    FHq_gpu = culinalg.dot(FH_gpu, q_gpu)
    del FH_gpu, q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(
            culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__),
            FHq_gpu)
    else:
        c_gpu = culinalg.dot(
            culinalg.pinv(
                culinalg.dot(F_gpu, F_gpu, 'c') +
                np.sum(ns) * smoothing * culinalg.eye(2 * M + 1, float_type),
                __pinv_rcond__), FHq_gpu)

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur / dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u_pop(u_rec_gpu,
                  c_gpu,
                  float_type(bw),
                  float_type(dt),
                  np.int32(M),
                  np.uint32(Nt),
                  block=block_dim_t,
                  grid=grid_dim_t)

    return np.real(u_rec_gpu.get())
Exemple #12
0
def iaf_decode(s, dur, dt, bw, b, d, R=np.inf, C=1.0, M=5, smoothing=0.0):
    """
    IAF time decoding machine.
    
    Decode a finite length signal encoded with an Integrate-and-Fire
    neuron.

    Parameters
    ----------
    s : ndarray of floats
        Encoded signal. The values represent the time between spikes (in s).
    dur : float
        Duration of signal (in s).
    dt : float
        Sampling resolution of original signal; the sampling frequency
        is 1/dt Hz.
    bw : float
        Signal bandwidth (in rad/s).
    b : float
        Encoder bias.
    d : float
        Encoder threshold.
    R : float
        Neuron resistance.
    C : float
        Neuron capacitance.
    M : int
        2*M+1 coefficients are used for reconstructing the signal.
    smoothing : float
        Smoothing parameter.
        
    Returns
    -------
    u_rec : ndarray of floats
        Recovered signal.

    """

    N = len(s)
    float_type = s.dtype.type
    if float_type == np.float32:
        use_double = 0
        complex_type = np.complex64
        __pinv_rcond__ = 1e-4
    elif float_type == np.float64:
        use_double = 1
        complex_type = np.complex128
        __pinv_rcond__ = 1e-8
    else:
        raise ValueError('unsupported data type')

    T = 2 * np.pi * M / bw
    if T < dur:
        raise ValueError('2*pi*M/bw must exceed the signal length')

    dev = cumisc.get_current_device()

    # Prepare kernels:
    cache_dir = None
    compute_q_mod = \
                  SourceModule(compute_q_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_q_ideal = compute_q_mod.get_function('compute_q_ideal')
    compute_q_leaky = compute_q_mod.get_function('compute_q_leaky')

    compute_F_mod = \
                  SourceModule(compute_F_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_F_ideal = compute_F_mod.get_function('compute_F_ideal')
    compute_F_leaky = compute_F_mod.get_function('compute_F_leaky')

    compute_u_mod = \
                  SourceModule(compute_u_template.substitute(use_double=use_double),
                               cache_dir=cache_dir)
    compute_u = compute_u_mod.get_function('compute_u')

    # Load data into GPU memory:
    s_gpu = gpuarray.to_gpu(s)

    # XXX: Eventually replace this with a PyCUDA equivalent
    ts = np.cumsum(s)
    ts_gpu = gpuarray.to_gpu(ts)

    # Set up GPUArrays for intermediary data. Note that all of the
    # arrays are complex to facilitate use of CUBLAS matrix
    # multiplication functions:
    q_gpu = gpuarray.empty((N - 1, 1), complex_type)
    F_gpu = gpuarray.empty((N - 1, 2 * M + 1), complex_type)

    # Get required block/grid sizes; use a smaller block size than the
    # maximum to prevent the kernels from using too many registers:
    max_threads_per_block = 256
    block_dim_s, grid_dim_s = cumisc.select_block_grid_sizes(
        dev, q_gpu.shape, max_threads_per_block)
    block_dim_F, grid_dim_F = cumisc.select_block_grid_sizes(
        dev, F_gpu.shape, max_threads_per_block)
    if np.isinf(R):
        compute_q_ideal(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_F_ideal(s_gpu,
                        ts_gpu,
                        F_gpu,
                        float_type(bw),
                        np.int32(M),
                        np.uint32((N - 1) * (2 * M + 1)),
                        block=block_dim_F,
                        grid=grid_dim_F)
    else:
        compute_q_leaky(s_gpu,
                        q_gpu,
                        float_type(b),
                        float_type(d),
                        float_type(R),
                        float_type(C),
                        np.uint32(N - 1),
                        block=block_dim_s,
                        grid=grid_dim_s)
        compute_F_leaky(s_gpu,
                        ts_gpu,
                        F_gpu,
                        float_type(bw),
                        float_type(R),
                        float_type(C),
                        np.int32(M),
                        np.uint32((N - 1) * (2 * M + 1)),
                        block=block_dim_F,
                        grid=grid_dim_F)

    # Compute the product of F^H and q first so that q
    # can be dropped from memory:
    FHq_gpu = culinalg.dot(F_gpu, q_gpu, 'c')
    del q_gpu

    if smoothing == 0:
        c_gpu = culinalg.dot(
            culinalg.pinv(culinalg.dot(F_gpu, F_gpu, 'c'), __pinv_rcond__),
            FHq_gpu)
    else:
        c_gpu = culinalg.dot(
            culinalg.pinv(
                culinalg.dot(F_gpu, F_gpu, 'c') +
                (N - 1) * smoothing * culinalg.eye(2 * M + 1, float_type),
                __pinv_rcond__), FHq_gpu)

    # Allocate array for reconstructed signal:
    Nt = int(np.ceil(dur / dt))
    u_rec_gpu = gpuarray.to_gpu(np.zeros(Nt, complex_type))
    ### Replace the above with the following line when the bug in
    # gpuarray.zeros in pycuda 2011.1.2 is fixed:
    #u_rec_gpu = gpuarray.zeros(Nt, complex_type)

    # Get required block/grid sizes:
    block_dim_t, grid_dim_t = \
                 cumisc.select_block_grid_sizes(dev, Nt, max_threads_per_block)

    # Reconstruct signal:
    compute_u(u_rec_gpu,
              c_gpu,
              float_type(bw),
              float_type(dt),
              np.int32(M),
              np.uint32(Nt),
              block=block_dim_t,
              grid=grid_dim_t)

    return np.real(u_rec_gpu.get())