Esempio n. 1
0
def get_G_kernel(dtype, dtypew):
    template = """
__device__ double
G_dirichlet_time(double tk1, double tk2, double tl1,double tl2, int M, double WM)
{
	double sum = 0;
	for(int m = 1; m <= M; ++m)
	{
		sum += (cos(m * WM * (tk2-tl2)) - cos(m * WM * (tk2-tl1)) - cos( m * WM * (tk1-tl2)) + cos(m * WM * (tk1-tl1))) / (m*m);
	}

	sum = sum * 2 / (WM * WM) + (tk2-tk1)*(tl2-tl1);
	return sum;
}


    __global__ void
compute_G_Kernel(%(type)s* g_G, int G_ld, double* g_tk1, double* g_tk2, double Wt, int Mt, %(typew)s* SWeight, int Sweight_ld, int* neuron_ind)
{
	unsigned int tid = threadIdx.x;
	unsigned int bdim = blockDim.x;
	unsigned int bid = blockIdx.x;
	int size = gridDim.x;

	double tl[2];
	__shared__ double tk[2];
	__shared__ int ind1;
	int ind2;
	
	
        if(tid == 0)
        {
                tk[0] = g_tk1[bid]; //roundf(g_tk1[bid] * rintf(1 / dt));
        }else if(tid == 1)
        {
                tk[1] = g_tk2[bid]; //roundf(g_tk2[bid] * rintf(1 / dt));
        }else if(tid ==2)
        {
                ind1 = neuron_ind[bid];
        }
                        
        __syncthreads();

        
        for(int i = tid; i < size; i += bdim)
        {
            ind2 = neuron_ind[i];
            tl[0] = g_tk1[i];
            tl[1] = g_tk2[i];
            
            g_G[bid * G_ld + i] = G_dirichlet_time(tk[0],tk[1],tl[0],tl[1],Mt,Wt/Mt) * SWeight[ind1 * Sweight_ld + ind2];

        }	
    
}

"""
    func = func_compile("compute_G_Kernel",  template  % {"type": dtype_to_ctype(dtype), "typew": dtype_to_ctype(dtypew)})
    return func
Esempio n. 2
0
File: vrf.py Progetto: bionet/vtem
def get_fft2Ds_kernel(dtype=np.dtype(np.complex128)):
    fft2Ds_template = """
    #include <pycuda/pycuda-complex.hpp>
    
    __global__ void
fft2Ds_kernel(%(type)s* d_Ds, %(type)s* d_gabor_fft, int Mx, int My, int LPx, int LPy, int N_filters, int pitch, double dxdy)
{
	const int tid = threadIdx.x;
	const int mx = blockIdx.x - Mx;
	const int my = blockIdx.y - My;

	int fftind;
    double sgndxdy;
	
	if(mx < 0 && my <0)
	{
		fftind = LPx * (LPy + my) + (LPx + mx);
	}else if(my < 0 && mx >= 0)
	{
		fftind = mx + (LPx * (LPy + my));
	}else if(mx >= 0 && my >=0)
	{
		fftind = my*LPx + mx;
	}else//(my >= 0 && mx < 0)
	{
		fftind = my * LPx + (LPx + mx);
	}

	if( (mx+my)%%2 ==0)
	{
		sgndxdy = dxdy;
	}else
	{
		sgndxdy = -dxdy;
	}

	%(type)s tmp;
    
    if(mx*mx * My * My + my*my * Mx * Mx <= Mx*Mx*My*My )
    {
        for(int i = tid; i < N_filters; i+=blockDim.x)
        {
            tmp = d_gabor_fft[i * LPx * LPy + fftind];

            d_Ds[i * pitch + (My-my)*(2*Mx+1) + (mx+Mx)] = %(type)s(sgndxdy * pycuda::real(tmp), sgndxdy * pycuda::imag(tmp));
        }
    }else
    {
        for(int i = tid; i < N_filters; i+=blockDim.x)
        {
            d_Ds[i * pitch + (My - my)*(2*Mx+1) + (mx+Mx)] = 0;
        }
    }

}
    """
    func = func_compile("fft2Ds_kernel", fft2Ds_template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 3
0
File: vrf.py Progetto: bionet/vtem
def get_Ds2fft_kernel(dtype):
    template = """
        #include <pycuda/pycuda-complex.hpp>
        __global__ void
        Ds2fft_kernel(%(type)s* d_Ds, int Ds_ld, %(type)s* filter, int filter_ld, int Mx, int My, int Px, int Py, int N_filters)
        {
        const int tid = threadIdx.x;
        const int mx = blockIdx.x - Mx;
        const int my = blockIdx.y - My;
        
        int fftind;
        double sgndxdy;
        
        if(mx < 0 && my <0)
        {
        fftind = Px * (Py + my) + (Px + mx);
        }else if(my < 0 && mx >= 0)
        {
        fftind = mx + (Px * (Py + my));
        }else if(mx >= 0 && my >=0)
        {
        fftind = my*Px + mx;
        }else//(my > 0 && mx < 0)
        {
        fftind = my * Px + (Px + mx);
        }  
        
        if( (mx+my)%%2 ==0)
        {
        sgndxdy = 1;
        }else
        {
        sgndxdy = -1;
        }
        
        %(type)s tmp;
        
        
        for(int i = tid; i < N_filters; i+=blockDim.x)
        {
        tmp = d_Ds[i * Ds_ld + (My-my)*(2*Mx+1) + (mx+Mx)];
        filter[i * filter_ld + fftind] = %(type)s(sgndxdy * pycuda::real(tmp),sgndxdy * pycuda::imag(tmp));
        }
        }
        
        """
    func = func_compile("Ds2fft_kernel", template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 4
0
def get_diag_add_kernel(dtype):
    template = """
__global__ void
diag_add_Kernel(%(type)s* d_G, int ld, int size, %(type)s addin)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int total = gridDim.x * blockDim.x;
	
	for(int i = tid; i < size; i+=total)
	{
		d_G[i * ld + i] += addin;
	}

}
"""
    func = func_compile("diag_add_Kernel",  template  % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 5
0
def get_diag_add_kernel(dtype):
    template = """
__global__ void
diag_add_Kernel(%(type)s* d_G, int ld, int size, %(type)s addin)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int total = gridDim.x * blockDim.x;
	
	for(int i = tid; i < size; i+=total)
	{
		d_G[i * ld + i] += addin;
	}

}
"""
    func = func_compile("diag_add_Kernel",
                        template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 6
0
File: vrf.py Progetto: bionet/vtem
def get_filter2rec_kernel(dtype):
    if dtype == np.complex128:
        dtypef = np.float64
    else:
        dtypef = np.float32
    
    template = """
        #include <pycuda/pycuda-complex.hpp>
        #define BLOCK_SIZE 16    
        
        __global__ void
        filter2rec_kernel(%(typef)s* d_dirich_space, int dirich_ld, %(type)s* d_filter_complex, int filter_ld, int N_filters, int Px, int Py, int Px1, int Py1)
        {
        unsigned int tid_x; tid_x = threadIdx.x;
        unsigned int tid_y; tid_y = threadIdx.y;
        
        int xld = (Px-1)/BLOCK_SIZE + 1;
        unsigned int bid_x = blockIdx.x %% xld; 
        unsigned int bid_y = blockIdx.x / xld; 
        
        unsigned int filter_id = blockIdx.y;
        
        unsigned int dim_x = blockDim.x;
        unsigned int dim_y = blockDim.y;
        
        int pix_x = dim_x * bid_x + tid_x;
        int pix_y = dim_y * bid_y + tid_y;
        
        int xdrift = (Px1 - Px)/2;
        int ydrift = (Py1 - Py)/2;
        
        int input_ind = (pix_y + ydrift) * Px1 + pix_x + xdrift + filter_id * filter_ld;
        int output_ind = (pix_y) * Px + (pix_x) + filter_id * dirich_ld;
        
        if(pix_x < Px && pix_y < Py)
        {
        d_dirich_space[output_ind] = pycuda::real(d_filter_complex[input_ind]);
        }
        
        }
        """
    func = func_compile("filter2rec_kernel", template % {"type": dtype_to_ctype(dtype), "typef": dtype_to_ctype(dtypef)})
    return func
Esempio n. 7
0
def get_compute_q_kernel(dtype):
    template = """
__global__ void
compute_q_Kernel(%(type)s* g_q, double* g_tk1, double* g_tk2, int* neuron_ind, double* kappa, double* delta, double* bias, double* d_norm, int size)
{
	int ind = blockIdx.x * blockDim.x + threadIdx.x;
    int total_threads = blockDim.x * gridDim.x;

	for(int i = ind; i < size; i += total_threads)
	{
		int neuron = neuron_ind[i];

		g_q[i] = (kappa[neuron] * delta[neuron] - bias[neuron] * (g_tk2[i] - g_tk1[i])) * d_norm[neuron];
	}
}

"""
    func = func_compile("compute_q_Kernel", template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 8
0
def get_compute_q_kernel(dtype):
    template = """
__global__ void
compute_q_Kernel(%(type)s* g_q, double* g_tk1, double* g_tk2, int* neuron_ind, double* kappa, double* delta, double* bias, double* d_norm, int size)
{
	int ind = blockIdx.x * blockDim.x + threadIdx.x;
    int total_threads = blockDim.x * gridDim.x;

	for(int i = ind; i < size; i += total_threads)
	{
		int neuron = neuron_ind[i];

		g_q[i] = (kappa[neuron] * delta[neuron] - bias[neuron] * (g_tk2[i] - g_tk1[i])) * d_norm[neuron];
	}
}

"""
    func = func_compile("compute_q_Kernel",
                        template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 9
0
def get_put_norm_kernel(dtype):
    template = """
    __global__ void put_norm(%(type)s* d_SWeight, double* d_norm, int ld)
{
	int tid = threadIdx.x;
	int bid = blockIdx.x;
	int NUM_NEURONS = gridDim.x;
	
	double norm1 = d_norm[bid];
	double norm2;

	for(int i = tid; i < NUM_NEURONS; i += blockDim.x)
	{
		norm2 = d_norm[i];
		d_SWeight[i + bid * ld] *= norm1 * norm2;
	}
}
"""
    func = func_compile("put_norm", template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 10
0
def get_put_norm_kernel(dtype):
    template = """
    __global__ void put_norm(%(type)s* d_SWeight, double* d_norm, int ld)
{
	int tid = threadIdx.x;
	int bid = blockIdx.x;
	int NUM_NEURONS = gridDim.x;
	
	double norm1 = d_norm[bid];
	double norm2;

	for(int i = tid; i < NUM_NEURONS; i += blockDim.x)
	{
		norm2 = d_norm[i];
		d_SWeight[i + bid * ld] *= norm1 * norm2;
	}
}
"""
    func = func_compile("put_norm", template  % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 11
0
def get_put_norm_unaligned_kernel(dtype):
    template = """
    __global__ void
put_norm_unaligned(%(type)s* d_SWeight, int ld, double* d_norm, int* neuron_ind1, int* neuron_ind2, int num_neurons1)
{
	int tid = threadIdx.x;
	int bid = blockIdx.x;
	
    int ind1 = neuron_ind1[bid];
	double norm1 = d_norm[ind1];
    int ind2;
	double norm2;

	for(int i = tid; i < num_neurons1; i += blockDim.x)
	{
        ind2 = neuron_ind2[i];
		norm2 = d_norm[ind2];
		d_SWeight[i + bid * ld] *= norm1 * norm2;
	}
}
"""
    func = func_compile("put_norm_unaligned", template  % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 12
0
def get_put_norm_unaligned_kernel(dtype):
    template = """
    __global__ void
put_norm_unaligned(%(type)s* d_SWeight, int ld, double* d_norm, int* neuron_ind1, int* neuron_ind2, int num_neurons1)
{
	int tid = threadIdx.x;
	int bid = blockIdx.x;
	
    int ind1 = neuron_ind1[bid];
	double norm1 = d_norm[ind1];
    int ind2;
	double norm2;

	for(int i = tid; i < num_neurons1; i += blockDim.x)
	{
        ind2 = neuron_ind2[i];
		norm2 = d_norm[ind2];
		d_SWeight[i + bid * ld] *= norm1 * norm2;
	}
}
"""
    func = func_compile("put_norm_unaligned",
                        template % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 13
0
File: vrf.py Progetto: bionet/vtem
def get_dirich_space_kernel(dtype = np.dtype(np.complex128), typef = np.dtype(np.float64)):
    dirich_template = """
    #include <pycuda/pycuda-complex.hpp>
    #define BLOCK_SIZE 16

    
    __device__ pycuda::complex<float> Iexpf(const float x)
{
	float s,c;
	sincosf(x, &s, &c);
	return pycuda::complex<float>( c, s);
}

__device__ pycuda::complex<double> Iexp(const double x)
{
	double s,c;
	sincos(x, &s, &c);
	return pycuda::complex<double>( c, s);
}
	
	__global__ void
psi_dirich_space_Kernel(%(typef)s* dirich_space, int dirich_ld, %(type)s* Ds, int Ds_ld, int Px, int Py, int Mx, int My, double Sx, double Sy, double x_start, double y_start, double WMx, double WMy)
{
	unsigned int filter_id = blockIdx.y; //actually y index;


	extern __shared__ %(type)s s_Ds[];

	unsigned int tid_x = threadIdx.x;
	unsigned int tid_y = threadIdx.y;

	int xld = (Px-1)/BLOCK_SIZE+1;
	
	unsigned int bid_x = blockIdx.x %% xld; 
	unsigned int bid_y = blockIdx.x / xld; 

	unsigned int dim_x = blockDim.x;
	unsigned int dim_y = blockDim.y;


	double x, y;
	int pix_x = dim_x * bid_x + tid_x;
	int pix_y = dim_y * bid_y + tid_y;
	
	// degree per pixel
	double dxdy;
	
	dxdy = (double)(Sx / (Px ));
	x =  (double)(pix_x) * dxdy + x_start;
	dxdy = (double)(Sy / (Py ));
	y =  -((double)(pix_y) * dxdy + y_start);

	%(type)s sum = %(type)s(0,0);
	int a = 0;

	for(int my = -My; my <= My; ++my)
	{
		for(int i = threadIdx.x + threadIdx.y * BLOCK_SIZE; i < (2*Mx+1); i += BLOCK_SIZE*BLOCK_SIZE)
		{
			s_Ds[i] = Ds[filter_id * Ds_ld + i + (my+My) * (2*Mx+1)];
		}
		__syncthreads();
		
		a = 0;
		for(int mx = -Mx; mx <= Mx; ++mx)
		{
			if( mx*mx * My * My + my*my * Mx * Mx <= Mx*Mx*My*My)// && mx*mx + my*my > 0)
			{
				sum += s_Ds[a] * %(exp)s(mx * WMx * x + my * WMy * y);
			}
			++a;
		}
		__syncthreads();
	}

        if(pix_x < Px && pix_y < Py)
        {
            int output_ind = (pix_y) * Px + (pix_x) + filter_id * dirich_ld;
            dirich_space[output_ind] = pycuda::real(sum);
        }
}

"""
	
    if typef == np.float64:
        Iexp = "Iexp"
    else:
        Iexp = "Iexpf"
    
    func = func_compile("psi_dirich_space_Kernel", dirich_template % {"type": dtype_to_ctype(dtype), "typef": dtype_to_ctype(typef), "exp": Iexp})
    return func
Esempio n. 14
0
def get_reconstruct_kernel(dtype, dtypeq):
    template = """
__global__ void
reconstruct_Kernel(double* u_rec, int u_rec_ld, %(type)s* dirich_space, int dirich_ld, double* g_tk1, double* g_tk2, %(typeq)s* g_ck, double* d_t, int* neuron_ind, double* d_norm, int M, double WM, int size)
{
	unsigned int tid = threadIdx.x;
	unsigned int bid = blockIdx.x;
	unsigned int bdim = blockDim.x;
	unsigned int pix = bid*bdim + tid;

	__shared__ double t;
	
	double u = 0;
	__shared__ double ck[128];
	__shared__ double tk1[128];
	__shared__ double tk2[128];
	__shared__ int ind[128];
	double space;
	double norm;
	
	if(tid == 0)
	{
		t = d_t[blockIdx.y];
	}
	
	
	for(unsigned int i = 0; i < size; i+=bdim)
	{
		if(i + tid < size)
		{
			ck[tid] = g_ck[i + tid];
			tk1[tid] = g_tk1[i + tid];
			tk2[tid] = g_tk2[i + tid];
			ind[tid] = neuron_ind[i + tid];
			
	
		}
		__syncthreads();
		
		for(unsigned int j = 0; j < bdim; ++j)
		{
			if(j + i < size)
			{
				space = dirich_space[ind[j] * dirich_ld + pix];
				norm = d_norm[ind[j]];
				double phi = 0;
				for(int m = 1; m <= M; ++m)
				{
					phi += (sin(m*WM*(t-tk1[j])) - sin(m*WM*(t - tk2[j]))) / m;
				}
				u += ck[j] * (phi * 2 / WM + tk2[j] - tk1[j]) * space * norm;
				
			}
		}
		__syncthreads();
	}
	
	if(pix < u_rec_ld)
	{
            u_rec[pix + u_rec_ld * blockIdx.y] = u;
        }

}

"""
    func = func_compile(
        "reconstruct_Kernel", template % {
            "type": dtype_to_ctype(dtype),
            "typeq": dtype_to_ctype(dtypeq)
        })
    return func
Esempio n. 15
0
def get_IAF_kernel_linear_rt(dtype = np.float64):
    IAF_linear = """
        
        __global__ void
        ensemble_encode(%(type)s* g_input,
        int input_ld,
        int num_neurons,
        int size,
        %(type)s* g_spike,
        int spike_ld,
        %(type)s* g_v0,
        %(type)s* g_kappa,
        %(type)s* g_bias,
        %(type)s* g_delta,
        %(type)s* g_time_count,
        int* g_spike_count,
        int max_spike,
        %(type)s dt,
        %(type)s* g_delta_value,
        %(type)s* g_sigma)  
        {
        int tid = threadIdx.x;
        int bdim = blockDim.x;
        int bid = blockIdx.x;
        int btIdx_to_neuIdx = bdim * bid + tid;
        
        
        %(type)s kappa, bias, ddt, v0, time_count;
        %(type)s delta,sigma, delta_mean;
        int spike_count;
        
        
        if(btIdx_to_neuIdx<num_neurons)
        {
        
        
        kappa=g_kappa[btIdx_to_neuIdx];
        
        bias=g_bias[btIdx_to_neuIdx];
        
        ddt=dt;
        v0=g_v0[btIdx_to_neuIdx];
        spike_count=0;
        time_count=g_time_count[btIdx_to_neuIdx];
        
        delta_mean=g_delta[btIdx_to_neuIdx];
        delta = g_delta_value[btIdx_to_neuIdx];
        sigma = g_sigma[btIdx_to_neuIdx];
        }
        
        
        for(unsigned int i = 0; i < size - 1; ++i)
        {
        if(btIdx_to_neuIdx < num_neurons)
        {
        %(type)s y1 = (g_input[i * input_ld + btIdx_to_neuIdx] + bias) / kappa;
        %(type)s y2 = (g_input[(i+1) * input_ld + btIdx_to_neuIdx] + bias) / kappa;
        
        if(y2 >= 0 || (y2<0 && y1<=0) )
        {
        %(type)s area = (y1 + y2) / 2 * ddt;
        
        while(v0 + area >= delta)
        {
        %(type)s a = ( (y2-y1)/ ddt);
        
        %(type)s remain = (fabs(a) <= 1e-12)? ((delta - v0) / y1) : (sqrt(y1 * y1 + 2 * (delta - v0) * a) - y1)/ (a);
        time_count += remain;
        delta = g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] * sigma + delta_mean;
        g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] = time_count;
        
        spike_count++;
        area -= (delta - v0);
        v0 = 0;
        ddt -= remain; 
        y1 = y1 + remain * a; // y1 += remain * (y2-y1)/ddt;
        }
        
        v0 += area;
        time_count += ddt;
        ddt = dt;
        }else
        {
        %(type)s a = ( (y2-y1)/ ddt);
        %(type)s remain;
        while((remain = y1*y1 + 2*(delta - v0)*a) >= 0)
        {
        remain = (sqrt(remain) - y1) / a;
        
        time_count += remain;
        delta = g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] * sigma + delta_mean;
        g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] = time_count;
        
        spike_count++;
        v0 = 0;
        ddt -= remain; 
        y1 = y1 + remain * a;
        a = ( (y2-y1) /ddt);
        }
        v0 += (y1+ y2) / 2 * ddt;
        time_count += ddt;
        ddt = dt;
        }
        
        
        }
        }
        
        __syncthreads();
        
        g_v0[btIdx_to_neuIdx]=v0;
        g_time_count[btIdx_to_neuIdx]=time_count;
        g_spike_count[btIdx_to_neuIdx]=spike_count;
        g_delta_value[btIdx_to_neuIdx] = delta;
        
        
        }
        """
    func = func_compile("ensemble_encode", IAF_linear % {"type": dtype_to_ctype(dtype)})
    return func
Esempio n. 16
0
def get_G_kernel(dtype, dtypew):
    template = """
__device__ double
G_dirichlet_time(double tk1, double tk2, double tl1,double tl2, int M, double WM)
{
	double sum = 0;
	for(int m = 1; m <= M; ++m)
	{
		sum += (cos(m * WM * (tk2-tl2)) - cos(m * WM * (tk2-tl1)) - cos( m * WM * (tk1-tl2)) + cos(m * WM * (tk1-tl1))) / (m*m);
	}

	sum = sum * 2 / (WM * WM) + (tk2-tk1)*(tl2-tl1);
	return sum;
}


    __global__ void
compute_G_Kernel(%(type)s* g_G, int G_ld, double* g_tk1, double* g_tk2, double Wt, int Mt, %(typew)s* SWeight, int Sweight_ld, int* neuron_ind)
{
	unsigned int tid = threadIdx.x;
	unsigned int bdim = blockDim.x;
	unsigned int bid = blockIdx.x;
	int size = gridDim.x;

	double tl[2];
	__shared__ double tk[2];
	__shared__ int ind1;
	int ind2;
	
	
        if(tid == 0)
        {
                tk[0] = g_tk1[bid]; //roundf(g_tk1[bid] * rintf(1 / dt));
        }else if(tid == 1)
        {
                tk[1] = g_tk2[bid]; //roundf(g_tk2[bid] * rintf(1 / dt));
        }else if(tid ==2)
        {
                ind1 = neuron_ind[bid];
        }
                        
        __syncthreads();

        
        for(int i = tid; i < size; i += bdim)
        {
            ind2 = neuron_ind[i];
            tl[0] = g_tk1[i];
            tl[1] = g_tk2[i];
            
            g_G[bid * G_ld + i] = G_dirichlet_time(tk[0],tk[1],tl[0],tl[1],Mt,Wt/Mt) * SWeight[ind1 * Sweight_ld + ind2];

        }	
    
}

"""
    func = func_compile(
        "compute_G_Kernel", template % {
            "type": dtype_to_ctype(dtype),
            "typew": dtype_to_ctype(dtypew)
        })
    return func
Esempio n. 17
0
File: vrf.py Progetto: bionet/vtem
def get_gabor_kernel(dtype=np.dtype(np.float64)):
    gabor_template = """
    #include <pycuda/pycuda-complex.hpp>
    #define PI 3.141592653589793238462643383279
    #define BLOCK_SIZE 16
    
    __global__ void
gabor_Kernel(%(type)s* g_filter, int filter_ld, double* g_m, double* g_l, double* g_x0, double* g_y0, int* g_ab, int Px, int Py, double Sx, double Sy, double x_start, double y_start, double KAPPA) 
{
	unsigned int tid_x = threadIdx.x;
	unsigned int tid_y = threadIdx.y;	

        int xld = (Px-1)/BLOCK_SIZE + 1;
	unsigned int bid_x = blockIdx.x %% xld; 
	unsigned int bid_y = blockIdx.x / xld; 

	unsigned int filter_id = blockIdx.y; //actually y index;

	unsigned int dim_x = blockDim.x;
	unsigned int dim_y = blockDim.y;


	__shared__ double alpha;
	__shared__ double theta;
	__shared__ double x0;
	__shared__ double y0;
	__shared__ int sc;
	
    if(tid_y == 0)
    {
        if(tid_x == 0)
        {
            alpha = g_m[filter_id];
        }else if(tid_x == 1)
        {
            theta = g_l[filter_id];
        }else if(tid_x == 2)
        {
            x0 = g_x0[filter_id];
        }else if(tid_x == 3)
        {
            y0 = g_y0[filter_id];
        }else if(tid_x == 4)
        {
            sc = g_ab[filter_id];
        }
	}
	__syncthreads();
	
	double x, y;
	int pix_x = dim_x * bid_x + tid_x;
	int pix_y = dim_y * bid_y + tid_y;
	
	
	// degree per pixel
	double dxdy;
	
	dxdy = (double)(Sx / (Px));
	x =  (double)(pix_x) * dxdy + x_start;
	dxdy = (double)(Sy / (Py));
	y =  - ((double)(pix_y) * dxdy + y_start);
	

	x = alpha * (x - x0);
	y = alpha * (y - y0);

	double sint, cost;

	sincos(theta, &sint, &cost);

	double X = x * cost + y * sint;
	double Y = -x * sint + y * cost;
	double first_part = alpha * (1/sqrt(2 * PI)) * exp (- ( (4 * X * X)  + (Y * Y)) / 8);
	
		
	sincos(KAPPA * X, &sint, &cost);
	double gb;
	if(sc == 0)
	{
		gb = first_part * cost;
	}else
	{
		gb = first_part * sint;
	}

	if(pix_x < Px && pix_y < Py)
        {
            int output_ind = (pix_y) * Px + (pix_x) + filter_id * filter_ld;
            g_filter[output_ind] = gb;
	}
}
    """
    
    func = func_compile("gabor_Kernel", gabor_template % {"type": dtype_to_ctype(dtype)}, options=["--ptxas-options=-v --maxrregcount=32"])
    return func
Esempio n. 18
0
def get_reconstruct_kernel(dtype, dtypeq):
    template = """
__global__ void
reconstruct_Kernel(double* u_rec, int u_rec_ld, %(type)s* dirich_space, int dirich_ld, double* g_tk1, double* g_tk2, %(typeq)s* g_ck, double* d_t, int* neuron_ind, double* d_norm, int M, double WM, int size)
{
	unsigned int tid = threadIdx.x;
	unsigned int bid = blockIdx.x;
	unsigned int bdim = blockDim.x;
	unsigned int pix = bid*bdim + tid;

	__shared__ double t;
	
	double u = 0;
	__shared__ double ck[128];
	__shared__ double tk1[128];
	__shared__ double tk2[128];
	__shared__ int ind[128];
	double space;
	double norm;
	
	if(tid == 0)
	{
		t = d_t[blockIdx.y];
	}
	
	
	for(unsigned int i = 0; i < size; i+=bdim)
	{
		if(i + tid < size)
		{
			ck[tid] = g_ck[i + tid];
			tk1[tid] = g_tk1[i + tid];
			tk2[tid] = g_tk2[i + tid];
			ind[tid] = neuron_ind[i + tid];
			
	
		}
		__syncthreads();
		
		for(unsigned int j = 0; j < bdim; ++j)
		{
			if(j + i < size)
			{
				space = dirich_space[ind[j] * dirich_ld + pix];
				norm = d_norm[ind[j]];
				double phi = 0;
				for(int m = 1; m <= M; ++m)
				{
					phi += (sin(m*WM*(t-tk1[j])) - sin(m*WM*(t - tk2[j]))) / m;
				}
				u += ck[j] * (phi * 2 / WM + tk2[j] - tk1[j]) * space * norm;
				
			}
		}
		__syncthreads();
	}
	
	if(pix < u_rec_ld)
	{
            u_rec[pix + u_rec_ld * blockIdx.y] = u;
        }

}

"""
    func = func_compile("reconstruct_Kernel",  template  % {"type": dtype_to_ctype(dtype), "typeq": dtype_to_ctype(dtypeq)})
    return func
Esempio n. 19
0
File: vrf.py Progetto: bionet/vtem
def get_cs_kernel(dtype=np.dtype(np.float64)):
    gabor_template = """
    #include <pycuda/pycuda-complex.hpp>
    #define PI 3.141592653589793238462643383279
    #define BLOCK_SIZE 16
    
    __global__ void
cs_Kernel(%(type)s* g_filter, int filter_ld, double* g_m, double* g_x0, double* g_y0, int Px, int Py, double Sx, double Sy, double x_start, double y_start, double sigma_c_square, double sigma_s_square) 
{
	unsigned int tid_x = threadIdx.x;
	unsigned int tid_y = threadIdx.y;	

        int xld = (Px-1)/BLOCK_SIZE + 1;
	unsigned int bid_x = blockIdx.x %% xld; 
	unsigned int bid_y = blockIdx.x / xld; 

	unsigned int filter_id = blockIdx.y; //actually y index;

	unsigned int dim_x = blockDim.x;
	unsigned int dim_y = blockDim.y;


	__shared__ double alpha;
	__shared__ double x0;
	__shared__ double y0;
	
    if(tid_y == 0)
    {
        if(tid_x == 0)
        {
            alpha = g_m[filter_id];
        }else if(tid_x == 1)
        {
            x0 = g_x0[filter_id];
        }else if(tid_x == 2)
        {
            y0 = g_y0[filter_id];
        }
	}
	__syncthreads();
	
	double x, y;
	int pix_x = dim_x * bid_x + tid_x;
	int pix_y = dim_y * bid_y + tid_y;
	
	
	// degree per pixel
	double dxdy;
	
	dxdy = (double)(Sx / (Px));
	x =  (double)(pix_x) * dxdy + x_start;
	dxdy = (double)(Sy / (Py));
	y =  - ((double)(pix_y) * dxdy + y_start);
	

	x = alpha * (x - x0);
	y = alpha * (y - y0);

	double XY = -(x*x + y*y);
    
	double gb = (alpha) * ( exp ( XY / (2 * sigma_c_square)) / (sigma_c_square) - 0.9 * exp ( XY / (2 * sigma_s_square)) / (sigma_s_square));
	
    

	if(pix_x < Px && pix_y < Py)
        {
            int output_ind = (pix_y) * Px + (pix_x) + filter_id * filter_ld;
            g_filter[output_ind] = gb / (2*PI);
	}
}
    """
    
    func = func_compile("cs_Kernel", gabor_template % {"type": dtype_to_ctype(dtype)}, options=["--ptxas-options=-v --maxrregcount=32"])
    return func