mod = SourceModule(""" __global__ void Fun_snr(float *S_td, float *f_picture,int *N,int *Modesize) { const int Imgw = N[0,0]; const int modesize = Modesize[0,0]; const int i = blockIdx.x * blockDim.x + threadIdx.x; float s_td=0.0; float sum=0.0; if(blockIdx.x <4060&& !((i+1)%Imgw/(Imgw-modesize+1))) { for(int p=0 ; p<modesize ; p++) for(int q=0 ; q<modesize ; q++) { sum =sum + f_picture[ (blockIdx.x+Imgw/blockDim.x*p) * blockDim.x + threadIdx.x + q]; } sum = sum /pow(float(modesize),2); for(int p=0 ; p<modesize ; p++) for(int q=0 ; q<modesize ; q++) { s_td =s_td + pow((f_picture[ (blockIdx.x+Imgw/blockDim.x*p) * blockDim.x + threadIdx.x + q]-sum),2); } S_td[i]= sqrt(s_td / pow(float(modesize),2)); } else { S_td[i]=0; } } __global__ void Fun_ROAD(float *ROAD, float *f_picture,int *N) { const int Imgw = N[0,0]; const int i = blockIdx.x * blockDim.x + threadIdx.x; if(blockIdx.x <4088&& !((i+1)%Imgw/(Imgw-2))) { float a[9]; int a_i=1; for(int p=0 ; p<3 ; p++) for(int q=0 ; q<3; q++) { a[a_i]=f_picture[(blockIdx.x+Imgw/blockDim.x*p) * blockDim.x + threadIdx.x + q]-f_picture[(blockIdx.x+Imgw/blockDim.x*1) * blockDim.x + threadIdx.x + 1]; a[a_i]=abs(a[a_i]); a_i+=1; } { int ii,j; float t; for(ii=0;ii<8;ii++) for(j=ii+1;j<9;j++) { if(a[ii]>a[j]) { t=a[ii]; a[ii]=a[j]; a[j]=t; } } } float a_sum=0; for(int kk=0;kk<5;kk++) a_sum=a_sum+a[kk]; ROAD[i]= a_sum; } else { ROAD[i]=0; } __syncthreads(); } """)
import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule mod = SourceModule(""" #include <stdio.h> __global__ void hello_world(void) { printf("Hello world from block (%d, %d) thread (%d, %d, %d)\\n", blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, threadIdx.z); } """) hello_world = mod.get_function('hello_world') hello_world(block=(1, 2, 3), grid=(2, 1))
import os import pycuda.driver as drv import numpy from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1), grid=(1, 1)) print(dest - a * b)
from pycuda.compiler import SourceModule import numpy as np from sklearn.neighbors import NearestNeighbors import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # NOQA import time from constants import Constant with open("src/source.cu", "r") as file: source = file.read() module_calculate_consts = SourceModule(source) # NOQA const = Constant() nn = NearestNeighbors(n_neighbors=const.num_neighbhours) timer = np.zeros([100, 7]) calc_density = module_calculate_consts.get_function("calc_density") calc_forces = module_calculate_consts.get_function("calc_forces") update_pos = module_calculate_consts.get_function("update_pos") r1 = (np.random.rand(const.num_particles, 3).astype(np.float32) - np.array([0.5, 0.5, 0.5]).astype(np.float32)) * 10 v = np.zeros([const.num_particles, 3]).astype(np.float32) plt.ion() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for iteration in range(100): timer[iteration][0] = time.time()
from pycuda_test_utils import ( PyCudaImplementation, get_free_bytes, get_time, time_function, free_memory_pool, get_median_filter_string, ) from write_and_read_results import ARRAY_SIZES, write_results_to_file LIB_NAME = "pycuda" mode = "sourcemodule" REFLECT_MODE = "reflect" median_filter_module = SourceModule(get_median_filter_string()) median_filter = median_filter_module.get_function("median_filter") def pycuda_median_filter(data, padded_data, filter_height, filter_width): N = 32 median_filter( data, padded_data, np.int32(data.shape[0]), np.int32(data.shape[1]), np.int32(data.shape[2]), np.int32(filter_height), np.int32(filter_width), block=(data.shape[0], data.shape[1], data.shape[2]), )
VecCode = ''' __global__ void vec_ker(int *ints, double *doubles) { int4 f1, f2; f1 = *reinterpret_cast<int4*>(ints); f2 = *reinterpret_cast<int4*>(&ints[4]); printf("First int4: %d, %d, %d, %d\\n", f1.x, f1.y, f1.z, f1.w); printf("Second int4: %d, %d, %d, %d\\n", f2.x, f2.y, f2.z, f2.w); double2 d1, d2; d1 = *reinterpret_cast<double2*>(doubles); d2 = *reinterpret_cast<double2*>(&doubles[2]); printf("First double2: %f, %f\\n", d1.x, d1.y); printf("Second double2: %f, %f\\n", d2.x, d2.y); }''' vec_mod = SourceModule(VecCode) vec_ker = vec_mod.get_function('vec_ker') ints = gpuarray.to_gpu(np.int32([1, 2, 3, 4, 5, 6, 7, 8])) doubles = gpuarray.to_gpu(np.double([1.11, 2.22, 3.33, 4.44])) print('Vectorized Memory Test:') vec_ker(ints, doubles, grid=(1, 1, 1), block=(1, 1, 1))
cu_matrix_kernel = SourceModule(""" #include <math.h> #include <stdio.h> #include "texture_fetch_functions.h" #include "texture_types.h" #define THREADS_PER_BLOCK 256 #define FIT_RADIUS 6 texture<float, cudaTextureType2DLayered, cudaReadModeElementType> tex; __device__ void deconvolve3_columns(int width,int height,int rowstride, float *data,float *buffer,float a,float b) { float *row; float q; int i, j; /* // if (( height < 2) || (rowstride > width)) { // printf("Failure in deconvolve3_rows: height, rowstride, width, a, b = %//d %d %d %f %f\n",height, rowstride, width, a, b ); // return; // } */ if (!height || !width) return; if (height == 1) { q = a + 2.0*b; for (j = 0; j < width; j++) data[j] /= q; return; } if (height == 2) { q = a*(a + 2.0*b); for (j = 0; j < width; j++) { buffer[0] = (a + b)/q*data[j] - b/q*data[rowstride + j]; data[rowstride + j] = (a + b)/q*data[rowstride + j] - b/q*data[j]; data[j] = buffer[0]; } return; } /* Special-case first row */ buffer[0] = a + b; /* Inner rows */ for (i = 1; i < height-1; i++) { q = b/buffer[i-1]; buffer[i] = a - q*b; row = data + (i - 1)*rowstride; for (j = 0; j < width; j++) row[rowstride + j] -= q*row[j]; } /* Special-case last row */ q = b/buffer[i-1]; buffer[i] = a + b*(1.0 - q); row = data + (i - 1)*rowstride; for (j = 0; j < width; j++) row[rowstride + j] -= q*row[j]; /* Go back */ row += rowstride; for (j = 0; j < width; j++) row[j] /= buffer[i]; do { i--; row = data + i*rowstride; for (j = 0; j < width; j++) row[j] = (row[j] - b*row[rowstride + j])/buffer[i]; } while (i > 0); } __device__ void deconvolve3_rows(int width,int height,int rowstride,float *data, float *buffer,float a,float b) { float *row; float q; int i, j; /* // if (( height < 2) || (rowstride > width)) { // printf("Failure in deconvolve3_rows\n"); // return; // } */ if (!height || !width) return; if (width == 1) { q = a + 2.0*b; for (i = 0; i < height; i++) data[i*rowstride] /= q; return; } if (width == 2) { q = a*(a + 2.0*b); for (i = 0; i < height; i++) { row = data + i*rowstride; buffer[0] = (a + b)/q*row[0] - b/q*row[1]; row[1] = (a + b)/q*row[1] - b/q*row[0]; row[0] = buffer[0]; } return; } /* Special-case first item */ buffer[0] = a + b; /* Inner items */ for (j = 1; j < width-1; j++) { q = b/buffer[j-1]; buffer[j] = a - q*b; data[j] -= q*data[j-1]; } /* Special-case last item */ q = b/buffer[j-1]; buffer[j] = a + b*(1.0 - q); data[j] -= q*data[j-1]; /* Go back */ data[j] /= buffer[j]; do { j--; data[j] = (data[j] - b*data[j+1])/buffer[j]; } while (j > 0); /* Remaining rows */ for (i = 1; i < height; i++) { row = data + i*rowstride; /* Forward */ for (j = 1; j < width-1; j++) row[j] -= b*row[j-1]/buffer[j-1]; row[j] -= b*row[j-1]/buffer[j-1]; /* Back */ row[j] /= buffer[j]; do { j--; row[j] = (row[j] - b*row[j+1])/buffer[j]; } while (j > 0); } } __device__ void resolve_coeffs_2d(int width, int height, int rowstride, float *data) { float *buffer; int max; max = width > height ? width : height; buffer = (float *)malloc(max*sizeof(float)); deconvolve3_rows(width, height, rowstride, data, buffer, 13.0/21.0, 4.0/21.0); deconvolve3_columns(width, height, rowstride, data, buffer, 13.0/21.0, 4.0/21.0); free(buffer); } __device__ float interpolate_2d(float x,float y,int rowstride,float *coeff) { float wx[4], wy[4]; int i, j; float v, vx; /* // if (x < 0.0 || x > 1.0 || y < 0.0 || y > 1.0) { // printf("interpolate_2d: x or y out of bounds %f %f\n",x,y); // return(-1.0); // } */ wx[0] = 4.0/21.0 + (-11.0/21.0 + (0.5 - x/6.0)*x)*x; wx[1] = 13.0/21.0 + (1.0/14.0 + (-1.0 + x/2.0)*x)*x; wx[2] = 4.0/21.0 + (3.0/7.0 + (0.5 - x/2.0)*x)*x; wx[3] = (1.0/42.0 + x*x/6.0)*x; wy[0] = 4.0/21.0 + (-11.0/21.0 + (0.5 - y/6.0)*y)*y; wy[1] = 13.0/21.0 + (1.0/14.0 + (-1.0 + y/2.0)*y)*y; wy[2] = 4.0/21.0 + (3.0/7.0 + (0.5 - y/2.0)*y)*y; wy[3] = (1.0/42.0 + y*y/6.0)*y; v = 0.0; for (i = 0; i < 4; i++) { vx = 0.0; for (j = 0; j < 4; j++) vx += coeff[i*rowstride + j]*wx[j]; v += wy[i]*vx; } return v; } __device__ float integrated_profile(int profile_type, int idx, int idy, float xpos, float ypos, float *psf_parameters, float *lut_0, float *lut_xd, float *lut_yd) { int psf_size; float psf_height, psf_sigma_x, psf_sigma_y, psf_xpos, psf_ypos; float p0; int ip, jp; float pi=3.14159265,fwtosig=0.8493218; psf_size = (int) psf_parameters[0]; psf_height = psf_parameters[1]; psf_sigma_x = psf_parameters[2]; psf_sigma_y = psf_parameters[3]; psf_ypos = psf_parameters[4]; psf_xpos = psf_parameters[5]; if (profile_type == 0) { // gaussian // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5) // Analytic part p0 = 0.5*psf_height*pi*fwtosig*fwtosig* (erff((idx-7.5+0.5)/(1.41421356*psf_sigma_x)) - erff((idx-7.5-0.5)/(1.41421356*psf_sigma_x))) * (erff((idy-7.5+0.5)/(1.41421356*psf_sigma_y)) - erff((idy-7.5-0.5)/(1.41421356*psf_sigma_y))); // Index into the lookup table ip = psf_size/2 + 2*idx - 15; jp = psf_size/2 + 2*idy - 15; if ((ip>=0) && (ip<=psf_size-1) && (jp>=0) && (jp<=psf_size-1)) { p0 += lut_0[ip+psf_size*jp] + lut_xd[ip+psf_size*jp]*(xpos-psf_xpos) + lut_yd[ip+psf_size*jp]*(ypos-psf_ypos); } return p0; } else if (profile_type == 1) { // moffat25 // From iraf/noao/digiphot/daophot/daolib/profile.x float d[4][4] = {{ 0.0, 0.0, 0.0, 0.0}, {-0.28867513, 0.28867513, 0.0, 0.0}, {-0.38729833, 0.0, 0.38729833, 0.0}, {-0.43056816, -0.16999052, 0.16999052, 0.43056816}}; float w[4][4] = {{1.0, 0.0, 0.0, 0.0}, {0.5, 0.5, 0.0, 0.0}, {0.27777778, 0.44444444, 0.27777778, 0.0}, {0.17392742, 0.32607258, 0.32607258, 0.17392742}}; float alpha = 0.3195079; float p1sq, p2sq, p1p2, dx, dy, xy, denom, func, x[4], xsq[4], p1xsq[4]; float y, ysq, p2ysq, wt, p4fod, wp4fod, wf; int npt, ix, iy; p1sq = psf_parameters[2]*psf_parameters[2]; p2sq = psf_parameters[3]*psf_parameters[3]; p1p2 = psf_parameters[2]*psf_parameters[3]; dx = idx-7.5+0.5; dy = idy-7.5+0.5; xy = dx * dy; denom = 1.0 + alpha * (dx*dx/p1sq + dy*dy/p2sq + xy*psf_parameters[4]); if (denom > 1.0e4) { return 0.0; } p0 = 0.0; func = 1.0 / (p1p2*powf(float(denom),float(2.5))); if (func >= 0.046) { npt = 4; } else if (func >= 0.0022) { npt = 3; } else if (func >= 0.0001) { npt = 2; } else if (func >= 1.0e-10) { p0 = (2.5 - 1.0) * func; } if (func >= 0.0001) { for (ix=0; ix<npt; ix++) { x[ix] = dx + d[npt][ix]; xsq[ix] = x[ix]*x[ix]; p1xsq[ix] = xsq[ix]/p1sq; } for (iy=0; iy<npt; iy++) { y = dy + d[npt][iy]; ysq = y*y; p2ysq = ysq/p2sq; for (ix=0; ix<npt; ix++) { wt = w[npt][iy] * w[npt][ix]; xy = x[ix] * y; denom = 1.0 + alpha * (p1xsq[ix] + p2ysq + xy*psf_parameters[4]); func = (2.5 - 1.0) / (p1p2 * powf(denom,2.5) ); p4fod = 2.5 * alpha * func / denom; wp4fod = wt * p4fod; wf = wt * func; p0 += wf; } } } p0 *= psf_parameters[1]; // Index into the lookup table ip = psf_size/2 + 2*idx - 15; jp = psf_size/2 + 2*idy - 15; if ((ip>=0) && (ip<=psf_size-1) && (jp>=0) && (jp<=psf_size-1)) { p0 += lut_0[ip+psf_size*jp] + lut_xd[ip+psf_size*jp]*(xpos-psf_xpos) + lut_yd[ip+psf_size*jp]*(ypos-psf_ypos); } return p0; } else { return 0.0; } } __global__ void convolve_image_psf(int profile_type, int nx, int ny, int dx, int dy, int dp, int ds, int n_coeff, int nkernel, int kernel_radius,int *kxindex, int *kyindex, int* ext_basis, float *psf_parameters, float *psf_0, float *psf_xd, float *psf_yd, float *coeff,float *cim1, float* cim2) { int id, txa, tyb, txag, tybg; int np, ns, i, j, ii, ip, jp, ic, ki, a, b; int d1, sidx, l, m, l1, m1, ig, jg; int psf_size, ix, jx; float x, y, p0, p1, p1g, cpsf_pixel, xpos, ypos; float psf_height, psf_sigma_x, psf_sigma_y, psf_sigma_xy, psf_xpos, psf_ypos; float gain,psf_rad,psf_rad2, px, py; float sx2, sy2, sxy2, sx2msy2, sx2psy2; float psf_norm,dd; float pi=3.14159265,fwtosig=0.8493218; __shared__ float psf_sum[256]; __shared__ float cpsf[256]; __shared__ float cpix1[256]; __shared__ float cpix2[256]; // initialise memory id = threadIdx.x+threadIdx.y*16; cpsf[id] = 0.0; // star position in normalised units xpos = blockIdx.x*dx + dx/2; ypos = blockIdx.y*dy + dy/2; x = (xpos - 0.5*(nx-1))/(nx-1); y = (ypos - 0.5*(ny-1))/(ny-1); // number of polynomial coefficients per basis function np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // PSF parameters psf_size = (int) psf_parameters[0]; psf_height = psf_parameters[1]; psf_sigma_x = psf_parameters[2]; psf_sigma_y = psf_parameters[3]; psf_ypos = psf_parameters[4]; psf_xpos = psf_parameters[5]; psf_rad = psf_parameters[6]; gain = psf_parameters[7]; if (psf_rad > 5.0) { psf_rad = 5.0; } psf_rad2 = psf_rad*psf_rad; // PSF integral __syncthreads(); psf_sum[id] = 0.0; for (i=threadIdx.x+1; i<psf_size-1; i+=blockDim.x) { for (j=threadIdx.y+1; j<psf_size-1; j+=blockDim.y) { psf_sum[id] += psf_0[i+j*psf_size]; } } __syncthreads(); i = 128; while (i != 0) { if (id < i) { psf_sum[id] += psf_sum[id + i]; } __syncthreads(); i /= 2; } __syncthreads(); if (profile_type == 0) { // gaussian psf_norm = 0.25*psf_sum[0] + psf_height*2*pi*fwtosig*fwtosig; } else if (profile_type == 1) { // moffat25 psf_sigma_xy = psf_parameters[8]; sx2 = psf_sigma_x*psf_sigma_x; sy2 = psf_sigma_y*psf_sigma_y; sxy2 = psf_sigma_xy*psf_sigma_xy; sx2msy2 = 1.0/sx2 - 1.0/sy2; sx2psy2 = 1.0/sx2 + 1.0/sy2; px = 1.0/sqrt( sx2psy2 + sqrt(sx2msy2*sx2msy2 + sxy2) ); py = 1.0/sqrt( sx2psy2 - sqrt(sx2msy2*sx2msy2 + sxy2) ); psf_norm = 0.25*psf_sum[0] + psf_height*pi*(px*py)/(psf_sigma_x*psf_sigma_y); } // Construct the convolved PSF // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5) // Analytic part p0 = integrated_profile(profile_type, threadIdx.x, threadIdx.y, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); cpsf_pixel = 0.0; // Iterate over coefficients for (ic=0; ic<n_coeff; ic++) { // basis function position ki = ic < np ? 0 : (ic-np)/ns + 1; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; // Set the polynomial degree for the subvector and the // index within the subvector if (ki == 0) { d1 = dp; sidx = ic; } else { d1 = ds; sidx = ic - np - (ki-1)*ns; } // Compute the polynomial index (l,m) values corresponding // to the index within the subvector l1 = m1 = 0; if (d1 > 0) { i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == sidx) { l1 = l; m1 = m; } i++; } } } // Indices into the PSF if (ki > 0) { txa = threadIdx.x + a; tyb = threadIdx.y + b; p1 = integrated_profile(profile_type, txa, tyb, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); // If we have an extended basis function, we need to // average the PSF over a 3x3 grid if (ext_basis[ki]) { p1 = 0.0; for (ig=-1; ig<2; ig++) { for (jg=-1; jg<2; jg++) { txag = txa + ig; tybg = tyb + jg; p1g = integrated_profile(profile_type, txag, tybg, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); p1 += p1g; } } p1 /= 9.0; } cpsf_pixel += coeff[ic]*(p1-p0)*powf(x,l1)*powf(y,m1); } else { cpsf_pixel += coeff[ic]*p0*powf(x,l1)*powf(y,m1); } } } //end ic loop __syncthreads(); cpsf[id] = cpsf_pixel/psf_norm; __syncthreads(); // Now convolve the image section with the convolved PSF for (i=xpos-dx/2; i<xpos+dx/2; i++) { for (j=ypos-dy/2; j<ypos+dy/2; j++) { ix = (int)floorf(i+0.5)+threadIdx.x-8.0; jx = (int)floorf(j+0.5)+threadIdx.y-8.0; cpix1[id] = cpsf[id]*tex2DLayered(tex,ix,jx,0); cpix2[id] = cpsf[id]*tex2DLayered(tex,ix,jx,1); __syncthreads(); // Parallel sum ii = 128; while (ii != 0) { if (id < ii) { cpix1[id] += cpix1[id + ii]; cpix2[id] += cpix2[id + ii]; } __syncthreads(); ii /= 2; } if (id == 0) { cim1[i+j*nx] = cpix1[0]; cim2[i+j*nx] = cpix2[0]; } __syncthreads(); } } return; } __global__ void cu_photom(int profile_type, int nx, int ny, int dp, int ds, int n_coeff, int nkernel, int kernel_radius,int *kxindex, int *kyindex, int* ext_basis, float *psf_parameters, float *psf_0, float *psf_xd, float *psf_yd, float *posx, float *posy, float *coeff, float *flux, float *dflux) { int id, txa, tyb, txag, tybg; int np, ns, i, j, ip, jp, ic, ki, a, b; int d1, sidx, l, m, l1, m1, ig, jg; int psf_size, ix, jx; float x, y, p0, p1, p1g, cpsf_pixel, xpos, ypos, dd; float psf_height, psf_sigma_x, psf_sigma_y, psf_sigma_xy, psf_xpos, psf_ypos; float psf_rad, psf_rad2, gain, fl, inv_var, px, py; float sx2, sy2, sxy2, sx2msy2, sx2psy2; float subx, suby, psf_norm, bgnd, dr2; float pi=3.14159265,fwtosig=0.8493218, RON=5.0; __shared__ float psf_sum[256]; __shared__ float cpsf[256]; __shared__ float mpsf[256]; __shared__ float fsum1[256]; __shared__ float fsum2[256]; __shared__ float fsum3[256]; __shared__ float fsum4[256]; __shared__ float fsum5[256]; // initialise memory id = threadIdx.x+threadIdx.y*16; cpsf[id] = 0.0; mpsf[id] = 0.0; // star position in normalised units xpos = posx[blockIdx.x]; ypos = posy[blockIdx.x]; x = (xpos - 0.5*(nx-1))/(nx-1); y = (ypos - 0.5*(ny-1))/(ny-1); // number of polynomial coefficients per basis function np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // PSF parameters psf_size = (int) psf_parameters[0]; psf_height = psf_parameters[1]; psf_sigma_x = psf_parameters[2]; psf_sigma_y = psf_parameters[3]; psf_ypos = psf_parameters[4]; psf_xpos = psf_parameters[5]; psf_rad = psf_parameters[6]; gain = psf_parameters[7]; if (psf_rad > 7.0) { psf_rad = 7.0; } psf_rad2 = psf_rad*psf_rad; // PSF integral __syncthreads(); psf_sum[id] = 0.0; for (i=threadIdx.x; i<psf_size; i+=blockDim.x) { for (j=threadIdx.y; j<psf_size; j+=blockDim.y) { psf_sum[id] += psf_0[i+j*psf_size]; //if (blockIdx.x == 120) { // printf("i, j, id, psf_0: %d %d %d %f\\n",i,j,id,psf_0[i+j*psf_size]); //} } } __syncthreads(); i = 128; while (i != 0) { if (id < i) { psf_sum[id] += psf_sum[id + i]; } __syncthreads(); i /= 2; } __syncthreads(); if (profile_type == 0) { // gaussian psf_norm = 0.25*psf_sum[0] + psf_height*2*pi*fwtosig*fwtosig; //if ((id == 0) && (blockIdx.x==120)){ // printf("psf_sum0, psf_height, psf_norm: %f %f %f\\n",psf_sum[0],psf_height,psf_norm); //} } else if (profile_type == 1) { // moffat25 psf_sigma_xy = psf_parameters[8]; sx2 = psf_sigma_x*psf_sigma_x; sy2 = psf_sigma_y*psf_sigma_y; sxy2 = psf_sigma_xy*psf_sigma_xy; sx2msy2 = 1.0/sx2 - 1.0/sy2; sx2psy2 = 1.0/sx2 + 1.0/sy2; px = 1.0/sqrt( sx2psy2 + sqrt(sx2msy2*sx2msy2 + sxy2) ); py = 1.0/sqrt( sx2psy2 - sqrt(sx2msy2*sx2msy2 + sxy2) ); psf_norm = 0.25*psf_sum[0] + psf_height*pi*(px*py)/(psf_sigma_x*psf_sigma_y); //if ((id == 0) && (blockIdx.x==120)){ // printf("psf_sum0, psf_height, psf_norm: %f %f %f\\n",psf_sum[0],psf_height, psf_norm); //} } // Construct the convolved PSF // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5) // Analytic part p0 = integrated_profile(profile_type, threadIdx.x, threadIdx.y, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); // Spatially variable part // // + // psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) + // psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos); // } // cpsf_pixel = 0.0; // Iterate over coefficients for (ic=0; ic<n_coeff; ic++) { // basis function position ki = ic < np ? 0 : (ic-np)/ns + 1; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; // Set the polynomial degree for the subvector and the // index within the subvector if (ki == 0) { d1 = dp; sidx = ic; } else { d1 = ds; sidx = ic - np - (ki-1)*ns; } // Compute the polynomial index (l,m) values corresponding // to the index within the subvector l1 = m1 = 0; if (d1 > 0) { i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == sidx) { l1 = l; m1 = m; } i++; } } } // Indices into the PSF if (ki > 0) { txa = threadIdx.x + a; tyb = threadIdx.y + b; p1 = integrated_profile(profile_type, txa, tyb, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); // // + // psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) + // psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos); // } // // If we have an extended basis function, we need to // average the PSF over a 3x3 grid if (ext_basis[ki]) { p1 = 0.0; for (ig=-1; ig<2; ig++) { for (jg=-1; jg<2; jg++) { txag = txa + ig; tybg = tyb + jg; p1g = integrated_profile(profile_type, txag, tybg, xpos, ypos, psf_parameters, psf_0, psf_xd, psf_yd); __syncthreads(); // // + // psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) + // psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos); // } // p1 += p1g; } } p1 /= 9.0; } cpsf_pixel += coeff[ic]*(p1-p0)*powf(x,l1)*powf(y,m1); } else { cpsf_pixel += coeff[ic]*p0*powf(x,l1)*powf(y,m1); } } } //end ic loop __syncthreads(); cpsf[id] = cpsf_pixel/psf_norm; __syncthreads(); /* Uncomment to print convolved PSF if ((id == 0) && (blockIdx.x==14)){ txa = 7; tyb = 7; ip = psf_size/2 + 2*txa - 15; jp = psf_size/2 + 2*tyb - 15; if (profile_type == 0) { printf("psf_test: %lf %lf %lf %lf\\n", 0.5*psf_height*pi*fwtosig*fwtosig* (erff((txa-7.5+0.5)/(1.41421356*psf_sigma_x)) - erff((txa-7.5-0.5)/(1.41421356*psf_sigma_x))) * (erff((tyb-7.5+0.5)/(1.41421356*psf_sigma_y)) - erff((tyb-7.5-0.5)/(1.41421356*psf_sigma_y))), psf_0[ip+psf_size*jp], psf_xd[ip+psf_size*jp]*(xpos-psf_xpos), psf_yd[ip+psf_size*jp]*(ypos-psf_ypos)); } dd = 0.0; printf("cpsf\\n"); for (j=15; j>=0; j--) { printf("%2d ",j); for (i=0; i<16; i++) { printf("%6.4f ",cpsf[i+j*16]); dd += cpsf[i+j*16]; } printf("\\n"); } printf("sum = %f\\n",dd); printf("psf lookup table fraction: %f\\n",psf_sum[0]/psf_norm); } */ __syncthreads(); // Map the convolved PSF to the subpixel star coordinates if (id == 0) { resolve_coeffs_2d(16,16,16,cpsf); } __syncthreads(); mpsf[id] = 0.0; subx = ceilf(xpos+0.5+0.00001) - (xpos+0.5); suby = ceilf(ypos+0.5+0.00001) - (ypos+0.5); if ((threadIdx.x > 1) && (threadIdx.x < 14) && (threadIdx.y > 1) && (threadIdx.y < 14)) { mpsf[id] = interpolate_2d(subx,suby,16,&cpsf[threadIdx.x-2+(threadIdx.y-2)*16]); } __syncthreads(); // force negative pixels to zero mpsf[id] = mpsf[id] > 0.0 ? mpsf[id] : 0.0; __syncthreads(); // // Normalise mapped PSF // (No - the convolved PSF contain the phot scale) /* cpsf[id] = mpsf[id]; __syncthreads(); i = 128; while (i != 0) { if (id < i) { cpsf[id] += cpsf[id + i]; } __syncthreads(); i /= 2; } mpsf[id] /= cpsf[0]; */ /* Uncomment to print mapped PSF */ if ((id == 0) && (blockIdx.x==14)){ printf("xpos, ypos: %f %f\\n",xpos,ypos); printf("subx, suby: %f %f\\n",subx,suby); printf("mpsf\\n"); dd = 0.0; for (j=15; j>=0; j--) { printf("%2d ",j); for (i=0; i<16; i++) { printf("%6.4f ",mpsf[i+j*16]); dd += mpsf[i+j*16]; } printf("\\n"); } printf("sum = %f\\n",dd); } __syncthreads(); // Fit the mapped PSF to the difference image to compute an // optimal flux estimate. // Assume the difference image is in tex(:,:,0) // and the inverse variance in tex(:,:,1). // We need to iterate to get the variance correct // fl = 0.0; for (j=0; j<3; j++) { fsum1[id] = 0.0; fsum2[id] = 0.0; fsum3[id] = 0.0; __syncthreads(); /* if ((id == 0) && (blockIdx.x==14)){ printf("photom, j=%d\\n",j); } */ if (powf(threadIdx.x-8.0,2)+powf(threadIdx.y-8.0,2) < psf_rad2) { ix = (int)floorf(xpos+0.5)+threadIdx.x-8.0; jx = (int)floorf(ypos+0.5)+threadIdx.y-8.0; inv_var = 1.0/(1.0/tex2DLayered(tex,ix,jx,1) + fl*mpsf[id]/gain); fsum1[id] = mpsf[id]*tex2DLayered(tex,ix,jx,0)*inv_var; fsum2[id] = mpsf[id]*mpsf[id]*inv_var; fsum3[id] = mpsf[id]; /* if ((blockIdx.x==14)){ printf("ix jx xpos ypos dr2 mpsf inv_var im: %03d %03d %8.3f %8.3f %6.3f %6.5f %g %g\\n",ix,jx,xpos,ypos,dr2, mpsf[id],inv_var, tex2DLayered(tex,ix,jx,0)); } */ } __syncthreads(); // Parallel sum i = 128; while (i != 0) { if (id < i) { fsum1[id] += fsum1[id + i]; fsum2[id] += fsum2[id + i]; fsum3[id] += fsum3[id + i]; } __syncthreads(); i /= 2; } fl = fsum1[0]/fsum2[0]; } if (id == 0) { flux[blockIdx.x] = fl; dflux[blockIdx.x] = sqrt(fsum3[0]*fsum3[0]/fsum2[0]); } /* Uncomment for debug info */ /* __syncthreads(); i = 128; while (i != 0) { if (id < i) { mpsf[id] += mpsf[id + i]; } __syncthreads(); i /= 2; } __syncthreads(); if (id == 0) { if (blockIdx.x == 120) { printf("result: %f %f %f %f %f %f %f %f %f %f %f %f\\n",fsum1[0],fsum2[0],fsum3[0],mpsf[0],psf_norm,psf_sum[0],bgnd,flux[blockIdx.x],flux[blockIdx.x]*fsum3[0],flux[blockIdx.x]*mpsf[0],fsum4[0],dflux[blockIdx.x]); } } */ __syncthreads(); return; } __global__ void cu_compute_model(int dp, int ds, int db, int *kxindex, int *kyindex, int* ext_basis, int nkernel, float *coefficient, float *M) { int np, ns, nb, hs, idx, ki, a, b, d1, sidx, l, m, l1, m1, i; float x, y, Bi; __shared__ float count[THREADS_PER_BLOCK]; // Calculate number of terms in subvectors np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; nb = (db+1)*(db+2)/2; hs = (nkernel-1)*ns+np+nb; x = (blockIdx.x - 0.5*(gridDim.x-1))/(gridDim.x-1); y = (blockIdx.y - 0.5*(gridDim.y-1))/(gridDim.y-1); count[threadIdx.x] = 0.0; for (idx = threadIdx.x; idx < hs; idx += blockDim.x) { // This is the index of the subvector and its kernel offsets ki = idx < np ? 0 : (idx-np)/ns + 1; a = b = 0; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; } // Set the polynomial degree for the subvector and the // index within the subvector if (ki == 0) { d1 = dp; sidx = idx; } else if (ki < nkernel) { d1 = ds; sidx = idx - np - (ki-1)*ns; } else { d1 = db; sidx = idx - np - (ki-1)*ns; } // Compute the (l,m) values corresponding to the index within // the subvector l1 = m1 = 0; if (d1 > 0) { i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == sidx) { l1 = l; m1 = m; } i++; } } } if (ki == 0) { Bi = tex2DLayered(tex,blockIdx.x,blockIdx.y,0); } else if (ki < nkernel) { if (ext_basis[ki]) { Bi = tex2DLayered(tex,blockIdx.x+a,blockIdx.y+b,1)- tex2DLayered(tex,blockIdx.x,blockIdx.y,0); } else { Bi = tex2DLayered(tex,blockIdx.x+a,blockIdx.y+b,0)- tex2DLayered(tex,blockIdx.x,blockIdx.y,0); } } else { Bi = 1.0; } count[threadIdx.x] += coefficient[idx]*powf(x,l1)*powf(y,m1)*Bi; } __syncthreads(); // Then parallel-sum the results i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { count[threadIdx.x] += count[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { M[blockIdx.x+gridDim.x*blockIdx.y] = count[0]; } } __global__ void cu_compute_vector(int dp, int ds, int db, int nx, int ny, int *kxindex, int *kyindex, int *ext_basis, int nkernel, int kernelRadius,float *V) { int idx; int np, ns, ki, a, b, d1, i, j; int l, m, l1, m1; float py, x, y, Bi; float temp; __shared__ float count[THREADS_PER_BLOCK]; // Calculate number of terms in subvectors np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // This is the index of the subvector and its kernel offsets ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1; a = b = 0; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; } // Set the polynomial degrees for the submatrix and the // indices within the submatrix if (ki == 0) { d1 = dp; idx = blockIdx.x; } else if (ki < nkernel) { d1 = ds; idx = blockIdx.x - np - (ki-1)*ns; } else { d1 = db; idx = blockIdx.x - np - (ki-1)*ns; } // Compute the (l,m) values corresponding to the index within // the subvector i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == idx) { l1 = l; m1 = m; } i++; } } // Compute the contribution to V from each image location. // Use individual threads to sum over columns. // tex[:,:,0] is the reference image, // tex[:,:,1] is the blurred reference image, // tex[:,:,2] is the target image, // tex[:,:,3] is the inverse variance, // tex[:,:,4] is the mask. // Bi is the basis image value. temp = 0.0; Bi = 1.0; __syncthreads(); for (j=kernelRadius; j<ny-kernelRadius; j++) { y = (j - 0.5*(ny-1))/(ny-1); py = powf(y,m1); for (i=threadIdx.x+kernelRadius; i<nx-kernelRadius; i+=blockDim.x) { x = (i - 0.5*(nx-1))/(nx-1); if (ki == 0) { Bi = tex2DLayered(tex,i,j,0); } else if (ki < nkernel) { if (ext_basis[ki]) { Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0); } else { Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0); } } else { Bi = 1.0; } temp += powf(x,l1)*py*Bi*tex2DLayered(tex,i,j,2)*tex2DLayered(tex,i,j,3)* tex2DLayered(tex,i,j,4); } } count[threadIdx.x] = temp; __syncthreads(); // Then parallel-sum the rows i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { count[threadIdx.x] += count[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { V[blockIdx.x] = count[0]; } } __global__ void cu_compute_vector_stamps(int dp, int ds, int db, int nx, int ny, int nstamps, int stamp_half_width, float *stamp_xpos, float* stamp_ypos, int *kxindex, int *kyindex, int *ext_basis, int nkernel, int kernelRadius,float *V) { int idx; int np, ns, ki, a, b, d1, i, j, i1, i2, j1, j2; int l, m, l1, m1; float py, x, y, Bi; float temp; __shared__ float count[THREADS_PER_BLOCK]; // Calculate number of terms in subvectors np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // This is the index of the subvector and its kernel offsets ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1; a = b = 0; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; } // Set the polynomial degrees for the submatrix and the // indices within the submatrix if (ki == 0) { d1 = dp; idx = blockIdx.x; } else if (ki < nkernel) { d1 = ds; idx = blockIdx.x - np - (ki-1)*ns; } else { d1 = db; idx = blockIdx.x - np - (ki-1)*ns; } // Compute the (l,m) values corresponding to the index within // the subvector i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == idx) { l1 = l; m1 = m; } i++; } } // Compute the contribution to V from each image location. // Use individual threads to sum over columns. // tex[:,:,0] is the reference image, // tex[:,:,1] is the blurred reference image, // tex[:,:,2] is the target image, // tex[:,:,3] is the inverse variance, // tex[:,:,4] is the mask. // Bi is the basis image value. temp = 0.0; Bi = 1.0; __syncthreads(); for (idx = threadIdx.x; idx<nstamps; idx += blockDim.x) { j1 = max(0,(int)stamp_ypos[idx]-stamp_half_width); j2 = min(ny,(int)stamp_ypos[idx]+stamp_half_width); for (j=j1; j<j2; j++) { y = (j - 0.5*(ny-1))/(ny-1); py = powf(y,m1); i1 = max(0,(int)stamp_xpos[idx]-stamp_half_width); i2 = min(nx,(int)stamp_xpos[idx]+stamp_half_width); for (i=i1; i<i2; i++) { x = (i - 0.5*(nx-1))/(nx-1); if (ki == 0) { Bi = tex2DLayered(tex,i,j,0); } else if (ki < nkernel) { if (ext_basis[ki]) { Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0); } else { Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0); } } else { Bi = 1.0; } temp += powf(x,l1)*py*Bi*tex2DLayered(tex,i,j,2)*tex2DLayered(tex,i,j,3)* tex2DLayered(tex,i,j,4); } } } count[threadIdx.x] = temp; __syncthreads(); // Then parallel-sum the rows i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { count[threadIdx.x] += count[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { V[blockIdx.x] = count[0]; } } __global__ void cu_compute_matrix(int dp, int ds, int db, int nx, int ny, int *kxindex, int *kyindex, int *ext_basis, int nkernel, int kernelRadius,float *H) { int idx, idy, idx0, idy0, idx1, idy1; int np, ns, ki, kj, a, b, c, d, d1, d2, i, j; int l, m, l1, m1, l2, m2; float py, x, y, Bi, Bj; float temp; __shared__ float count[THREADS_PER_BLOCK]; // Terminate if we are not in the lower triangle if (blockIdx.x > blockIdx.y) { return; } // Calculate number of terms in submatrices np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // These are indices of the submatrix and their kernel offsets ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1; kj = blockIdx.y < np ? 0 : (blockIdx.y-np)/ns + 1; a = b = 0; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; } if (kj<nkernel) { c = kxindex[kj]; d = kyindex[kj]; } // Set the polynomial degrees for the submatrix and the // indices within the submatrix if (ki == 0) { d1 = dp; idx = blockIdx.x; } else if (ki < nkernel) { d1 = ds; idx = blockIdx.x - np - (ki-1)*ns; } else { d1 = db; idx = blockIdx.x - np - (ki-1)*ns; } if (kj == 0) { d2 = dp; idy = blockIdx.y; } else if (kj < nkernel) { d2 = ds; idy = blockIdx.y - np - (kj-1)*ns; } else { d2 = db; idy = blockIdx.y - np - (kj-1)*ns; } if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel) && (idx > idy)) { return; } idx0 = idx; idy0 = idy; // Compute the (l,m) values corresponding to the indices within // the submatrix i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == idx) { l1 = l; m1 = m; } i++; } } i = 0; for (l=0; l<=d2; l++) { for (m=0; m<=d2-l; m++) { if (i == idy) { l2 = l; m2 = m; } i++; } } // Compute the contribution to H from each image location. // Use individual threads to sum over columns. // tex[:,:,0] is the reference image, // tex[:,:,1] is the blurred reference image, // tex[:,:,2] is the target image, // tex[:,:,3] is the inverse variance, // tex[:,:,4] is the mask. // Bi and Bj are the basis image values. temp = 0.0; Bi = Bj = 1.0; __syncthreads(); for (j=kernelRadius; j<ny-kernelRadius; j++) { y = (j - 0.5*(ny-1))/(ny-1); py = powf(y,m1+m2); for (i=threadIdx.x+kernelRadius; i<nx-kernelRadius; i+=blockDim.x) { x = (i - 0.5*(nx-1))/(nx-1); if (ki == 0) { Bi = tex2DLayered(tex,i,j,0); } else if (ki < nkernel) { if (ext_basis[ki]) { Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0); } else { Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0); } } else { Bi = 1.0; } if (kj == 0) { Bj = tex2DLayered(tex,i,j,0); } else if (kj < nkernel) { if (ext_basis[kj]) { Bj = tex2DLayered(tex,i+c,j+d,1)-tex2DLayered(tex,i,j,0); } else { Bj = tex2DLayered(tex,i+c,j+d,0)-tex2DLayered(tex,i,j,0); } } else { Bj = 1.0; } temp += powf(x,l1+l2)*py*Bi*Bj*tex2DLayered(tex,i,j,3)*tex2DLayered(tex,i,j,4); } } count[threadIdx.x] = temp; __syncthreads(); // Then parallel-sum the rows i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { count[threadIdx.x] += count[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { H[blockIdx.x+gridDim.x*blockIdx.y] = count[0]; H[blockIdx.y+gridDim.x*blockIdx.x] = count[0]; if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel)) { idx1 = np + (ki-1)*ns; idy1 = np + (kj-1)*ns; H[(idx1+idy0)+gridDim.x*(idy1+idx0)] = count[0]; H[(idy1+idx0)+gridDim.x*(idx1+idy0)] = count[0]; } } } __global__ void cu_compute_matrix_stamps(int dp, int ds, int db, int nx, int ny, int nstamps, int stamp_half_width, float *stamp_xpos, float* stamp_ypos, int *kxindex, int *kyindex, int *ext_basis, int nkernel, int kernelRadius,float *H) { int idx, idy, idx0, idy0, idx1, idy1; int np, ns, ki, kj, a, b, c, d, d1, d2, i, j, i1, i2, j1, j2; int l, m, l1, m1, l2, m2; float px, py, x, y, Bi, Bj; float temp; __shared__ float count[THREADS_PER_BLOCK]; // Terminate if we are not in the lower triangle if (blockIdx.x > blockIdx.y) { return; } // Calculate number of terms in submatrices np = (dp+1)*(dp+2)/2; ns = (ds+1)*(ds+2)/2; // These are indices of the submatrix and their kernel offsets ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1; kj = blockIdx.y < np ? 0 : (blockIdx.y-np)/ns + 1; a = b = 0; if (ki<nkernel) { a = kxindex[ki]; b = kyindex[ki]; } if (kj<nkernel) { c = kxindex[kj]; d = kyindex[kj]; } // Set the polynomial degrees for the submatrix and the // indices within the submatrix if (ki == 0) { d1 = dp; idx = blockIdx.x; } else if (ki < nkernel) { d1 = ds; idx = blockIdx.x - np - (ki-1)*ns; } else { d1 = db; idx = blockIdx.x - np - (ki-1)*ns; } if (kj == 0) { d2 = dp; idy = blockIdx.y; } else if (kj < nkernel) { d2 = ds; idy = blockIdx.y - np - (kj-1)*ns; } else { d2 = db; idy = blockIdx.y - np - (kj-1)*ns; } if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel) && (idx > idy)) { return; } idx0 = idx; idy0 = idy; // Compute the (l,m) values corresponding to the indices within // the submatrix i = 0; for (l=0; l<=d1; l++) { for (m=0; m<=d1-l; m++) { if (i == idx) { l1 = l; m1 = m; } i++; } } i = 0; for (l=0; l<=d2; l++) { for (m=0; m<=d2-l; m++) { if (i == idy) { l2 = l; m2 = m; } i++; } } // Compute the contribution to H from each image location. // Use individual threads to sum over stamps. // tex[:,:,0] is the reference image, // tex[:,:,1] is the blurred reference image, // tex[:,:,2] is the target image, // tex[:,:,3] is the inverse variance, // tex[:,:,4] is the mask. // Bi and Bj are the basis image values. temp = 0.0; Bi = Bj = 1.0; __syncthreads(); for (idx = threadIdx.x; idx<nstamps; idx += blockDim.x) { i1 = max(0,(int)stamp_xpos[idx]-stamp_half_width); i2 = min(nx,(int)stamp_xpos[idx]+stamp_half_width); for (i=i1; i<i2; i++) { x = (i - 0.5*(nx-1))/(nx-1); px = powf(x,l1+l2); j1 = max(0,(int)stamp_ypos[idx]-stamp_half_width); j2 = min(ny,(int)stamp_ypos[idx]+stamp_half_width); for (j=j1; j<j2; j++) { y = (j - 0.5*(ny-1))/(ny-1); py = powf(y,m1+m2); if (ki == 0) { Bi = tex2DLayered(tex,i,j,0); } else if (ki < nkernel) { if (ext_basis[ki]) { Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0); } else { Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0); } } else { Bi = 1.0; } if (kj == 0) { Bj = tex2DLayered(tex,i,j,0); } else if (kj < nkernel) { if (ext_basis[kj]) { Bj = tex2DLayered(tex,i+c,j+d,1)-tex2DLayered(tex,i,j,0); } else { Bj = tex2DLayered(tex,i+c,j+d,0)-tex2DLayered(tex,i,j,0); } } else { Bj = 1.0; } temp += px*py*Bi*Bj*tex2DLayered(tex,i,j,3)*tex2DLayered(tex,i,j,4); } } } count[threadIdx.x] = temp; __syncthreads(); // Then parallel-sum the rows i = blockDim.x/2; while (i != 0) { if (threadIdx.x < i) { count[threadIdx.x] += count[threadIdx.x + i]; } __syncthreads(); i /= 2; } if (threadIdx.x == 0) { H[blockIdx.x+gridDim.x*blockIdx.y] = count[0]; H[blockIdx.y+gridDim.x*blockIdx.x] = count[0]; if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel)) { idx1 = np + (ki-1)*ns; idy1 = np + (kj-1)*ns; H[(idx1+idy0)+gridDim.x*(idy1+idx0)] = count[0]; H[(idy1+idx0)+gridDim.x*(idx1+idy0)] = count[0]; } } } """)
def gpuFunc(iterator): iterator = iter(iterator) cpu_data = numpy.asarray(list(iterator), dtype=numpy.float32) datasize = numpy.asarray(len(cpu_data)) gridNum = ( datasize * 3) / 256 + 1 # * 3 for data dimensions. /256 for block size. # +1 for overprovisioning in case there is dangling threads centroids = numpy.empty((datasize), gpuarray.vec.float4) cuda.init() dev = cuda.Device(0) contx = dev.make_context() # The GPU kernel below takes centroids IDs and 3-D data points in form of float4 (x,y,z,w). # X is for the centroid ID whereas (y,z,w) are the actual point coordinate. mod = SourceModule(""" __global__ void assignToCentroid(float *data, int* datasize, int *k, float *kPoints, float4 * clusters){ //1d grid index unsigned idxInLeftBlocks = blockIdx.x * (blockDim.x * blockDim.y); unsigned idxInCurrBlock = threadIdx.y * blockDim.x + threadIdx.x; unsigned idx = idxInLeftBlocks + idxInCurrBlock; int size = datasize[0]; int K = k[0]; if (idx < size*3){//consider xyz for each data point if (idx % 3 == 0){ unsigned dataIdx = idx/3; clusters[dataIdx].x = 999999; clusters[dataIdx].y = data[idx]; clusters[dataIdx].z = data[idx+1]; clusters[dataIdx].w = data[idx+2]; } float inf = 99999.9999; float closest = inf; float tempDist=0, tempDistX=0, tempDistY=0, tempDistZ=0; //updateing tempDist for (int i = 0; i < K; i++){ if( idx % 3 == 0 /*0,3,6*/) tempDistX = data[idx] - kPoints[i*3]; else if(idx % 3 == 1 /*1,4,7*/) tempDistY = data[idx] - kPoints[i*3 + 1]; else {/*2,5,8*/ tempDistZ = data[idx] - kPoints[i*3 + 2]; tempDist = pow(tempDistX,2) + pow(tempDistY,2) + pow(tempDistZ,2); } if(tempDist < closest){ closest = tempDist; int dataIdx = idx/3; clusters[dataIdx].x = i;//coordinate belongs to cluster i } } } } """) func = mod.get_function("assignToCentroid") func(cuda.In(cpu_data), cuda.In(datasize), cuda.In(k), cuda.In(kPoints), cuda.Out(centroids), block=(16, 16, 1), grid=(gridNum, 1), shared=0) closest = [(x[0], (numpy.asarray([x[1], x[2], x[3]]), 1)) for x in centroids] contx.pop() del cpu_data del datasize del centroids del contx return iter(closest)
from pycuda.compiler import SourceModule TPB = 16 N = 1024 N2 = N**2 mod = SourceModule(""" #include <stdio.h> #define N 1024 #define N2 1048576 __global__ void prime(int *array, int *prime_mask) { int i = threadIdx.x + blockIdx.x * blockDim.x; if(array[i] > 1){ if(prime_mask[i] == 0){ int inc = array[i]; for(int j = i+inc ; j < N2 ; j += inc ){ prime_mask[j] = 1; } } } } """) def p_prime(array, prime_mask): for i in range(1, N2): if prime_mask[i] == 0: print("El numero {} es primo".format(array[i]))
ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(f, 'yz')) cey_gpu = cuda.to_device(set_c(f, 'zx')) cez_gpu = cuda.to_device(set_c(f, 'xy')) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('TPB', str(tpb)).replace('nxy', str(nx * ny)).replace( 'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz))) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") update_h.prepare("PPPPPP", block=(tpb, 1, 1)) update_e.prepare("PPPPPPPPP", block=(16, 16, 1)) update_src.prepare("fP", block=(nx, 1, 1)) # prepare for plot from matplotlib.pyplot import * ion() imsh = imshow(np.ones((ny, nz), 'f').T, cmap=cm.hot, origin='lower',
ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) ) cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) ) cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) ) chx_gpu = cuda.to_device( set_c(f,(None,0,0)) ) chy_gpu = cuda.to_device( set_c(f,(0,None,0)) ) chz_gpu = cuda.to_device( set_c(f,(0,0,None)) ) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e") update_h = mod.get_function("update_h") update_src = mod.get_function("update_src") tex = mod.get_texref("tex") tey = mod.get_texref("tey") tez = mod.get_texref("tez") thx = mod.get_texref("thx") thy = mod.get_texref("thy") thz = mod.get_texref("thz") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tchx = mod.get_texref("tchx") tchy = mod.get_texref("tchy") tchz = mod.get_texref("tchz")
block_size = 64 block = (block_size,1,1) grid = (A_cols/block_size+1,1) concatenate_cols_kernel(A,A_conc, np.intc(A_rows), np.intc(A_cols), block= block, grid=grid) def sign(x): return np.float32(2.*np.greater_equal(x,0)-1.) if __name__ == "__main__": context = pycuda.autoinit.context print "Building the kernels..." mod = SourceModule(open("binary_kernels.cu").read()) gemm_kernel = mod.get_function("gemm") concatenate_rows_kernel = mod.get_function("concatenate_rows_kernel") concatenate_cols_kernel = mod.get_function("concatenate_cols_kernel") xnor_gemm_kernel = mod.get_function("xnor_gemm") print "Loading matrices to device..." # Matrices dimensions N = 8192 A_rows = N A_cols = N B_cols = N # A_rows = 784 # A_cols = 1024 # B_cols = 4096
mod = SourceModule(""" __device__ __constant__ float t_c[9] ; __device__ __constant__ int c_c[18] ; __global__ void funRT(float* fin_g0,float* fin_g1,float* fin_g2,float* fin_g3,float* fin_g4,float* fin_g5,float* fin_g6,float* fin_g7,float* fin_g8, float* feq_g0,float* feq_g1,float* feq_g2,float* feq_g3,float* feq_g4,float* feq_g5,float* feq_g6,float* feq_g7,float* feq_g8, float* rho_g, float* u_g, float* taus_g){ int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int xsize = blockDim.x * gridDim.x; int ysize = blockDim.y * gridDim.y; int d = xsize * ysize; int i = x + y * xsize; //int k = 0; //int BC = 0; // defining whether the cell is a boundary condition or not float omega = %s; //1.940391; float cu[9]; float rho_l= 0.0; //rho_g[i]; float fin_l[9]; float u_l[2] ; // local fin to avoid multiple global memory access if ( (x>=0) and (x<xsize) and (y>=0) and (y<ysize) ){ fin_l[0] = fin_g0[i]; } else { fin_l[0] = fin_g0[i]; } if ( (x-1>=0) and (x-1<xsize) and (y>=0) and (y<ysize) ){ fin_l[1] = fin_g1[i-1]; } else { fin_l[1] = fin_g1[i]; } if ( (x>=0) and (x<xsize) and (y+1>=0) and (y+1<ysize) ){ fin_l[2] = fin_g2[i+xsize]; } else { fin_l[2] = fin_g2[i]; } if ( (x+1>=0) and (x+1<xsize) and (y>=0) and (y<ysize) ){ fin_l[3] = fin_g3[i+1]; } else { fin_l[3] = fin_g3[i]; } if ( (x>=0) and (x<xsize) and (y-1>=0) and (y-1<ysize) ){ fin_l[4] = fin_g4[i-xsize]; } else { fin_l[4] = fin_g4[i]; } if ( (x-1>=0) and (x-1<xsize) and (y+1>=0) and (y+1<ysize) ){ fin_l[5] = fin_g5[i-1+xsize]; } else { fin_l[5] = fin_g5[i]; } if ( (x+1>=0) and (x+1<xsize) and (y+1>=0) and (y+1<ysize) ){ fin_l[6] = fin_g6[i+1+xsize]; } else { fin_l[6] = fin_g6[i]; } if ( (x+1>=0) and (x+1<xsize) and (y-1>=0) and (y-1<ysize) ){ fin_l[7] = fin_g7[i+1-xsize]; } else { fin_l[7] = fin_g7[i]; } if ( (x-1>=0) and (x-1<xsize) and (y-1>=0) and (y-1<ysize) ){ fin_l[8] = fin_g8[i-1-xsize]; } else { fin_l[8] = fin_g8[i]; } // Applying boundary conditions //__syncthreads(); if (x ==0 ){ fin_l[1] = feq_g1[i] - feq_g3[i] + fin_l[3]; fin_l[5] = feq_g5[i] - feq_g7[i] + fin_l[7]; fin_l[8] = feq_g8[i] - feq_g6[i] + fin_l[6]; }else if(x==xsize-1){ fin_l[3] = -feq_g1[i] + feq_g3[i] + fin_l[1]; fin_l[6] = -feq_g8[i] + feq_g6[i] + fin_l[8]; fin_l[7] = -feq_g5[i] + feq_g7[i] + fin_l[5]; } if (y==ysize-1){ fin_l[2] = -feq_g4[i] + feq_g2[i] + fin_l[4]; fin_l[5] = -feq_g7[i] + feq_g5[i] + fin_l[7]; fin_l[6] = -feq_g8[i] + feq_g6[i] + fin_l[8]; }else if (y==0){ fin_l[4] = -feq_g2[i] + feq_g4[i] + fin_l[2]; fin_l[7] = -feq_g5[i] + feq_g7[i] + fin_l[5]; fin_l[8] = -feq_g6[i] + feq_g8[i] + fin_l[6]; } rho_l = fin_l[0]+fin_l[1]+fin_l[2]+fin_l[3]+fin_l[4]+fin_l[5]+fin_l[6]+fin_l[7]+fin_l[8]; u_l[0] = (c_c[0]*fin_l[0]+c_c[2]*fin_l[1]+c_c[4]*fin_l[2]+c_c[6]*fin_l[3]+c_c[8]*fin_l[4]+c_c[10]*fin_l[5]+c_c[12]*fin_l[6]+c_c[14]*fin_l[7]+c_c[16]*fin_l[8])/rho_l; u_l[1] = (c_c[1]*fin_l[0]+c_c[3]*fin_l[1]+c_c[5]*fin_l[2]+c_c[7]*fin_l[3]+c_c[9]*fin_l[4]+c_c[11]*fin_l[5]+c_c[13]*fin_l[6]+c_c[15]*fin_l[7]+c_c[17]*fin_l[8])/rho_l; // BCs left wall, right wall, bottom wall and top wall if ( x == 0 or x == xsize-1 or y == ysize-1){ u_l[0] =0; u_l[1] =0; } else if (y==0){ rho_l = fin_l[0]+fin_l[1]+fin_l[3]+2*(fin_l[2]+fin_l[5]+fin_l[6]); u_l[0]= 0.08 ; // uLB; u_l[1] =0; } u_g[0*d+i] = u_l[0]; u_g[1*d+i] = u_l[1]; rho_g[i] = rho_l; cu[0] = 0 ; feq_g0[i] = rho_l*t_c[0]*(1. + 3.0*cu[0] + 9*0.5*cu[0]*cu[0] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g0[i] = fin_l[0] - omega*(fin_l[0]-feq_g0[i]) ; cu[1] = (u_l[0]) ; feq_g1[i] = rho_l*t_c[1]*(1. + 3.0*cu[1] + 9*0.5*cu[1]*cu[1] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g1[i] = fin_l[1] - omega*(fin_l[1]-feq_g1[i]) ; cu[2] = (u_l[1]) ; feq_g2[i] = rho_l*t_c[2]*(1. + 3.0*cu[2] + 9*0.5*cu[2]*cu[2] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g2[i] = fin_l[2] - omega*(fin_l[2]-feq_g2[i]) ; cu[3] = (-u_l[0]) ; feq_g3[i] = rho_l*t_c[3]*(1. + 3.0*cu[3] + 9*0.5*cu[3]*cu[3] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g3[i] = fin_l[3] - omega*(fin_l[3]-feq_g3[i]) ; cu[4] = (-u_l[1]) ; feq_g4[i] = rho_l*t_c[4]*(1. + 3.0*cu[4] + 9*0.5*cu[4]*cu[4] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g4[i] = fin_l[4] - omega*(fin_l[4]-feq_g4[i]) ; cu[5] = (u_l[0] + u_l[1]) ; feq_g5[i] = rho_l*t_c[5]*(1. + 3.0*cu[5] + 9*0.5*cu[5]*cu[5] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g5[i] = fin_l[5] - omega*(fin_l[5]-feq_g5[i]) ; cu[6] = (-u_l[0] + u_l[1]) ; feq_g6[i] = rho_l*t_c[6]*(1. + 3.0*cu[6] + 9*0.5*cu[6]*cu[6] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g6[i] = fin_l[6] - omega*(fin_l[6]-feq_g6[i]) ; cu[7] = (-u_l[0] -u_l[1]) ; feq_g7[i] = rho_l*t_c[7]*(1. + 3.0*cu[7] + 9*0.5*cu[7]*cu[7] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g7[i] = fin_l[7] - omega*(fin_l[7]-feq_g7[i]) ; cu[8] = (u_l[0] -u_l[1]) ; feq_g8[i] = rho_l*t_c[8]*(1. + 3.0*cu[8] + 9*0.5*cu[8]*cu[8] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1])); fin_g8[i] = fin_l[8] - omega*(fin_l[8]-feq_g8[i]) ; } """ % (omega))
import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy a = numpy.random.randn(5, 5) a = a.astype(numpy.float32) a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doubles_matrix(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) func = mod.get_function("doubles_matrix") func(a_gpu, block=(5, 5, 1)) a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print("ORIGINAL MATRIX") print a print("DOUBLED MATRIX AFTER PyCUDA EXECUTION") print a_doubled
mod = SourceModule(""" #include <math.h> __device__ int phase(float x0, float y0, float x1, float y1) { bool realGreater = (x0 >= x1); bool imagGreater = (y0 >= y1); if (realGreater && imagGreater) { return 1; } else if (!realGreater && imagGreater) { return 2; } else if (!realGreater && !imagGreater) { return 3; }; return 4; } __device__ float calculateNewCentreX(float x1, int previousResult, int n) { if (previousResult == 1 || previousResult == 4) { return x1 + powf(2, n - 2); }; return x1 - powf(2, n - 2); } __device__ float calculateNewCentreY(float y1, int previousResult, int n) { if (previousResult == 1 || previousResult == 2) { return y1 + powf(2, n - 2); }; return y1 - powf(2, n - 2); } __global__ void getGridValue(int *regexStringGrid, int n) { int real = blockDim.x * blockIdx.x + threadIdx.x; int imag = blockDim.y * blockIdx.y + threadIdx.y; int dim = gridDim.x * (blockIdx.y * blockDim.y + threadIdx.y) + blockDim.x * blockIdx.x + threadIdx.x; float x1 = powf(2, n - 1); float y1 = powf(2, n - 1); for (int i = 0; i < n; ++i) { int value = phase(real, imag, x1, y1); x1 = calculateNewCentreX(x1, value, n - i); y1 = calculateNewCentreY(y1, value, n - i); regexStringGrid[dim] += value * powf(10, i); } }""")
descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx arrcopy(mcopy, cex, tcex_gpu) arrcopy(mcopy, cey, tcey_gpu) arrcopy(mcopy, cez, tcez_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tcex.set_array(tcex_gpu) tcey.set_array(tcey_gpu) tcez.set_array(tcez_gpu) update_h.prepare("iPPPPPP", block=(Dx,Dy,1)) update_e.prepare("iPPPPPP", block=(Dx,Dy,1), texrefs=[tcex,tcey,tcez]) update_src.prepare("fP", block=(nz,1,1))
mod = SourceModule(""" __global__ void matmul2(int n, float *a, float *b, float *c) { int row_num_a,col_num_a,row_num_b,col_num_b,row_num_c,col_num_c; row_num_a=col_num_a=row_num_b=col_num_b=row_num_c=col_num_c = n; __shared__ float A[32][32];//2D array for storing shared matrix values of A & B __shared__ float B[32][32];//Process subMatrix in block int Col=blockIdx.x*blockDim.x+threadIdx.x;//Col and Row Ids of threads int Row=blockIdx.y*blockDim.y+threadIdx.y; double temp = 0; for (int i = 0; i < (col_num_a-1)/blockDim.x+1; ++i) { if (Row < row_num_a && i*blockDim.x+threadIdx.x < col_num_a) A[threadIdx.y][threadIdx.x] = a[Row*col_num_a + i*blockDim.x+threadIdx.x];//Memory Fetch from a else A[threadIdx.y][threadIdx.x] = 0;//In case the block dim is not a multiple of matrix if (Col < col_num_b && i*blockDim.x+threadIdx.y < row_num_b) B[threadIdx.y][threadIdx.x] = b[(i*blockDim.x+threadIdx.y)*col_num_b+Col];//Memory Fetch from b else B[threadIdx.y][threadIdx.x] = 0; __syncthreads();//Wait for all matrix loads to shared memory - then proceed with for loop if (Row < row_num_c && Col < col_num_c) for (int j = 0; j < blockDim.x; ++j)//Matrix multiplication temp += A[threadIdx.y][j] * B[j][threadIdx.x]; __syncthreads(); } if(Row<row_num_c && Col<col_num_c)//If the matrix is needed, then do this c[Row*col_num_c+Col] = (float)temp;//Save to c }""")
from shared_gpu_kernels import gen_kernel, transform_to_cuda from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32) Stream = drv.Stream() Stream2 = drv.Stream() MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4) MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes) prg = SourceModule( transform_to_cuda( gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH) ) ) prog = prg.get_function('mt_brute') zzz = time.time() ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2) for i in xrange(TEST_ITERATIONS): prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream) drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2) zzz = time.time() - zzz print '>>>', zzz
mod = SourceModule(""" #include <math.h> __global__ void fitness(double *x, double *y, double eps, double a, double b, double c, double d) { const int i = threadIdx.x + blockDim.x * blockIdx.x; double tempx = x[i]; double tempy = y[i]; //double avgfit = (x[i]*(x[i]*a + y[i]*c) + y[i]*(x[i]*b+y[i]*d))/pow(x[i]+y[i],2); double pop = tempx + tempy; double lambda1 = 2; double lambda2 = .02; double avgfit = lambda1 - lambda2*pop; y[i] = tempy + tempy*eps*( (tempx*c + tempy*d)/(tempx+tempy) + avgfit ); x[i] = tempx + tempx*eps*( (tempx*a + tempy*b)/(tempx+tempy) + avgfit ); } __global__ void diffuse_display(double *picture, double *destx, double *desty, double *x, double *y, double mu1, double mu2, double eps, int side) { const int i = threadIdx.x + blockDim.x * blockIdx.x; const int ROW = i/side; const int COLUMN = i%side; const int N = ROW*side + (COLUMN+side-1)%side; const int S = ROW*side + (COLUMN +1)%side; const int E = ((ROW+1)%side)*side + COLUMN; const int W = ((ROW+side-1)%side)*side + COLUMN; const int NE = ((ROW+1)%side)*side + (COLUMN+side-1)%side; const int NW = ((ROW-1+side)%side)*side + (COLUMN+side-1)%side; const int SE = ((ROW+1)%side)*side + (COLUMN+1)%side; const int SW = ((ROW+side-1)%side)*side + (COLUMN+1)%side; double mu1eps = mu1*eps; double mu2eps = mu2*eps; double d = 4+4/sqrt(2.0); destx[i] = (1-mu1eps)*x[i] + (( mu1eps ) / d ) * (x[N]+x[S]+x[W]+x[E]) + ( (1/sqrt(2.0) ) * mu1eps / d )*(x[NE]+x[NW]+x[SE]+x[SW]); desty[i] = (1-mu2eps)*y[i] + (( mu2eps ) / d ) * (y[N]+y[S]+y[W]+y[E]) + ( (1/sqrt(2.0) ) * mu2eps / d )*(y[NE]+y[NW]+y[SE]+y[SW]); picture[i] = destx[i]/(destx[i]+desty[i]); } """)
import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np from cutools import mt_rand mod = SourceModule(""" #include "mt_rand.cu.h" __global__ void cu_rand_test(float *x, int N) { unsigned int idx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; if (idx < N) { // initialize the MT MersenneTwisterState mtState; MersenneTwisterInitialise(mtState, idx); // x[idx] = mt_rand(mtState, idx); } } """, include_dirs=[mt_rand.get_include_dir()]) # seed the random number generator mt_rand.seed(cuda, mod) cu_rand = mod.get_function("cu_rand_test") asize = 10000
#!python import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy as np from pycuda.compiler import SourceModule func_mod = SourceModule(""" template <class T> __device__ T incr(T x) { return (x + 1.0); } // Needed to avoid name mangling so that PyCUDA can // find the kernel function: extern "C" { __global__ void func(float *a, int N) { int idx = threadIdx.x; if (idx < N) a[idx] = incr(a[idx]); } } """, no_extern_c=1) func = func_mod.get_function('func') N = 5 x = np.asarray(np.random.rand(N), np.float32) x_orig = x.copy() x_gpu = gpuarray.to_gpu(x)
import pycuda.autoinit from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray import numpy mod = SourceModule(r""" void __global__ add(const float *x, const float *y, float *z) { const int n = blockDim.x * blockIdx.x + threadIdx.x; z[n] = x[n] + y[n]; } """) add = mod.get_function("add") num = 6 A = numpy.random.rand(num) B = numpy.random.rand(num) C = numpy.zeros(num) A_GPU = gpuarray.to_gpu(A.astype(numpy.float32)) B_GPU = gpuarray.to_gpu(B.astype(numpy.float32)) C_GPU = gpuarray.to_gpu(B.astype(numpy.float32)) add(A_GPU, B_GPU, C_GPU, grid=(2,), block=(4,1,1)) C = C_GPU.get() print('A=', A) print('B=', B) print('C=', C)
# memory allocate f = np.zeros((nx,ny,nz), dtype=np.float64) cf = np.ones_like(f)*0.5 eh_gpus = ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu = [cuda.to_device(f) for i in range(6)] ce_gpus = cex_gpu, cey_gpu, cez_gpu = [cuda.to_device(cf) for i in range(3)] # prepare kernels tpb = 256 for bpg in xrange(65535, 0, -1): if (nx * ny * nz / tpb) % bpg == 0: break print 'tpb = %d, bpg = %g' % (tpb, bpg) from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('Dx',str(tpb)).replace('nxyz',str(nx*ny*nz)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)), options=['-m 64'] ) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") update_h.prepare("PPPPPP", block=(tpb,1,1)) update_e.prepare("PPPPPPPPP", block=(tpb,1,1)) update_src.prepare("fP", block=(nz,1,1)) # prepare for plot #import matplotlib.pyplot as plt #plt.ion() #imsh = plt.imshow(np.ones((nx,ny),'f').T, cmap=plt.cm.hot, origin='lower', vmin=0, vmax=0.005) #plt.colorbar()
def __init__(self, img_size, **kwargs): self.num = CorrelStage.num CorrelStage.num += 1 self.verbose = kwargs.get("verbose", 0) self.debug(2, "Initializing with resolution", img_size) self.h, self.w = img_size self._ready = False self.nbIter = kwargs.get("iterations", 5) self.showDiff = kwargs.get("show_diff", False) if self.showDiff: import cv2 cv2.namedWindow("Residual", cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) self.mul = kwargs.get("mul", 3) # These two store the values of the last resampled array # It is meant to allocate output array only once (see resampleD) self.rX, self.rY = -1, -1 # self.loop will be incremented every time getDisp is called # It will be used to measure performance and output some info self.loop = 0 # Allocating stuff # # Grid and block for kernels called with the size of the image # # All the images and arrays in the kernels will be in order (x,y) self.grid = (int(ceil(self.w / 32)), int(ceil(self.h / 32))) self.block = (int(ceil(self.w / self.grid[0])), int(ceil(self.h / self.grid[1])), 1) self.debug(3, "Default grid:", self.grid, "block", self.block) # We need the number of fields to allocate the G tables # self.Nfields = kwargs.get("Nfields") if self.Nfields is None: self.Nfields = len(kwargs.get("fields")[0]) # Allocating everything we need # self.devG = [] self.devFieldsX = [] self.devFieldsY = [] for i in range(self.Nfields): # devG stores the G arrays (to compute the research direction) self.devG.append(gpuarray.empty(img_size, np.float32)) # devFieldsX/Y store the fields value along X and Y self.devFieldsX.append(gpuarray.empty((self.h, self.w), np.float32)) self.devFieldsY.append(gpuarray.empty((self.h, self.w), np.float32)) # devH Stores the Hessian matrix self.H = np.zeros((self.Nfields, self.Nfields), np.float32) # And devHi stores its invert self.devHi = gpuarray.empty((self.Nfields, self.Nfields), np.float32) # devOut is written with the difference of the images self.devOut = gpuarray.empty((self.h, self.w), np.float32) # devX stores the value of the parameters (what is actually computed) self.devX = gpuarray.empty((self.Nfields), np.float32) # to store the research direction self.devVec = gpuarray.empty((self.Nfields), np.float32) # To store the original image on the device self.devOrig = gpuarray.empty(img_size, np.float32) # To store the gradient along X of the original image on the device self.devGradX = gpuarray.empty(img_size, np.float32) # And along Y self.devGradY = gpuarray.empty(img_size, np.float32) # Locating the kernel file # kernelFile = kwargs.get("kernel_file") if kernelFile is None: self.debug(2, "Kernel file not specified") from crappy import __path__ as crappyPath kernelFile = crappyPath[0] + "/data/kernels.cu" # Reading kernels and compiling module # with open(kernelFile, "r") as f: self.debug(3, "Sourcing module") self.mod = SourceModule(f.read() % (self.w, self.h, self.Nfields)) # Assigning functions to the kernels # # These kernels are defined in data/kernels.cu self._resampleOrigKrnl = self.mod.get_function('resampleO') self._resampleKrnl = self.mod.get_function('resample') self._gradientKrnl = self.mod.get_function('gradient') self._makeGKrnl = self.mod.get_function('makeG') self._makeDiff = self.mod.get_function('makeDiff') self._dotKrnl = self.mod.get_function('myDot') self._addKrnl = self.mod.get_function('kadd') # These ones use pyCuda reduction module to generate efficient kernels self._mulRedKrnl = ReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float *y") self._leastSquare = ReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="float *x") # We could have used use mulRedKrnl(x,x), but this is probably faster ? # Getting texture references # self.tex = self.mod.get_texref('tex') self.tex_d = self.mod.get_texref('tex_d') self.texMask = self.mod.get_texref('texMask') # Setting proper flags # # All textures use normalized coordinates except for the mask for t in [self.tex, self.tex_d]: t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) for t in [self.tex, self.tex_d, self.texMask]: t.set_filter_mode(cuda.filter_mode.LINEAR) t.set_address_mode(0, cuda.address_mode.BORDER) t.set_address_mode(1, cuda.address_mode.BORDER) # Preparing kernels for less overhead when called # self._resampleOrigKrnl.prepare("Pii", texrefs=[self.tex]) self._resampleKrnl.prepare("Pii", texrefs=[self.tex_d]) self._gradientKrnl.prepare("PP", texrefs=[self.tex]) self._makeDiff.prepare("PPPP", texrefs=[self.tex, self.tex_d, self.texMask]) self._addKrnl.prepare("PfP") # Reading original image if provided # if kwargs.get("img") is not None: self.setOrig(kwargs.get("img")) # Reading fields if provided # if kwargs.get("fields") is not None: self.setFields(kwargs.get("fields")) # Reading mask if provided # if kwargs.get("mask") is not None: self.setMask(kwargs.get("mask"))
lines = numpy.array(data, dtype=str) # Allocate cuda memory for input data lines_gpu = cuda.mem_alloc(lines.size * lines.dtype.itemsize) # Copy data from host to device cuda.memcpy_htod(lines_gpu, lines) # Dimensions blocks = len(data) thread_per_block = lines.dtype.itemsize nbr_values = lines.size * lines.dtype.itemsize # Create destination array dest = numpy.zeros((nbr_values, ), dtype=numpy.int32) # Allocate cuda memory for output data dest_gpu = cuda.mem_alloc(dest.size * dest.dtype.itemsize) # Define cuda function mod = SourceModule(""" __global__ void process(int **dest, char **line){ int index = threadIdx.x + blockIdx.x * blockDim.x; dest[index] = (int *)line[index]; } """) # Run GPU model gpu_func = mod.get_function("process") gpu_func(dest_gpu, lines_gpu, grid=(blocks, 1), block=(thread_per_block, 1, 1)) cuda.memcpy_dtoh(dest, dest_gpu) print(dest)
def __init__(self, img_size, **kwargs): cuda.init() from pycuda.tools import make_default_context global context context = make_default_context() unknown = [] for k in kwargs.keys(): if k not in [ 'verbose', 'levels', 'resampling_factor', 'kernel_file', 'iterations', 'show_diff', 'Nfields', 'img', 'fields', 'mask', 'mul' ]: unknown.append(k) if len(unknown) != 0: warnings.warn( "Unrecognized parameter" + ('s: ' + str(unknown) if len(unknown) > 1 else ': ' + unknown[0]), SyntaxWarning) self.verbose = kwargs.get("verbose", 0) self.debug( 3, "You set the verbose level to the maximum.\n\ It may help finding bugs or tracking errors but it may also \ impact the program performance as it will print A LOT of \ output and add GPU->CPU copies only to print information.\n\ If it is not desired, consider lowering the verbosity: \ 1 or 2 is a reasonable choice, \ 0 won't show anything except for errors.") self.levels = kwargs.get("levels", 5) self.loop = 0 self.resamplingFactor = kwargs.get("resampling_factor", 2) h, w = img_size self.nbIter = kwargs.get("iterations", 4) self.debug(1, "Initializing... Master resolution:", img_size, "levels:", self.levels, "verbosity:", self.verbose) # Computing dimensions of the different levels # self.h, self.w = [], [] for i in range(self.levels): self.h.append(int(round(h / (self.resamplingFactor**i)))) self.w.append(int(round(w / (self.resamplingFactor**i)))) if kwargs.get("Nfields") is not None: self.Nfields = kwargs.get("Nfields") else: try: self.Nfields = len(kwargs["fields"]) except KeyError: self.debug( 0, "Error! You must provide the number of fields at init. \ Add Nfields=x or directly set fields with fields=list/tuple") raise ValueError kernelFile = kwargs.get("kernel_file") if kernelFile is None: self.debug( 3, "Kernel file not specified, using the one in crappy dir") from crappy import __path__ as crappyPath kernelFile = crappyPath[0] + "/data/kernels.cu" self.debug(3, "Kernel file:", kernelFile) # Creating a new instance of CorrelStage for each stage # self.correl = [] for i in range(self.levels): self.correl.append( CorrelStage((self.h[i], self.w[i]), verbose=self.verbose, Nfields=self.Nfields, iterations=self.nbIter, show_diff=(i == 0 and kwargs.get("show_diff", False)), mul=kwargs.get("mul", 3), kernel_file=kernelFile)) # Set original image if provided # if kwargs.get("img") is not None: self.setOrig(kwargs.get("img")) s = """ texture<float, cudaTextureType2D, cudaReadModeElementType> texFx{0}; texture<float, cudaTextureType2D, cudaReadModeElementType> texFy{0}; __global__ void resample{0}(float* outX, float* outY, int x, int y) {{ int idx = blockIdx.x*blockDim.x+threadIdx.x; int idy = blockIdx.y*blockDim.y+threadIdx.y; if(idx < x && idy < y) {{ outX[idy*x+idx] = tex2D(texFx{0},(float)idx/x, (float)idy/y); outY[idy*x+idx] = tex2D(texFy{0},(float)idx/x, (float)idy/y); }} }} """ self.src = "" for i in range(self.Nfields): self.src += s.format( i) # Adding textures for the quick fields resampling self.mod = SourceModule(self.src) self.texFx = [] self.texFy = [] self.resampleF = [] for i in range(self.Nfields): self.texFx.append(self.mod.get_texref("texFx%d" % i)) self.texFy.append(self.mod.get_texref("texFy%d" % i)) self.resampleF.append(self.mod.get_function("resample%d" % i)) self.resampleF[i].prepare("PPii", texrefs=[self.texFx[i], self.texFy[i]]) for t in self.texFx + self.texFy: t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) t.set_filter_mode(cuda.filter_mode.LINEAR) t.set_address_mode(0, cuda.address_mode.BORDER) t.set_address_mode(1, cuda.address_mode.BORDER) # Set fields if provided # if kwargs.get("fields") is not None: self.setFields(kwargs.get("fields")) if kwargs.get("mask") is not None: self.setMask(kwargs.get("mask"))
ker = SourceModule(''' // row-column dot-product for matrix multiplication __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) { printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N); float val = 0; for (int k=0; k < N; k++) { if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]); val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; } return(val); } // matrix multiplication kernel that is parallelized over row/column tuples. __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col); output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); } ''')
def solve_gpu(currentmodelrun, modelend, G): """Solving using FDTD method on GPU. Implemented using Nvidia CUDA. Args: currentmodelrun (int): Current model run number. modelend (int): Number of last model to run. G (class): Grid class instance - holds essential parameters describing the model. Returns: tsolve (float): Time taken to execute solving """ import pycuda.driver as drv from pycuda.compiler import SourceModule drv.init() # Create device handle and context on specifc GPU device (and make it current context) dev = drv.Device(G.gpu.deviceID) ctx = dev.make_context() # Electric and magnetic field updates - prepare kernels, and get kernel functions if Material.maxpoles > 0: kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3])) else: # Set to one any substitutions for dispersive materials kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1)) update_e_gpu = kernels_fields.get_function("update_e") update_h_gpu = kernels_fields.get_function("update_h") # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0] if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem: raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name)) else: drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU if Material.maxpoles > 0: # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values). update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A") update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B") G.gpu_initialise_dispersive_arrays() # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU G.gpu_set_blocks_per_grid() G.gpu_initialise_arrays() # PML updates if G.pmls: # Prepare kernels kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3])) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) # Set block per grid, initialise arrays on GPU, and get kernel functions for pml in G.pmls: pml.gpu_set_blocks_per_grid(G) pml.gpu_initialise_arrays() pml.gpu_get_update_funcs(kernels_pml) # Receivers if G.rxs: # Initialise arrays on GPU rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G) # Prepare kernel and get kernel function kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2])) store_outputs_gpu = kernel_store_outputs.get_function("store_outputs") # Sources - initialise arrays on GPU, prepare kernel and get kernel functions if G.voltagesources + G.hertziandipoles + G.magneticdipoles: kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3])) # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0] updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0] drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE) drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH) if G.hertziandipoles: srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G) update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole") if G.magneticdipoles: srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G) update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole") if G.voltagesources: srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G) update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source") # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions if G.snapshots: # Initialise arrays on GPU snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G) # Prepare kernel and get kernel function kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2])) store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot") # Iteration loop timer iterstart = drv.Event() iterend = drv.Event() iterstart.record() for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable): # Get GPU memory usage on final iteration if iteration == G.iterations - 1: memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0] # Store field component values for every receiver if G.rxs: store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration), rxcoords_gpu.gpudata, rxs_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1)) # Store any snapshots for i, snap in enumerate(G.snapshots): if snap.time == iteration + 1: store_snapshot_gpu(np.int32(i), np.int32(snap.xs), np.int32(snap.xf), np.int32(snap.ys), np.int32(snap.yf), np.int32(snap.zs), np.int32(snap.zf), np.int32(snap.dx), np.int32(snap.dy), np.int32(snap.dz), G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata, snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata, block=Snapshot.tpb, grid=Snapshot.bpg) if G.snapsgpu2cpu: gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) # Update magnetic field components update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update magnetic field components with the PML correction for pml in G.pmls: pml.gpu_update_magnetic(G) # Update magnetic field components for magetic dipole sources if G.magneticdipoles: update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata, srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1)) # Update electric field components # If all materials are non-dispersive do standard update if Material.maxpoles == 0: update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # If there are any dispersive materials do 1st part of dispersive update # (it is split into two parts as it requires present and updated electric field values). else: update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata, block=G.tpb, grid=G.bpg) # Update electric field components with the PML correction for pml in G.pmls: pml.gpu_update_electric(G) # Update electric field components for voltage sources if G.voltagesources: update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata, srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1)) # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last) if G.hertziandipoles: update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration), floattype(G.dx), floattype(G.dy), floattype(G.dz), srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata, srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1)) # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates. if Material.maxpoles > 0: update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata, G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg) # Copy output from receivers array back to correct receiver objects if G.rxs: gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G) # Copy data from any snapshots back to correct snapshot objects if G.snapshots and not G.snapsgpu2cpu: for i, snap in enumerate(G.snapshots): gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(), snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap) iterend.record() iterend.synchronize() tsolve = iterstart.time_till(iterend) * 1e-3 # Remove context from top of stack and delete ctx.pop() del ctx return tsolve, memsolve
import numpy as np import pycuda.autoinit import pycuda.driver as cuda from pycuda.compiler import SourceModule mod = SourceModule(''' __global__ void Text_GPU(float *A , float *B){ int bid = blockIdx.x; int tid = threadIdx.x; __shared__ float s_data[128]; s_data[tid] = A[bid*128 + tid]; __syncthreads(); for(int i = 64;i>0;i/=2){ if(tid < i) s_data[tid] = s_data[tid] + s_data[tid +i]; __syncthreads(); } if(tid == 0) B[bid] = s_data[0]; } ''') Text_GPU = mod.get_function("Text_GPU") A = np.ones((32, 128), dtype=np.float32) B = np.ones((32,), dtype=np.float32) Text_GPU(cuda.In(A), cuda.InOut(B), grid=(32, 1), block=(128, 1, 1)) print(B)
mod = SourceModule(""" #include <stdio.h> /* This function compute the force between two blobs with vector between blob centers r. In this example the force is derived from a Yukawa potential U = eps * exp(-r_norm / b) / r_norm with eps = potential strength r_norm = distance between blobs b = Debye length */ __device__ void blob_blob_force(const double rx, const double ry, const double rz, double &fx, double &fy, double &fz, const double eps, const double b, const double a){ double r_norm = sqrt(rx*rx + ry*ry + rz*rz); double f = -((eps / b) + (eps / r_norm)) * exp(-r_norm / b) / (r_norm*r_norm); fx += f * rx; fy += f * ry; fz += f * rz; } /* This function computes the blob-blob force for all blobs. */ __global__ void calc_blob_blob_force(const double *x, double *f, const double repulsion_strength, const double debye_length, const double blob_radius, const double Lx, const double Ly, const double Lz, const int number_of_blobs){ int i = blockDim.x * blockIdx.x + threadIdx.x; if(i >= number_of_blobs) return; int offset_i = i * 3; int offset_j; double rx, ry, rz; double fx = 0; double fy = 0; double fz = 0; // Loop over blobs to add interanctions for(int j=0; j<number_of_blobs; j++){ offset_j = j * 3; // Compute blob to blob vector rx = x[offset_j] - x[offset_i]; ry = x[offset_j + 1] - x[offset_i + 1]; rz = x[offset_j + 2] - x[offset_i + 2]; // Project a vector r to the minimal image representation // centered around (0,0,0) and of size L=(Lx, Ly, Lz). If // any dimension of L is equal or smaller than zero the // box is assumed to be infinite in that direction. if(Lx > 0){ rx = rx - int(rx / Lx + 0.5 * (int(rx>0) - int(rx<0))) * Lx; } if(Ly > 0){ ry = ry - int(ry / Ly + 0.5 * (int(ry>0) - int(ry<0))) * Ly; } if(Lz > 0){ rz = rz - int(rz / Lz + 0.5 * (int(rz>0) - int(rz<0))) * Lz; } // Compute force between blobs i and j if(i != j){ blob_blob_force(rx, ry, rz, fx, fy, fz, repulsion_strength, debye_length, blob_radius); } } // Return forces f[offset_i] = fx; f[offset_i + 1] = fy; f[offset_i + 2] = fz; } """)