def page_locked_array(a): a_pl = drv.pagelocked_zeros_like(a, mem_flags=drv.host_alloc_flags.DEVICEMAP) if len(a.shape) == 1: a_pl[:] = a else: a_pl[:, :] = a return a_pl
def __init__(self, field_name, dset, stencil_size): """ Initialize the MyField instance. """ self.name = field_name # What the field is called. self.data = dset[:].astype(np.float32) # Get the field values. self.ind = 0 # A counter for the variable dimension. # Determine whether this is a global field or not. if (self.data.ndim == 4): # Standard 3-D field. self.isglobal = False self.global_dims = dset.shape[ 1:] # Get the 3D shape of the dataset. # Get the dataset values. self.data = np.reshape(dset[:].astype(np.float32), self.global_dims) self.global_offset = dset.attrs['offset'][:].astype(np.float32) self.spacing = dset.attrs['pitch'][:].astype(np.float32) self.d_data = None # We haven't loaded the field onto the GPU yet. # Describe what was read in. print self.name, 'field loading...' print '\tdimensions:', self.global_dims print '\toffset:', self.global_offset print '\tpitch:', self.spacing # Add the padding necessary for automatic parallelization. self.pad_size = np.ceil(stencil_size / self.spacing).astype(np.int) self.local_dims = self.global_dims + 2 * self.pad_size self.local_offset = self.pad_size temp_data = np.zeros(self.local_dims).astype(np.float32) temp_data[ \ self.local_offset[0]:self.local_offset[0]+self.global_dims[0], \ self.local_offset[1]:self.local_offset[1]+self.global_dims[1], \ self.local_offset[2]:self.local_offset[2]+self.global_dims[2]] \ = self.data self.data = temp_data # Copy array over to the device. self.d_data = drv.mem_alloc(self.data.nbytes) drv.memcpy_htod(self.d_data, self.data) else: # Global field. self.isglobal = True # Describe what was read in. print 'Loading global field', self.name # Copy array over to the device. data = dset[:].astype(np.float32) self.data = drv.pagelocked_zeros_like(data, \ drv.host_alloc_flags.DEVICEMAP) self.d_data = self.data.base.get_device_pointer() self.data[:] = data[:] # To enable writing to the hdf5 file at the very end only. self.data_hist = []
def get_page_locked_array(a): """Replace the pageable array to page-locked array""" import pycuda.driver as drv temp_page_lock_p = drv.pagelocked_zeros_like(a, mem_flags=drv.host_alloc_flags.DEVICEMAP) if len(a.shape) == 1: temp_page_lock_p[:] = a else: temp_page_lock_p[:, :] = a assert numpy.allclose(a, temp_page_lock_p) return temp_page_lock_p
def __init__(self, field_name, dset, stencil_size): """ Initialize the MyField instance. """ self.name = field_name # What the field is called. self.data = dset[:].astype(np.float32) # Get the field values. self.ind = 0 # A counter for the variable dimension. # Determine whether this is a global field or not. if (self.data.ndim == 4): # Standard 3-D field. self.isglobal = False self.global_dims = dset.shape[1:] # Get the 3D shape of the dataset. # Get the dataset values. self.data = np.reshape(dset[:].astype(np.float32), self.global_dims) self.global_offset = dset.attrs['offset'][:].astype(np.float32) self.spacing = dset.attrs['pitch'][:].astype(np.float32) self.d_data = None # We haven't loaded the field onto the GPU yet. # Describe what was read in. print self.name, 'field loading...' print '\tdimensions:', self.global_dims print '\toffset:', self.global_offset print '\tpitch:', self.spacing # Add the padding necessary for automatic parallelization. self.pad_size = np.ceil(stencil_size / self.spacing).astype(np.int) self.local_dims = self.global_dims + 2 * self.pad_size self.local_offset = self.pad_size temp_data = np.zeros(self.local_dims).astype(np.float32) temp_data[ \ self.local_offset[0]:self.local_offset[0]+self.global_dims[0], \ self.local_offset[1]:self.local_offset[1]+self.global_dims[1], \ self.local_offset[2]:self.local_offset[2]+self.global_dims[2]] \ = self.data self.data = temp_data # Copy array over to the device. self.d_data = drv.mem_alloc(self.data.nbytes) drv.memcpy_htod(self.d_data, self.data) else: # Global field. self.isglobal = True # Describe what was read in. print 'Loading global field', self.name # Copy array over to the device. data = dset[:].astype(np.float32) self.data = drv.pagelocked_zeros_like(data, \ drv.host_alloc_flags.DEVICEMAP) self.d_data = self.data.base.get_device_pointer() self.data[:] = data[:] # To enable writing to the hdf5 file at the very end only. self.data_hist = []
def get_page_locked_array(a): """Replace the pageable array to page-locked array""" import pycuda.driver as drv temp_page_lock_p = drv.pagelocked_zeros_like( a, mem_flags=drv.host_alloc_flags.DEVICEMAP) if len(a.shape) == 1: temp_page_lock_p[:] = a else: temp_page_lock_p[:, :] = a assert numpy.allclose(a, temp_page_lock_p) return temp_page_lock_p
def alloc_exchange_boundaries(s): s.ey_tmp = cuda.pagelocked_zeros((s.ny,s.nz),'f') s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
'update_h': np.zeros(tmax), 'mpi_recv_h': np.zeros(tmax), 'memcpy_htod_h': np.zeros(tmax), 'mpi_send_h': np.zeros(tmax), 'memcpy_dtoh_h': np.zeros(tmax), 'update_e': np.zeros(tmax), 'mpi_recv_e': np.zeros(tmax), 'memcpy_htod_e': np.zeros(tmax), 'mpi_send_e': np.zeros(tmax), 'memcpy_dtoh_e': np.zeros(tmax), 'src_e': np.zeros(tmax) } # main loop ey_tmp = cuda.pagelocked_zeros((ny, nz), 'f') ez_tmp = cuda.pagelocked_zeros_like(ey_tmp) hy_tmp = cuda.pagelocked_zeros_like(ey_tmp) hz_tmp = cuda.pagelocked_zeros_like(ey_tmp) for tn in xrange(1, tmax + 1): if rank == 1: start.record() for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh( hy_tmp, int(hy_gpu) + (nx - 1) * ny * nz * np.nbytes['float32']) cuda.memcpy_dtoh( hz_tmp, int(hz_gpu) + (nx - 1) * ny * nz * np.nbytes['float32']) comm.Send(hy_tmp, 1, 20)
stop = cuda.Event() start.record() # main loop for tn in xrange(1, tmax + 1): for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args) for i, bpg in enumerate(bpg_list): update_e.prepared_call(bpg, np.int32(i * MBy), *eh_args) if rank == 1: update_src.prepared_call((1, 1), np.float32(tn), ez_gpu) if tn % 10 == 0 and rank == 0: print "tn =\t%d/%d (%d %%)\r" % (tn, tmax, float(tn) / tmax * 100), sys.stdout.flush() g = cuda.pagelocked_zeros_like(f) cuda.memcpy_dtoh(g, ez_gpu) if rank == 1: comm.Send(g, 0, 21) else: lg = np.zeros((2 * nx, ny), 'f') lg[:nx, :] = g[:, :, nz / 2] comm.Recv(g, 1, 21) lg[nx:, :] = g[:, :, nz / 2] #cuda.memcpy_dtoh(f, ez_gpu) #f[:,:,:] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32) #imsh.set_array( f[:,:,nz/2].T**2 ) #f[:,:,nz/2] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32)[:,:,nz/2] #f = np.zeros((2*nx,ny),'f') #f[:nx,:] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32)[:,:,nz/2]
from datetime import datetime t1 = datetime.now() flop = 3*(nx*ny*nz*30)*tgap flops = np.zeros(tmax/tgap+1) start, stop = cuda.Event(), cuda.Event() start.record() elif rank == 1: start, stop = cuda.Event(), cuda.Event() exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 'src_e':np.zeros(tmax)} # main loop ey_tmp = cuda.pagelocked_zeros((ny,nz),'f') ez_tmp = cuda.pagelocked_zeros_like(ey_tmp) hy_tmp = cuda.pagelocked_zeros_like(ey_tmp) hz_tmp = cuda.pagelocked_zeros_like(ey_tmp) for tn in xrange(1, tmax+1): if rank == 1: start.record() for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) comm.Send(hy_tmp, 1, 20) comm.Send(hz_tmp, 1, 21) elif rank == 1: stop.record() stop.synchronize() exec_time['update_h'][tn-1] = stop.time_since(start)
mpi.world.send(1, 22, ey_tmp) mpi.world.send(1, 23, ez_tmp) if rank == 1: update_src.prepared_call((1,1), np.float32(tn), ez_gpu) if tn%tgap == 0 and rank == 0: stop.record() stop.synchronize() flops[tn/tgap] = flop/stop.time_since(start)*1e-6 print '[',datetime.now()-t1,']'," %d/%d (%d %%) %1.2f GFLOPS\r" % (tn, tmax, float(tn)/tmax*100, flops[tn/tgap]), sys.stdout.flush() start.record() if rank == 0: print "\navg: %1.2f GFLOPS" % flops[2:-2].mean() g = cuda.pagelocked_zeros_like(f) cuda.memcpy_dtoh(g, ez_gpu) if rank != 0: mpi.world.send(0, 24, g[:,:,nz/2]) else: lg = np.zeros((3*nx,ny),'f') lg[:nx,:] = g[:,:,nz/2] lg[nx:-nx,:] = mpi.world.recv(1, 24) lg[2*nx:,:] = mpi.world.recv(2, 24) imsh.set_array( lg.T**2 ) show()#draw() #savefig('./png-wave/%.5d.png' % tstep) stop.record() stop.synchronize() print stop.time_since(start)*1e-3
def alloc_exchange_boundaries(s): s.ey_tmp = cuda.pagelocked_zeros((s.ny, s.nz), 'f') s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp) s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)