コード例 #1
0
def page_locked_array(a):
    a_pl = drv.pagelocked_zeros_like(a,
                                     mem_flags=drv.host_alloc_flags.DEVICEMAP)
    if len(a.shape) == 1:
        a_pl[:] = a
    else:
        a_pl[:, :] = a
    return a_pl
コード例 #2
0
    def __init__(self, field_name, dset, stencil_size):
        """ Initialize the MyField instance. """
        self.name = field_name  # What the field is called.
        self.data = dset[:].astype(np.float32)  # Get the field values.
        self.ind = 0  # A counter for the variable dimension.

        # Determine whether this is a global field or not.
        if (self.data.ndim == 4):  # Standard 3-D field.
            self.isglobal = False
            self.global_dims = dset.shape[
                1:]  # Get the 3D shape of the dataset.
            # Get the dataset values.
            self.data = np.reshape(dset[:].astype(np.float32),
                                   self.global_dims)
            self.global_offset = dset.attrs['offset'][:].astype(np.float32)
            self.spacing = dset.attrs['pitch'][:].astype(np.float32)
            self.d_data = None  # We haven't loaded the field onto the GPU yet.

            # Describe what was read in.
            print self.name, 'field loading...'
            print '\tdimensions:', self.global_dims
            print '\toffset:', self.global_offset
            print '\tpitch:', self.spacing

            # Add the padding necessary for automatic parallelization.
            self.pad_size = np.ceil(stencil_size / self.spacing).astype(np.int)
            self.local_dims = self.global_dims + 2 * self.pad_size
            self.local_offset = self.pad_size
            temp_data = np.zeros(self.local_dims).astype(np.float32)
            temp_data[ \
                self.local_offset[0]:self.local_offset[0]+self.global_dims[0], \
                self.local_offset[1]:self.local_offset[1]+self.global_dims[1], \
                self.local_offset[2]:self.local_offset[2]+self.global_dims[2]] \
                = self.data
            self.data = temp_data

            # Copy array over to the device.
            self.d_data = drv.mem_alloc(self.data.nbytes)
            drv.memcpy_htod(self.d_data, self.data)

        else:  # Global field.
            self.isglobal = True

            # Describe what was read in.
            print 'Loading global field', self.name

            # Copy array over to the device.
            data = dset[:].astype(np.float32)
            self.data = drv.pagelocked_zeros_like(data, \
                drv.host_alloc_flags.DEVICEMAP)
            self.d_data = self.data.base.get_device_pointer()
            self.data[:] = data[:]

            # To enable writing to the hdf5 file at the very end only.
            self.data_hist = []
コード例 #3
0
ファイル: utility.py プロジェクト: budiaji/anuga-cuda
def get_page_locked_array(a):
    """Replace the pageable array to page-locked array"""

    import pycuda.driver as drv

    temp_page_lock_p = drv.pagelocked_zeros_like(a, mem_flags=drv.host_alloc_flags.DEVICEMAP)
    if len(a.shape) == 1:
        temp_page_lock_p[:] = a
    else:
        temp_page_lock_p[:, :] = a
    assert numpy.allclose(a, temp_page_lock_p)
    return temp_page_lock_p
コード例 #4
0
ファイル: my_fields.py プロジェクト: JesseLu/gce-server
    def __init__(self, field_name, dset, stencil_size):
        """ Initialize the MyField instance. """
        self.name = field_name # What the field is called.
        self.data = dset[:].astype(np.float32) # Get the field values.
        self.ind = 0 # A counter for the variable dimension.

        # Determine whether this is a global field or not.
        if (self.data.ndim == 4): # Standard 3-D field.
            self.isglobal = False
            self.global_dims = dset.shape[1:] # Get the 3D shape of the dataset.
            # Get the dataset values.
            self.data = np.reshape(dset[:].astype(np.float32), self.global_dims) 
            self.global_offset = dset.attrs['offset'][:].astype(np.float32)
            self.spacing = dset.attrs['pitch'][:].astype(np.float32)
            self.d_data = None # We haven't loaded the field onto the GPU yet.

            # Describe what was read in.
            print self.name, 'field loading...'
            print '\tdimensions:', self.global_dims
            print '\toffset:', self.global_offset
            print '\tpitch:', self.spacing

            # Add the padding necessary for automatic parallelization.
            self.pad_size = np.ceil(stencil_size / self.spacing).astype(np.int)
            self.local_dims = self.global_dims + 2 * self.pad_size
            self.local_offset = self.pad_size
            temp_data = np.zeros(self.local_dims).astype(np.float32)
            temp_data[ \
                self.local_offset[0]:self.local_offset[0]+self.global_dims[0], \
                self.local_offset[1]:self.local_offset[1]+self.global_dims[1], \
                self.local_offset[2]:self.local_offset[2]+self.global_dims[2]] \
                = self.data 
            self.data = temp_data

            # Copy array over to the device.
            self.d_data = drv.mem_alloc(self.data.nbytes) 
            drv.memcpy_htod(self.d_data, self.data)

        else: # Global field.
            self.isglobal = True

            # Describe what was read in.
            print 'Loading global field', self.name

            # Copy array over to the device.
            data = dset[:].astype(np.float32)
            self.data = drv.pagelocked_zeros_like(data, \
                drv.host_alloc_flags.DEVICEMAP) 
            self.d_data = self.data.base.get_device_pointer()
            self.data[:] = data[:]

            # To enable writing to the hdf5 file at the very end only.
            self.data_hist = []
コード例 #5
0
def get_page_locked_array(a):
    """Replace the pageable array to page-locked array"""

    import pycuda.driver as drv

    temp_page_lock_p = drv.pagelocked_zeros_like(
        a, mem_flags=drv.host_alloc_flags.DEVICEMAP)
    if len(a.shape) == 1:
        temp_page_lock_p[:] = a
    else:
        temp_page_lock_p[:, :] = a
    assert numpy.allclose(a, temp_page_lock_p)
    return temp_page_lock_p
コード例 #6
0
	def alloc_exchange_boundaries(s):
		s.ey_tmp = cuda.pagelocked_zeros((s.ny,s.nz),'f')
		s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
		s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
		s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
コード例 #7
0
        'update_h': np.zeros(tmax),
        'mpi_recv_h': np.zeros(tmax),
        'memcpy_htod_h': np.zeros(tmax),
        'mpi_send_h': np.zeros(tmax),
        'memcpy_dtoh_h': np.zeros(tmax),
        'update_e': np.zeros(tmax),
        'mpi_recv_e': np.zeros(tmax),
        'memcpy_htod_e': np.zeros(tmax),
        'mpi_send_e': np.zeros(tmax),
        'memcpy_dtoh_e': np.zeros(tmax),
        'src_e': np.zeros(tmax)
    }

# main loop
ey_tmp = cuda.pagelocked_zeros((ny, nz), 'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
for tn in xrange(1, tmax + 1):
    if rank == 1: start.record()
    for i, bpg in enumerate(bpg_list):
        update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args)

    if rank == 0:
        cuda.memcpy_dtoh(
            hy_tmp,
            int(hy_gpu) + (nx - 1) * ny * nz * np.nbytes['float32'])
        cuda.memcpy_dtoh(
            hz_tmp,
            int(hz_gpu) + (nx - 1) * ny * nz * np.nbytes['float32'])
        comm.Send(hy_tmp, 1, 20)
コード例 #8
0
    stop = cuda.Event()
    start.record()

# main loop
for tn in xrange(1, tmax + 1):
    for i, bpg in enumerate(bpg_list):
        update_h.prepared_call(bpg, np.int32(i * MBy), *eh_args)
    for i, bpg in enumerate(bpg_list):
        update_e.prepared_call(bpg, np.int32(i * MBy), *eh_args)
    if rank == 1: update_src.prepared_call((1, 1), np.float32(tn), ez_gpu)

    if tn % 10 == 0 and rank == 0:
        print "tn =\t%d/%d (%d %%)\r" % (tn, tmax, float(tn) / tmax * 100),
        sys.stdout.flush()

g = cuda.pagelocked_zeros_like(f)
cuda.memcpy_dtoh(g, ez_gpu)
if rank == 1:
    comm.Send(g, 0, 21)
else:
    lg = np.zeros((2 * nx, ny), 'f')
    lg[:nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 1, 21)
    lg[nx:, :] = g[:, :, nz / 2]

    #cuda.memcpy_dtoh(f, ez_gpu)
    #f[:,:,:] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32)
    #imsh.set_array( f[:,:,nz/2].T**2 )
    #f[:,:,nz/2] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32)[:,:,nz/2]
    #f = np.zeros((2*nx,ny),'f')
    #f[:nx,:] = cuda.from_device(int(ez_gpu), (nx,ny,nz), np.float32)[:,:,nz/2]
コード例 #9
0
	from datetime import datetime
	t1 = datetime.now()
	flop = 3*(nx*ny*nz*30)*tgap
	flops = np.zeros(tmax/tgap+1)
	start, stop = cuda.Event(), cuda.Event()
	start.record()

elif rank == 1:
	start, stop = cuda.Event(), cuda.Event()
	exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 
			'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 
			'src_e':np.zeros(tmax)}

# main loop
ey_tmp = cuda.pagelocked_zeros((ny,nz),'f')
ez_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hy_tmp = cuda.pagelocked_zeros_like(ey_tmp)
hz_tmp = cuda.pagelocked_zeros_like(ey_tmp)
for tn in xrange(1, tmax+1):
	if rank == 1: start.record()
	for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args)

	if rank == 0:
		cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) 
		comm.Send(hy_tmp, 1, 20)
		comm.Send(hz_tmp, 1, 21)
	elif rank == 1:
		stop.record()
		stop.synchronize()
		exec_time['update_h'][tn-1] = stop.time_since(start)
コード例 #10
0
		mpi.world.send(1, 22, ey_tmp)
		mpi.world.send(1, 23, ez_tmp)

	if rank == 1: update_src.prepared_call((1,1), np.float32(tn), ez_gpu)

	if tn%tgap == 0 and rank == 0:
		stop.record()
		stop.synchronize()
		flops[tn/tgap] = flop/stop.time_since(start)*1e-6
		print '[',datetime.now()-t1,']'," %d/%d (%d %%) %1.2f GFLOPS\r" % (tn, tmax, float(tn)/tmax*100, flops[tn/tgap]),
		sys.stdout.flush()
		start.record()

if rank == 0: print "\navg: %1.2f GFLOPS" % flops[2:-2].mean()

g = cuda.pagelocked_zeros_like(f)
cuda.memcpy_dtoh(g, ez_gpu)
if rank != 0:
	mpi.world.send(0, 24, g[:,:,nz/2])
else:
	lg = np.zeros((3*nx,ny),'f')
	lg[:nx,:] = g[:,:,nz/2]
	lg[nx:-nx,:] = mpi.world.recv(1, 24) 
	lg[2*nx:,:] = mpi.world.recv(2, 24) 
	imsh.set_array( lg.T**2 )
	show()#draw()
	#savefig('./png-wave/%.5d.png' % tstep) 

	stop.record()
	stop.synchronize()
	print stop.time_since(start)*1e-3
コード例 #11
0
 def alloc_exchange_boundaries(s):
     s.ey_tmp = cuda.pagelocked_zeros((s.ny, s.nz), 'f')
     s.ez_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
     s.hy_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)
     s.hz_tmp = cuda.pagelocked_zeros_like(s.ey_tmp)