-
Notifications
You must be signed in to change notification settings - Fork 1
/
cudaTools.py
156 lines (149 loc) · 5.27 KB
/
cudaTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import pycuda.driver as cuda
import sys
######################################################################
CUDA_initialized = False
def setCudaDevice( devN = None, usingAnimation=False ):
global CUDA_initialized
if CUDA_initialized: return
import pycuda.autoinit
nDevices = cuda.Device.count()
print "\nAvailable Devices:"
for i in range(nDevices):
dev = cuda.Device( i )
print " Device {0}: {1}".format( i, dev.name() )
devNumber = 0
if nDevices > 1:
if devN == None:
devNumber = int(raw_input("Select device number: "))
else:
devNumber = devN
cuda.Context.pop() #Disable previus CUDA context
dev = cuda.Device( devNumber )
if usingAnimation:
import pycuda.gl as cuda_gl
cuda_gl.make_context(dev)
else:
dev.make_context()
print "Using device {0}: {1}".format( devNumber, dev.name() )
CUDA_initialized = True
return dev
######################################################################
def mpi_setCudaDevice( pId, devN, MPI_comm, show=True ):
cuda.init()
if pId == 0 and show:
nDevices = cuda.Device.count()
print "Available Devices:"
for i in range(nDevices):
dev = cuda.Device( i )
print " Device {0}: {1}".format( i, dev.name() )
MPI_comm.Barrier()
#cuda.Context.pop() #Disable previus CUDA context
dev = cuda.Device( devN ) #device we are working on
ctx = dev.make_context() #make a working context
ctx.push() #let context make the lead
print "[pId {0}] Using device {1}: {2}".format(pId, devN, ctx.get_device().name())
return ctx, dev
#####################################################################
def getFreeMemory( show=True):
Mbytes = float(cuda.mem_get_info()[0])/1e6
if show:
print " Free Global Memory: {0:.0f} MB".format(float(Mbytes))
return cuda.mem_get_info()[0]
#####################################################################
def kernelMemoryInfo(kernel, kernelName=""):
shared=kernel.shared_size_bytes
regs=kernel.num_regs
local=kernel.local_size_bytes
const=kernel.const_size_bytes
mbpt=kernel.max_threads_per_block
print "=Kernel Memory= {0}". format(kernelName)
print("""Local:%d,\nShared:%d,\nRegisters:%d,\nConst:%d,\nMax Threads/B:%d"""%(local,shared,regs,const,mbpt))
#####################################################################
def np2DtoCudaArray( npArray, allowSurfaceBind=False ):
#import pycuda.autoinit
h, w = npArray.shape
descr2D = cuda.ArrayDescriptor()
descr2D.width = w
descr2D.height = h
descr2D.format = cuda.dtype_to_array_format(npArray.dtype)
descr2D.num_channels = 1
if allowSurfaceBind:
descr.flags = cuda.array3d_flags.SURFACE_LDST
cudaArray = cuda.Array(descr2D)
copy2D = cuda.Memcpy2D()
copy2D.set_src_host(npArray)
copy2D.set_dst_array(cudaArray)
copy2D.src_pitch = npArray.strides[0]
copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0]
copy2D.src_height = copy2D.height = h
copy2D(aligned=True)
return cudaArray, descr2D
#####################################################################
def np3DtoCudaArray( npArray, allowSurfaceBind=False ):
#import pycuda.autoinit
d, h, w = npArray.shape
descr3D = cuda.ArrayDescriptor3D()
descr3D.width = w
descr3D.height = h
descr3D.depth = d
descr3D.format = cuda.dtype_to_array_format(npArray.dtype)
descr3D.num_channels = 1
descr3D.flags = 0
if allowSurfaceBind:
descr3D.flags = cuda.array3d_flags.SURFACE_LDST
cudaArray = cuda.Array(descr3D)
copy3D = cuda.Memcpy3D()
copy3D.set_src_host(npArray)
copy3D.set_dst_array(cudaArray)
copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1]
copy3D.src_height = copy3D.height = h
copy3D.depth = d
copy3D()
return cudaArray, copy3D
#####################################################################
def gpuArray3DtocudaArray( gpuArray, allowSurfaceBind=False, precision='float' ):
#import pycuda.autoinit
d, h, w = gpuArray.shape
descr3D = cuda.ArrayDescriptor3D()
descr3D.width = w
descr3D.height = h
descr3D.depth = d
if precision == 'float':
descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype)
descr3D.num_channels = 1
elif precision == 'double':
descr3D.format = cuda.array_format.SIGNED_INT32
descr3D.num_channels = 2
else:
print "ERROR: CUDA_ARRAY incompatible precision"
sys.exit()
descr3D.flags = 0
if allowSurfaceBind:
descr3D.flags = cuda.array3d_flags.SURFACE_LDST
cudaArray = cuda.Array(descr3D)
copy3D = cuda.Memcpy3D()
copy3D.set_src_device(gpuArray.ptr)
copy3D.set_dst_array(cudaArray)
copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1]
copy3D.src_height = copy3D.height = h
copy3D.depth = d
copy3D()
return cudaArray, copy3D
#####################################################################
def gpuArray2DtocudaArray( gpuArray ):
#import pycuda.autoinit
h, w = gpuArray.shape
descr2D = cuda.ArrayDescriptor()
descr2D.width = w
descr2D.height = h
descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype)
descr2D.num_channels = 1
cudaArray = cuda.Array(descr2D)
copy2D = cuda.Memcpy2D()
copy2D.set_src_device(gpuArray.ptr)
copy2D.set_dst_array(cudaArray)
copy2D.src_pitch = gpuArray.strides[0]
copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0]
copy2D.src_height = copy2D.height = h
copy2D(aligned=True)
return cudaArray, copy2D