/
pyPrefixSum.py
102 lines (79 loc) · 3.24 KB
/
pyPrefixSum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import numpy as np
import pyopencl as cl
from clutil import createProgram, pow2gt, roundUp
szFloat = 4
szInt = 4
szChar = 1
cm = cl.mem_flags
LEN_WORKGROUP = 256
ELEMENTS_PER_THREAD = 2
ELEMENTS_PER_WORKGROUP = ELEMENTS_PER_THREAD*LEN_WORKGROUP
PROFILE_GPU = True
class PrefixSum:
def __init__(self, context, devices, capacity):
self.context = context
if PROFILE_GPU == True:
self.queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)
else:
self.queue = cl.CommandQueue(context)
filename = os.path.join(os.path.dirname(__file__), 'prefixsum.cl')
program = createProgram(context, devices, [], filename)
self.kernScan_pad_to_pow2 = cl.Kernel(program, 'scan_pad_to_pow2')
self.kernScan_subarrays = cl.Kernel(program, 'scan_subarrays')
self.kernScan_inc_subarrays = cl.Kernel(program, 'scan_inc_subarrays')
self.lw = (LEN_WORKGROUP, )
self.capacity = roundUp(capacity, ELEMENTS_PER_WORKGROUP)
self.d_parts = []
len = self.capacity/ELEMENTS_PER_WORKGROUP
while len > 0:
self.d_parts.append(cl.Buffer(context, cl.mem_flags.READ_WRITE, szInt*len))
len = len/ELEMENTS_PER_WORKGROUP
self.elapsed = 0
def factory(self, length=None):
if length == None:
length = self.capacity
elif length > self.capacity:
raise ValueError('length > self.capacity: {0}, {1}'.format(length, self.capacity))
length = pow2gt(length)
return cl.Buffer(self.context, cl.mem_flags.READ_WRITE, length*szInt)
def scan(self, dArray, dTotal, length):
if length == None:
length = dArray.size/szInt
k = (length + ELEMENTS_PER_WORKGROUP - 1) / ELEMENTS_PER_WORKGROUP
gw = (k*LEN_WORKGROUP, )
if k == 1:
event = self.kernScan_pad_to_pow2(self.queue, gw, self.lw,
dArray,
cl.LocalMemory(ELEMENTS_PER_WORKGROUP*szInt),
np.int32(length),
dTotal
)
event.wait()
if PROFILE_GPU == True:
self.elapsed += (event.profile.end - event.profile.start)
else:
if length > self.capacity:
raise ValueError('length > self.capacity: {0}, {1}'.format(length, self.capacity))
else:
i = int(np.log(length)/np.log(ELEMENTS_PER_WORKGROUP))-1
d_part = self.d_parts[i]
event = self.kernScan_subarrays(self.queue, gw, self.lw,
dArray,
cl.LocalMemory(ELEMENTS_PER_WORKGROUP*szInt),
d_part,
np.int32(length),
)
event.wait()
if PROFILE_GPU == True:
self.elapsed += (event.profile.end - event.profile.start)
self.scan(d_part, dTotal, k)
event = self.kernScan_inc_subarrays(self.queue, gw, self.lw,
dArray,
cl.LocalMemory(ELEMENTS_PER_WORKGROUP*szInt),
d_part,
np.int32(length),
)
event.wait()
if PROFILE_GPU == True:
self.elapsed += (event.profile.end - event.profile.start)