/
gpu_countminsketch.py
103 lines (82 loc) · 3.32 KB
/
gpu_countminsketch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
import pyopencl as cl
from collections import Counter
class GPUCountMinSketch(object):
"""
A GPU implementation of the count min sketch algorithm.
It will batch input strings into a list until the buffer
reaches a limit. Then it will invoke the opencl kernel
to calculate the hashes.
"""
def __init__(self, d, w, batch_limit, rand, hash_functions, M=None):
"""
:param d: the depth of the sketch
:param w: the width of the sketch
:param batch_limit: size limit of the batch buffer
:param kernel: the kernel code for hashing the given data.
:param M: provided matrix of counts
:return:
"""
self.d = d
self.w = w
self.rand = rand
self.hash_functions = hash_functions
self.batch_limit = batch_limit
self.batch = Counter()
if M is not None:
self.M = M
else:
self.M = np.zeros([d, w], dtype=np.int32)
# Initialize kernel
self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx)
# kernel code
code = """
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void increment(__global const unsigned int* rand, __global char* keys, __global int* counts, __global int* out)
{
int i = get_global_id(0);
int j = get_global_id(1);
int width = %d;
int str_size = %d;
// calculate hash inline
unsigned int hash = rand[j];
char c;
for(int k = 0; k < str_size; k++){
c = keys[i*str_size + k];
if (c != 0) {
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
}
}
long index = hash %% width;
atomic_add(&out[j * width + index], counts[i]);
}
""" % (self.w, 32)
# build the Kernel
self.bld = cl.Program(self.ctx, code).build()
def add(self, x, delta=1):
if len(self.batch) < self.batch_limit:
self.batch[x] += delta
else:
self.dump_batch()
def dump_batch(self):
keys = np.array(self.batch.keys(), dtype='S32')
counts = np.array(self.batch.values(), dtype=np.int32)
out = np.zeros([self.d, self.w], dtype=np.int32)
# create the buffers to hold the values of the input
rand_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.rand)
keys_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=keys)
counts_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=counts)
# create output buffer
out_buf = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, out.nbytes)
# Kernel is now launched
launch = self.bld.increment(self.queue, (len(keys), self.d), None, rand_buf, keys_buf, counts_buf, out_buf)
# wait till the process completes
launch.wait()
cl.enqueue_read_buffer(self.queue, out_buf, out).wait()
self.M += out
self.batch.clear()
def query(self, x):
return min([self.M[i][self.hash_functions[i](x) % self.w] for i in range(self.d)])
def get_matrix(self):
return self.M