def initialize(self, conf, ctx): # Initialize redis client self.r = redis.Redis(host='localhost', port=6379, db=0) self.k = 50 #k value for topK #Initialize count Min Sketch with default delta and epsilon values self.sketch = Sketch(10**-7, 0.005, self.k)
class TopK(Bolt): """ This bolt will update Top K words to find trending topics at a given time. We use two data structures to implement topk: - Using countmin sketch for maintaining the frequency of each word and finding top-k - Using Redis to store top-k in sorted sets """ def initialize(self, conf, ctx): # Initialize redis client self.r = redis.Redis(host='localhost', port=6379, db=0) self.k = 50 #k value for topK #Initialize count Min Sketch with default delta and epsilon values self.sketch = Sketch(10**-7, 0.005, self.k) def process(self, tup): word = tup.values[0] #self.log("Topk: %s" % word) #Add the word to count min sketch sketch. self.sketch.update(word, 1) #If total words in top-k less than freq, add the word directly if self.r.zcount("top-k", "-inf", "+inf") < self.k: self.r.zadd("top-k", word, self.sketch.get(word)) else: w, freq = self.r.zrange("top-k", 0, 0, withscores= True)[0] #If word freq is higher than least freq of top-k list #Replace the word by current word if freq and freq < self.sketch.get(word): self.r.zrem("top-k", w) self.r.zadd("top-k", word, self.sketch.get(word))