Beispiel #1
0
class topK_max:
    '''
    Given a stream of (key,value,ridealong) tuples, remember the k largest values.
    If a key is added repeatedly, use the largest value.
    '''
    def __init__(self, size):
        self.size = size
        self.element = namedtuple('topK_max_element', ['value', 'ridealong'])
        self.d = ItemSortedDict(getvaluevalue)

    def add(self, key, value, ridealong):
        if key in self.d:
            if value >= self.d[key].value:
                self.d[key] = self.element(value, ridealong)
        elif len(self.d) < self.size:
            self.d[key] = self.element(value, ridealong)
        elif value > self.d.peekitem()[1].value:
            self.d.popitem()
            self.d[key] = self.element(value, ridealong)

    def readout(self):
        return [(k, list(v)) for k, v in self.d.items()]
Beispiel #2
0
class topK_sum:
    '''
    Space-saving heavy hitters.

    Given a stream of (key, value) tuples, rememgber the k items
    with the largest sum of values.

    http://www.cse.ust.hk/~raywong/comp5331/References/EfficientComputationOfFrequentAndTop-kElementsInDataStreams.pdf
    '''
    def __init__(self, size):
        self.size = size
        self.element = namedtuple('topK_max_element',
                                  ['value', 'ridealong', 'fake'])
        self.d = ItemSortedDict(getvaluevalue)

    def add(self, key, value, ridealong):
        if key in self.d:
            self.d[key] = self.element(self.d[key].value + value, ridealong,
                                       self.d[key].fake)
        elif len(self.d) < self.size:
            self.d[key] = self.element(value, ridealong, 0)
        elif value >= self.d.peekitem()[1].value:
            self.d.popitem()
            self.d[key] = self.element(value, ridealong, 0)
        else:
            evicted = self.d.popitem()
            oldvalue = evicted[1].value
            newvalue = max(value, oldvalue + 1)
            fake = max(newvalue - value, 0)
            self.d[key] = self.element(newvalue, ridealong, fake)

    def readout(self):
        ret = []
        for i in self.d.items():
            if i[1].value > 2 * i[1].fake:
                ret.append((i[0], [i[1].value, i[1].ridealong]))
        return ret