コード例 #1
0
ファイル: bigdata.py プロジェクト: yflau/dsapp
def find_top_k_with_FastRBTree(filename = TDATA, k = 10):
    """
    Profile result:
       
       5 million strings:
       memory consuming: 259 MB
       time consuming: 39.9689998627
       [(753, 'bf'),
        (753, 'qj'),
        (753, 'zb'),
        (753, 'vz'),
        (763, 'ma'),
        (755, 'lx'),
        (779, 'qp'),
        (768, 'bg'),
        (758, 'eq'),
        (767, 'tf')]
    """
    result = []
    t = FastRBTree()
    with open(filename) as f:
        for line in f:
            key = line.strip()
            t[key] = t.setdefault(key, 0) + 1

    # heapq
    for key, val in t.iter_items():
        if len(result) < k:
            heapq.heappush(result, (val, key))
        else:
            heapq.heappushpop(result, (val, key))
    
    return result