def find_top_k_with_FastRBTree(filename = TDATA, k = 10): """ Profile result: 5 million strings: memory consuming: 259 MB time consuming: 39.9689998627 [(753, 'bf'), (753, 'qj'), (753, 'zb'), (753, 'vz'), (763, 'ma'), (755, 'lx'), (779, 'qp'), (768, 'bg'), (758, 'eq'), (767, 'tf')] """ result = [] t = FastRBTree() with open(filename) as f: for line in f: key = line.strip() t[key] = t.setdefault(key, 0) + 1 # heapq for key, val in t.iter_items(): if len(result) < k: heapq.heappush(result, (val, key)) else: heapq.heappushpop(result, (val, key)) return result