-
Notifications
You must be signed in to change notification settings - Fork 0
/
awj.py
121 lines (97 loc) · 3.39 KB
/
awj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import os.path
import time
import heapq
from collections.abc import MutableMapping
from random import shuffle
from glob import glob
import feather
class AWJ(MutableMapping):
'''LRU cache for DataFrames backed by on-disk feather files
Parameters
----------
cache_path : str or Path
The location of the cache files
max_size : float, optional
The maximum size in MB of the cache directory.
'''
def __init__(self, cache_path, *, max_size=None):
self._cache_path = cache_path
self.max_size = max_size
# convert to bytes
if self.max_size is not None:
self.max_size *= 1048576
# TODO 2k compat
os.makedirs(cache_path, exist_ok=True)
self._fn_cache = dict()
self._sz_cache = dict()
# TODO replace this with a double linked list like boltons LRU
self._heap_map = dict()
self._heap = []
# put files in to heap in random order
files = glob(os.path.join(self._cache_path, '*feather'))
shuffle(files)
for fn in files:
key = self._key_from_filename(fn)
self._fn_cache[key] = fn
stat = os.stat(fn)
self._sz_cache[key] = stat.st_size
heap_entry = [time.time(), key]
heapq.heappush(self._heap, heap_entry)
self._heap_map[key] = heap_entry
# prune up front just in case
self.__prune_files()
def __prune_files(self):
if self.max_size is None or not self.max_size > 0:
return
# TODO deal with pathological case of single file larger than max_size
# as written this will result is all files being removed
cur_size = self.cache_size
while cur_size > self.max_size:
_, key = heapq.heappop(self._heap)
if key in self:
cur_size -= self._sz_cache[key]
del self[key]
def _filename_from_key(self, key):
return os.path.join(self._cache_path, key + '.feather')
def _key_from_filename(self, fn):
fn, ext = os.path.splitext(os.path.basename(fn))
return fn
def __setitem__(self, key, df):
fn = self._filename_from_key(key)
feather.write_dataframe(df, fn)
self._fn_cache[key] = fn
self._sz_cache[key] = os.stat(fn).st_size
if key in self._heap_map:
self._heap_map[key][0] = time.time()
# ensure the heap invariant
heapq.heapify(self._heap)
else:
heap_entry = [time.time(), key]
self._heap_map[key] = heap_entry
heapq.heappush(self._heap, heap_entry)
self.__prune_files()
def __getitem__(self, key):
fn = self._fn_cache[key]
ret = feather.read_dataframe(fn)
self._heap_map[key][0] = time.time()
# ensure the heap invariant
heapq.heapify(self._heap)
return ret
def __delitem__(self, key):
fn = self._fn_cache.pop(key)
self._sz_cache.pop(key)
self._heap_map.pop(key)
os.unlink(fn)
def __contains__(self, key):
return key in self._fn_cache
def __iter__(self):
return iter(self._fn_cache)
@property
def cache_path(self):
return self._cache_path
@property
def cache_size(self):
return sum(v for v in self._sz_cache.values())
def __len__(self):
return len(self._fn_cache)