/
kcvstore.py
176 lines (151 loc) · 7.82 KB
/
kcvstore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import os
import pickle
from blist import sorteddict
from tempfile import NamedTemporaryFile
# I've made the executive decision that the 3rd party blist module is within
# the spirit of this coding challenge. To understand why, I'll compare the
# following code to commit a81ea9, which included a small bit of extra code to
# handle "a world without blist". We'll see that it's largely the same as what
# I have now. (Except that blist is a tad more efficient.)
#
# The basic design I had for the KeyColumnValueStore noticed that the only
# particular requirement separating it from a plain nested hash table was the
# get_key method, which returns a list of columns in sorted order. To this
# end, I only needed to track two things:
#
# * a nested hash table (Python dictionaries) for all the data, obviously
#
# * the order of columns associated with each key, using a hash table from keys
# to a sorted sequence of columns (every time you insert a new column, you'd
# have to "insort" into this sequence)
#
# The question is: what data structure do we use for the sorted sequence? I
# didn't want the overhead of writing my own ordered tree, so the naive way to
# do this is with a regular Python list. We can insort into a list using the
# bisect module, which is all well and good: there are only O(log n)
# comparisons to find where to insert into a list of size n. But the actual
# *insertion* would still be O(n). Python lists are dynamic arrays underneath,
# so insertion into the middle means we have to shift the indices of all the
# elements to the right of the insertion point.
#
# On the other hand, the instructions clearly said data structure libraries
# were OK. Enter blist, with its sortedset class. It handles the ordered tree
# structure for me, with the properly logarithmic insertion methods and so on.
# The code is just as simple as if I'd used regular Python lists with the
# bisect module (it's just more efficient with blist). So, I think using blist
# is in the spirit of the problem here.
#
# However, this "dictionary + sorted set of keys" bookkeeping turns out to be a
# problem so easy to solve (if you already have a sorted set) that the blist
# module already *has* a sorteddict class. Since I'm using blist anyway, why
# not make the tiny conceptual leap to just represent the KeyColumnValueStore
# as a dict from keys to sorteddicts? While it makes the code fairly trivial
# to implement, it's not really any more "trivial" than the blist-based code
# from before would've been. In my mind, the "dict + sorted set" approach
# would be OK (as I said above). Though the sorteddict class just "solves the
# problem for me" (in one view), it's not without thought: it's basically a
# shorter way to do what I already would've done.
#
# This shortness is, of course, thoroughly negated by my long-winded comment.
class KeyColumnValueStore(object):
"""A key/column/value store.
A key/column/value store is similar to a nested hash table, with the
following requirements:
* like a hash table, a key can only occur once
* within a given key, a column cannot appear more than once, like a hash
table
* when querying the contents of a key, a list of column/value tuples are
returned, sorted by column
* all keys, columns, and values will be strings of variable length
* errors shouldn't be raised if a nonexistent key/column is accessed, empty
lists / None values should be returned
The implementation is designed with the assumption that reads will be much
more frequent than writes.
"""
def __init__(self, path=None):
self.kcv = {}
self._create_log_file(path)
# XXX Loads of issues with this naive persistence strategy. I won't be
# fixing them.
def _create_log_file(self, path):
if path is None:
with NamedTemporaryFile(delete=False) as tmp:
self.path = tmp.name
else:
self.path = os.path.expandvars(os.path.expanduser(path))
if os.path.isfile(self.path) and os.stat(self.path).st_size:
self._load()
else:
self._persist() # will create the file if it doesn't exist
def _load(self):
"""Reads the existing key/column/value structure from disk."""
with open(self.path, 'r') as log:
self.kcv = pickle.load(log)
def _persist(self):
"""Persists the current key/column/value structure to disk."""
with open(self.path, 'w') as log:
pickle.dump(self.kcv, log)
def set(self, key, col, val):
"""Sets the value at the given key/column.
In the average case, requires O(log(c)**2) operations, where c is the
number of columns associated with the key."""
assert all(isinstance(datum, basestring) for datum in (key, col, val))
self.kcv.setdefault(key, sorteddict())[col] = val
self._persist()
def get(self, key, col):
"""Return the value at the specified key/column, or None if no such
value exists.
In the average case, requires O(1) operations."""
cv = self.kcv.get(key)
return None if cv is None else cv.get(col)
def get_key(self, key):
"""Returns a sorted list of column/value tuples.
Requires O(c) operations, where c is the number of columns associated
with the key."""
return list(self.kcv.get(key, sorteddict()).items())
def get_keys(self):
"""Returns a set containing all of the keys in the store.
Requires O(k) operations, where k is the number of keys stored."""
# It'd be easy to make this O(1) by having an extra slot, self.keys =
# set(), that we add to & remove from in methods like self.set and
# self.delete_key. However, just returning self.keys would present a
# mutable structure to the outside, which is asking for trouble, and
# copying the value would be O(k) anyway (I think). So, this
# implementation does just as well.
return set(self.kcv.iterkeys())
def delete(self, key, col):
"""Removes a column/value from the given key.
In the average case, requires O(log(c)) operations, where c is the
number of columns associated with the key."""
if key in self.kcv and col in self.kcv[key]:
del self.kcv[key][col]
self._persist()
def delete_key(self, key):
"""Removes all data associated with the given key.
In the average case, requires O(1) operations."""
if key in self.kcv:
del self.kcv[key]
self._persist()
def get_slice(self, key, start, stop):
"""Returns a sorted list of column/value tuples where the column values
are between the start and stop values, inclusive of the start and stop
values. Start and/or stop can be None values, leaving the slice open
ended in that direction.
In the average case, requires O(log(c)**2 + s) operations, where c is
the number of columns associated with the key and s is the length of
the slice. Thus, for large slices, this approaches O(c)."""
# sorteddict.keys() - O(1), returns sortedset in Python 2
# x not in sortedset - O(log(c)**2)
# len(sortedset) - O(1)
# sortedset.bisect_* - O(log(c)**2)
# sortedset[i:j] - O(log(c))
# self.get - O(1)
# list comprehension - O(s), since O(1) self.get for each slice item
cols = self.kcv.get(key, sorteddict()).keys()
start_missing = start not in cols
stop_missing = stop not in cols
if start_missing and stop_missing and not (start is stop is None):
return []
start_index = 0 if start_missing else cols.bisect_left(start)
stop_index = len(cols) if stop_missing else cols.bisect_right(stop)
return [(c, self.get(key, c)) for c in cols[start_index:stop_index]]