async def get_or_create(tx, bstore, blob): hash = hasher(blob).digest() key = found.pack((bstore.prefix_hash, hash)) maybe_uid = await found.get(tx, key) if maybe_uid is not None: return UUID(bytes=maybe_uid) # Otherwise create the hash entry and store the blob with a new uid # TODO: Use a counter and implement a garbage collector, and implement # bstore.delete uid = uuid4() found.set(tx, key, uid.bytes) for index, slice in enumerate(sliced(blob, found.MAX_SIZE_VALUE)): found.set(tx, found.pack((bstore.prefix_blob, uid, index)), bytes(slice)) return uid
def test_pack_unpack(): value = ( (uuid4(), None, SingleFloat(3.1415), b"x42", 1, -1, 3.1415, -3.1415, ("abc",)), ("d", "e", "f"), 2.718281828459045, ) # noqa assert found.unpack(found.pack(value)) == value
async def exists(self, tr, subject, predicate, object): predicate = self._predicates[predicate] object = predicate.pack(object) key = found.pack( (self._prefix, PREFIX_SPO, subject, predicate.name, object)) value = await tr.get(key) return value is not None
async def remove(self, tr, *triples): for triple in triples: subject, predicate, object = triple # might fail because of unknown predicate predicate = self._predicates[predicate] # might fail because of not the correct type object = predicate.pack(object) # remove from data key = found.pack( (self._prefix, PREFIX_SPO, subject, predicate.name, object)) tr.clear(key) if predicate.pos: # remove from index key = found.pack((self._prefix, PREFIX_POS, predicate.name, object, subject)) tr.clear(key)
async def get(tx, bstore, uid): key = found.pack((bstore.prefix_blob, uid)) out = b'' async for _, value in found.query(tx, key, found.next_prefix(key)): out += value if out == b'': raise BStoreException('BLOB should be in database: uid={}'.format(uid)) return out
async def _lookup_pos_subjects(self, tr, predicate, object): predicate = self._predicates[predicate] object = predicate.pack(object) start = found.pack((self._prefix, PREFIX_POS, predicate.name, object)) end = found.strinc(start) items = await tr.get_range(start, end) out = list() for key, _ in items: _, _, _, _, subject = found.unpack(key) out.append(subject) return out
async def uuid(self, tr): uid = uuid4() start = found.pack((self._prefix, PREFIX_SPO, uid)) end = b"\xFF" items = await tr.get_range(start, end, limit=1) if not items: return uid key, _ = items[0] _, _, subject, _, _ = found.unpack(key) assert subject != uid, "Unlikely Error!" return uid
async def index(tx, store, docuid, counter): # translate keys that are string tokens, into uuid4 bytes with # store.tokens tokens = dict() for string, count in counter.items(): query = nstore.select(tx, store.tokens, string, nstore.var('uid')) try: uid = await query.__anext__() except StopAsyncIteration: uid = uuid4() nstore.add(tx, store.tokens, string, uid) else: uid = uid['uid'] tokens[uid] = count # store tokens to use later during search for filtering found.set(tx, found.pack((store.prefix_counters, docuid)), zstd.compress(found.pack(tuple(tokens.items())))) # store tokens keys for candidate selection for token in tokens: found.set(tx, found.pack((store.prefix_index, token, docuid)), b'')
async def all(self, tr): start = found.pack((self._prefix, PREFIX_SPO)) end = found.strinc(start) msg = "fetching everything between start=%r and end=%r" log.debug(msg, start, end) out = [] items = await tr.get_range(start, end) for key, _ in items: # value is always empty _, _, subject, predicate, object = found.unpack(key) predicate = self._predicates[predicate] object = predicate.unpack(object) out.append((subject, predicate.name, object)) return out
async def massage(tx, store, candidate, keywords, hits): score = 0 counter = await found.get(tx, found.pack( (store.prefix_counters, candidate))) # TODO: replace the dictionary and the following for loop with # a single iteration over the counter, using zigzag algorithm. counter = dict(found.unpack(zstd.decompress(counter))) for keyword in keywords: try: count = counter[keyword] except KeyError: return None else: score += count hits[candidate] = score
async def search(tx, store, keywords, limit=13): coroutines = (_keywords_to_token(tx, store.tokens, keyword) for keyword in keywords) keywords = await asyncio.gather(*coroutines) # If a keyword is not present in store.tokens, then there is no # document associated with it, hence there is no document that # match that keyword, hence no document that has all the requested # keywords. Return an empty counter. if any(keyword is None for keyword in keywords): return list() # Select seed token coroutines = (_token_to_size(tx, store.prefix_index, token) for token in keywords) sizes = await asyncio.gather(*coroutines) _, seed = min(zip(sizes, keywords), key=itemgetter(0)) # Select candidates candidates = [] key = found.pack((store.prefix_index, seed)) query = found.query(tx, key, found.next_prefix(key)) async for key, _ in query: _, _, uid = found.unpack(key) candidates.append(uid) # XXX: 500 was empirically discovered, to make it so that the # search takes less than 1 second or so. if len(candidates) >= FOUND_PSTORE_SAMPLE_COUNT: candidates = random.sample(candidates, FOUND_PSTORE_SAMPLE_COUNT) # score, filter and construct hits aka. massage hits = Counter() coroutines = (massage(tx, store, c, keywords, hits) for c in candidates) await asyncio.gather(*coroutines) out = hits.most_common(limit) return out
async def where(self, tr, pattern, *patterns): # seed bindings vars = tuple((isinstance(item, var) for item in pattern)) if vars == (True, False, False): subject, predicate, object = pattern subjects = await self._lookup_pos_subjects(tr, predicate, object) name = subject.name bindings = [Map().set(name, subject) for subject in subjects] elif vars == (False, True, True): # TODO: extract to a method subject = pattern[0] start = found.pack((self._prefix, PREFIX_SPO, subject)) end = found.strinc(start) items = await tr.get_range(start, end) bindings = [] for key, _ in items: _, _, _, predicate, object = found.unpack(key) predicate = self._predicates[predicate] object = predicate.unpack(object) binding = Map() binding = binding.set(pattern[1].name, predicate.name) binding = binding.set(pattern[2].name, object) bindings.append(binding) elif vars == (False, False, True): # TODO: extract to a method subject = pattern[0] predicate = pattern[1] start = found.pack((self._prefix, PREFIX_SPO, subject, predicate)) end = found.strinc(start) items = await tr.get_range(start, end) bindings = [] for key, _ in items: _, _, _, _, object = found.unpack(key) predicate = self._predicates[predicate] object = predicate.unpack(object) binding = Map() binding = binding.set(pattern[2].name, object) bindings.append(binding) else: raise PatternException(pattern) log.debug("seed bindings: %r", bindings) # contine matching other patterns, if any. for pattern in patterns: # one log.debug("matching pattern: %r", pattern) next_bindings = [] for binding in bindings: # two bound_pattern = pattern_bind(pattern, binding) log.debug("bound pattern: %r", bound_pattern) vars = tuple((isinstance(item, var) for item in bound_pattern)) if vars == (False, False, False): log.debug("clause: False, False, False") ok = await self.exists(tr, *bound_pattern) if ok: # this binding is valid against this bound_pattern, # proceed with this binding and continue with # the next pattern. next_bindings.append(binding) elif vars == (False, False, True): # TODO: extract to a method log.debug("clause: False, False, True") subject, predicate, object = bound_pattern predicate = self._predicates[predicate] start = found.pack( (self._prefix, PREFIX_SPO, subject, predicate.name)) end = found.strinc(start) items = await tr.get_range(start, end) for key, _ in items: _, _, _, _, value = found.unpack(key) value = predicate.pack(value) new = binding.set(object.name, value) next_bindings.append(new) elif vars == (True, False, False): log.debug("clause: True, False, False") subject, predicate, object = bound_pattern predicate = self._predicates[predicate] object = predicate.pack(object) values = await self._lookup_pos_subjects( tr, predicate.name, object) for value in values: new = binding.set(subject.name, value) next_bindings.append(new) else: raise PatternException(pattern) bindings = next_bindings return bindings
async def test(tx): found.set(tx, prefix_zero + b"\x01", found.pack((1,))) found.set(tx, prefix_zero + b"\x02", found.pack((2,))) found.set(tx, prefix_zero + b"\x03", found.pack((3,))) found.set(tx, prefix_one + b"\x42", found.pack((42,)))
async def query(tx): out = found.query(tx, found.pack((1,)), found.pack((8,))) out = await aiolist(out) return out
async def set(tx): for number in range(10): found.set(tx, found.pack((number,)), found.pack((str(number),)))
async def _prepare(tx, prefix, candidates, keywords): for candidate in candidates: out = await found.get(tx, found.pack((prefix, candidate))) yield (candidate, keywords, out)
async def _token_to_size(tx, prefix_index, token): key = found.pack((prefix_index, token)) out = await found.estimated_size_bytes(tx, key, found.next_prefix(key)) return out