Ejemplo n.º 1
0
async def get_or_create(tx, bstore, blob):
    hash = hasher(blob).digest()
    key = found.pack((bstore.prefix_hash, hash))
    maybe_uid = await found.get(tx, key)
    if maybe_uid is not None:
        return UUID(bytes=maybe_uid)
    # Otherwise create the hash entry and store the blob with a new uid
    # TODO: Use a counter and implement a garbage collector, and implement
    # bstore.delete
    uid = uuid4()
    found.set(tx, key, uid.bytes)
    for index, slice in enumerate(sliced(blob, found.MAX_SIZE_VALUE)):
        found.set(tx, found.pack((bstore.prefix_blob, uid, index)),
                  bytes(slice))
    return uid
Ejemplo n.º 2
0
def test_pack_unpack():
    value = (
        (uuid4(), None, SingleFloat(3.1415), b"x42", 1, -1, 3.1415, -3.1415, ("abc",)),
        ("d", "e", "f"),
        2.718281828459045,
    )  # noqa
    assert found.unpack(found.pack(value)) == value
Ejemplo n.º 3
0
 async def exists(self, tr, subject, predicate, object):
     predicate = self._predicates[predicate]
     object = predicate.pack(object)
     key = found.pack(
         (self._prefix, PREFIX_SPO, subject, predicate.name, object))
     value = await tr.get(key)
     return value is not None
Ejemplo n.º 4
0
 async def remove(self, tr, *triples):
     for triple in triples:
         subject, predicate, object = triple
         # might fail because of unknown predicate
         predicate = self._predicates[predicate]
         # might fail because of not the correct type
         object = predicate.pack(object)
         # remove from data
         key = found.pack(
             (self._prefix, PREFIX_SPO, subject, predicate.name, object))
         tr.clear(key)
         if predicate.pos:
             # remove from index
             key = found.pack((self._prefix, PREFIX_POS, predicate.name,
                               object, subject))
             tr.clear(key)
Ejemplo n.º 5
0
async def get(tx, bstore, uid):
    key = found.pack((bstore.prefix_blob, uid))
    out = b''
    async for _, value in found.query(tx, key, found.next_prefix(key)):
        out += value
    if out == b'':
        raise BStoreException('BLOB should be in database: uid={}'.format(uid))
    return out
Ejemplo n.º 6
0
 async def _lookup_pos_subjects(self, tr, predicate, object):
     predicate = self._predicates[predicate]
     object = predicate.pack(object)
     start = found.pack((self._prefix, PREFIX_POS, predicate.name, object))
     end = found.strinc(start)
     items = await tr.get_range(start, end)
     out = list()
     for key, _ in items:
         _, _, _, _, subject = found.unpack(key)
         out.append(subject)
     return out
Ejemplo n.º 7
0
 async def uuid(self, tr):
     uid = uuid4()
     start = found.pack((self._prefix, PREFIX_SPO, uid))
     end = b"\xFF"
     items = await tr.get_range(start, end, limit=1)
     if not items:
         return uid
     key, _ = items[0]
     _, _, subject, _, _ = found.unpack(key)
     assert subject != uid, "Unlikely Error!"
     return uid
Ejemplo n.º 8
0
async def index(tx, store, docuid, counter):
    # translate keys that are string tokens, into uuid4 bytes with
    # store.tokens
    tokens = dict()
    for string, count in counter.items():
        query = nstore.select(tx, store.tokens, string, nstore.var('uid'))
        try:
            uid = await query.__anext__()
        except StopAsyncIteration:
            uid = uuid4()
            nstore.add(tx, store.tokens, string, uid)
        else:
            uid = uid['uid']
        tokens[uid] = count

    # store tokens to use later during search for filtering
    found.set(tx, found.pack((store.prefix_counters, docuid)),
              zstd.compress(found.pack(tuple(tokens.items()))))

    # store tokens keys for candidate selection
    for token in tokens:
        found.set(tx, found.pack((store.prefix_index, token, docuid)), b'')
Ejemplo n.º 9
0
 async def all(self, tr):
     start = found.pack((self._prefix, PREFIX_SPO))
     end = found.strinc(start)
     msg = "fetching everything between start=%r and end=%r"
     log.debug(msg, start, end)
     out = []
     items = await tr.get_range(start, end)
     for key, _ in items:  # value is always empty
         _, _, subject, predicate, object = found.unpack(key)
         predicate = self._predicates[predicate]
         object = predicate.unpack(object)
         out.append((subject, predicate.name, object))
     return out
Ejemplo n.º 10
0
async def massage(tx, store, candidate, keywords, hits):
    score = 0
    counter = await found.get(tx, found.pack(
        (store.prefix_counters, candidate)))
    # TODO: replace the dictionary and the following for loop with
    # a single iteration over the counter, using zigzag algorithm.
    counter = dict(found.unpack(zstd.decompress(counter)))
    for keyword in keywords:
        try:
            count = counter[keyword]
        except KeyError:
            return None
        else:
            score += count
    hits[candidate] = score
Ejemplo n.º 11
0
async def search(tx, store, keywords, limit=13):
    coroutines = (_keywords_to_token(tx, store.tokens, keyword)
                  for keyword in keywords)
    keywords = await asyncio.gather(*coroutines)
    # If a keyword is not present in store.tokens, then there is no
    # document associated with it, hence there is no document that
    # match that keyword, hence no document that has all the requested
    # keywords. Return an empty counter.
    if any(keyword is None for keyword in keywords):
        return list()

    # Select seed token
    coroutines = (_token_to_size(tx, store.prefix_index, token)
                  for token in keywords)
    sizes = await asyncio.gather(*coroutines)
    _, seed = min(zip(sizes, keywords), key=itemgetter(0))

    # Select candidates
    candidates = []
    key = found.pack((store.prefix_index, seed))
    query = found.query(tx, key, found.next_prefix(key))

    async for key, _ in query:
        _, _, uid = found.unpack(key)
        candidates.append(uid)

    # XXX: 500 was empirically discovered, to make it so that the
    #      search takes less than 1 second or so.
    if len(candidates) >= FOUND_PSTORE_SAMPLE_COUNT:
        candidates = random.sample(candidates, FOUND_PSTORE_SAMPLE_COUNT)

    # score, filter and construct hits aka. massage
    hits = Counter()

    coroutines = (massage(tx, store, c, keywords, hits) for c in candidates)
    await asyncio.gather(*coroutines)

    out = hits.most_common(limit)

    return out
Ejemplo n.º 12
0
    async def where(self, tr, pattern, *patterns):
        # seed bindings
        vars = tuple((isinstance(item, var) for item in pattern))
        if vars == (True, False, False):
            subject, predicate, object = pattern
            subjects = await self._lookup_pos_subjects(tr, predicate, object)
            name = subject.name
            bindings = [Map().set(name, subject) for subject in subjects]
        elif vars == (False, True, True):
            # TODO: extract to a method
            subject = pattern[0]
            start = found.pack((self._prefix, PREFIX_SPO, subject))
            end = found.strinc(start)
            items = await tr.get_range(start, end)
            bindings = []
            for key, _ in items:
                _, _, _, predicate, object = found.unpack(key)
                predicate = self._predicates[predicate]
                object = predicate.unpack(object)
                binding = Map()
                binding = binding.set(pattern[1].name, predicate.name)
                binding = binding.set(pattern[2].name, object)
                bindings.append(binding)
        elif vars == (False, False, True):
            # TODO: extract to a method
            subject = pattern[0]
            predicate = pattern[1]
            start = found.pack((self._prefix, PREFIX_SPO, subject, predicate))
            end = found.strinc(start)
            items = await tr.get_range(start, end)
            bindings = []
            for key, _ in items:
                _, _, _, _, object = found.unpack(key)
                predicate = self._predicates[predicate]
                object = predicate.unpack(object)
                binding = Map()
                binding = binding.set(pattern[2].name, object)
                bindings.append(binding)
        else:
            raise PatternException(pattern)

        log.debug("seed bindings: %r", bindings)
        # contine matching other patterns, if any.
        for pattern in patterns:  # one
            log.debug("matching pattern: %r", pattern)
            next_bindings = []
            for binding in bindings:  # two
                bound_pattern = pattern_bind(pattern, binding)
                log.debug("bound pattern: %r", bound_pattern)
                vars = tuple((isinstance(item, var) for item in bound_pattern))
                if vars == (False, False, False):
                    log.debug("clause: False, False, False")
                    ok = await self.exists(tr, *bound_pattern)
                    if ok:
                        # this binding is valid against this bound_pattern,
                        # proceed with this binding and continue with
                        # the next pattern.
                        next_bindings.append(binding)
                elif vars == (False, False, True):
                    # TODO: extract to a method
                    log.debug("clause: False, False, True")
                    subject, predicate, object = bound_pattern
                    predicate = self._predicates[predicate]
                    start = found.pack(
                        (self._prefix, PREFIX_SPO, subject, predicate.name))
                    end = found.strinc(start)
                    items = await tr.get_range(start, end)
                    for key, _ in items:
                        _, _, _, _, value = found.unpack(key)
                        value = predicate.pack(value)
                        new = binding.set(object.name, value)
                        next_bindings.append(new)
                elif vars == (True, False, False):
                    log.debug("clause: True, False, False")
                    subject, predicate, object = bound_pattern
                    predicate = self._predicates[predicate]
                    object = predicate.pack(object)
                    values = await self._lookup_pos_subjects(
                        tr, predicate.name, object)
                    for value in values:
                        new = binding.set(subject.name, value)
                        next_bindings.append(new)
                else:
                    raise PatternException(pattern)
            bindings = next_bindings
        return bindings
Ejemplo n.º 13
0
 async def test(tx):
     found.set(tx, prefix_zero + b"\x01", found.pack((1,)))
     found.set(tx, prefix_zero + b"\x02", found.pack((2,)))
     found.set(tx, prefix_zero + b"\x03", found.pack((3,)))
     found.set(tx, prefix_one + b"\x42", found.pack((42,)))
Ejemplo n.º 14
0
 async def query(tx):
     out = found.query(tx, found.pack((1,)), found.pack((8,)))
     out = await aiolist(out)
     return out
Ejemplo n.º 15
0
 async def set(tx):
     for number in range(10):
         found.set(tx, found.pack((number,)), found.pack((str(number),)))
Ejemplo n.º 16
0
async def _prepare(tx, prefix, candidates, keywords):
    for candidate in candidates:
        out = await found.get(tx, found.pack((prefix, candidate)))
        yield (candidate, keywords, out)
Ejemplo n.º 17
0
async def _token_to_size(tx, prefix_index, token):
    key = found.pack((prefix_index, token))
    out = await found.estimated_size_bytes(tx, key, found.next_prefix(key))
    return out