Exemple #1
    def suggestions_and_scores(self, text, weighting=None):
        if weighting is None:
            weighting = scoring.TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(query.Term("start%s" % size, gramlist[0],
            queries.append(query.Term("end%s" % size, gramlist[-1],
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
            result = s.search(q, limit=None)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result)
                    if fs["word"] != text]
Exemple #2
def test_roundtrip():
        [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b(""))
        [b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee")],
        b("\x00") * 5)
        [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b(""))
        [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")],
        b("\x00") * 3)
    _rt(columns.StructColumn("ifH", (0, 0.0, 0)),
        [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462),
         (-57829, -1.5, 6), (0, 0, 0)],
        (0, 0.0, 0))

    numcol = columns.NumericColumn
    _rt(numcol("b"), [10, -20, 30, -25, 15], 0)
    _rt(numcol("B"), [10, 20, 30, 25, 15], 0)
    _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0)
    _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0)
    _rt(numcol("i"), [2 ** 16, -(2 ** 20), 2 ** 24, -(2 ** 28), 2 ** 30], 0)
    _rt(numcol("I"), [2 ** 16, 2 ** 20, 2 ** 24, 2 ** 28, 2 ** 31 & 0xFFFFFFFF], 0)
    _rt(numcol("q"), [10, -20, 30, -25, 15], 0)
    _rt(numcol("Q"), [2 ** 35, 2 ** 40, 2 ** 48, 2 ** 52, 2 ** 63], 0)
    _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0)
    _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0)

    c = columns.BitColumn(compress_at=10)
    _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False)
    _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False)

    c = columns.PickleColumn(columns.VarBytesColumn())
    _rt(c, [None, True, False, 100, -7, "hello"], None)
def damerau_levenshtein(seq1, seq2, limit=None):
    """Returns the Damerau-Levenshtein edit distance between two strings.

    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)

        if limit and x > limit and min(thisrow) > limit:
            return limit + 1

    return thisrow[len(seq2) - 1]
Exemple #4
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
def test_charboost_postings():
    postings = []
    docnum = 0
    for _ in xrange(0, 20):
        docnum += randint(1, 10)
        posns = []
        pos = 0
        endchar = 0
        for __ in xrange(0, randint(1, 10)):
            pos += randint(1, 10)
            startchar = endchar + randint(3, 10)
            endchar = startchar + randint(3, 10)
            boost = byte_to_float(float_to_byte(random() * 2))
            posns.append((pos, startchar, endchar, boost))
        postings.append((docnum, posns))

    assert_equal(postings, roundtrip(postings, CharacterBoosts(), "character_boosts"))
    as_chars = [(docnum, [(pos, sc, ec) for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_chars, roundtrip(postings, CharacterBoosts(), "characters"))
    as_posbsts = [(docnum, [(pos, bst) for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_posbsts, roundtrip(postings, CharacterBoosts(), "position_boosts"))
    as_posns = [(docnum, [pos for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_posns, roundtrip(postings, CharacterBoosts(), "positions"))
    as_freq = [(docnum, len(posns)) for docnum, posns in as_posns]
    assert_equal(as_freq, roundtrip(postings, CharacterBoosts(), "frequency"))
Exemple #6
def test_merge_random():
    items1 = sorted((random_name(4), random_name(8)) for _ in xrange(500))
    items2 = sorted((random_name(4), random_name(8)) for _ in xrange(500))

    x1 = sorted(dict(items1 + items2).items())
    x2 = list(kv.merge_items(items1, items2))
    assert x1 == x2
 def fill(self, docnum):
     if docnum > self._count:
         if self._refs is not None:
             self._refs.extend(0 for _ in xrange(docnum - self._count))
             dbfile = self._dbfile
             for _ in xrange(docnum - self._count):
def read_qsafe_array(typecode, size, dbfile):
    if typecode == "q":
        arry = [dbfile.read_long() for _ in xrange(size)]
    elif typecode == "Q":
        arry = [dbfile.read_ulong() for _ in xrange(size)]
        arry = dbfile.read_array(typecode, size)

    return arry
 def __iter__(self):
     i = 0
     for num in self._bitset:
         if num > i:
             for _ in xrange(num - i):
                 yield False
         yield True
         i = num + 1
     if self._doccount > i:
         for _ in xrange(self._doccount - i):
             yield False
Exemple #10
    def __call__(
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query":
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1
    def _read_weights(self):
        # If we haven't loaded the data from disk yet, load it now
        if self._data is None:
        weights = self._data[1]

        # De-minify the weights
        postcount = self._blocklength
        if weights is None:
            self._weights = array("f", (1.0 for _ in xrange(postcount)))
        elif isinstance(weights, float):
            self._weights = array("f", (weights for _ in xrange(postcount)))
            self._weights = weights
Exemple #12
            def run(self):
                ix = st.create_index(dir, schema)
                num = 0

                for i in xrange(50):
                    w = ix.writer()
                    for _ in xrange(random.randint(1, 100)):
                        content = u(" ").join(random.sample(domain, random.randint(5, 20)))
                        w.add_document(id=text_type(num), content=content)
                        num += 1

Exemple #13
def test_boolean_find_deleted():
    # "Random" string of ones and zeros representing deleted and undeleted
    domain = "1110001010001110010101000101001011101010001011111101000101010101"

    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    count = 0
    # Create multiple segments just in case
    for _ in xrange(5):
        w = ix.writer()
        for c in domain:
            w.add_document(i=count, b=(c == "1"))

    # Delete documents where "b" is True
    with ix.writer() as w:
        w.delete_by_term("b", "t")

    with ix.searcher() as s:
        # Double check that documents with b=True are all deleted
        reader = s.reader()
        for docnum in xrange(s.doc_count_all()):
            b = s.stored_fields(docnum)["b"]
            assert b == reader.is_deleted(docnum)

        # Try doing a search for documents where b=True
        qp = qparser.QueryParser("b", ix.schema)
        q = qp.parse("b:t")
        r = s.search(q, limit=None)
        assert len(r) == 0

        # Make sure Every query doesn't match deleted docs
        r = s.search(qp.parse("*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(qp.parse("*:*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        # Make sure Not query doesn't match deleted docs
        q = qp.parse("NOT b:t")
        r = s.search(q, limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(q, limit=5)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)
Exemple #14
def test_numeric_ranges():
    schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()

    for i in xrange(400):
        w.add_document(id=i, num=i)

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, target):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]
            assert result == target

        # Note that range() is always inclusive-exclusive
        check("[10 to 390]", list(range(10, 390 + 1)))
        check("[100 to]", list(range(100, 400)))
        check("[to 350]", list(range(0, 350 + 1)))
        check("[16 to 255]", list(range(16, 255 + 1)))
        check("{10 to 390]", list(range(11, 390 + 1)))
        check("[10 to 390}", list(range(10, 390)))
        check("{10 to 390}", list(range(11, 390)))
        check("{16 to 255}", list(range(17, 255)))
Exemple #15
    def all_doc_ids(self):
        """Returns an iterator of all (undeleted) document IDs in the reader.

        is_deleted = self.is_deleted
        return (docnum for docnum in xrange(self.doc_count_all())
                if not is_deleted(docnum))
Exemple #16
def test_batchsize_eq_doccount():
    schema = fields.Schema(a=fields.KEYWORD(stored=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=4, batchsize=10) as w:
            for i in xrange(10):
Exemple #17
def test_decimal_ranges():
    from decimal import Decimal

    schema = fields.Schema(id=fields.STORED,
                           num=fields.NUMERIC(int, decimal_places=2))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    count = Decimal("0.0")
    inc = Decimal("0.2")
    for _ in xrange(500):
        w.add_document(id=str(count), num=count)
        count += inc

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, start, end):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]

            target = []
            count = Decimal(start)
            limit = Decimal(end)
            while count <= limit:
                count += inc

            assert result == target

        check("[10.2 to 80.8]", "10.2", "80.8")
        check("{10.2 to 80.8]", "10.4", "80.8")
        check("[10.2 to 80.8}", "10.2", "80.6")
        check("{10.2 to 80.8}", "10.4", "80.6")
def test_random_multistream():
    letters = "abcdefghijklmnopqrstuvwxyz"

    def randstring(n):
        s = "".join(random.choice(letters) for _ in xrange(n))
        return s.encode("latin1")

    domain = {}
    for _ in xrange(100):
        name = randstring(random.randint(5, 10))
        value = randstring(2500)
        domain[name] = value

    outfiles = dict((name, BytesIO(value)) for name, value in domain.items())

    with TempStorage() as st:
        msw = compound.CompoundWriter(st, buffersize=1024)
        mfiles = {}
        for name in domain:
            mfiles[name] = msw.create_file(name)
        while outfiles:
            name = random.choice(list(outfiles.keys()))
            v = outfiles[name].read(1000)
            if len(v) < 1000:
                del outfiles[name]
        f = st.create_file("test")

        f = st.open_file("test")
        msr = compound.CompoundStorage(f)
        for name, value in domain.items():
            assert msr.open_file(name).read() == value
Exemple #19
    def _read_header(self, dbfile, doccount):
        first = dbfile.read(4)  # Magic
        assert first == self.magic
        version = dbfile.read_int()  # Version number
        assert version == 1

        dc = dbfile.read_uint()  # Number of documents saved
        if doccount is None:
            doccount = dc
        assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
        self._count = doccount

        fieldcount = dbfile.read_ushort()  # Number of fields
        # Read per-field info
        for i in xrange(fieldcount):
            fieldname = dbfile.read_string().decode('utf-8')
            self.totals[fieldname] = dbfile.read_long()
            self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.starts[fieldname] = i * doccount

        # Add header length to per-field offsets
        eoh = dbfile.tell()  # End of header
        for fieldname in self.starts:
            self.starts[fieldname] += eoh
Exemple #20
def test_deleteall():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "deleteall") as ix:
        w = ix.writer()
        domain = u("alfa bravo charlie delta echo").split()
        for i, ls in enumerate(permutations(domain)):
            w.add_document(text=u(" ").join(ls))
            if not i % 10:
                w = ix.writer()

        # This is just a test, don't use this method to delete all docs IRL!
        doccount = ix.doc_count_all()
        w = ix.writer()
        for docnum in xrange(doccount):

        with ix.searcher() as s:
            r = s.search(query.Or([query.Term("text", u("alfa")),
                                   query.Term("text", u("bravo"))]))
            assert len(r) == 0

        assert ix.doc_count_all() == 0

        with ix.reader() as r:
            assert list(r) == []
Exemple #21
    def __init__(self, dbfile):
        self.dbfile = dbfile
        self.map = dbfile.map

        magic = dbfile.read(4)
        if magic == b("HASH"):
            self.format = 1
            self.header_size = 16 + 256 * header_entry_size
            _pointer_struct = Struct("!Iq")  # Hash value, position
            self.hashtype = dbfile.read_byte()
            dbfile.read(3)  # Unused
            self._end_of_hashes = dbfile.read_long()
            assert self._end_of_hashes >= self.header_size
            # Old format
            self.format = self.hashtype = 0
            self.header_size = 256 * header_entry_size
            _pointer_struct = Struct("!qq")  # Hash value, position

        self.hash_func = hash_functions[self.hashtype]
        self.buckets = []
        for _ in xrange(256):
            he = unpack_header_entry(dbfile.read(header_entry_size))
        self._start_of_hashes = self.buckets[0][0]

        self.pointer_size = _pointer_struct.size
        self.unpack_pointer = _pointer_struct.unpack

        self.is_closed = False
Exemple #22
    def ranges_for_key(self, key):
        read = self.read
        pointer_size = self.pointer_size
        if isinstance(key, text_type):
            key = key.encode('latin-1')
        keyhash = self.hash_func(key)
        hpos, hslots = self._hashtable_info(keyhash)
        if not hslots:

        slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size)
        for _ in xrange(hslots):
            slothash, pos = self.unpack_pointer(read(slotpos, pointer_size))
            if not pos:

            slotpos += pointer_size
            # If we reach the end of the hashtable, wrap around
            if slotpos == hpos + (hslots * pointer_size):
                slotpos = hpos

            if slothash == keyhash:
                keylen, datalen = unpack_lengths(read(pos, lengths_size))
                if keylen == len(key):
                    if key == read(pos + lengths_size, keylen):
                        yield (pos + lengths_size + keylen, datalen)
 def cache_clear():
     """Clear the cache and cache statistics"""
     stats[0] = stats[1] = stats[2] = 0
     for i in xrange(maxsize):
         clock_keys[i] = None
         clock_refs[i] = 0
Exemple #24
    def _write_hashes(self):
        dbfile = self.dbfile
        hashes = self.hashes
        directory = self.directory = []

        pos = dbfile.tell()
        for i in xrange(0, 256):
            entries = hashes[i]
            numslots = 2 * len(entries)
            directory.append((pos, numslots))

            null = (0, 0)
            hashtable = [null] * numslots
            for hashval, position in entries:
                n = (hashval >> 8) % numslots
                while hashtable[n] != null:
                    n = (n + 1) % numslots
                hashtable[n] = (hashval, position)

            write = dbfile.write
            for hashval, position in hashtable:
                write(self.pack_pointer(hashval, position))
                pos += self.pointer_size

        self._end_of_hashes = dbfile.tell()
Exemple #25
    def __iter__(self):
        dbfile = self.dbfile
        names = self.names
        lengths = array("I")

        for i in xrange(self.length):
            dbfile.seek(_LONG_SIZE, 1)

        for length in lengths:
            vlist = loads(dbfile.read(length) + b("."))
            vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                     if vlist[i] is not None)
            yield vdict
Exemple #26
    def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
        :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
            to write to.
        :param magic: the format tag bytes to write at the start of the file.
        :param hashtype: an integer indicating which hashing algorithm to use.
            Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).

        self.dbfile = dbfile
        self.hashtype = hashtype
        self.hashfn = _hash_functions[self.hashtype]
        # A place for subclasses to put extra metadata
        self.extras = {}

        self.startoffset = dbfile.tell()
        # Write format tag
        # Write hash type
        # Unused future expansion bits

        # 256 lists of hashed keys and positions
        self.buckets = [[] for _ in xrange(256)]
        # List to remember the positions of the hash tables
        self.directory = []
Exemple #27
    def all_stored_fields(self):
        """Yields the stored fields for all documents.

        for docnum in xrange(self.doc_count_all()):
            if not self.is_deleted(docnum):
                yield self.stored_fields(docnum)
Exemple #28
 def __iter__(self):
     base = 0
     for byte in self._iter_bytes():
         for i in xrange(8):
             if byte & (1 << i):
                 yield base + i
         base += 8
Exemple #29
    def __init__(self, dbfile, offset, expand=True):
        self.id = offset
        self.dbfile = dbfile

        flags = dbfile.read_byte()
        self.final = bool(flags & 1)
        self._edges = {}
        if flags & 2:
            singles = flags & 4
            bytes = flags & 8

            nkeys = dbfile.read_varint()

            ptrs = dbfile.read_array("I", nkeys)
            for i in xrange(nkeys):
                ptr = ptrs[i]
                if singles:
                    if bytes:
                        charnum = dbfile.read_byte()
                        charnum = dbfile.read_ushort()
                    self._edges[unichr(charnum)] = ptr
                    key = utf8decode(dbfile.read_string())[0]
                    if len(key) > 1 and expand:
                        self._edges[key[0]] = PatNode(dbfile, key[1:], ptr)
                        self._edges[key] = ptr
 def __getitem__(self, docnum):
     fixedlen = self._fixedlen
     v = self._child[docnum]
     if not v:
         return []
     ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)]
     return ls
Exemple #31
def test_nonexclusive_read():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "readlock") as ix:
        for num in u("one two three four five").split():
            w = ix.writer()
            w.add_document(text=u("Test document %s") % num)

        def fn():
            for _ in xrange(5):
                r = ix.reader()
                assert list(r.field_terms("text")) == [
                    "document", "five", "four", "one", "test", "three", "two"

        ths = [threading.Thread(target=fn) for _ in xrange(5)]
        for th in ths:
        for th in ths:
Exemple #32
def test_multimatcher():
    schema = fields.Schema(content=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)

    domain = ("alfa", "bravo", "charlie", "delta")

    for _ in xrange(3):
        w = ix.writer()
        for ls in permutations(domain):
            w.add_document(content=u(" ").join(ls))

    q = Term("content", "bravo")
    with ix.searcher() as s:
        m = q.matcher(s)
        while m.is_active():
            content = s.stored_fields(m.id())["content"].split()
            spans = m.spans()
            for span in spans:
                assert content[span.start] == "bravo"
Exemple #33
def test_simple_indexing():
    schema = fields.Schema(text=fields.TEXT, id=fields.STORED)
    domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
              u("kilo"), u("lima"), u("mike"), u("november"))
    docs = defaultdict(list)
    with TempIndex(schema, "simple") as ix:
        with ix.writer() as w:
            for i in xrange(100):
                smp = random.sample(domain, 5)
                for word in smp:
                w.add_document(text=u(" ").join(smp), id=i)

        with ix.searcher() as s:
            for word in domain:
                rset = sorted([
                    for hit in s.search(query.Term("text", word), limit=None)
                assert rset == docs[word]
Exemple #34
def read_region(dbfile, region, start=None):
    _read = dbfile.read
    _unpack = itemheader.unpack
    _headersize = itemheader.size

    start = start if start is not None else region.start

    first = True
    for i in xrange(region.length):
        keylen, vlen = _unpack(_read(_headersize))
        key = _read(keylen)
        val = _read(vlen)

        if first:
            assert key == region.minkey
            first = False

        yield key, val

    assert dbfile.tell() == region.end
Exemple #35
def read_gints(dbfile, n):
    """Read N integers from the bytes stream dbfile. Expects that the file
    starts at a key byte.

    count = 0
    read = dbfile.read
    for _ in xrange(n):
        if count == 0:
            key = ord(dbfile.read(1))
        code = key >> (count * 2) & 3
        if code == 0:
            yield ord(read(1))
        elif code == 1:
            yield unpack_ushort_le(read(2))[0]
        elif code == 2:
            yield unpack_uint_le(read(3) + "\x00")[0]
            yield unpack_uint_le(read(4))[0]

        count = (count + 1) % 4
Exemple #36
    def read_nums(self, f, n):
        """Read N integers from the bytes stream dbfile. Expects that the file
        is positioned at a key byte.

        count = 0
        key = None
        for _ in xrange(n):
            if count == 0:
                key = f.read_byte()
            code = key >> (count * 2) & 3
            if code == 0:
                yield f.read_byte()
            elif code == 1:
                yield f.read_ushort_le()
            elif code == 2:
                yield unpack_uint_le(f.read(3) + "\x00")[0]
                yield f.read_uint_le()

            count = (count + 1) % 4
Exemple #37
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = [
            "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf",
            "hotel", "india", "juliet", "kilo", "lima"

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
                           text=u(" ").join(random.sample(domain, 5)))
        print("Write buffered:", now() - t)

        t = now()
        print("Optimize buffered:", now() - t)
def test_lengths2():
    schema = fields.Schema(text=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)
    count = 0
    for _ in xrange(3):
        w = ix.writer()
        for ls in permutations(u("alfa bravo charlie").split()):
            if "bravo" in ls and "charlie" in ls:
                count += 1
            w.add_document(text=u(" ").join(ls))

    with ix.searcher() as s:
        q = query.Or(
            [query.Term("text", u("bravo")),
             query.Term("text", u("charlie"))])
        r = s.search(q, limit=None)
        assert len(r) == count

        r = s.search(q, limit=3)
        assert len(r) == count
def test_exclusion():
    from datetime import datetime

    schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME)
    ix = RamStorage().create_index(schema)
    dt1 = datetime(1950, 1, 1)
    dt2 = datetime(1960, 1, 1)
    with ix.writer() as w:
        # Make 39 documents with dates != dt1 and then make a last document
        # with feed == dt1.
        for i in xrange(40):
            w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1))

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)
        # Find documents where date != dt1
        q = qp.parse("NOT (date:(19500101000000))")

        r = s.search(q, limit=None)
        assert len(r) == 39  # Total number of matched documents
        assert r.scored_length() == 39  # Number of docs in the results
def test_combos():
    qs = ('w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) ' +
          '(apple ANDNOT bear)^2.3')

    init_args = {
        plugins.MultifieldPlugin: (["content", "title"], {
            "content": 1.0,
            "title": 1.2
        plugins.FieldAliasPlugin: ({
            "content": ("text", "body")
        }, ),
        plugins.MultifieldPlugin: (["title", "content"], ),
        plugins.CopyFieldPlugin: ({
            "name": "phone"
        }, ),
        plugins.PseudoFieldPlugin: ({
            "name": lambda x: x

    pis = _plugin_classes(())
    for i, plugin in enumerate(pis):
            pis[i] = plugin(*init_args.get(plugin, ()))
        except TypeError:
            raise TypeError("Error instantiating %s" % plugin)

    count = 0
    for i, first in enumerate(pis):
        for j in xrange(len(pis)):
            if i == j: continue
            plist = [p for p in pis[:j] if p is not first] + [first]
            qp = qparser.QueryParser("text", None, plugins=plist)
            except Exception:
                e = sys.exc_info()[1]
                raise Exception(str(e) + " combo: %s %r" % (count, plist))
            count += 1
Exemple #41
    def insert(self, word):
        """Add the given "word" (a string or list of strings) to the graph.
        Words must be inserted in sorted order.

        lw = self.lastword
        prefixlen = 0
        if lw:
            if self._field_root and lw[0] != word[0]:
                # If field_root == True, caller can add entire fields out-of-
                # order (but not individual terms)
            elif word < lw:
                raise Exception("Out of order %r..%r." % (self.lastword, word))
                # find common prefix between word and previous word
                for i in xrange(min(len(word), len(lw))):
                    if word[i] != lw[i]: break
                    prefixlen += 1

        # Check the unchecked for redundant nodes, proceeding from last
        # one down to the common prefix size. Then truncate the list at
        # that point.

        # Add the suffix, starting from the correct node mid-way through the
        # graph
        if not self.unchecked:
            node = self.root
            node = self.unchecked[-1][2]

        for letter in word[prefixlen:]:
            nextnode = BuildNode()
            node.put(letter, nextnode)
            self.unchecked.append((node, letter, nextnode))
            node = nextnode

        node.final = True
        self.lastword = word
Exemple #42
    def __getitem__(self, num):
        if num > self.length - 1:
            raise IndexError("Tried to get document %s, file has %s" %
                             (num, self.length))

        dbfile = self.dbfile
        start = self.directory_offset + num * stored_pointer_size
        ptr = dbfile.read(stored_pointer_size)
        if len(ptr) != stored_pointer_size:
            raise Exception("Error reading %r @%s %s < %s" %
                            (dbfile, start, len(ptr), stored_pointer_size))
        position, length = unpack_stored_pointer(ptr)
        vlist = loads(dbfile.map[position:position + length] + b("."))

        names = self.names
        # Recreate a dictionary by putting the field names and values back
        # together by position. We can't just use dict(zip(...)) because we
        # want to filter out the None values.
        vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                     if vlist[i] is not None)
        return vdict
Exemple #43
def test_20000_batch():
    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000batch") as ix:
        domain = [
            "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf",
            "hotel", "india", "juliet", "kilo", "lima"

        t = now()
        w = ix.writer()
        for i in xrange(20000):
                           text=u(" ").join(random.sample(domain, 5)))
            if not i % 100:
                w = ix.writer()
        print("Write batch:", now() - t)

        t = now()
        print("Optimize batch:", now() - t)
Exemple #44
    def suggest(self, text, limit=5, maxdist=2, prefix=0):
        :param text: the text to check. This word will **not** be added to the
            suggestions, even if it appears in the word graph.
        :param limit: only return up to this many suggestions. If there are not
            enough terms in the field within ``maxdist`` of the given word, the
            returned list will be shorter than this number.
        :param maxdist: the largest edit distance from the given word to look
            at. Numbers higher than 2 are not very effective or efficient.
        :param prefix: require suggestions to share a prefix of this length
            with the given word. This is often justifiable since most
            misspellings do not involve the first letter of the word. Using a
            prefix dramatically decreases the time it takes to generate the
            list of words.

        _suggestions = self._suggestions

        heap = []
        seen = set([text])
        for k in xrange(1, maxdist + 1):
            for item in _suggestions(text, k, prefix):
                if item[1] in seen:

                # Note that the *higher* scores (item[0]) are better!
                if len(heap) < limit:
                    heappush(heap, item)
                elif item > heap[0]:
                    heapreplace(heap, item)

            # If the heap is already at the required length, don't bother going
            # to a higher edit distance
            if len(heap) >= limit:

        sugs = sorted(heap, key=lambda item: (0 - item[0], item[1]))
        return [sug for _, sug in sugs]
Exemple #45
    def _compress(self, inarray, inoffset, n):
        _numsize = self._numsize
        _bitsize = self._bitsize
        _num = self._num
        _bits = self._bits

        for key in xrange(_numsize):
            value = key << _bitsize
            num = _num[key] if _num[key] < n else n
            bits = 0

            j = 0
            while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
                x = inarray[inoffset + j]
                value |= x << bits
                bits += _bits[key][j]
                j += 1

            if j == num:
                return value, num

        raise Exception
Exemple #46
def _check_writer(name, writer_fn):
    schema = fields.Schema(text=fields.TEXT, id=fields.STORED)
    domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
              u("kilo"), u("lima"), u("mike"), u("november"))
    docs = defaultdict(list)
    with TempIndex(schema, name) as ix:
        w = writer_fn(ix)
        for i in xrange(1000):
            smp = random.sample(domain, 5)
            for word in smp:
            w.add_document(text=u(" ").join(smp), id=i)

        with ix.searcher() as s:
            for word in domain:
                rset = sorted([hit["id"] for hit
                               in s.search(query.Term("text", word),
                assert_equal(rset, docs[word])
    def set_searcher(self, segment_searcher, docoffset):
        fieldname = self._fieldname
        self._segment_searcher = segment_searcher
        reader = segment_searcher.reader()

        if self._use_vectors:
        elif self._use_column:
            self._creader = reader.column_reader(fieldname, translate=False)
            # Otherwise, cache the values in each document in a huge list
            # of lists
            dc = segment_searcher.doc_count_all()
            field = segment_searcher.schema[fieldname]
            from_bytes = field.from_bytes

            self._lists = [[] for _ in xrange(dc)]
            for btext in field.sortable_terms(reader, fieldname):
                text = from_bytes(btext)
                postings = reader.postings(fieldname, btext)
                for docid in postings.all_ids():
Exemple #48
def test_buffered_threads():
    domain = u("alfa bravo charlie delta").split()
    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:

        class SimWriter(threading.Thread):
            def run(self):
                for _ in xrange(5):
                    time.sleep(random.uniform(0.01, 0.1))

        w = writing.BufferedWriter(ix, limit=10)
        threads = [SimWriter() for _ in xrange(5)]
        for thread in threads:
        for thread in threads:

        with ix.reader() as r:
            assert r.doc_count() == 4
            assert sorted([d["name"] for d in r.all_stored_fields()]) == domain
def test_page_counts():
    from whoosh.scoring import Frequency

    schema = fields.Schema(id=fields.ID(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for i in xrange(10):

    with ix.searcher(weighting=Frequency) as s:
        q = query.Every("id")

        r = s.search(q)
        assert len(r) == 10

        with pytest.raises(ValueError):
            s.search_page(q, 0)

        r = s.search_page(q, 1, 5)
        assert len(r) == 10
        assert r.pagecount == 2

        r = s.search_page(q, 1, 5)
        assert len(r) == 10
        assert r.pagecount == 2

        r = s.search_page(q, 2, 5)
        assert len(r) == 10
        assert r.pagecount == 2
        assert r.pagenum == 2

        r = s.search_page(q, 1, 10)
        assert len(r) == 10
        assert r.pagecount == 1
        assert r.pagenum == 1
Exemple #50
def parse_record(data, tags=None):
    leader = data[:LEADER_LEN]
    assert len(leader) == LEADER_LEN

    dataoffset = int(data[12:17])
    assert dataoffset > 0
    assert dataoffset < len(data)

    # dataoffset - 1 to avoid END-OF-FIELD byte
    dirstart = LEADER_LEN
    dirend = dataoffset - 1

    # Number of fields in record
    assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
    field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN

    result = {}
    for i in xrange(field_count):
        start = dirstart + i * DIRECTORY_ENTRY_LEN
        end = start + DIRECTORY_ENTRY_LEN
        tag = data[start:start + 3]
        if tags and not tag in tags:

        entry = data[start:end]
        elen = int(entry[3:7])
        offset = dataoffset + int(entry[7:12])
        edata = data[offset:offset + elen - 1]

        if not (tag < "010" and tag.isdigit()):
            edata = edata.split(SUBFIELD_INDICATOR)[1:]
            if tag in result:
                result[tag] = edata
            result[tag] = edata
    return result
Exemple #51
    def read_ids(self):
        postfile = self.postfile
        offset = self.dataoffset
        postcount = self.count

        if self.stringids:
            rs = postfile.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            newoffset = postfile.tell()
        elif self.idslen:
            ids = array("I")
            if IS_LITTLE:
            newoffset = offset + self.idslen
            ids = postfile.read_array("I", postcount)
            newoffset = offset + _INT_SIZE * postcount

        self.ids = ids
        self.weights_offset = newoffset
        return ids
Exemple #52
    def _read_part(self):
        scored = self._scored
        boost = self._boost
        limit = min(self._docnum + self._partsize, self._doccount)
        offset = self._docnum
        a = self._a

        # Clear the array
        for i in xrange(self._partsize):
            a[i] = 0

        # Add the scores from the submatchers into the array
        for m in self._submatchers:
            while m.is_active() and m.id() < limit:
                i = m.id() - offset
                if scored:
                    a[i] += m.score() * boost
                    a[i] = 1

        self._offset = offset
        self._limit = limit
def test_filter_by_result():
    schema = fields.Schema(title=fields.TEXT(stored=True),

    with TempIndex(schema, "filter") as ix:
        words = u("foo bar baz qux barney").split()
        with ix.writer() as w:
            for x in xrange(100):
                t = u("even" if x % 2 == 0 else "odd")
                c = words[x % len(words)]
                w.add_document(title=t, content=c)

        with ix.searcher() as searcher:
            fq = query.Term("title", "even")
            filter_result = searcher.search(fq)
            assert filter_result.docset is None

            q = query.Term("content", "foo")

            # filter_result.docs()
            result = searcher.search(q, filter=filter_result)
            assert all(x["title"] == "even" and x["content"] == "foo"
                       for x in result)
Exemple #54
def test_compound_sort():
    fspec = fields.KEYWORD(stored=True, sortable=True)
    schema = fields.Schema(a=fspec, b=fspec, c=fspec)
    ix = RamStorage().create_index(schema)

    alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split()
    blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split()
    clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split()
    assert all(len(ls) == 10 for ls in (alist, blist, clist))

    with ix.writer() as w:
        for i in xrange(10):
            w.add_document(a=alist[i], b=blist[i], c=clist[i])

    with ix.searcher() as s:
        q = query.Every()
        sortedby = [sorting.FieldFacet("a"),
                    sorting.FieldFacet("b", reverse=True),

        r = s.search(q, sortedby=sortedby)
        output = []
        for hit in r:
            output.append(" ".join((hit["a"], hit["b"], hit["c"])))

        assert output == [
            "alfa charlie charlie",
            "alfa charlie india",
            "alfa bravo echo",
            "alfa alfa alfa",
            "alfa alfa golf",
            "bravo charlie foxtrot",
            "bravo bravo bravo",
            "bravo bravo hotel",
            "bravo alfa delta",
            "bravo alfa juliet",
Exemple #55
def test_random():
    def randstring():
        length = random.randint(1, 5)
        a = array("B", (random.randint(0, 255) for _ in xrange(length)))
        return array_tobytes(a)

    keys = sorted(randstring() for _ in xrange(100))

    with TempStorage() as st:
        gwrite(keys, st)
        gr = greader(st)
        cur = fst.Cursor(gr)
        s1 = cur.flatten()
        s2 = sorted(set(keys))
        for i, (k1, k2) in enumerate(zip(s1, s2)):
            assert k1 == k2, "%s: %r != %r" % (i, k1, k2)

        sample = list(keys)
        for key in sample:
            assert cur.prefix_bytes() == key
Exemple #56
def make_multi_index(ix):
    for i in xrange(0, len(docs), 3):
        w = ix.writer()
        for doc in docs[i:i + 3]:
            w.add_document(ev=u("a"), **doc)
Exemple #57
 def randstring(min, max):
     return "".join(
         chr(randint(1, 255)) for _ in xrange(randint(min, max)))
Exemple #58
 def all_ids(self):
     missing = self.missing
     negs = set(self.child.all_ids())
     return (id for id in xrange(self.limit)
             if id not in negs and not missing(id))
Exemple #59
# don't have to constantly recalculate them on the fly. This makes a small but
# noticeable difference.

def _varint(i):
    a = array("B")
    while (i & ~0x7F) != 0:
        a.append((i & 0x7F) | 0x80)
        i = i >> 7
    return array_tobytes(a)

_varint_cache_size = 512
_varint_cache = []
for i in xrange(0, _varint_cache_size):
_varint_cache = tuple(_varint_cache)

def varint(i):
    """Encodes the given integer into a string of the minimum number  of bytes.
    if i < len(_varint_cache):
        return _varint_cache[i]
    return _varint(i)

def varint_to_int(vi):
    b = ord(vi[0])
    p = 1
Exemple #60
    def decorating_function(user_function):
        stats = [0, 0, 0]  # hits, misses, hand
        data = {}

        if maxsize:
            # The keys at each point on the clock face
            clock_keys = [None] * maxsize
            # The "referenced" bits at each point on the clock face
            clock_refs = array("B", (0 for _ in xrange(maxsize)))
            lock = Lock()

            def wrapper(*args):
                key = args
                    with lock:
                        pos, result = data[key]
                        # The key is in the cache. Set the key's reference bit
                        clock_refs[pos] = 1
                        # Record a cache hit
                        stats[0] += 1
                except KeyError:
                    # Compute the value
                    result = user_function(*args)
                    with lock:
                        # Current position of the clock hand
                        hand = stats[2]
                        # Remember to stop here after a full revolution
                        end = hand
                        # Sweep around the clock looking for a position with
                        # the reference bit off
                        while True:
                            hand = (hand + 1) % maxsize
                            current_ref = clock_refs[hand]
                            if current_ref:
                                # This position's "referenced" bit is set. Turn
                                # the bit off and move on.
                                clock_refs[hand] = 0
                            elif not current_ref or hand == end:
                                # We've either found a position with the
                                # "reference" bit off or reached the end of the
                                # circular cache. So we'll replace this
                                # position with the new key
                                current_key = clock_keys[hand]
                                if current_key in data:
                                    del data[current_key]
                                clock_keys[hand] = key
                                clock_refs[hand] = 1
                        # Put the key and result in the cache
                        data[key] = (hand, result)
                        # Save the new hand position
                        stats[2] = hand
                        # Record a cache miss
                        stats[1] += 1
                return result

            def wrapper(*args):
                key = args
                    result = data[key]
                    stats[0] += 1
                except KeyError:
                    result = user_function(*args)
                    data[key] = result
                    stats[1] += 1
                return result

        def cache_info():
            return stats[0], stats[1], maxsize, len(data)

        def cache_clear():
            """Clear the cache and cache statistics"""
            stats[0] = stats[1] = stats[2] = 0
            for i in xrange(maxsize):
                clock_keys[i] = None
                clock_refs[i] = 0

        wrapper.cache_info = cache_info
        wrapper.cache_clear = cache_clear
        return wrapper