Example #1
0
    def suggestions_and_scores(self, text, weighting=None):
        if weighting is None:
            weighting = scoring.TF_IDF()

        grams = defaultdict(list)
        for size in xrange(self.mingram, self.maxgram + 1):
            key = "gram%s" % size
            nga = analysis.NgramAnalyzer(size)
            for t in nga(text):
                grams[key].append(t.text)

        queries = []
        for size in xrange(self.mingram, min(self.maxgram + 1, len(text))):
            key = "gram%s" % size
            gramlist = grams[key]
            queries.append(query.Term("start%s" % size, gramlist[0],
                                      boost=self.booststart))
            queries.append(query.Term("end%s" % size, gramlist[-1],
                                      boost=self.boostend))
            for gram in gramlist:
                queries.append(query.Term(key, gram))

        q = query.Or(queries)
        ix = self.index()
        s = ix.searcher(weighting=weighting)
        try:
            result = s.search(q, limit=None)
            return [(fs["word"], fs["score"], result.score(i))
                    for i, fs in enumerate(result)
                    if fs["word"] != text]
        finally:
            s.close()
Example #2
0
def test_roundtrip():
    _rt(columns.VarBytesColumn(),
        [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b(""))
    _rt(columns.FixedBytesColumn(5),
        [b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee")],
        b("\x00") * 5)
    _rt(columns.RefBytesColumn(),
        [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b(""))
    _rt(columns.RefBytesColumn(3),
        [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")],
        b("\x00") * 3)
    _rt(columns.StructColumn("ifH", (0, 0.0, 0)),
        [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462),
         (-57829, -1.5, 6), (0, 0, 0)],
        (0, 0.0, 0))

    numcol = columns.NumericColumn
    _rt(numcol("b"), [10, -20, 30, -25, 15], 0)
    _rt(numcol("B"), [10, 20, 30, 25, 15], 0)
    _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0)
    _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0)
    _rt(numcol("i"), [2 ** 16, -(2 ** 20), 2 ** 24, -(2 ** 28), 2 ** 30], 0)
    _rt(numcol("I"), [2 ** 16, 2 ** 20, 2 ** 24, 2 ** 28, 2 ** 31 & 0xFFFFFFFF], 0)
    _rt(numcol("q"), [10, -20, 30, -25, 15], 0)
    _rt(numcol("Q"), [2 ** 35, 2 ** 40, 2 ** 48, 2 ** 52, 2 ** 63], 0)
    _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0)
    _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0)

    c = columns.BitColumn(compress_at=10)
    _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False)
    _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False)

    c = columns.PickleColumn(columns.VarBytesColumn())
    _rt(c, [None, True, False, 100, -7, "hello"], None)
def damerau_levenshtein(seq1, seq2, limit=None):
    """Returns the Damerau-Levenshtein edit distance between two strings.
    """

    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)

        if limit and x > limit and min(thisrow) > limit:
            return limit + 1

    return thisrow[len(seq2) - 1]
Example #4
0
def test_datetime():
    dtf = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=dtf)
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for month in xrange(1, 12):
        for day in xrange(1, 28):
            w.add_document(id=u("%s-%s") % (month, day),
                           date=datetime(2010, month, day, 14, 0, 0))
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("date:20100523"))
        assert len(r) == 1
        assert r[0]["id"] == "5-23"
        assert r[0]["date"].__class__ is datetime
        assert r[0]["date"].month == 5
        assert r[0]["date"].day == 23

        r = s.search(qp.parse("date:'2010 02'"))
        assert len(r) == 27

        q = qp.parse(u("date:[2010-05 to 2010-08]"))
        startdt = datetime(2010, 5, 1, 0, 0, 0, 0)
        enddt = datetime(2010, 8, 31, 23, 59, 59, 999999)
        assert q.__class__ is query.NumericRange
        assert q.start == times.datetime_to_long(startdt)
        assert q.end == times.datetime_to_long(enddt)
Example #5
0
def test_charboost_postings():
    postings = []
    docnum = 0
    for _ in xrange(0, 20):
        docnum += randint(1, 10)
        posns = []
        pos = 0
        endchar = 0
        for __ in xrange(0, randint(1, 10)):
            pos += randint(1, 10)
            startchar = endchar + randint(3, 10)
            endchar = startchar + randint(3, 10)
            boost = byte_to_float(float_to_byte(random() * 2))
            posns.append((pos, startchar, endchar, boost))
        postings.append((docnum, posns))

    assert_equal(postings, roundtrip(postings, CharacterBoosts(), "character_boosts"))
    
    as_chars = [(docnum, [(pos, sc, ec) for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_chars, roundtrip(postings, CharacterBoosts(), "characters"))
    
    as_posbsts = [(docnum, [(pos, bst) for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_posbsts, roundtrip(postings, CharacterBoosts(), "position_boosts"))
    
    as_posns = [(docnum, [pos for pos, sc, ec, bst in posns]) for docnum, posns in postings]
    assert_equal(as_posns, roundtrip(postings, CharacterBoosts(), "positions"))
    
    as_freq = [(docnum, len(posns)) for docnum, posns in as_posns]
    assert_equal(as_freq, roundtrip(postings, CharacterBoosts(), "frequency"))
Example #6
0
def test_merge_random():
    items1 = sorted((random_name(4), random_name(8)) for _ in xrange(500))
    items2 = sorted((random_name(4), random_name(8)) for _ in xrange(500))

    x1 = sorted(dict(items1 + items2).items())
    x2 = list(kv.merge_items(items1, items2))
    assert x1 == x2
 def fill(self, docnum):
     if docnum > self._count:
         if self._refs is not None:
             self._refs.extend(0 for _ in xrange(docnum - self._count))
         else:
             dbfile = self._dbfile
             for _ in xrange(docnum - self._count):
                 dbfile.write_ushort(0)
def read_qsafe_array(typecode, size, dbfile):
    if typecode == "q":
        arry = [dbfile.read_long() for _ in xrange(size)]
    elif typecode == "Q":
        arry = [dbfile.read_ulong() for _ in xrange(size)]
    else:
        arry = dbfile.read_array(typecode, size)

    return arry
 def __iter__(self):
     i = 0
     for num in self._bitset:
         if num > i:
             for _ in xrange(num - i):
                 yield False
         yield True
         i = num + 1
     if self._doccount > i:
         for _ in xrange(self._doccount - i):
             yield False
Example #10
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query":
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                    continue
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
        else:
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                        continue
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1
    def _read_weights(self):
        # If we haven't loaded the data from disk yet, load it now
        if self._data is None:
            self._read_data()
        weights = self._data[1]

        # De-minify the weights
        postcount = self._blocklength
        if weights is None:
            self._weights = array("f", (1.0 for _ in xrange(postcount)))
        elif isinstance(weights, float):
            self._weights = array("f", (weights for _ in xrange(postcount)))
        else:
            self._weights = weights
Example #12
0
            def run(self):
                ix = st.create_index(dir, schema)
                num = 0

                for i in xrange(50):
                    print(i)
                    w = ix.writer()
                    for _ in xrange(random.randint(1, 100)):
                        content = u(" ").join(random.sample(domain, random.randint(5, 20)))
                        w.add_document(id=text_type(num), content=content)
                        num += 1
                    w.commit()

                    time.sleep(0.1)
Example #13
0
def test_boolean_find_deleted():
    # "Random" string of ones and zeros representing deleted and undeleted
    domain = "1110001010001110010101000101001011101010001011111101000101010101"

    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    count = 0
    # Create multiple segments just in case
    for _ in xrange(5):
        w = ix.writer()
        for c in domain:
            w.add_document(i=count, b=(c == "1"))
        w.commit(merge=False)

    # Delete documents where "b" is True
    with ix.writer() as w:
        w.delete_by_term("b", "t")

    with ix.searcher() as s:
        # Double check that documents with b=True are all deleted
        reader = s.reader()
        for docnum in xrange(s.doc_count_all()):
            b = s.stored_fields(docnum)["b"]
            assert b == reader.is_deleted(docnum)

        # Try doing a search for documents where b=True
        qp = qparser.QueryParser("b", ix.schema)
        q = qp.parse("b:t")
        r = s.search(q, limit=None)
        assert len(r) == 0

        # Make sure Every query doesn't match deleted docs
        r = s.search(qp.parse("*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(qp.parse("*:*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        # Make sure Not query doesn't match deleted docs
        q = qp.parse("NOT b:t")
        r = s.search(q, limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(q, limit=5)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)
Example #14
0
def test_numeric_ranges():
    schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC)
    ix = RamStorage().create_index(schema)
    w = ix.writer()

    for i in xrange(400):
        w.add_document(id=i, num=i)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, target):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]
            assert result == target

        # Note that range() is always inclusive-exclusive
        check("[10 to 390]", list(range(10, 390 + 1)))
        check("[100 to]", list(range(100, 400)))
        check("[to 350]", list(range(0, 350 + 1)))
        check("[16 to 255]", list(range(16, 255 + 1)))
        check("{10 to 390]", list(range(11, 390 + 1)))
        check("[10 to 390}", list(range(10, 390)))
        check("{10 to 390}", list(range(11, 390)))
        check("{16 to 255}", list(range(17, 255)))
Example #15
0
    def all_doc_ids(self):
        """Returns an iterator of all (undeleted) document IDs in the reader.
        """

        is_deleted = self.is_deleted
        return (docnum for docnum in xrange(self.doc_count_all())
                if not is_deleted(docnum))
Example #16
0
def test_batchsize_eq_doccount():
    check_multi()
    schema = fields.Schema(a=fields.KEYWORD(stored=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=4, batchsize=10) as w:
            for i in xrange(10):
                w.add_document(a=u(str(i)))
Example #17
0
def test_decimal_ranges():
    from decimal import Decimal

    schema = fields.Schema(id=fields.STORED,
                           num=fields.NUMERIC(int, decimal_places=2))
    ix = RamStorage().create_index(schema)
    w = ix.writer()
    count = Decimal("0.0")
    inc = Decimal("0.2")
    for _ in xrange(500):
        w.add_document(id=str(count), num=count)
        count += inc
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("num", schema)

        def check(qs, start, end):
            q = qp.parse(qs)
            result = [s.stored_fields(d)["id"] for d in q.docs(s)]

            target = []
            count = Decimal(start)
            limit = Decimal(end)
            while count <= limit:
                target.append(str(count))
                count += inc

            assert result == target

        check("[10.2 to 80.8]", "10.2", "80.8")
        check("{10.2 to 80.8]", "10.4", "80.8")
        check("[10.2 to 80.8}", "10.2", "80.6")
        check("{10.2 to 80.8}", "10.4", "80.6")
Example #18
0
def test_random_multistream():
    letters = "abcdefghijklmnopqrstuvwxyz"

    def randstring(n):
        s = "".join(random.choice(letters) for _ in xrange(n))
        return s.encode("latin1")

    domain = {}
    for _ in xrange(100):
        name = randstring(random.randint(5, 10))
        value = randstring(2500)
        domain[name] = value

    outfiles = dict((name, BytesIO(value)) for name, value in domain.items())

    with TempStorage() as st:
        msw = compound.CompoundWriter(st, buffersize=1024)
        mfiles = {}
        for name in domain:
            mfiles[name] = msw.create_file(name)
        while outfiles:
            name = random.choice(list(outfiles.keys()))
            v = outfiles[name].read(1000)
            mfiles[name].write(v)
            if len(v) < 1000:
                del outfiles[name]
        f = st.create_file("test")
        msw.save_as_compound(f)

        f = st.open_file("test")
        msr = compound.CompoundStorage(f)
        for name, value in domain.items():
            assert msr.open_file(name).read() == value
        msr.close()
Example #19
0
    def _read_header(self, dbfile, doccount):
        first = dbfile.read(4)  # Magic
        assert first == self.magic
        version = dbfile.read_int()  # Version number
        assert version == 1

        dc = dbfile.read_uint()  # Number of documents saved
        if doccount is None:
            doccount = dc
        assert dc == doccount, "read=%s argument=%s" % (dc, doccount)
        self._count = doccount

        fieldcount = dbfile.read_ushort()  # Number of fields
        # Read per-field info
        for i in xrange(fieldcount):
            fieldname = dbfile.read_string().decode('utf-8')
            self.totals[fieldname] = dbfile.read_long()
            self.minlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.maxlens[fieldname] = byte_to_length(dbfile.read_byte())
            self.starts[fieldname] = i * doccount

        # Add header length to per-field offsets
        eoh = dbfile.tell()  # End of header
        for fieldname in self.starts:
            self.starts[fieldname] += eoh
Example #20
0
def test_deleteall():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "deleteall") as ix:
        w = ix.writer()
        domain = u("alfa bravo charlie delta echo").split()
        for i, ls in enumerate(permutations(domain)):
            w.add_document(text=u(" ").join(ls))
            if not i % 10:
                w.commit()
                w = ix.writer()
        w.commit()

        # This is just a test, don't use this method to delete all docs IRL!
        doccount = ix.doc_count_all()
        w = ix.writer()
        for docnum in xrange(doccount):
            w.delete_document(docnum)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Or([query.Term("text", u("alfa")),
                                   query.Term("text", u("bravo"))]))
            assert len(r) == 0

        ix.optimize()
        assert ix.doc_count_all() == 0

        with ix.reader() as r:
            assert list(r) == []
Example #21
0
    def __init__(self, dbfile):
        self.dbfile = dbfile
        self.map = dbfile.map

        dbfile.seek(0)
        magic = dbfile.read(4)
        if magic == b("HASH"):
            self.format = 1
            self.header_size = 16 + 256 * header_entry_size
            _pointer_struct = Struct("!Iq")  # Hash value, position
            self.hashtype = dbfile.read_byte()
            dbfile.read(3)  # Unused
            self._end_of_hashes = dbfile.read_long()
            assert self._end_of_hashes >= self.header_size
        else:
            # Old format
            self.format = self.hashtype = 0
            self.header_size = 256 * header_entry_size
            _pointer_struct = Struct("!qq")  # Hash value, position

        self.hash_func = hash_functions[self.hashtype]
        self.buckets = []
        for _ in xrange(256):
            he = unpack_header_entry(dbfile.read(header_entry_size))
            self.buckets.append(he)
        self._start_of_hashes = self.buckets[0][0]

        self.pointer_size = _pointer_struct.size
        self.unpack_pointer = _pointer_struct.unpack

        self.is_closed = False
Example #22
0
    def ranges_for_key(self, key):
        read = self.read
        pointer_size = self.pointer_size
        if isinstance(key, text_type):
            key = key.encode('latin-1')
        keyhash = self.hash_func(key)
        hpos, hslots = self._hashtable_info(keyhash)
        if not hslots:
            return

        slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size)
        for _ in xrange(hslots):
            slothash, pos = self.unpack_pointer(read(slotpos, pointer_size))
            if not pos:
                return

            slotpos += pointer_size
            # If we reach the end of the hashtable, wrap around
            if slotpos == hpos + (hslots * pointer_size):
                slotpos = hpos

            if slothash == keyhash:
                keylen, datalen = unpack_lengths(read(pos, lengths_size))
                if keylen == len(key):
                    if key == read(pos + lengths_size, keylen):
                        yield (pos + lengths_size + keylen, datalen)
 def cache_clear():
     """Clear the cache and cache statistics"""
     data.clear()
     stats[0] = stats[1] = stats[2] = 0
     for i in xrange(maxsize):
         clock_keys[i] = None
         clock_refs[i] = 0
Example #24
0
    def _write_hashes(self):
        dbfile = self.dbfile
        hashes = self.hashes
        directory = self.directory = []

        pos = dbfile.tell()
        for i in xrange(0, 256):
            entries = hashes[i]
            numslots = 2 * len(entries)
            directory.append((pos, numslots))

            null = (0, 0)
            hashtable = [null] * numslots
            for hashval, position in entries:
                n = (hashval >> 8) % numslots
                while hashtable[n] != null:
                    n = (n + 1) % numslots
                hashtable[n] = (hashval, position)

            write = dbfile.write
            for hashval, position in hashtable:
                write(self.pack_pointer(hashval, position))
                pos += self.pointer_size

        dbfile.flush()
        self._end_of_hashes = dbfile.tell()
Example #25
0
    def __iter__(self):
        dbfile = self.dbfile
        names = self.names
        lengths = array("I")

        dbfile.seek(self.directory_offset)
        for i in xrange(self.length):
            dbfile.seek(_LONG_SIZE, 1)
            lengths.append(dbfile.read_uint())

        dbfile.seek(self.basepos)
        for length in lengths:
            vlist = loads(dbfile.read(length) + b("."))
            vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                     if vlist[i] is not None)
            yield vdict
Example #26
0
    def __init__(self, dbfile, magic=b("HSH3"), hashtype=0):
        """
        :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object
            to write to.
        :param magic: the format tag bytes to write at the start of the file.
        :param hashtype: an integer indicating which hashing algorithm to use.
            Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash).
        """

        self.dbfile = dbfile
        self.hashtype = hashtype
        self.hashfn = _hash_functions[self.hashtype]
        # A place for subclasses to put extra metadata
        self.extras = {}

        self.startoffset = dbfile.tell()
        # Write format tag
        dbfile.write(magic)
        # Write hash type
        dbfile.write_byte(self.hashtype)
        # Unused future expansion bits
        dbfile.write_int(0)
        dbfile.write_int(0)

        # 256 lists of hashed keys and positions
        self.buckets = [[] for _ in xrange(256)]
        # List to remember the positions of the hash tables
        self.directory = []
Example #27
0
    def all_stored_fields(self):
        """Yields the stored fields for all documents.
        """

        for docnum in xrange(self.doc_count_all()):
            if not self.is_deleted(docnum):
                yield self.stored_fields(docnum)
Example #28
0
 def __iter__(self):
     base = 0
     for byte in self._iter_bytes():
         for i in xrange(8):
             if byte & (1 << i):
                 yield base + i
         base += 8
Example #29
0
    def __init__(self, dbfile, offset, expand=True):
        self.id = offset
        self.dbfile = dbfile

        dbfile.seek(offset)
        flags = dbfile.read_byte()
        self.final = bool(flags & 1)
        self._edges = {}
        if flags & 2:
            singles = flags & 4
            bytes = flags & 8

            nkeys = dbfile.read_varint()

            ptrs = dbfile.read_array("I", nkeys)
            for i in xrange(nkeys):
                ptr = ptrs[i]
                if singles:
                    if bytes:
                        charnum = dbfile.read_byte()
                    else:
                        charnum = dbfile.read_ushort()
                    self._edges[unichr(charnum)] = ptr
                else:
                    key = utf8decode(dbfile.read_string())[0]
                    if len(key) > 1 and expand:
                        self._edges[key[0]] = PatNode(dbfile, key[1:], ptr)
                    else:
                        self._edges[key] = ptr
 def __getitem__(self, docnum):
     fixedlen = self._fixedlen
     v = self._child[docnum]
     if not v:
         return []
     ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)]
     return ls
Example #31
0
def test_nonexclusive_read():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "readlock") as ix:
        for num in u("one two three four five").split():
            w = ix.writer()
            w.add_document(text=u("Test document %s") % num)
            w.commit(merge=False)

        def fn():
            for _ in xrange(5):
                r = ix.reader()
                assert list(r.field_terms("text")) == [
                    "document", "five", "four", "one", "test", "three", "two"
                ]
                r.close()

        ths = [threading.Thread(target=fn) for _ in xrange(5)]
        for th in ths:
            th.start()
        for th in ths:
            th.join()
Example #32
0
def test_multimatcher():
    schema = fields.Schema(content=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)

    domain = ("alfa", "bravo", "charlie", "delta")

    for _ in xrange(3):
        w = ix.writer()
        for ls in permutations(domain):
            w.add_document(content=u(" ").join(ls))
        w.commit(merge=False)

    q = Term("content", "bravo")
    with ix.searcher() as s:
        m = q.matcher(s)
        while m.is_active():
            content = s.stored_fields(m.id())["content"].split()
            spans = m.spans()
            for span in spans:
                assert content[span.start] == "bravo"
            m.next()
Example #33
0
def test_simple_indexing():
    schema = fields.Schema(text=fields.TEXT, id=fields.STORED)
    domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
              u("kilo"), u("lima"), u("mike"), u("november"))
    docs = defaultdict(list)
    with TempIndex(schema, "simple") as ix:
        with ix.writer() as w:
            for i in xrange(100):
                smp = random.sample(domain, 5)
                for word in smp:
                    docs[word].append(i)
                w.add_document(text=u(" ").join(smp), id=i)

        with ix.searcher() as s:
            for word in domain:
                rset = sorted([
                    hit["id"]
                    for hit in s.search(query.Term("text", word), limit=None)
                ])
                assert rset == docs[word]
Example #34
0
def read_region(dbfile, region, start=None):
    _read = dbfile.read
    _unpack = itemheader.unpack
    _headersize = itemheader.size

    start = start if start is not None else region.start
    dbfile.seek(start)

    first = True
    for i in xrange(region.length):
        keylen, vlen = _unpack(_read(_headersize))
        key = _read(keylen)
        val = _read(vlen)

        if first:
            assert key == region.minkey
            first = False

        yield key, val

    assert dbfile.tell() == region.end
Example #35
0
def read_gints(dbfile, n):
    """Read N integers from the bytes stream dbfile. Expects that the file
    starts at a key byte.
    """

    count = 0
    read = dbfile.read
    for _ in xrange(n):
        if count == 0:
            key = ord(dbfile.read(1))
        code = key >> (count * 2) & 3
        if code == 0:
            yield ord(read(1))
        elif code == 1:
            yield unpack_ushort_le(read(2))[0]
        elif code == 2:
            yield unpack_uint_le(read(3) + "\x00")[0]
        else:
            yield unpack_uint_le(read(4))[0]

        count = (count + 1) % 4
Example #36
0
    def read_nums(self, f, n):
        """Read N integers from the bytes stream dbfile. Expects that the file
        is positioned at a key byte.
        """

        count = 0
        key = None
        for _ in xrange(n):
            if count == 0:
                key = f.read_byte()
            code = key >> (count * 2) & 3
            if code == 0:
                yield f.read_byte()
            elif code == 1:
                yield f.read_ushort_le()
            elif code == 2:
                yield unpack_uint_le(f.read(3) + "\x00")[0]
            else:
                yield f.read_uint_le()

            count = (count + 1) % 4
Example #37
0
def test_20000_buffered():
    from whoosh.writing import BufferedWriter

    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000buffered") as ix:
        domain = [
            "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf",
            "hotel", "india", "juliet", "kilo", "lima"
        ]

        t = now()
        w = BufferedWriter(ix, limit=100, period=None)
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.close()
        print("Write buffered:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize buffered:", now() - t)
def test_lengths2():
    schema = fields.Schema(text=fields.TEXT(stored=True))
    ix = RamStorage().create_index(schema)
    count = 0
    for _ in xrange(3):
        w = ix.writer()
        for ls in permutations(u("alfa bravo charlie").split()):
            if "bravo" in ls and "charlie" in ls:
                count += 1
            w.add_document(text=u(" ").join(ls))
        w.commit(merge=False)

    with ix.searcher() as s:
        q = query.Or(
            [query.Term("text", u("bravo")),
             query.Term("text", u("charlie"))])
        r = s.search(q, limit=None)
        assert len(r) == count

        r = s.search(q, limit=3)
        assert len(r) == count
def test_exclusion():
    from datetime import datetime

    schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME)
    ix = RamStorage().create_index(schema)
    dt1 = datetime(1950, 1, 1)
    dt2 = datetime(1960, 1, 1)
    with ix.writer() as w:
        # Make 39 documents with dates != dt1 and then make a last document
        # with feed == dt1.
        for i in xrange(40):
            w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1))

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)
        # Find documents where date != dt1
        q = qp.parse("NOT (date:(19500101000000))")

        r = s.search(q, limit=None)
        assert len(r) == 39  # Total number of matched documents
        assert r.scored_length() == 39  # Number of docs in the results
Example #40
0
def test_combos():
    qs = ('w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) ' +
          '(apple ANDNOT bear)^2.3')

    init_args = {
        plugins.MultifieldPlugin: (["content", "title"], {
            "content": 1.0,
            "title": 1.2
        }),
        plugins.FieldAliasPlugin: ({
            "content": ("text", "body")
        }, ),
        plugins.MultifieldPlugin: (["title", "content"], ),
        plugins.CopyFieldPlugin: ({
            "name": "phone"
        }, ),
        plugins.PseudoFieldPlugin: ({
            "name": lambda x: x
        }),
    }

    pis = _plugin_classes(())
    for i, plugin in enumerate(pis):
        try:
            pis[i] = plugin(*init_args.get(plugin, ()))
        except TypeError:
            raise TypeError("Error instantiating %s" % plugin)

    count = 0
    for i, first in enumerate(pis):
        for j in xrange(len(pis)):
            if i == j: continue
            plist = [p for p in pis[:j] if p is not first] + [first]
            qp = qparser.QueryParser("text", None, plugins=plist)
            try:
                qp.parse(qs)
            except Exception:
                e = sys.exc_info()[1]
                raise Exception(str(e) + " combo: %s %r" % (count, plist))
            count += 1
Example #41
0
    def insert(self, word):
        """Add the given "word" (a string or list of strings) to the graph.
        Words must be inserted in sorted order.
        """

        lw = self.lastword
        prefixlen = 0
        if lw:
            if self._field_root and lw[0] != word[0]:
                # If field_root == True, caller can add entire fields out-of-
                # order (but not individual terms)
                pass
            elif word < lw:
                raise Exception("Out of order %r..%r." % (self.lastword, word))
            else:
                # find common prefix between word and previous word
                for i in xrange(min(len(word), len(lw))):
                    if word[i] != lw[i]: break
                    prefixlen += 1

        # Check the unchecked for redundant nodes, proceeding from last
        # one down to the common prefix size. Then truncate the list at
        # that point.
        self._minimize(prefixlen)

        # Add the suffix, starting from the correct node mid-way through the
        # graph
        if not self.unchecked:
            node = self.root
        else:
            node = self.unchecked[-1][2]

        for letter in word[prefixlen:]:
            nextnode = BuildNode()
            node.put(letter, nextnode)
            self.unchecked.append((node, letter, nextnode))
            node = nextnode

        node.final = True
        self.lastword = word
Example #42
0
    def __getitem__(self, num):
        if num > self.length - 1:
            raise IndexError("Tried to get document %s, file has %s" %
                             (num, self.length))

        dbfile = self.dbfile
        start = self.directory_offset + num * stored_pointer_size
        dbfile.seek(start)
        ptr = dbfile.read(stored_pointer_size)
        if len(ptr) != stored_pointer_size:
            raise Exception("Error reading %r @%s %s < %s" %
                            (dbfile, start, len(ptr), stored_pointer_size))
        position, length = unpack_stored_pointer(ptr)
        vlist = loads(dbfile.map[position:position + length] + b("."))

        names = self.names
        # Recreate a dictionary by putting the field names and values back
        # together by position. We can't just use dict(zip(...)) because we
        # want to filter out the None values.
        vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist))
                     if vlist[i] is not None)
        return vdict
Example #43
0
def test_20000_batch():
    sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(sc, "20000batch") as ix:
        domain = [
            "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf",
            "hotel", "india", "juliet", "kilo", "lima"
        ]

        t = now()
        w = ix.writer()
        for i in xrange(20000):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
            if not i % 100:
                w.commit()
                w = ix.writer()
        w.commit()
        print("Write batch:", now() - t)

        t = now()
        ix.optimize()
        print("Optimize batch:", now() - t)
Example #44
0
    def suggest(self, text, limit=5, maxdist=2, prefix=0):
        """
        :param text: the text to check. This word will **not** be added to the
            suggestions, even if it appears in the word graph.
        :param limit: only return up to this many suggestions. If there are not
            enough terms in the field within ``maxdist`` of the given word, the
            returned list will be shorter than this number.
        :param maxdist: the largest edit distance from the given word to look
            at. Numbers higher than 2 are not very effective or efficient.
        :param prefix: require suggestions to share a prefix of this length
            with the given word. This is often justifiable since most
            misspellings do not involve the first letter of the word. Using a
            prefix dramatically decreases the time it takes to generate the
            list of words.
        """

        _suggestions = self._suggestions

        heap = []
        seen = set([text])
        for k in xrange(1, maxdist + 1):
            for item in _suggestions(text, k, prefix):
                if item[1] in seen:
                    continue
                seen.add(item[1])

                # Note that the *higher* scores (item[0]) are better!
                if len(heap) < limit:
                    heappush(heap, item)
                elif item > heap[0]:
                    heapreplace(heap, item)

            # If the heap is already at the required length, don't bother going
            # to a higher edit distance
            if len(heap) >= limit:
                break

        sugs = sorted(heap, key=lambda item: (0 - item[0], item[1]))
        return [sug for _, sug in sugs]
Example #45
0
    def _compress(self, inarray, inoffset, n):
        _numsize = self._numsize
        _bitsize = self._bitsize
        _num = self._num
        _bits = self._bits

        for key in xrange(_numsize):
            value = key << _bitsize
            num = _num[key] if _num[key] < n else n
            bits = 0

            j = 0
            while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
                x = inarray[inoffset + j]
                value |= x << bits
                bits += _bits[key][j]
                j += 1

            if j == num:
                return value, num

        raise Exception
Example #46
0
def _check_writer(name, writer_fn):
    schema = fields.Schema(text=fields.TEXT, id=fields.STORED)
    domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
              u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"),
              u("kilo"), u("lima"), u("mike"), u("november"))
    docs = defaultdict(list)
    with TempIndex(schema, name) as ix:
        w = writer_fn(ix)
        for i in xrange(1000):
            smp = random.sample(domain, 5)
            for word in smp:
                docs[word].append(i)
            w.add_document(text=u(" ").join(smp), id=i)
        w.commit()

        with ix.searcher() as s:
            for word in domain:
                print(word)
                rset = sorted([hit["id"] for hit
                               in s.search(query.Term("text", word),
                                           limit=None)])
                assert_equal(rset, docs[word])
Example #47
0
    def set_searcher(self, segment_searcher, docoffset):
        fieldname = self._fieldname
        self._segment_searcher = segment_searcher
        reader = segment_searcher.reader()

        if self._use_vectors:
            pass
        elif self._use_column:
            self._creader = reader.column_reader(fieldname, translate=False)
        else:
            # Otherwise, cache the values in each document in a huge list
            # of lists
            dc = segment_searcher.doc_count_all()
            field = segment_searcher.schema[fieldname]
            from_bytes = field.from_bytes

            self._lists = [[] for _ in xrange(dc)]
            for btext in field.sortable_terms(reader, fieldname):
                text = from_bytes(btext)
                postings = reader.postings(fieldname, btext)
                for docid in postings.all_ids():
                    self._lists[docid].append(text)
Example #48
0
def test_buffered_threads():
    domain = u("alfa bravo charlie delta").split()
    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:

        class SimWriter(threading.Thread):
            def run(self):
                for _ in xrange(5):
                    w.update_document(name=random.choice(domain))
                    time.sleep(random.uniform(0.01, 0.1))

        w = writing.BufferedWriter(ix, limit=10)
        threads = [SimWriter() for _ in xrange(5)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        w.close()

        with ix.reader() as r:
            assert r.doc_count() == 4
            assert sorted([d["name"] for d in r.all_stored_fields()]) == domain
def test_page_counts():
    from whoosh.scoring import Frequency

    schema = fields.Schema(id=fields.ID(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)

    w = ix.writer()
    for i in xrange(10):
        w.add_document(id=text_type(i))
    w.commit()

    with ix.searcher(weighting=Frequency) as s:
        q = query.Every("id")

        r = s.search(q)
        assert len(r) == 10

        with pytest.raises(ValueError):
            s.search_page(q, 0)

        r = s.search_page(q, 1, 5)
        assert len(r) == 10
        assert r.pagecount == 2

        r = s.search_page(q, 1, 5)
        assert len(r) == 10
        assert r.pagecount == 2

        r = s.search_page(q, 2, 5)
        assert len(r) == 10
        assert r.pagecount == 2
        assert r.pagenum == 2

        r = s.search_page(q, 1, 10)
        assert len(r) == 10
        assert r.pagecount == 1
        assert r.pagenum == 1
Example #50
0
def parse_record(data, tags=None):
    leader = data[:LEADER_LEN]
    assert len(leader) == LEADER_LEN

    dataoffset = int(data[12:17])
    assert dataoffset > 0
    assert dataoffset < len(data)

    # dataoffset - 1 to avoid END-OF-FIELD byte
    dirstart = LEADER_LEN
    dirend = dataoffset - 1

    # Number of fields in record
    assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
    field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN

    result = {}
    for i in xrange(field_count):
        start = dirstart + i * DIRECTORY_ENTRY_LEN
        end = start + DIRECTORY_ENTRY_LEN
        tag = data[start:start + 3]
        if tags and not tag in tags:
            continue

        entry = data[start:end]
        elen = int(entry[3:7])
        offset = dataoffset + int(entry[7:12])
        edata = data[offset:offset + elen - 1]

        if not (tag < "010" and tag.isdigit()):
            edata = edata.split(SUBFIELD_INDICATOR)[1:]
            if tag in result:
                result[tag].extend(edata)
            else:
                result[tag] = edata
        else:
            result[tag] = edata
    return result
Example #51
0
    def read_ids(self):
        postfile = self.postfile
        offset = self.dataoffset
        postcount = self.count
        postfile.seek(offset)

        if self.stringids:
            rs = postfile.read_string
            ids = [utf8decode(rs())[0] for _ in xrange(postcount)]
            newoffset = postfile.tell()
        elif self.idslen:
            ids = array("I")
            ids.fromstring(decompress(postfile.read(self.idslen)))
            if IS_LITTLE:
                ids.byteswap()
            newoffset = offset + self.idslen
        else:
            ids = postfile.read_array("I", postcount)
            newoffset = offset + _INT_SIZE * postcount

        self.ids = ids
        self.weights_offset = newoffset
        return ids
Example #52
0
    def _read_part(self):
        scored = self._scored
        boost = self._boost
        limit = min(self._docnum + self._partsize, self._doccount)
        offset = self._docnum
        a = self._a

        # Clear the array
        for i in xrange(self._partsize):
            a[i] = 0

        # Add the scores from the submatchers into the array
        for m in self._submatchers:
            while m.is_active() and m.id() < limit:
                i = m.id() - offset
                if scored:
                    a[i] += m.score() * boost
                else:
                    a[i] = 1
                m.next()

        self._offset = offset
        self._limit = limit
def test_filter_by_result():
    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(stored=True))

    with TempIndex(schema, "filter") as ix:
        words = u("foo bar baz qux barney").split()
        with ix.writer() as w:
            for x in xrange(100):
                t = u("even" if x % 2 == 0 else "odd")
                c = words[x % len(words)]
                w.add_document(title=t, content=c)

        with ix.searcher() as searcher:
            fq = query.Term("title", "even")
            filter_result = searcher.search(fq)
            assert filter_result.docset is None

            q = query.Term("content", "foo")

            # filter_result.docs()
            result = searcher.search(q, filter=filter_result)
            assert all(x["title"] == "even" and x["content"] == "foo"
                       for x in result)
Example #54
0
def test_compound_sort():
    fspec = fields.KEYWORD(stored=True, sortable=True)
    schema = fields.Schema(a=fspec, b=fspec, c=fspec)
    ix = RamStorage().create_index(schema)

    alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split()
    blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split()
    clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split()
    assert all(len(ls) == 10 for ls in (alist, blist, clist))

    with ix.writer() as w:
        for i in xrange(10):
            w.add_document(a=alist[i], b=blist[i], c=clist[i])

    with ix.searcher() as s:
        q = query.Every()
        sortedby = [sorting.FieldFacet("a"),
                    sorting.FieldFacet("b", reverse=True),
                    sorting.FieldFacet("c")]

        r = s.search(q, sortedby=sortedby)
        output = []
        for hit in r:
            output.append(" ".join((hit["a"], hit["b"], hit["c"])))

        assert output == [
            "alfa charlie charlie",
            "alfa charlie india",
            "alfa bravo echo",
            "alfa alfa alfa",
            "alfa alfa golf",
            "bravo charlie foxtrot",
            "bravo bravo bravo",
            "bravo bravo hotel",
            "bravo alfa delta",
            "bravo alfa juliet",
        ]
Example #55
0
def test_random():
    def randstring():
        length = random.randint(1, 5)
        a = array("B", (random.randint(0, 255) for _ in xrange(length)))
        return array_tobytes(a)

    keys = sorted(randstring() for _ in xrange(100))

    with TempStorage() as st:
        gwrite(keys, st)
        gr = greader(st)
        cur = fst.Cursor(gr)
        s1 = cur.flatten()
        s2 = sorted(set(keys))
        for i, (k1, k2) in enumerate(zip(s1, s2)):
            assert k1 == k2, "%s: %r != %r" % (i, k1, k2)

        sample = list(keys)
        random.shuffle(sample)
        for key in sample:
            cur.reset()
            cur.find_path(key)
            assert cur.prefix_bytes() == key
        gr.close()
Example #56
0
def make_multi_index(ix):
    for i in xrange(0, len(docs), 3):
        w = ix.writer()
        for doc in docs[i:i + 3]:
            w.add_document(ev=u("a"), **doc)
        w.commit(merge=False)
Example #57
0
 def randstring(min, max):
     return "".join(
         chr(randint(1, 255)) for _ in xrange(randint(min, max)))
Example #58
0
 def all_ids(self):
     missing = self.missing
     negs = set(self.child.all_ids())
     return (id for id in xrange(self.limit)
             if id not in negs and not missing(id))
Example #59
0
# don't have to constantly recalculate them on the fly. This makes a small but
# noticeable difference.


def _varint(i):
    a = array("B")
    while (i & ~0x7F) != 0:
        a.append((i & 0x7F) | 0x80)
        i = i >> 7
    a.append(i)
    return array_tobytes(a)


_varint_cache_size = 512
_varint_cache = []
for i in xrange(0, _varint_cache_size):
    _varint_cache.append(_varint(i))
_varint_cache = tuple(_varint_cache)


def varint(i):
    """Encodes the given integer into a string of the minimum number  of bytes.
    """
    if i < len(_varint_cache):
        return _varint_cache[i]
    return _varint(i)


def varint_to_int(vi):
    b = ord(vi[0])
    p = 1
Example #60
0
    def decorating_function(user_function):
        stats = [0, 0, 0]  # hits, misses, hand
        data = {}

        if maxsize:
            # The keys at each point on the clock face
            clock_keys = [None] * maxsize
            # The "referenced" bits at each point on the clock face
            clock_refs = array("B", (0 for _ in xrange(maxsize)))
            lock = Lock()

            @functools.wraps(user_function)
            def wrapper(*args):
                key = args
                try:
                    with lock:
                        pos, result = data[key]
                        # The key is in the cache. Set the key's reference bit
                        clock_refs[pos] = 1
                        # Record a cache hit
                        stats[0] += 1
                except KeyError:
                    # Compute the value
                    result = user_function(*args)
                    with lock:
                        # Current position of the clock hand
                        hand = stats[2]
                        # Remember to stop here after a full revolution
                        end = hand
                        # Sweep around the clock looking for a position with
                        # the reference bit off
                        while True:
                            hand = (hand + 1) % maxsize
                            current_ref = clock_refs[hand]
                            if current_ref:
                                # This position's "referenced" bit is set. Turn
                                # the bit off and move on.
                                clock_refs[hand] = 0
                            elif not current_ref or hand == end:
                                # We've either found a position with the
                                # "reference" bit off or reached the end of the
                                # circular cache. So we'll replace this
                                # position with the new key
                                current_key = clock_keys[hand]
                                if current_key in data:
                                    del data[current_key]
                                clock_keys[hand] = key
                                clock_refs[hand] = 1
                                break
                        # Put the key and result in the cache
                        data[key] = (hand, result)
                        # Save the new hand position
                        stats[2] = hand
                        # Record a cache miss
                        stats[1] += 1
                return result

        else:
            @functools.wraps(user_function)
            def wrapper(*args):
                key = args
                try:
                    result = data[key]
                    stats[0] += 1
                except KeyError:
                    result = user_function(*args)
                    data[key] = result
                    stats[1] += 1
                return result

        def cache_info():
            return stats[0], stats[1], maxsize, len(data)

        def cache_clear():
            """Clear the cache and cache statistics"""
            data.clear()
            stats[0] = stats[1] = stats[2] = 0
            for i in xrange(maxsize):
                clock_keys[i] = None
                clock_refs[i] = 0

        wrapper.cache_info = cache_info
        wrapper.cache_clear = cache_clear
        return wrapper