def suggestions_and_scores(self, text, weighting=None): if weighting is None: weighting = scoring.TF_IDF() grams = defaultdict(list) for size in xrange(self.mingram, self.maxgram + 1): key = "gram%s" % size nga = analysis.NgramAnalyzer(size) for t in nga(text): grams[key].append(t.text) queries = [] for size in xrange(self.mingram, min(self.maxgram + 1, len(text))): key = "gram%s" % size gramlist = grams[key] queries.append(query.Term("start%s" % size, gramlist[0], boost=self.booststart)) queries.append(query.Term("end%s" % size, gramlist[-1], boost=self.boostend)) for gram in gramlist: queries.append(query.Term(key, gram)) q = query.Or(queries) ix = self.index() s = ix.searcher(weighting=weighting) try: result = s.search(q, limit=None) return [(fs["word"], fs["score"], result.score(i)) for i, fs in enumerate(result) if fs["word"] != text] finally: s.close()
def test_roundtrip(): _rt(columns.VarBytesColumn(), [b("a"), b("ccc"), b("bbb"), b("e"), b("dd")], b("")) _rt(columns.FixedBytesColumn(5), [b("aaaaa"), b("eeeee"), b("ccccc"), b("bbbbb"), b("eeeee")], b("\x00") * 5) _rt(columns.RefBytesColumn(), [b("a"), b("ccc"), b("bb"), b("ccc"), b("a"), b("bb")], b("")) _rt(columns.RefBytesColumn(3), [b("aaa"), b("bbb"), b("ccc"), b("aaa"), b("bbb"), b("ccc")], b("\x00") * 3) _rt(columns.StructColumn("ifH", (0, 0.0, 0)), [(100, 1.5, 15000), (-100, -5.0, 0), (5820, 6.5, 462), (-57829, -1.5, 6), (0, 0, 0)], (0, 0.0, 0)) numcol = columns.NumericColumn _rt(numcol("b"), [10, -20, 30, -25, 15], 0) _rt(numcol("B"), [10, 20, 30, 25, 15], 0) _rt(numcol("h"), [1000, -2000, 3000, -15000, 32000], 0) _rt(numcol("H"), [1000, 2000, 3000, 15000, 50000], 0) _rt(numcol("i"), [2 ** 16, -(2 ** 20), 2 ** 24, -(2 ** 28), 2 ** 30], 0) _rt(numcol("I"), [2 ** 16, 2 ** 20, 2 ** 24, 2 ** 28, 2 ** 31 & 0xFFFFFFFF], 0) _rt(numcol("q"), [10, -20, 30, -25, 15], 0) _rt(numcol("Q"), [2 ** 35, 2 ** 40, 2 ** 48, 2 ** 52, 2 ** 63], 0) _rt(numcol("f"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) _rt(numcol("d"), [1.5, -2.5, 3.5, -4.5, 1.25], 0) c = columns.BitColumn(compress_at=10) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(70)], False) _rt(c, [bool(random.randint(0, 1)) for _ in xrange(90)], False) c = columns.PickleColumn(columns.VarBytesColumn()) _rt(c, [None, True, False, 100, -7, "hello"], None)
def damerau_levenshtein(seq1, seq2, limit=None): """Returns the Damerau-Levenshtein edit distance between two strings. """ oneago = None thisrow = list(range(1, len(seq2) + 1)) + [0] for x in xrange(len(seq1)): # Python lists wrap around for negative indices, so put the # leftmost column at the *end* of the list. This matches with # the zero-indexed strings and saves extra calculation. twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] for y in xrange(len(seq2)): delcost = oneago[y] + 1 addcost = thisrow[y - 1] + 1 subcost = oneago[y - 1] + (seq1[x] != seq2[y]) thisrow[y] = min(delcost, addcost, subcost) # This block deals with transpositions if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]): thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) if limit and x > limit and min(thisrow) > limit: return limit + 1 return thisrow[len(seq2) - 1]
def test_datetime(): dtf = fields.DATETIME(stored=True) schema = fields.Schema(id=fields.ID(stored=True), date=dtf) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for month in xrange(1, 12): for day in xrange(1, 28): w.add_document(id=u("%s-%s") % (month, day), date=datetime(2010, month, day, 14, 0, 0)) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("date:20100523")) assert len(r) == 1 assert r[0]["id"] == "5-23" assert r[0]["date"].__class__ is datetime assert r[0]["date"].month == 5 assert r[0]["date"].day == 23 r = s.search(qp.parse("date:'2010 02'")) assert len(r) == 27 q = qp.parse(u("date:[2010-05 to 2010-08]")) startdt = datetime(2010, 5, 1, 0, 0, 0, 0) enddt = datetime(2010, 8, 31, 23, 59, 59, 999999) assert q.__class__ is query.NumericRange assert q.start == times.datetime_to_long(startdt) assert q.end == times.datetime_to_long(enddt)
def test_charboost_postings(): postings = [] docnum = 0 for _ in xrange(0, 20): docnum += randint(1, 10) posns = [] pos = 0 endchar = 0 for __ in xrange(0, randint(1, 10)): pos += randint(1, 10) startchar = endchar + randint(3, 10) endchar = startchar + randint(3, 10) boost = byte_to_float(float_to_byte(random() * 2)) posns.append((pos, startchar, endchar, boost)) postings.append((docnum, posns)) assert_equal(postings, roundtrip(postings, CharacterBoosts(), "character_boosts")) as_chars = [(docnum, [(pos, sc, ec) for pos, sc, ec, bst in posns]) for docnum, posns in postings] assert_equal(as_chars, roundtrip(postings, CharacterBoosts(), "characters")) as_posbsts = [(docnum, [(pos, bst) for pos, sc, ec, bst in posns]) for docnum, posns in postings] assert_equal(as_posbsts, roundtrip(postings, CharacterBoosts(), "position_boosts")) as_posns = [(docnum, [pos for pos, sc, ec, bst in posns]) for docnum, posns in postings] assert_equal(as_posns, roundtrip(postings, CharacterBoosts(), "positions")) as_freq = [(docnum, len(posns)) for docnum, posns in as_posns] assert_equal(as_freq, roundtrip(postings, CharacterBoosts(), "frequency"))
def test_merge_random(): items1 = sorted((random_name(4), random_name(8)) for _ in xrange(500)) items2 = sorted((random_name(4), random_name(8)) for _ in xrange(500)) x1 = sorted(dict(items1 + items2).items()) x2 = list(kv.merge_items(items1, items2)) assert x1 == x2
def fill(self, docnum): if docnum > self._count: if self._refs is not None: self._refs.extend(0 for _ in xrange(docnum - self._count)) else: dbfile = self._dbfile for _ in xrange(docnum - self._count): dbfile.write_ushort(0)
def read_qsafe_array(typecode, size, dbfile): if typecode == "q": arry = [dbfile.read_long() for _ in xrange(size)] elif typecode == "Q": arry = [dbfile.read_ulong() for _ in xrange(size)] else: arry = dbfile.read_array(typecode, size) return arry
def __iter__(self): i = 0 for num in self._bitset: if num > i: for _ in xrange(num - i): yield False yield True i = num + 1 if self._doccount > i: for _ in xrange(self._doccount - i): yield False
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode="", **kwargs ): assert isinstance(value, text_type), "%r is not unicode" % value inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query": size = min(self.max, inlen) for start in xrange(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in xrange(0, inlen - self.min + 1): for size in xrange(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1
def _read_weights(self): # If we haven't loaded the data from disk yet, load it now if self._data is None: self._read_data() weights = self._data[1] # De-minify the weights postcount = self._blocklength if weights is None: self._weights = array("f", (1.0 for _ in xrange(postcount))) elif isinstance(weights, float): self._weights = array("f", (weights for _ in xrange(postcount))) else: self._weights = weights
def run(self): ix = st.create_index(dir, schema) num = 0 for i in xrange(50): print(i) w = ix.writer() for _ in xrange(random.randint(1, 100)): content = u(" ").join(random.sample(domain, random.randint(5, 20))) w.add_document(id=text_type(num), content=content) num += 1 w.commit() time.sleep(0.1)
def test_boolean_find_deleted(): # "Random" string of ones and zeros representing deleted and undeleted domain = "1110001010001110010101000101001011101010001011111101000101010101" schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) count = 0 # Create multiple segments just in case for _ in xrange(5): w = ix.writer() for c in domain: w.add_document(i=count, b=(c == "1")) w.commit(merge=False) # Delete documents where "b" is True with ix.writer() as w: w.delete_by_term("b", "t") with ix.searcher() as s: # Double check that documents with b=True are all deleted reader = s.reader() for docnum in xrange(s.doc_count_all()): b = s.stored_fields(docnum)["b"] assert b == reader.is_deleted(docnum) # Try doing a search for documents where b=True qp = qparser.QueryParser("b", ix.schema) q = qp.parse("b:t") r = s.search(q, limit=None) assert len(r) == 0 # Make sure Every query doesn't match deleted docs r = s.search(qp.parse("*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(qp.parse("*:*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) # Make sure Not query doesn't match deleted docs q = qp.parse("NOT b:t") r = s.search(q, limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(q, limit=5) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r)
def test_numeric_ranges(): schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC) ix = RamStorage().create_index(schema) w = ix.writer() for i in xrange(400): w.add_document(id=i, num=i) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, target): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] assert result == target # Note that range() is always inclusive-exclusive check("[10 to 390]", list(range(10, 390 + 1))) check("[100 to]", list(range(100, 400))) check("[to 350]", list(range(0, 350 + 1))) check("[16 to 255]", list(range(16, 255 + 1))) check("{10 to 390]", list(range(11, 390 + 1))) check("[10 to 390}", list(range(10, 390))) check("{10 to 390}", list(range(11, 390))) check("{16 to 255}", list(range(17, 255)))
def all_doc_ids(self): """Returns an iterator of all (undeleted) document IDs in the reader. """ is_deleted = self.is_deleted return (docnum for docnum in xrange(self.doc_count_all()) if not is_deleted(docnum))
def test_batchsize_eq_doccount(): check_multi() schema = fields.Schema(a=fields.KEYWORD(stored=True)) with TempIndex(schema) as ix: with ix.writer(procs=4, batchsize=10) as w: for i in xrange(10): w.add_document(a=u(str(i)))
def test_decimal_ranges(): from decimal import Decimal schema = fields.Schema(id=fields.STORED, num=fields.NUMERIC(int, decimal_places=2)) ix = RamStorage().create_index(schema) w = ix.writer() count = Decimal("0.0") inc = Decimal("0.2") for _ in xrange(500): w.add_document(id=str(count), num=count) count += inc w.commit() with ix.searcher() as s: qp = qparser.QueryParser("num", schema) def check(qs, start, end): q = qp.parse(qs) result = [s.stored_fields(d)["id"] for d in q.docs(s)] target = [] count = Decimal(start) limit = Decimal(end) while count <= limit: target.append(str(count)) count += inc assert result == target check("[10.2 to 80.8]", "10.2", "80.8") check("{10.2 to 80.8]", "10.4", "80.8") check("[10.2 to 80.8}", "10.2", "80.6") check("{10.2 to 80.8}", "10.4", "80.6")
def test_random_multistream(): letters = "abcdefghijklmnopqrstuvwxyz" def randstring(n): s = "".join(random.choice(letters) for _ in xrange(n)) return s.encode("latin1") domain = {} for _ in xrange(100): name = randstring(random.randint(5, 10)) value = randstring(2500) domain[name] = value outfiles = dict((name, BytesIO(value)) for name, value in domain.items()) with TempStorage() as st: msw = compound.CompoundWriter(st, buffersize=1024) mfiles = {} for name in domain: mfiles[name] = msw.create_file(name) while outfiles: name = random.choice(list(outfiles.keys())) v = outfiles[name].read(1000) mfiles[name].write(v) if len(v) < 1000: del outfiles[name] f = st.create_file("test") msw.save_as_compound(f) f = st.open_file("test") msr = compound.CompoundStorage(f) for name, value in domain.items(): assert msr.open_file(name).read() == value msr.close()
def _read_header(self, dbfile, doccount): first = dbfile.read(4) # Magic assert first == self.magic version = dbfile.read_int() # Version number assert version == 1 dc = dbfile.read_uint() # Number of documents saved if doccount is None: doccount = dc assert dc == doccount, "read=%s argument=%s" % (dc, doccount) self._count = doccount fieldcount = dbfile.read_ushort() # Number of fields # Read per-field info for i in xrange(fieldcount): fieldname = dbfile.read_string().decode('utf-8') self.totals[fieldname] = dbfile.read_long() self.minlens[fieldname] = byte_to_length(dbfile.read_byte()) self.maxlens[fieldname] = byte_to_length(dbfile.read_byte()) self.starts[fieldname] = i * doccount # Add header length to per-field offsets eoh = dbfile.tell() # End of header for fieldname in self.starts: self.starts[fieldname] += eoh
def test_deleteall(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "deleteall") as ix: w = ix.writer() domain = u("alfa bravo charlie delta echo").split() for i, ls in enumerate(permutations(domain)): w.add_document(text=u(" ").join(ls)) if not i % 10: w.commit() w = ix.writer() w.commit() # This is just a test, don't use this method to delete all docs IRL! doccount = ix.doc_count_all() w = ix.writer() for docnum in xrange(doccount): w.delete_document(docnum) w.commit() with ix.searcher() as s: r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))])) assert len(r) == 0 ix.optimize() assert ix.doc_count_all() == 0 with ix.reader() as r: assert list(r) == []
def __init__(self, dbfile): self.dbfile = dbfile self.map = dbfile.map dbfile.seek(0) magic = dbfile.read(4) if magic == b("HASH"): self.format = 1 self.header_size = 16 + 256 * header_entry_size _pointer_struct = Struct("!Iq") # Hash value, position self.hashtype = dbfile.read_byte() dbfile.read(3) # Unused self._end_of_hashes = dbfile.read_long() assert self._end_of_hashes >= self.header_size else: # Old format self.format = self.hashtype = 0 self.header_size = 256 * header_entry_size _pointer_struct = Struct("!qq") # Hash value, position self.hash_func = hash_functions[self.hashtype] self.buckets = [] for _ in xrange(256): he = unpack_header_entry(dbfile.read(header_entry_size)) self.buckets.append(he) self._start_of_hashes = self.buckets[0][0] self.pointer_size = _pointer_struct.size self.unpack_pointer = _pointer_struct.unpack self.is_closed = False
def ranges_for_key(self, key): read = self.read pointer_size = self.pointer_size if isinstance(key, text_type): key = key.encode('latin-1') keyhash = self.hash_func(key) hpos, hslots = self._hashtable_info(keyhash) if not hslots: return slotpos = hpos + (((keyhash >> 8) % hslots) * pointer_size) for _ in xrange(hslots): slothash, pos = self.unpack_pointer(read(slotpos, pointer_size)) if not pos: return slotpos += pointer_size # If we reach the end of the hashtable, wrap around if slotpos == hpos + (hslots * pointer_size): slotpos = hpos if slothash == keyhash: keylen, datalen = unpack_lengths(read(pos, lengths_size)) if keylen == len(key): if key == read(pos + lengths_size, keylen): yield (pos + lengths_size + keylen, datalen)
def cache_clear(): """Clear the cache and cache statistics""" data.clear() stats[0] = stats[1] = stats[2] = 0 for i in xrange(maxsize): clock_keys[i] = None clock_refs[i] = 0
def _write_hashes(self): dbfile = self.dbfile hashes = self.hashes directory = self.directory = [] pos = dbfile.tell() for i in xrange(0, 256): entries = hashes[i] numslots = 2 * len(entries) directory.append((pos, numslots)) null = (0, 0) hashtable = [null] * numslots for hashval, position in entries: n = (hashval >> 8) % numslots while hashtable[n] != null: n = (n + 1) % numslots hashtable[n] = (hashval, position) write = dbfile.write for hashval, position in hashtable: write(self.pack_pointer(hashval, position)) pos += self.pointer_size dbfile.flush() self._end_of_hashes = dbfile.tell()
def __iter__(self): dbfile = self.dbfile names = self.names lengths = array("I") dbfile.seek(self.directory_offset) for i in xrange(self.length): dbfile.seek(_LONG_SIZE, 1) lengths.append(dbfile.read_uint()) dbfile.seek(self.basepos) for length in lengths: vlist = loads(dbfile.read(length) + b(".")) vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist)) if vlist[i] is not None) yield vdict
def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): """ :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to write to. :param magic: the format tag bytes to write at the start of the file. :param hashtype: an integer indicating which hashing algorithm to use. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). """ self.dbfile = dbfile self.hashtype = hashtype self.hashfn = _hash_functions[self.hashtype] # A place for subclasses to put extra metadata self.extras = {} self.startoffset = dbfile.tell() # Write format tag dbfile.write(magic) # Write hash type dbfile.write_byte(self.hashtype) # Unused future expansion bits dbfile.write_int(0) dbfile.write_int(0) # 256 lists of hashed keys and positions self.buckets = [[] for _ in xrange(256)] # List to remember the positions of the hash tables self.directory = []
def all_stored_fields(self): """Yields the stored fields for all documents. """ for docnum in xrange(self.doc_count_all()): if not self.is_deleted(docnum): yield self.stored_fields(docnum)
def __iter__(self): base = 0 for byte in self._iter_bytes(): for i in xrange(8): if byte & (1 << i): yield base + i base += 8
def __init__(self, dbfile, offset, expand=True): self.id = offset self.dbfile = dbfile dbfile.seek(offset) flags = dbfile.read_byte() self.final = bool(flags & 1) self._edges = {} if flags & 2: singles = flags & 4 bytes = flags & 8 nkeys = dbfile.read_varint() ptrs = dbfile.read_array("I", nkeys) for i in xrange(nkeys): ptr = ptrs[i] if singles: if bytes: charnum = dbfile.read_byte() else: charnum = dbfile.read_ushort() self._edges[unichr(charnum)] = ptr else: key = utf8decode(dbfile.read_string())[0] if len(key) > 1 and expand: self._edges[key[0]] = PatNode(dbfile, key[1:], ptr) else: self._edges[key] = ptr
def __getitem__(self, docnum): fixedlen = self._fixedlen v = self._child[docnum] if not v: return [] ls = [v[i:i + fixedlen] for i in xrange(0, len(v), fixedlen)] return ls
def test_nonexclusive_read(): schema = fields.Schema(text=fields.TEXT) with TempIndex(schema, "readlock") as ix: for num in u("one two three four five").split(): w = ix.writer() w.add_document(text=u("Test document %s") % num) w.commit(merge=False) def fn(): for _ in xrange(5): r = ix.reader() assert list(r.field_terms("text")) == [ "document", "five", "four", "one", "test", "three", "two" ] r.close() ths = [threading.Thread(target=fn) for _ in xrange(5)] for th in ths: th.start() for th in ths: th.join()
def test_multimatcher(): schema = fields.Schema(content=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) domain = ("alfa", "bravo", "charlie", "delta") for _ in xrange(3): w = ix.writer() for ls in permutations(domain): w.add_document(content=u(" ").join(ls)) w.commit(merge=False) q = Term("content", "bravo") with ix.searcher() as s: m = q.matcher(s) while m.is_active(): content = s.stored_fields(m.id())["content"].split() spans = m.spans() for span in spans: assert content[span.start] == "bravo" m.next()
def test_simple_indexing(): schema = fields.Schema(text=fields.TEXT, id=fields.STORED) domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike"), u("november")) docs = defaultdict(list) with TempIndex(schema, "simple") as ix: with ix.writer() as w: for i in xrange(100): smp = random.sample(domain, 5) for word in smp: docs[word].append(i) w.add_document(text=u(" ").join(smp), id=i) with ix.searcher() as s: for word in domain: rset = sorted([ hit["id"] for hit in s.search(query.Term("text", word), limit=None) ]) assert rset == docs[word]
def read_region(dbfile, region, start=None): _read = dbfile.read _unpack = itemheader.unpack _headersize = itemheader.size start = start if start is not None else region.start dbfile.seek(start) first = True for i in xrange(region.length): keylen, vlen = _unpack(_read(_headersize)) key = _read(keylen) val = _read(vlen) if first: assert key == region.minkey first = False yield key, val assert dbfile.tell() == region.end
def read_gints(dbfile, n): """Read N integers from the bytes stream dbfile. Expects that the file starts at a key byte. """ count = 0 read = dbfile.read for _ in xrange(n): if count == 0: key = ord(dbfile.read(1)) code = key >> (count * 2) & 3 if code == 0: yield ord(read(1)) elif code == 1: yield unpack_ushort_le(read(2))[0] elif code == 2: yield unpack_uint_le(read(3) + "\x00")[0] else: yield unpack_uint_le(read(4))[0] count = (count + 1) % 4
def read_nums(self, f, n): """Read N integers from the bytes stream dbfile. Expects that the file is positioned at a key byte. """ count = 0 key = None for _ in xrange(n): if count == 0: key = f.read_byte() code = key >> (count * 2) & 3 if code == 0: yield f.read_byte() elif code == 1: yield f.read_ushort_le() elif code == 2: yield unpack_uint_le(f.read(3) + "\x00")[0] else: yield f.read_uint_le() count = (count + 1) % 4
def test_20000_buffered(): from whoosh.writing import BufferedWriter sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000buffered") as ix: domain = [ "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima" ] t = now() w = BufferedWriter(ix, limit=100, period=None) for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) w.close() print("Write buffered:", now() - t) t = now() ix.optimize() print("Optimize buffered:", now() - t)
def test_lengths2(): schema = fields.Schema(text=fields.TEXT(stored=True)) ix = RamStorage().create_index(schema) count = 0 for _ in xrange(3): w = ix.writer() for ls in permutations(u("alfa bravo charlie").split()): if "bravo" in ls and "charlie" in ls: count += 1 w.add_document(text=u(" ").join(ls)) w.commit(merge=False) with ix.searcher() as s: q = query.Or( [query.Term("text", u("bravo")), query.Term("text", u("charlie"))]) r = s.search(q, limit=None) assert len(r) == count r = s.search(q, limit=3) assert len(r) == count
def test_exclusion(): from datetime import datetime schema = fields.Schema(id=fields.ID(stored=True), date=fields.DATETIME) ix = RamStorage().create_index(schema) dt1 = datetime(1950, 1, 1) dt2 = datetime(1960, 1, 1) with ix.writer() as w: # Make 39 documents with dates != dt1 and then make a last document # with feed == dt1. for i in xrange(40): w.add_document(id=u(str(i)), date=(dt2 if i >= 1 else dt1)) with ix.searcher() as s: qp = qparser.QueryParser("id", schema) # Find documents where date != dt1 q = qp.parse("NOT (date:(19500101000000))") r = s.search(q, limit=None) assert len(r) == 39 # Total number of matched documents assert r.scored_length() == 39 # Number of docs in the results
def test_combos(): qs = ('w:a "hi there"^4.2 AND x:b^2.3 OR c AND (y:d OR e) ' + '(apple ANDNOT bear)^2.3') init_args = { plugins.MultifieldPlugin: (["content", "title"], { "content": 1.0, "title": 1.2 }), plugins.FieldAliasPlugin: ({ "content": ("text", "body") }, ), plugins.MultifieldPlugin: (["title", "content"], ), plugins.CopyFieldPlugin: ({ "name": "phone" }, ), plugins.PseudoFieldPlugin: ({ "name": lambda x: x }), } pis = _plugin_classes(()) for i, plugin in enumerate(pis): try: pis[i] = plugin(*init_args.get(plugin, ())) except TypeError: raise TypeError("Error instantiating %s" % plugin) count = 0 for i, first in enumerate(pis): for j in xrange(len(pis)): if i == j: continue plist = [p for p in pis[:j] if p is not first] + [first] qp = qparser.QueryParser("text", None, plugins=plist) try: qp.parse(qs) except Exception: e = sys.exc_info()[1] raise Exception(str(e) + " combo: %s %r" % (count, plist)) count += 1
def insert(self, word): """Add the given "word" (a string or list of strings) to the graph. Words must be inserted in sorted order. """ lw = self.lastword prefixlen = 0 if lw: if self._field_root and lw[0] != word[0]: # If field_root == True, caller can add entire fields out-of- # order (but not individual terms) pass elif word < lw: raise Exception("Out of order %r..%r." % (self.lastword, word)) else: # find common prefix between word and previous word for i in xrange(min(len(word), len(lw))): if word[i] != lw[i]: break prefixlen += 1 # Check the unchecked for redundant nodes, proceeding from last # one down to the common prefix size. Then truncate the list at # that point. self._minimize(prefixlen) # Add the suffix, starting from the correct node mid-way through the # graph if not self.unchecked: node = self.root else: node = self.unchecked[-1][2] for letter in word[prefixlen:]: nextnode = BuildNode() node.put(letter, nextnode) self.unchecked.append((node, letter, nextnode)) node = nextnode node.final = True self.lastword = word
def __getitem__(self, num): if num > self.length - 1: raise IndexError("Tried to get document %s, file has %s" % (num, self.length)) dbfile = self.dbfile start = self.directory_offset + num * stored_pointer_size dbfile.seek(start) ptr = dbfile.read(stored_pointer_size) if len(ptr) != stored_pointer_size: raise Exception("Error reading %r @%s %s < %s" % (dbfile, start, len(ptr), stored_pointer_size)) position, length = unpack_stored_pointer(ptr) vlist = loads(dbfile.map[position:position + length] + b(".")) names = self.names # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. vdict = dict((names[i], vlist[i]) for i in xrange(len(vlist)) if vlist[i] is not None) return vdict
def test_20000_batch(): sc = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT) with TempIndex(sc, "20000batch") as ix: domain = [ "alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima" ] t = now() w = ix.writer() for i in xrange(20000): w.add_document(id=text_type(i), text=u(" ").join(random.sample(domain, 5))) if not i % 100: w.commit() w = ix.writer() w.commit() print("Write batch:", now() - t) t = now() ix.optimize() print("Optimize batch:", now() - t)
def suggest(self, text, limit=5, maxdist=2, prefix=0): """ :param text: the text to check. This word will **not** be added to the suggestions, even if it appears in the word graph. :param limit: only return up to this many suggestions. If there are not enough terms in the field within ``maxdist`` of the given word, the returned list will be shorter than this number. :param maxdist: the largest edit distance from the given word to look at. Numbers higher than 2 are not very effective or efficient. :param prefix: require suggestions to share a prefix of this length with the given word. This is often justifiable since most misspellings do not involve the first letter of the word. Using a prefix dramatically decreases the time it takes to generate the list of words. """ _suggestions = self._suggestions heap = [] seen = set([text]) for k in xrange(1, maxdist + 1): for item in _suggestions(text, k, prefix): if item[1] in seen: continue seen.add(item[1]) # Note that the *higher* scores (item[0]) are better! if len(heap) < limit: heappush(heap, item) elif item > heap[0]: heapreplace(heap, item) # If the heap is already at the required length, don't bother going # to a higher edit distance if len(heap) >= limit: break sugs = sorted(heap, key=lambda item: (0 - item[0], item[1])) return [sug for _, sug in sugs]
def _compress(self, inarray, inoffset, n): _numsize = self._numsize _bitsize = self._bitsize _num = self._num _bits = self._bits for key in xrange(_numsize): value = key << _bitsize num = _num[key] if _num[key] < n else n bits = 0 j = 0 while j < num and inarray[inoffset + j] < (1 << _bits[key][j]): x = inarray[inoffset + j] value |= x << bits bits += _bits[key][j] j += 1 if j == num: return value, num raise Exception
def _check_writer(name, writer_fn): schema = fields.Schema(text=fields.TEXT, id=fields.STORED) domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot"), u("golf"), u("hotel"), u("india"), u("juliet"), u("kilo"), u("lima"), u("mike"), u("november")) docs = defaultdict(list) with TempIndex(schema, name) as ix: w = writer_fn(ix) for i in xrange(1000): smp = random.sample(domain, 5) for word in smp: docs[word].append(i) w.add_document(text=u(" ").join(smp), id=i) w.commit() with ix.searcher() as s: for word in domain: print(word) rset = sorted([hit["id"] for hit in s.search(query.Term("text", word), limit=None)]) assert_equal(rset, docs[word])
def set_searcher(self, segment_searcher, docoffset): fieldname = self._fieldname self._segment_searcher = segment_searcher reader = segment_searcher.reader() if self._use_vectors: pass elif self._use_column: self._creader = reader.column_reader(fieldname, translate=False) else: # Otherwise, cache the values in each document in a huge list # of lists dc = segment_searcher.doc_count_all() field = segment_searcher.schema[fieldname] from_bytes = field.from_bytes self._lists = [[] for _ in xrange(dc)] for btext in field.sortable_terms(reader, fieldname): text = from_bytes(btext) postings = reader.postings(fieldname, btext) for docid in postings.all_ids(): self._lists[docid].append(text)
def test_buffered_threads(): domain = u("alfa bravo charlie delta").split() schema = fields.Schema(name=fields.ID(unique=True, stored=True)) with TempIndex(schema, "buffthreads") as ix: class SimWriter(threading.Thread): def run(self): for _ in xrange(5): w.update_document(name=random.choice(domain)) time.sleep(random.uniform(0.01, 0.1)) w = writing.BufferedWriter(ix, limit=10) threads = [SimWriter() for _ in xrange(5)] for thread in threads: thread.start() for thread in threads: thread.join() w.close() with ix.reader() as r: assert r.doc_count() == 4 assert sorted([d["name"] for d in r.all_stored_fields()]) == domain
def test_page_counts(): from whoosh.scoring import Frequency schema = fields.Schema(id=fields.ID(stored=True)) st = RamStorage() ix = st.create_index(schema) w = ix.writer() for i in xrange(10): w.add_document(id=text_type(i)) w.commit() with ix.searcher(weighting=Frequency) as s: q = query.Every("id") r = s.search(q) assert len(r) == 10 with pytest.raises(ValueError): s.search_page(q, 0) r = s.search_page(q, 1, 5) assert len(r) == 10 assert r.pagecount == 2 r = s.search_page(q, 1, 5) assert len(r) == 10 assert r.pagecount == 2 r = s.search_page(q, 2, 5) assert len(r) == 10 assert r.pagecount == 2 assert r.pagenum == 2 r = s.search_page(q, 1, 10) assert len(r) == 10 assert r.pagecount == 1 assert r.pagenum == 1
def parse_record(data, tags=None): leader = data[:LEADER_LEN] assert len(leader) == LEADER_LEN dataoffset = int(data[12:17]) assert dataoffset > 0 assert dataoffset < len(data) # dataoffset - 1 to avoid END-OF-FIELD byte dirstart = LEADER_LEN dirend = dataoffset - 1 # Number of fields in record assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0 field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN result = {} for i in xrange(field_count): start = dirstart + i * DIRECTORY_ENTRY_LEN end = start + DIRECTORY_ENTRY_LEN tag = data[start:start + 3] if tags and not tag in tags: continue entry = data[start:end] elen = int(entry[3:7]) offset = dataoffset + int(entry[7:12]) edata = data[offset:offset + elen - 1] if not (tag < "010" and tag.isdigit()): edata = edata.split(SUBFIELD_INDICATOR)[1:] if tag in result: result[tag].extend(edata) else: result[tag] = edata else: result[tag] = edata return result
def read_ids(self): postfile = self.postfile offset = self.dataoffset postcount = self.count postfile.seek(offset) if self.stringids: rs = postfile.read_string ids = [utf8decode(rs())[0] for _ in xrange(postcount)] newoffset = postfile.tell() elif self.idslen: ids = array("I") ids.fromstring(decompress(postfile.read(self.idslen))) if IS_LITTLE: ids.byteswap() newoffset = offset + self.idslen else: ids = postfile.read_array("I", postcount) newoffset = offset + _INT_SIZE * postcount self.ids = ids self.weights_offset = newoffset return ids
def _read_part(self): scored = self._scored boost = self._boost limit = min(self._docnum + self._partsize, self._doccount) offset = self._docnum a = self._a # Clear the array for i in xrange(self._partsize): a[i] = 0 # Add the scores from the submatchers into the array for m in self._submatchers: while m.is_active() and m.id() < limit: i = m.id() - offset if scored: a[i] += m.score() * boost else: a[i] = 1 m.next() self._offset = offset self._limit = limit
def test_filter_by_result(): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) with TempIndex(schema, "filter") as ix: words = u("foo bar baz qux barney").split() with ix.writer() as w: for x in xrange(100): t = u("even" if x % 2 == 0 else "odd") c = words[x % len(words)] w.add_document(title=t, content=c) with ix.searcher() as searcher: fq = query.Term("title", "even") filter_result = searcher.search(fq) assert filter_result.docset is None q = query.Term("content", "foo") # filter_result.docs() result = searcher.search(q, filter=filter_result) assert all(x["title"] == "even" and x["content"] == "foo" for x in result)
def test_compound_sort(): fspec = fields.KEYWORD(stored=True, sortable=True) schema = fields.Schema(a=fspec, b=fspec, c=fspec) ix = RamStorage().create_index(schema) alist = u("alfa bravo alfa bravo alfa bravo alfa bravo alfa bravo").split() blist = u("alfa bravo charlie alfa bravo charlie alfa bravo charlie alfa").split() clist = u("alfa bravo charlie delta echo foxtrot golf hotel india juliet").split() assert all(len(ls) == 10 for ls in (alist, blist, clist)) with ix.writer() as w: for i in xrange(10): w.add_document(a=alist[i], b=blist[i], c=clist[i]) with ix.searcher() as s: q = query.Every() sortedby = [sorting.FieldFacet("a"), sorting.FieldFacet("b", reverse=True), sorting.FieldFacet("c")] r = s.search(q, sortedby=sortedby) output = [] for hit in r: output.append(" ".join((hit["a"], hit["b"], hit["c"]))) assert output == [ "alfa charlie charlie", "alfa charlie india", "alfa bravo echo", "alfa alfa alfa", "alfa alfa golf", "bravo charlie foxtrot", "bravo bravo bravo", "bravo bravo hotel", "bravo alfa delta", "bravo alfa juliet", ]
def test_random(): def randstring(): length = random.randint(1, 5) a = array("B", (random.randint(0, 255) for _ in xrange(length))) return array_tobytes(a) keys = sorted(randstring() for _ in xrange(100)) with TempStorage() as st: gwrite(keys, st) gr = greader(st) cur = fst.Cursor(gr) s1 = cur.flatten() s2 = sorted(set(keys)) for i, (k1, k2) in enumerate(zip(s1, s2)): assert k1 == k2, "%s: %r != %r" % (i, k1, k2) sample = list(keys) random.shuffle(sample) for key in sample: cur.reset() cur.find_path(key) assert cur.prefix_bytes() == key gr.close()
def make_multi_index(ix): for i in xrange(0, len(docs), 3): w = ix.writer() for doc in docs[i:i + 3]: w.add_document(ev=u("a"), **doc) w.commit(merge=False)
def randstring(min, max): return "".join( chr(randint(1, 255)) for _ in xrange(randint(min, max)))
def all_ids(self): missing = self.missing negs = set(self.child.all_ids()) return (id for id in xrange(self.limit) if id not in negs and not missing(id))
# don't have to constantly recalculate them on the fly. This makes a small but # noticeable difference. def _varint(i): a = array("B") while (i & ~0x7F) != 0: a.append((i & 0x7F) | 0x80) i = i >> 7 a.append(i) return array_tobytes(a) _varint_cache_size = 512 _varint_cache = [] for i in xrange(0, _varint_cache_size): _varint_cache.append(_varint(i)) _varint_cache = tuple(_varint_cache) def varint(i): """Encodes the given integer into a string of the minimum number of bytes. """ if i < len(_varint_cache): return _varint_cache[i] return _varint(i) def varint_to_int(vi): b = ord(vi[0]) p = 1
def decorating_function(user_function): stats = [0, 0, 0] # hits, misses, hand data = {} if maxsize: # The keys at each point on the clock face clock_keys = [None] * maxsize # The "referenced" bits at each point on the clock face clock_refs = array("B", (0 for _ in xrange(maxsize))) lock = Lock() @functools.wraps(user_function) def wrapper(*args): key = args try: with lock: pos, result = data[key] # The key is in the cache. Set the key's reference bit clock_refs[pos] = 1 # Record a cache hit stats[0] += 1 except KeyError: # Compute the value result = user_function(*args) with lock: # Current position of the clock hand hand = stats[2] # Remember to stop here after a full revolution end = hand # Sweep around the clock looking for a position with # the reference bit off while True: hand = (hand + 1) % maxsize current_ref = clock_refs[hand] if current_ref: # This position's "referenced" bit is set. Turn # the bit off and move on. clock_refs[hand] = 0 elif not current_ref or hand == end: # We've either found a position with the # "reference" bit off or reached the end of the # circular cache. So we'll replace this # position with the new key current_key = clock_keys[hand] if current_key in data: del data[current_key] clock_keys[hand] = key clock_refs[hand] = 1 break # Put the key and result in the cache data[key] = (hand, result) # Save the new hand position stats[2] = hand # Record a cache miss stats[1] += 1 return result else: @functools.wraps(user_function) def wrapper(*args): key = args try: result = data[key] stats[0] += 1 except KeyError: result = user_function(*args) data[key] = result stats[1] += 1 return result def cache_info(): return stats[0], stats[1], maxsize, len(data) def cache_clear(): """Clear the cache and cache statistics""" data.clear() stats[0] = stats[1] = stats[2] = 0 for i in xrange(maxsize): clock_keys[i] = None clock_refs[i] = 0 wrapper.cache_info = cache_info wrapper.cache_clear = cache_clear return wrapper