def test_hash_contents(): samp = [ ('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add_all(samp) hw.close() hrf = st.open_file("test.hsh") hr = HashReader(hrf) assert_equal(set(hr.items()), samp) hr.close()
def test_fieldwriter_multiblock(): field = fields.TEXT() st, codec, seg = _make_codec(blocklimit=2) fw = codec.field_writer(st, seg) fw.start_field("text", field) fw.start_term(u("alfa")) fw.add(0, 2.0, b("test1"), 2) fw.add(1, 5.0, b("test2"), 5) fw.add(2, 3.0, b("test3"), 3) fw.add(3, 4.0, b("test4"), 4) fw.add(4, 1.0, b("test5"), 1) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ti = tr.terminfo("text", "alfa") assert_equal(ti.weight(), 15.0) assert_equal(ti.doc_frequency(), 5) assert_equal(ti.min_length(), 1) assert_equal(ti.max_length(), 5) assert_equal(ti.max_weight(), 5.0) assert_equal(ti.min_id(), 0) assert_equal(ti.max_id(), 4) ps = [] m = tr.matcher("text", "alfa", field.format) while m.is_active(): ps.append((m.id(), m.weight(), m.value())) m.next() assert_equal(ps, [(0, 2.0, b("test1")), (1, 5.0, b("test2")), (2, 3.0, b("test3")), (3, 4.0, b("test4")), (4, 1.0, b("test5"))])
def test_skip(): _docnums = [1, 3, 12, 34, 43, 67, 68, 102, 145, 212, 283, 291, 412, 900, 905, 1024, 1800, 2048, 15000] st, codec, seg = _make_codec() fieldobj = fields.TEXT() fw = codec.field_writer(st, seg) fw.start_field("f1", fieldobj) fw.start_term(b("test")) for n in _docnums: fw.add(n, 1.0, b(''), None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) m = tr.matcher("f1", b("test"), fieldobj.format) assert m.id() == 1 m.skip_to(220) assert m.id() == 283 m.skip_to(1) assert m.id() == 283 m.skip_to(1000) assert m.id() == 1024 m.skip_to(1800) assert m.id() == 1800
def _btexts(self, ixreader): fieldname = self.fieldname field = ixreader.schema[fieldname] startexcl = self.startexcl endexcl = self.endexcl if self.start is None: start = b("") else: start = field.to_bytes(self.start) if self.end is None: end = b("\xFF\xFF\xFF\xFF") else: end = field.to_bytes(self.end) for fname, t in ixreader.terms_from(fieldname, start): if fname != fieldname: break if t == start and startexcl: continue if t == end and endexcl: break if t > end: break yield t
def test_wildcard_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) w.commit() r = ix.reader() qp = QueryParser("value", ix.schema) def words(terms): z = [] for t in terms: assert t[0] == "value" z.append(t[1]) return b(" ").join(sorted(z)) q = qp.parse(u("b*")) ts = q.existing_terms(r) assert ts == set() ts = q.existing_terms(r, expand=True) assert words(ts) == b("bear boggle bravo") q = qp.parse(u("[a TO f]")) ts = q.existing_terms(r) assert ts == set() ts = q.existing_terms(r, expand=True) assert words(ts) == b("alfa bear boggle bravo charlie delta echo") q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) assert ts == set([("value", b("render"))]) ts = q.existing_terms(r, expand=True) assert words(ts) == b("render rendering renders")
def close(self): dbfile = self.dbfile order = self.order keycount = self.keycount # Finish the pickled list of texts dbfile.write(b("l.")) # Compact the order array if possible if self.hastexts: if keycount < 255: code = "B" order = array(code, order) elif keycount < 65535: code = "H" order = array(code, order) # Write the order array dbfile.write(code) dbfile.write_array(self.order) # Seek back to the start and write numbers of docs dbfile.flush() dbfile.seek(self.start) dbfile.write_uint(len(order)) if self.hastexts: dbfile.write_uint(keycount) dbfile.flush() # Seek back and write the finished file tag dbfile.seek(self.tagpos) dbfile.write(b("+")) dbfile.close()
def test_hash_contents(): samp = [('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all(samp) hw.close() hr = HashReader.open(st, "test.hsh") probes = list(samp) random.shuffle(probes) for key, value in probes: assert hr[key] == value assert set(hr.keys()) == set([k for k, v in samp]) assert set(hr.values()) == set([v for k, v in samp]) assert set(hr.items()) == samp hr.close()
def from_string(cls, s): hbyte = ord(s[0:1]) if hbyte < 2: st = cls.struct # Freq, Doc freq, min len, max len, max w, max WOL, min ID, max ID f, df, ml, xl, xw, xwol, mid, xid = st.unpack(s[1:st.size + 1]) mid = None if mid == NO_ID else mid xid = None if xid == NO_ID else xid # Postings pstr = s[st.size + 1:] if hbyte == 0: p = unpack_long(pstr)[0] else: p = loads(pstr + b(".")) else: # Old format was encoded as a variable length pickled tuple v = loads(s + b(".")) if len(v) == 1: f = df = 1 p = v[0] elif len(v) == 2: f = df = v[1] p = v[0] else: f, p, df = v # Fake values for stats which weren't stored before ml = 1 xl = 106374 xw = 999999999 xwol = 999999999 mid = -1 xid = -1 return cls(f, df, ml, xl, xw, xwol, mid, xid, p)
def _btexts(self, ixreader): fieldname = self.fieldname field = ixreader.schema[fieldname] startexcl = self.startexcl endexcl = self.endexcl if self.start is None: start = b("") else: try: start = field.to_bytes(self.start) except ValueError: return if self.end is None: end = b("\xFF\xFF\xFF\xFF") else: try: end = field.to_bytes(self.end) except ValueError: return for fname, t in ixreader.terms_from(fieldname, start): if fname != fieldname: break if t == start and startexcl: continue if t == end and endexcl: break if t > end: break yield t
def _test_simple_compound(st): alist = [1, 2, 3, 5, -5, -4, -3, -2] blist = [1, 12, 67, 8, 2, 1023] clist = [100, -100, 200, -200] with st.create_file("a") as af: for x in alist: af.write_int(x) with st.create_file("b") as bf: for x in blist: bf.write_varint(x) with st.create_file("c") as cf: for x in clist: cf.write_int(x) f = st.create_file("f") CompoundStorage.assemble(f, st, ["a", "b", "c"]) f = CompoundStorage(st.open_file("f")) with f.open_file("a") as af: for x in alist: assert x == af.read_int() assert af.read() == b('') with f.open_file("b") as bf: for x in blist: assert x == bf.read_varint() assert bf.read() == b('') with f.open_file("c") as cf: for x in clist: assert x == cf.read_int() assert cf.read() == b('')
def test_hash_contents(): samp = [ ('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all(samp) hw.close() hr = HashReader.open(st, "test.hsh") probes = list(samp) random.shuffle(probes) for key, value in probes: assert hr[key] == value assert set(hr.keys()) == set([k for k, v in samp]) assert set(hr.values()) == set([v for k, v in samp]) assert set(hr.items()) == samp hr.close()
def _test_simple_compound(st): alist = [1, 2, 3, 5, -5, -4, -3, -2] blist = [1, 12, 67, 8, 2, 1023] clist = [100, -100, 200, -200] with st.create_file("a") as af: for x in alist: af.write_int(x) with st.create_file("b") as bf: for x in blist: bf.write_varint(x) with st.create_file("c") as cf: for x in clist: cf.write_int(x) f = st.create_file("f") CompoundStorage.assemble(f, st, ["a", "b", "c"]) f = CompoundStorage(st, "f") with f.open_file("a") as af: for x in alist: assert_equal(x, af.read_int()) assert_equal(af.read(), b('')) with f.open_file("b") as bf: for x in blist: assert_equal(x, bf.read_varint()) assert_equal(bf.read(), b('')) with f.open_file("c") as cf: for x in clist: assert_equal(x, cf.read_int()) assert_equal(cf.read(), b(''))
def test_token_boost(): from whoosh.analysis import RegexTokenizer, DoubleMetaphoneFilter ana = RegexTokenizer() | DoubleMetaphoneFilter() field = fields.TEXT(analyzer=ana, phrase=False) results = list(field.index(u("spruce view"))) assert_equal(results, [('SPRS', 1, 1.0, b('\x00\x00\x00\x01')), ('FF', 1, 0.5, b('\x00\x00\x00\x01')), ('F', 1, 1.0, b('\x00\x00\x00\x01'))])
def test_hash_single(): st = RamStorage() hw = HashWriter(st.create_file("test.hsh")) hw.add(b("alfa"), b("bravo")) hw.close() hr = HashReader.open(st, "test.hsh") assert hr.get(b("alfa")) == b("bravo") assert hr.get(b("foo")) is None
def decode_positions(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 posns = [] for code in codes: position = code[0] + position posns.append(position) return posns
def test_shared_suffix(): st = gwrite(enlist("blowing blue glowing")) gr = greader(st) cur1 = fst.Cursor(gr) cur2 = fst.Cursor(gr) cur1.find_path(b("blo")) cur2.find_path(b("glo")) assert cur1.stack[-1].target == cur2.stack[-1].target
def _print_line(self, indent, command, **kwargs): self._dbfile.write(b(" ") * indent) self._dbfile.write(command.encode("latin1")) for k, v in iteritems(kwargs): if isinstance(v, memoryview): v = bytes(v) if v is not None and not isinstance(v, _reprable): raise TypeError(type(v)) self._dbfile.write(("\t%s=%r" % (k, v)).encode("latin1")) self._dbfile.write(b("\n"))
def minimize_values(postingsize, values, compression=0): if postingsize < 0: string = dumps(values, -1)[2:] elif postingsize == 0: string = b('') else: string = b('').join(values) if string and compression: string = compress(string, compression) return string
def test_ordered_closest(): keys = [ 'alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november' ] values = [''] * len(keys) with TempStorage("orderedclosest") as st: hwf = st.create_file("test.hsh") hw = OrderedHashWriter(hwf) hw.add_all(zip(keys, values)) hw.close() hrf = st.open_file("test.hsh") hr = OrderedHashReader(hrf) ck = hr.closest_key assert_equal(ck(''), b('alfa')) assert_equal(ck(' '), b('alfa')) assert_equal(ck('alfa'), b('alfa')) assert_equal(ck('bravot'), b('charlie')) assert_equal(ck('charlie'), b('charlie')) assert_equal(ck('kiloton'), b('lima')) assert_equal(ck('oskar'), None) assert_equal(list(hr.keys()), [b(k) for k in keys]) assert_equal(list(hr.values()), [b(v) for v in values]) assert_equal(list(hr.keys_from('f')), [b(k) for k in keys[5:]]) hr.close()
def test_ordered_closest(): keys = ['alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november'] values = [''] * len(keys) with TempStorage("orderedclosest") as st: hwf = st.create_file("test.hsh") hw = OrderedHashWriter(hwf) hw.add_all(zip(keys, values)) hw.close() hrf = st.open_file("test.hsh") hr = OrderedHashReader(hrf) ck = hr.closest_key assert_equal(ck(''), b('alfa')) assert_equal(ck(' '), b('alfa')) assert_equal(ck('alfa'), b('alfa')) assert_equal(ck('bravot'), b('charlie')) assert_equal(ck('charlie'), b('charlie')) assert_equal(ck('kiloton'), b('lima')) assert_equal(ck('oskar'), None) assert_equal(list(hr.keys()), [b(k) for k in keys]) assert_equal(list(hr.values()), [b(v) for v in values]) assert_equal(list(hr.keys_from('f')), [b(k) for k in keys[5:]]) hr.close()
def test_insert_bytes(): # This test is only meaningful on Python 3 domain = [b("alfa"), b("bravo"), b("charlie")] st = RamStorage() gw = fst.GraphWriter(st.create_file("test")) gw.start_field("test") for key in domain: gw.insert(key) gw.close() cur = fst.GraphReader(st.open_file("test")).cursor() assert list(cur.flatten()) == domain
def test_random_access(): times = 1000 with TempStorage("orderedhash") as st: hw = HashWriter(st.create_file("test.hsh")) hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hr = HashReader.open(st, "test.hsh") for x in keys: assert hr[b("%08x" % x)] == b(str(x)) hr.close()
def decode_character_boosts(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:]) position = 0 endchar = 0 posn_char_boosts = [] for code in codes: position = position + code[0] startchar = endchar + code[1] endchar = startchar + code[2] posn_char_boosts.append((position, startchar, endchar, code[3])) return posn_char_boosts
def decode_characters(self, valuestring): if not valuestring.endswith(b(".")): valuestring += b(".") codes = loads(valuestring[_INT_SIZE:]) position = 0 endchar = 0 posns_chars = [] for code in codes: position = code[0] + position startchar = code[1] + endchar endchar = code[2] + startchar posns_chars.append((position, startchar, endchar)) return posns_chars
def parse_glob(pattern, _glob_multi=b("*"), _glob_single=b("?"), _glob_range1=b("["), _glob_range2=b("]"), _glob_range_not=b("!")): parsed = [] pos = 0 while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_multi: # * # (Ignore more than one star in a row) if parsed: prev = parsed[-1][0] if prev == _STAR: continue parsed.append((_STAR, )) elif char == _glob_single: # ? # (Ignore ? after a star) if parsed: prev = parsed[-1][0] if prev == _STAR: continue parsed.append((_QUEST, )) elif char == _glob_range1: # [ chars = set() firstchar = True negate = False # Take the char range specification until the ] while pos < len(pattern): char = pattern[pos] pos += 1 if char == _glob_range2: break # If first char inside the range is !, negate the list if firstchar and char == _glob_range_not: negate = True else: chars.add(char) firstchar = False if chars: parsed.append((_RANGE, chars, negate)) else: if parsed and parsed[-1][0] == _LIT: parsed[-1][1] += char else: parsed.append([_LIT, char]) parsed.append((_END, )) return parsed
def test_ordered_hash(): times = 10000 with TempStorage("orderedhash") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for x in keys: assert_equal(hr[b("%08x" % x)], b(str(x))) hr.close()
def test_indentical_fields(): schema = fields.Schema(id=fields.STORED, f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) with TempIndex(schema, "identifields") as ix: w = ix.writer() w.add_document(id=1, f1=u("alfa"), f2=u("alfa"), f3=u("alfa")) w.commit() with ix.searcher() as s: assert list(s.lexicon("f1")) == [b("alfa")] assert list(s.lexicon("f2")) == [b("alfa")] assert list(s.lexicon("f3")) == [b("alfa")] assert list(s.documents(f1="alfa")) == [{"id": 1}] assert list(s.documents(f2="alfa")) == [{"id": 1}] assert list(s.documents(f3="alfa")) == [{"id": 1}]
def load(cls, dbfile, expand=True): dbfile.seek(0) magic = dbfile.read(4) if magic != b("GR01"): raise Exception("%r does not seem to be a graph file" % dbfile) _ = dbfile.read_int() # File flags (currently unused) return DiskNode(dbfile, dbfile.read_uint(), expand=expand)
def __getitem__(self, num): if num > self.length - 1: raise IndexError("Tried to get document %s, file has %s" % (num, self.length)) dbfile = self.dbfile start = self.directory_offset + num * stored_pointer_size dbfile.seek(start) ptr = dbfile.read(stored_pointer_size) if len(ptr) != stored_pointer_size: raise Exception("Error reading %r @%s %s < %s" % (dbfile, start, len(ptr), stored_pointer_size)) position, length = unpack_stored_pointer(ptr) vlist = loads(dbfile.map[position:position + length] + b(".")) names = self.names # Recreate a dictionary by putting the field names and values back # together by position. We can't just use dict(zip(...)) because we # want to filter out the None values. values = dict((names[i], vlist[i]) for i in xrange(len(names)) if vlist[i] is not None) # Pull any extra stored dynamic field values off the end of the list if len(vlist) > len(names): values.update(dict(vlist[len(names):])) return values
def add_all(self, items): dbfile = self.dbfile hashes = self.hashes hash_func = self.hash_func pos = dbfile.tell() write = dbfile.write index = self.index lk = self.lastkey or b('') for key, value in items: if isinstance(key, text_type): key = key.encode('latin-1') if isinstance(value, text_type): value = value.encode('latin-1') if key <= lk: raise ValueError("Keys must increase: %r .. %r" % (lk, key)) lk = key index.append(pos) write(pack_lengths(len(key), len(value))) write(key) write(value) h = hash_func(key) hashes[h & 255].append((h, pos)) pos += lengths_size + len(key) + len(value) self.lastkey = lk
def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) q = qp.parse("done:false") assert q.__class__ == query.Term assert q.text is False assert schema["done"].to_bytes(False) == b("f") r = s.search(q) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r)
def __init__(self, dbfile): self.dbfile = dbfile self.map = dbfile.map dbfile.seek(0) magic = dbfile.read(4) if magic == b("HASH"): self.format = 1 self.header_size = 16 + 256 * header_entry_size _pointer_struct = Struct("!Iq") # Hash value, position self.hashtype = dbfile.read_byte() dbfile.read(3) # Unused self._end_of_hashes = dbfile.read_long() assert self._end_of_hashes >= self.header_size else: # Old format self.format = self.hashtype = 0 self.header_size = 256 * header_entry_size _pointer_struct = Struct("!qq") # Hash value, position self.hash_func = hash_functions[self.hashtype] self.buckets = [] for _ in xrange(256): he = unpack_header_entry(dbfile.read(header_entry_size)) self.buckets.append(he) self._start_of_hashes = self.buckets[0][0] self.pointer_size = _pointer_struct.size self.unpack_pointer = _pointer_struct.unpack self.is_closed = False
def add_all(self, items): dbfile = self.dbfile hashes = self.hashes hash_func = self.hash_func pos = dbfile.tell() write = dbfile.write index = self.index lk = self.lastkey or b('') for key, value in items: if not isinstance(key, bytes_type): raise TypeError("Key %r should be bytes" % key) if not isinstance(value, bytes_type): raise TypeError("Value %r should be bytes" % value) if key <= lk: raise ValueError("Keys must increase: %r .. %r" % (lk, key)) lk = key index.append(pos) write(pack_lengths(len(key), len(value))) write(key) write(value) h = hash_func(key) hashes[h & 255].append((h, pos)) pos += lengths_size + len(key) + len(value) self.lastkey = lk
def digest(self): if self._digest is None: d = sha1() vtype = self.owner.vtype for arc in self.arcs: d.update(arc.label) if arc.target: d.update(pack_long(arc.target)) else: d.update(b("z")) if arc.value: d.update(vtype.to_bytes(arc.value)) if arc.accept: d.update(b("T")) self._digest = d.digest() return self._digest
def test_removefield(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT, city=fields.KEYWORD(stored=True)) with TempIndex(schema, "removefield") as ix: w = ix.writer() w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad")) w.add_document(id=u("c"), content=u("charlie"), city=u("cairo")) w.add_document(id=u("d"), content=u("delta"), city=u("dakar")) w.commit() with ix.searcher() as s: assert s.document(id=u("c")) == {"id": "c", "city": "cairo"} w = ix.writer() w.remove_field("content") w.remove_field("city") w.commit() ixschema = ix._current_schema() assert ixschema.names() == ["id"] assert ixschema.stored_names() == ["id"] with ix.searcher() as s: assert ("content", b("charlie")) not in s.reader() assert s.document(id=u("c")) == {"id": u("c")}
def sortable_terms(self, ixreader, fieldname): zero = b("\x00") for token in ixreader.lexicon(fieldname): if token[0:1] != zero: # Only yield the full-precision values break yield token
def __init__(self, dbfile, magic=b("HSH3"), hashtype=0): """ :param dbfile: a :class:`~whoosh.filedb.structfile.StructFile` object to write to. :param magic: the format tag bytes to write at the start of the file. :param hashtype: an integer indicating which hashing algorithm to use. Possible values are 0 (MD5), 1 (CRC32), or 2 (CDB hash). """ self.dbfile = dbfile self.hashtype = hashtype self.hashfn = _hash_functions[self.hashtype] # A place for subclasses to put extra metadata self.extras = {} self.startoffset = dbfile.tell() # Write format tag dbfile.write(magic) # Write hash type dbfile.write_byte(self.hashtype) # Unused future expansion bits dbfile.write_int(0) dbfile.write_int(0) # 256 lists of hashed keys and positions self.buckets = [[] for _ in xrange(256)] # List to remember the positions of the hash tables self.directory = []
def __init__(self, dbfile): self.dbfile = dbfile dbfile.seek(0) magic = dbfile.read(4) if magic == b("HASH"): self.format = 1 self.header_size = 16 + 256 * header_entry_size _pointer_struct = Struct("!Iq") # Hash value, position self.hashtype = dbfile.read_byte() dbfile.read(3) # Unused self._end_of_hashes = dbfile.read_long() assert self._end_of_hashes >= self.header_size else: # Old format self.format = self.hashtype = 0 self.header_size = 256 * header_entry_size _pointer_struct = Struct("!qq") # Hash value, position self.hash_func = hash_functions[self.hashtype] self.buckets = [] for _ in xrange(256): he = unpack_header_entry(dbfile.read(header_entry_size)) self.buckets.append(he) self._start_of_hashes = self.buckets[0][0] self.pointer_size = _pointer_struct.size self.unpack_pointer = _pointer_struct.unpack self.is_closed = False
def test_random_termkeys(): def random_fieldname(): return "".join(chr(random.randint(65, 90)) for _ in xrange(1, 20)) def random_btext(): a = array("H", (random.randint(0, 0xd7ff) for _ in xrange(1, 20))) return array_tobytes(a).decode("utf-16") domain = sorted(set([(random_fieldname(), random_btext().encode("utf-8")) for _ in xrange(1000)])) st, codec, seg = _make_codec() fieldobj = fields.TEXT() tw = codec.field_writer(st, seg) # Stupid ultra-low-level hand-adding of postings just to check handling of # random fieldnames and term texts lastfield = None for fieldname, text in domain: if lastfield and fieldname != lastfield: tw.finish_field() lastfield = None if lastfield is None: tw.start_field(fieldname, fieldobj) lastfield = fieldname tw.start_term(text) tw.add(0, 1.0, b(""), 1) tw.finish_term() if lastfield: tw.finish_field() tw.close() tr = codec.terms_reader(st, seg) for term in domain: assert term in tr