def test_random_hash(): with TempStorage("randomhash") as st: domain = "abcdefghijklmnopqrstuvwxyz" domain += domain.upper() times = 1000 minlen = 1 maxlen = len(domain) samp = dict((randstring(domain, minlen, maxlen), randstring(domain, minlen, maxlen)) for _ in xrange(times)) hwf = st.create_file("test.hsh") hw = HashWriter(hwf) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for k in keys: v = hr[k] assert_equal(v, b(samp[k])) hr.close()
def test_huge_postfile(): with TempStorage("hugeindex") as st: pf = st.create_file("test.pst") gb5 = 5 * 1024 * 1024 * 1024 pf.seek(gb5) pf.write("\x00\x00\x00\x00") assert_equal(pf.tell(), gb5 + 4) fpw = FilePostingWriter(pf) format = formats.Frequency(None) offset = fpw.start(format) for i in xrange(10): fpw.write(i, float(i), struct.pack("!I", i), 10) posttotal = fpw.finish() assert_equal(posttotal, 10) fpw.close() pf = st.open_file("test.pst") pfr = FilePostingReader(pf, offset, format) i = 0 while pfr.is_active(): assert_equal(pfr.id(), i) assert_equal(pfr.weight(), float(i)) assert_equal(pfr.value(), struct.pack("!I", i)) pfr.next() i += 1 pf.close()
def test_hash_contents(): samp = [ ('alfa', 'bravo'), ('charlie', 'delta'), ('echo', 'foxtrot'), ('golf', 'hotel'), ('india', 'juliet'), ('kilo', 'lima'), ('mike', 'november'), ('oskar', 'papa'), ('quebec', 'romeo'), ('sierra', 'tango'), ('ultra', 'victor'), ('whiskey', 'xray'), ] # Convert to bytes samp = set((b(k), b(v)) for k, v in samp) with TempStorage("hashcontents") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add_all(samp) hw.close() hrf = st.open_file("test.hsh") hr = HashReader(hrf) assert_equal(set(hr.items()), samp) hr.close()
def test_bigtable(): with TempStorage("bigtable") as st: def randstring(min, max): return "".join( chr(randint(1, 255)) for _ in xrange(randint(min, max))) count = 100000 samp = dict( (randstring(1, 50), randstring(1, 50)) for _ in xrange(count)) fhw = HashWriter(st.create_file("big.hsh")) fhw.add_all(iteritems(samp)) fhw.close() fhr = HashReader(st.open_file("big.hsh")) keys = list(samp.keys()) shuffle(keys) for key in keys: assert_equal(samp[key], fhr[key]) set1 = set(iteritems(samp)) set2 = set(fhr.items()) assert_equal(set1, set2) fhr.close()
def test_ordered_closest(): keys = [ 'alfa', 'bravo', 'charlie', 'delta', 'echo', 'foxtrot', 'golf', 'hotel', 'india', 'juliet', 'kilo', 'lima', 'mike', 'november' ] # Make into bytes for Python 3 keys = [b(k) for k in keys] values = [b('')] * len(keys) with TempStorage("orderedclosest") as st: hwf = st.create_file("test.hsh") hw = OrderedHashWriter(hwf) hw.add_all(zip(keys, values)) hw.close() hrf = st.open_file("test.hsh") hr = OrderedHashReader(hrf) ck = hr.closest_key assert_equal(ck(b('')), b('alfa')) assert_equal(ck(b(' ')), b('alfa')) assert_equal(ck(b('alfa')), b('alfa')) assert_equal(ck(b('bravot')), b('charlie')) assert_equal(ck(b('charlie')), b('charlie')) assert_equal(ck(b('kiloton')), b('lima')) assert_equal(ck(b('oskar')), None) assert_equal(list(hr.keys()), keys) assert_equal(list(hr.values()), values) assert_equal(list(hr.keys_from(b('f'))), keys[5:]) hr.close()
def test_random_hash(): from string import ascii_letters as domain times = 1000 minlen = 1 maxlen = len(domain) def randstring(): s = "".join(random.sample(domain, random.randint(minlen, maxlen))) return b(s) with TempStorage("randomhash") as st: samp = dict((randstring(), randstring()) for _ in xrange(times)) hwf = st.create_file("test.hsh") hw = HashWriter(hwf) for k, v in iteritems(samp): hw.add(k, v) hw.close() keys = list(samp.keys()) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for k in keys: assert_equal(hr[k], samp[k]) hr.close()
def test_threaded_filelock(): with TempStorage("threadedfilelock") as st: lock1 = st.lock("testlock") result = [] # The thread function tries to acquire the lock and then quits def fn(): lock2 = st.lock("testlock") gotit = try_for(lock2.acquire, 1.0, 0.1) if gotit: result.append(True) lock2.release() t = threading.Thread(target=fn) # Acquire the lock in this thread lock1.acquire() # Start the other thread trying to acquire the lock t.start() # Wait for a bit time.sleep(0.15) # Release the lock lock1.release() # Wait for the other thread to finish t.join() # If the other thread got the lock, it should have appended True to the # "results" list. assert_equal(result, [True])
def test_within(): with TempStorage() as st: gwrite(enlist("0 00 000 001 01 010 011 1 10 100 101 11 110 111"), st) gr = greader(st) s = set(dawg.within(gr, "01", k=1)) gr.close() assert_equal( s, set(["0", "00", "01", "011", "010", "001", "10", "101", "1", "11"]))
def test_words(): words = enlist("alfa alpaca amtrak bellow fellow fiona zebulon") with TempStorage() as st: gwrite(words, st) gr = greader(st) cur = dawg.Cursor(gr) assert_equal(list(cur.flatten_strings()), words) gr.close()
def test_stored_fields(): codec = default_codec() fieldobj = fields.TEXT(stored=True) with TempStorage("storedfields") as st: seg = codec.new_segment(st, "test") dw = codec.per_document_writer(st, seg) dw.start_doc(0) dw.add_field("a", fieldobj, "hello", 1) dw.add_field("b", fieldobj, "there", 1) dw.finish_doc() dw.start_doc(1) dw.add_field("a", fieldobj, "one", 1) dw.add_field("b", fieldobj, "two", 1) dw.add_field("c", fieldobj, "three", 1) dw.finish_doc() dw.start_doc(2) dw.finish_doc() dw.start_doc(3) dw.add_field("a", fieldobj, "alfa", 1) dw.add_field("b", fieldobj, "bravo", 1) dw.finish_doc() dw.close() dr = codec.stored_fields_reader(st, seg) assert_equal(dr[0], {"a": "hello", "b": "there"}) # Note: access out of order assert_equal(dr[3], {"a": "alfa", "b": "bravo"}) assert_equal(dr[1], {"a": "one", "b": "two", "c": "three"}) dr.close() dr = codec.stored_fields_reader(st, seg) sfs = list(dr) assert_equal(sfs, [ { "a": "hello", "b": "there" }, { "a": "one", "b": "two", "c": "three" }, {}, { "a": "alfa", "b": "bravo" }, ]) dr.close()
def test_filelock_simple(): with TempStorage("simplefilelock") as st: lock1 = st.lock("testlock") lock2 = st.lock("testlock") assert lock1 is not lock2 assert lock1.acquire() assert st.file_exists("testlock") assert not lock2.acquire() lock1.release() assert lock2.acquire() assert not lock1.acquire() lock2.release()
def test_dawg(): from whoosh.support.dawg import DawgBuilder with TempStorage() as st: df = st.create_file("test.dawg") dw = DawgBuilder(field_root=True) dw.insert(["test"] + list("special")) dw.insert(["test"] + list("specials")) dw.write(df) assert_equal(list(dawg.flatten(dw.root.edge("test"))), ["special", "specials"])
def test_hash(): with TempStorage("hash") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add(b("foo"), b("bar")) hw.add(b("glonk"), b("baz")) hw.close() hrf = st.open_file("test.hsh") hr = HashReader(hrf) assert_equal(hr.get(b("foo")), b("bar")) assert_equal(hr.get(b("baz")), None) hr.close()
def test_stored_fields(): with TempStorage("storedfields") as st: sf = st.create_file("test.sf") sfw = StoredFieldWriter(sf, ["a", "b"]) sfw.append({"a": "hello", "b": "there"}) sfw.append({"a": "one", "b": "two"}) sfw.append({"a": "alfa", "b": "bravo"}) sfw.close() sf = st.open_file("test.sf") sfr = StoredFieldReader(sf) assert_equal(sfr[0], {"a": "hello", "b": "there"}) assert_equal(sfr[2], {"a": "alfa", "b": "bravo"}) assert_equal(sfr[1], {"a": "one", "b": "two"}) sfr.close()
def _fst_roundtrip(domain, t): with TempStorage() as st: f = st.create_file("test") gw = dawg.GraphWriter(f, vtype=t) gw.start_field("_") for key, value in domain: gw.insert(key, value) gw.finish_field() gw.close() f = st.open_file("test") gr = dawg.GraphReader(f, vtype=t) cur = dawg.Cursor(gr) assert_equal(list(cur.flatten_v()), domain) f.close()
def test_termkey(): with TempStorage("termkey") as st: tw = TermIndexWriter(st.create_file("test.trm")) tw.add(("alfa", u("bravo")), FileTermInfo(1.0, 3)) tw.add(("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')), FileTermInfo(4.0, 6)) tw.add(("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')), FileTermInfo(7.0, 9)) tw.close() tr = TermIndexReader(st.open_file("test.trm")) assert ("alfa", u("bravo")) in tr assert ("alfa", u('\xc3\xa6\xc3\xaf\xc5\ufffd\xc3\xba')) in tr assert ("text", u('\xe6\u2014\xa5\xe6\u0153\xac\xe8\xaa\u017e')) in tr tr.close()
def test_ordered_hash(): times = 10000 with TempStorage("orderedhash") as st: hwf = st.create_file("test.hsh") hw = HashWriter(hwf) hw.add_all((b("%08x" % x), b(str(x))) for x in xrange(times)) hw.close() keys = list(range(times)) random.shuffle(keys) hrf = st.open_file("test.hsh") hr = HashReader(hrf) for x in keys: assert_equal(hr[b("%08x" % x)], b(str(x))) hr.close()
def test_readwrite(): schema = fields.Schema(id=fields.ID(stored=True), content=fields.TEXT) with TempStorage("threading") as st: domain = ("alfa", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", "quebec", "romeo", "sierra", "tango", "uniform", "victor", "whiskey", "xray", "yankee", "zulu") class WriterThread(threading.Thread): def run(self): ix = st.create_index(dir, schema) num = 0 for i in xrange(50): print(i) w = ix.writer() for _ in xrange(random.randint(1, 100)): content = u(" ").join( random.sample(domain, random.randint(5, 20))) w.add_document(id=text_type(num), content=content) num += 1 w.commit() time.sleep(0.1) class SearcherThread(threading.Thread): def run(self): print(self.name + " starting") for _ in xrange(10): ix = st.open_index() s = ix.searcher() q = query.Term("content", random.choice(domain)) s.search(q, limit=10) s.close() ix.close() time.sleep(0.1) print(self.name + " done") wt = WriterThread() wt.start() time.sleep(0.5) for _ in xrange(20): SearcherThread().start() time.sleep(0.5) wt.join()
def test_readwrite(): with TempStorage("readwrite") as st: format = Frequency() postings = make_postings() postfile = st.create_file("readwrite") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, freq in postings: fpw.write(id, float(freq), format.encode(freq), 0) fpw.finish() fpw.close() postfile = st.open_file("readwrite") fpr = FilePostingReader(postfile, 0, format) assert_equal(postings, list(fpr.items_as("frequency"))) postfile.close()
def roundtrip(postings, format, astype): with TempStorage("roundtrip") as st: postfile = st.create_file(astype) getweight = format.decoder("weight") fpw = FilePostingWriter(postfile, blocklimit=8) fpw.start(format) for id, value in postings: v = format.encode(value) fpw.write(id, getweight(v), v, 0) fpw.finish() fpw.close() postfile = st.open_file(astype) fpr = FilePostingReader(postfile, 0, format) readback = list(fpr.items_as(astype)) postfile.close() return readback
def test_fields(): with TempStorage() as st: f = st.create_file("test") gw = dawg.GraphWriter(f) gw.start_field("f1") gw.insert("a") gw.insert("aa") gw.insert("ab") gw.finish_field() gw.start_field("f2") gw.insert("ba") gw.insert("baa") gw.insert("bab") gw.close() gr = dawg.GraphReader(st.open_file("test")) cur1 = dawg.Cursor(gr, gr.root("f1")) cur2 = dawg.Cursor(gr, gr.root("f2")) assert_equal(list(cur1.flatten_strings()), ["a", "aa", "ab"]) assert_equal(list(cur2.flatten_strings()), ["ba", "baa", "bab"]) gr.close()
def _roundtrip(content, format_, astype, ana=None): with TempStorage("roundtrip") as st: codec = default_codec() seg = codec.new_segment(st, "") ana = ana or analysis.StandardAnalyzer() field = fields.FieldType(format=format_, analyzer=ana) fw = codec.field_writer(st, seg) fw.start_field("f1", field) for text, _, weight, valuestring in sorted(field.index(content)): fw.start_term(text) fw.add(0, weight, valuestring, None) fw.finish_term() fw.finish_field() fw.close() tr = codec.terms_reader(st, seg) ps = [] for fieldname, text in tr.keys(): m = tr.matcher(fieldname, text, format_) ps.append((text, m.value_as(astype))) tr.close() return ps
def test_random(): def randstring(): length = random.randint(1, 10) a = array("B", (random.randint(0, 255) for _ in xrange(length))) return array_tobytes(a) keys = sorted(randstring() for _ in xrange(1000)) with TempStorage() as st: gwrite(keys, st) gr = greader(st) cur = dawg.Cursor(gr) s1 = cur.flatten() s2 = sorted(set(keys)) for i, (k1, k2) in enumerate(zip(s1, s2)): assert k1 == k2, "%s: %r != %r" % (i, k1, k2) sample = list(keys) random.shuffle(sample) for key in sample: cur.reset() cur.find_path(key) assert_equal(cur.prefix_bytes(), key) gr.close()
def test_simple_compound_mmap(): with TempStorage("compound") as st: assert st.supports_mmap _test_simple_compound(st)