コード例 #1
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_delete_nonexistant():
    from whoosh.writing import IndexingError

    schema = fields.Schema(id=fields.ID(stored=True))
    # Single segment
    with TempIndex(schema, "deletenon1") as ix:
        w = ix.writer()
        for char in u("ABC"):
            w.add_document(id=char)
        w.commit()

        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()

    # Multiple segments
    with TempIndex(schema, "deletenon1") as ix:
        for char in u("ABC"):
            w = ix.writer()
            w.add_document(id=char)
            w.commit(merge=False)

        try:
            w = ix.writer()
            assert_raises(IndexingError, w.delete_document, 5)
        finally:
            w.cancel()
コード例 #2
0
ファイル: test_indexing.py プロジェクト: ws-os/oh-mainline
def test_reindex():
    SAMPLE_DOCS = [
        {
            'id': u('test1'),
            'text': u('This is a document. Awesome, is it not?')
        },
        {
            'id': u('test2'),
            'text': u('Another document. Astounding!')
        },
        {
            'id':
            u('test3'),
            'text':
            u('A fascinating article on the behavior of domestic steak knives.'
              )
        },
    ]

    schema = fields.Schema(text=fields.TEXT(stored=True),
                           id=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "reindex") as ix:

        def reindex():
            writer = ix.writer()
            for doc in SAMPLE_DOCS:
                writer.update_document(**doc)
            writer.commit()

        reindex()
        assert_equal(ix.doc_count_all(), 3)
        reindex()
        assert_equal(ix.doc_count_all(), 3)
コード例 #3
0
ファイル: test_indexing.py プロジェクト: ws-os/oh-mainline
def test_update():
    # Test update with multiple unique keys
    SAMPLE_DOCS = [
        {
            "id": u("test1"),
            "path": u("/test/1"),
            "text": u("Hello")
        },
        {
            "id": u("test2"),
            "path": u("/test/2"),
            "text": u("There")
        },
        {
            "id": u("test3"),
            "path": u("/test/3"),
            "text": u("Reader")
        },
    ]

    schema = fields.Schema(id=fields.ID(unique=True, stored=True),
                           path=fields.ID(unique=True, stored=True),
                           text=fields.TEXT)

    with TempIndex(schema, "update") as ix:
        writer = ix.writer()
        for doc in SAMPLE_DOCS:
            writer.add_document(**doc)
        writer.commit()

        writer = ix.writer()
        writer.update_document(id=u("test2"),
                               path=u("test/1"),
                               text=u("Replacement"))
        writer.commit()
コード例 #4
0
ファイル: test_indexing.py プロジェクト: ws-os/oh-mainline
def test_noscorables1():
    values = [
        u("alfa"),
        u("bravo"),
        u("charlie"),
        u("delta"),
        u("echo"),
        u("foxtrot"),
        u("golf"),
        u("hotel"),
        u("india"),
        u("juliet"),
        u("kilo"),
        u("lima")
    ]
    from random import choice, sample, randint

    times = 1000

    schema = fields.Schema(id=fields.ID, tags=fields.KEYWORD)
    with TempIndex(schema, "noscorables1") as ix:
        w = ix.writer()
        for _ in xrange(times):
            w.add_document(id=choice(values),
                           tags=u(" ").join(sample(values, randint(2, 7))))
        w.commit()

        with ix.searcher() as s:
            s.search(query.Term("id", "bravo"))
コード例 #5
0
def test_removefield():
    schema = fields.Schema(id=fields.ID(stored=True),
                           content=fields.TEXT,
                           city=fields.KEYWORD(stored=True))
    with TempIndex(schema, "removefield") as ix:
        w = ix.writer()
        w.add_document(id=u("b"), content=u("bravo"), city=u("baghdad"))
        w.add_document(id=u("c"), content=u("charlie"), city=u("cairo"))
        w.add_document(id=u("d"), content=u("delta"), city=u("dakar"))
        w.commit()
        
        with ix.searcher() as s:
            assert_equal(s.document(id=u("c")), {"id": "c", "city": "cairo"})
        
        w = ix.writer()
        w.remove_field("content")
        w.remove_field("city")
        w.commit()

        ixschema = ix._current_schema()
        assert_equal(ixschema.names(), ["id"])
        assert_equal(ixschema.stored_names(), ["id"])
        
        with ix.searcher() as s:
            assert ("content", u("charlie")) not in s.reader()
            assert_equal(s.document(id=u("c")), {"id": u("c")})
コード例 #6
0
def test_suggest_prefix():
    domain = ("Shoot To Kill", "Bloom, Split and Deviate",
              "Rankle the Seas and the Skies", "Lightning Flash Flame Shell",
              "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and "
              "Heavenly Demon Sneer",
              "All Waves, Rise now and Become my Shield, Lightning, Strike "
              "now and Become my Blade",
              "Cry, Raise Your Head, Rain Without end",
              "Sting All Enemies To Death", "Reduce All Creation to Ash",
              "Sit Upon the Frozen Heavens", "Call forth the Twilight")

    schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True),
                           quick=fields.NGRAM(maxsize=10, stored=True))

    with TempIndex(schema, "sugprefix") as ix:
        with ix.writer() as w:
            for item in domain:
                content = u(item)
                w.add_document(content=content, quick=content)

        with ix.searcher() as s:
            sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2)
            assert_equal(sugs, ['rage', 'rain'])

            sugs = s.suggest("content", "ra", maxdist=2, prefix=1)
            assert_equal(sugs, ["rage", "rain", "roar"])
コード例 #7
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_buffered_update():
    schema = fields.Schema(id=fields.ID(stored=True, unique=True),
                           payload=fields.STORED)
    with TempIndex(schema, "bufferedupdate") as ix:
        w = writing.BufferedWriter(ix, period=None, limit=5)
        for i in xrange(10):
            for char in u("abc"):
                fs = dict(id=char, payload=text_type(i) + char)
                w.update_document(**fs)

        with w.reader() as r:
            assert_equal(sorted(r.all_stored_fields(), key=lambda x: x["id"]),
                         [{
                             'id': u('a'),
                             'payload': u('9a')
                         }, {
                             'id': u('b'),
                             'payload': u('9b')
                         }, {
                             'id': u('c'),
                             'payload': u('9c')
                         }])
            assert_equal(r.doc_count(), 3)

        w.close()
コード例 #8
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
def test_sort_filter():
    schema = fields.Schema(group=fields.ID(stored=True),
                           key=fields.ID(stored=True))
    groups = u("alfa bravo charlie").split()
    keys = u("abcdefghijklmnopqrstuvwxyz")
    source = []
    for i in xrange(100):
        key = keys[i % len(keys)]
        group = groups[i % len(groups)]
        source.append({"key": key, "group": group})
    source.sort(key=lambda x: (x["key"], x["group"]))

    sample = list(source)
    random.shuffle(sample)

    with TempIndex(schema, "sortfilter") as ix:
        w = ix.writer()
        for i, fs in enumerate(sample):
            w.add_document(**fs)
            i += 1
            if not i % 26:
                w.commit(merge=False)
                w = ix.writer()
        w.commit()

        fq = query.Term("group", u("bravo"))

        with ix.searcher() as s:
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=20)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"][:20])

            fq = query.Term("group", u("bravo"))
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=None)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"])

        ix.optimize()

        with ix.searcher() as s:
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=20)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"][:20])

            fq = query.Term("group", u("bravo"))
            r = s.search(query.Every(),
                         sortedby=("key", "group"),
                         filter=fq,
                         limit=None)
            assert_equal([h.fields() for h in r],
                         [d for d in source if d["group"] == "bravo"])
コード例 #9
0
def test_deleteall():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "deleteall") as ix:
        w = ix.writer()
        domain = u("alfa bravo charlie delta echo").split()
        for i, ls in enumerate(permutations(domain)):
            w.add_document(text=u(" ").join(ls))
            if not i % 10:
                w.commit()
                w = ix.writer()
        w.commit()

        # This is just a test, don't use this method to delete all docs IRL!
        doccount = ix.doc_count_all()
        w = ix.writer()
        for docnum in xrange(doccount):
            w.delete_document(docnum)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Or([query.Term("text", u("alfa")), query.Term("text", u("bravo"))]))
            assert_equal(len(r), 0)

        ix.optimize()
        assert_equal(ix.doc_count_all(), 0)

        with ix.reader() as r:
            assert_equal(list(r), [])
コード例 #10
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_buffered_threads():
    class SimWriter(threading.Thread):
        def __init__(self, w, domain):
            threading.Thread.__init__(self)
            self.w = w
            self.domain = domain

        def run(self):
            w = self.w
            domain = self.domain
            for _ in xrange(10):
                w.update_document(name=random.choice(domain))
                time.sleep(random.uniform(0.01, 0.1))

    schema = fields.Schema(name=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "buffthreads") as ix:
        domain = u("alfa bravo charlie delta").split()
        w = writing.BufferedWriter(ix, limit=10)
        threads = [SimWriter(w, domain) for _ in xrange(10)]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        w.close()

        with ix.reader() as r:
            assert_equal(r.doc_count(), 4)
            assert_equal(sorted([d["name"] for d in r.all_stored_fields()]),
                         domain)
コード例 #11
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_asyncwriter_no_stored():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "asyncnostored") as ix:
        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
                  u("foxtrot"), u("golf"), u("hotel"), u("india"))

        writers = []
        # Simulate doing 20 (near-)simultaneous commits. If we weren't using
        # AsyncWriter, at least some of these would fail because the first
        # writer wouldn't be finished yet.
        for i in xrange(20):
            w = writing.AsyncWriter(ix)
            writers.append(w)
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
            w.commit()

        # Wait for all writers to finish before checking the results
        for w in writers:
            if w.running:
                w.join()

        # Check whether all documents made it into the index.
        with ix.reader() as r:
            assert_equal(sorted([int(id) for id in r.lexicon("id")]),
                         list(range(20)))
コード例 #12
0
ファイル: test_vectors.py プロジェクト: datakortet/whoosh
def test_vector_merge():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectormerge") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black hole " +
                                      "story"))
        writer.commit()

        writer = ix.writer()
        writer.add_document(title=u("two"),
                            content=u("You can read along in your book"))
        writer.commit()

        with ix.searcher() as s:
            r = s.reader()

            docnum = s.document_number(title=u("one"))
            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])

            docnum = s.document_number(title=u("two"))

            vec = list(r.vector_as("frequency", docnum, "content"))
            assert_equal(vec,
                         [(u('along'), 1), (u('book'), 1), (u('read'), 1)])
コード例 #13
0
ファイル: test_ramindex.py プロジェクト: ws-os/oh-mainline
def test_todisk():
    ix = make_index()

    with TempIndex(ix.schema, "ramtodisk") as fix:
        w = fix.writer()
        w.add_reader(ix.reader())
        w.commit()
コード例 #14
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
def test_page_sorted():
    schema = fields.Schema(key=fields.ID(stored=True))
    with TempIndex(schema, "pagesorted") as ix:
        domain = list(u("abcdefghijklmnopqrstuvwxyz"))
        random.shuffle(domain)

        w = ix.writer()
        for char in domain:
            w.add_document(key=char)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Every(), sortedby="key", limit=5)
            assert_equal(r.scored_length(), 5)
            assert_equal(len(r), s.doc_count_all())

            rp = s.search_page(query.Every(), 1, pagelen=5, sortedby="key")
            assert_equal("".join([h["key"] for h in rp]), "abcde")
            assert_equal(rp[10:], [])

            rp = s.search_page(query.Term("key", "glonk"),
                               1,
                               pagelen=5,
                               sortedby="key")
            assert_equal(len(rp), 0)
            assert rp.is_last_page()
コード例 #15
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
def test_multifacet():
    schema = fields.Schema(tag=fields.ID(stored=True),
                           size=fields.ID(stored=True))
    with TempIndex(schema, "multifacet") as ix:
        w = ix.writer()
        w.add_document(tag=u("alfa"), size=u("small"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.add_document(tag=u("alfa"), size=u("large"))
        w.add_document(tag=u("bravo"), size=u("small"))
        w.add_document(tag=u("alfa"), size=u("medium"))
        w.add_document(tag=u("bravo"), size=u("medium"))
        w.commit()

        correct = {
            (u('bravo'), u('medium')): [1, 5],
            (u('alfa'), u('large')): [2],
            (u('alfa'), u('medium')): [4],
            (u('alfa'), u('small')): [0],
            (u('bravo'), u('small')): [3]
        }

        with ix.searcher() as s:
            facet = sorting.MultiFacet(["tag", "size"])
            r = s.search(query.Every(), groupedby={"tag/size": facet})
            cats = r.groups(("tag/size"))
            assert_equal(cats, correct)
コード例 #16
0
def test_no_add():
    from whoosh.filedb.multiproc import MpWriter

    schema = fields.Schema(
        text=fields.TEXT(stored=True, spelling=True, vector=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=3) as w:
            assert_equal(type(w), MpWriter)
コード例 #17
0
def test_batchsize_eq_doccount():
    from whoosh.filedb.multiproc import MpWriter

    schema = fields.Schema(a=fields.KEYWORD(stored=True))
    with TempIndex(schema) as ix:
        with ix.writer(procs=4, batchsize=10) as w:
            for i in xrange(10):
                w.add_document(a=u(str(i)))
コード例 #18
0
def test_delete_recovery():
    schema = fields.Schema(text=fields.TEXT)
    with TempIndex(schema, "delrecover") as ix:
        rw = RecoverWriter(ix)
        rr = RecoverReader(ix)
        rw.start()
        rr.start()
        rw.join()
        rr.join()
コード例 #19
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
 def check(method):
     with TempIndex(get_schema()) as ix:
         method(ix)
         with ix.searcher() as s:
             results = s.search(query.Every(), groupedby="tag")
             groups = results.groups("tag")
             assert (sorted(groups.items()) == [(u('one'), [0, 6]),
                                                (u('three'), [1, 3, 7, 8]),
                                                (u('two'), [2, 4, 5])])
コード例 #20
0
def test_empty_commit():
    s = fields.Schema(id=fields.ID(stored=True))
    with TempIndex(s, "emptycommit") as ix:
        w = ix.writer()
        w.add_document(id=u("1"))
        w.add_document(id=u("2"))
        w.add_document(id=u("3"))
        w.commit()

        w = ix.writer()
        w.commit()
コード例 #21
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_cancel_delete():
    schema = fields.Schema(id=fields.ID(stored=True))
    # Single segment
    with TempIndex(schema, "canceldelete1") as ix:
        w = ix.writer()
        for char in u("ABCD"):
            w.add_document(id=char)
        w.commit()

        with ix.reader() as r:
            assert not r.has_deletions()

        w = ix.writer()
        w.delete_document(2)
        w.delete_document(3)
        w.cancel()

        with ix.reader() as r:
            assert not r.has_deletions()
            assert not r.is_deleted(2)
            assert not r.is_deleted(3)

    # Multiple segments
    with TempIndex(schema, "canceldelete2") as ix:
        for char in u("ABCD"):
            w = ix.writer()
            w.add_document(id=char)
            w.commit(merge=False)

        with ix.reader() as r:
            assert not r.has_deletions()

        w = ix.writer()
        w.delete_document(2)
        w.delete_document(3)
        w.cancel()

        with ix.reader() as r:
            assert not r.has_deletions()
            assert not r.is_deleted(2)
            assert not r.is_deleted(3)
コード例 #22
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
def test_empty_field():
    schema = fields.Schema(id=fields.STORED, key=fields.KEYWORD)
    with TempIndex(schema, "emptysort") as ix:
        w = ix.writer()
        w.add_document(id=1)
        w.add_document(id=2)
        w.add_document(id=3)
        w.commit()

        with ix.searcher() as s:
            r = s.search(query.Every(), sortedby="key")
            assert_equal([h["id"] for h in r], [1, 2, 3])
コード例 #23
0
def test_single():
    schema = fields.Schema(id=fields.ID(stored=True), text=fields.TEXT)
    with TempIndex(schema, "single") as ix:
        w = ix.writer()
        w.add_document(id=u("1"), text=u("alfa"))
        w.commit()

        with ix.searcher() as s:
            assert ("text", u("alfa")) in s.reader()
            assert_equal(list(s.documents(id="1")), [{"id": "1"}])
            assert_equal(list(s.documents(text="alfa")), [{"id": "1"}])
            assert_equal(list(s.all_stored_fields()), [{"id": "1"}])
コード例 #24
0
def test_many_updates():
    schema = fields.Schema(key=fields.ID(unique=True, stored=True))
    with TempIndex(schema, "manyupdates") as ix:
        for _ in xrange(10000):
            num = random.randint(0, 5000)
            w = ix.writer()
            w.update_document(key=text_type(num))
            w.commit()

        with ix.searcher() as s:
            result = [d["key"] for d in s.search(query.Every())]
            assert_equal(len(result), len(set(result)))
コード例 #25
0
ファイル: test_vectors.py プロジェクト: datakortet/whoosh
def test_vector_reading():
    schema = fields.Schema(title=fields.TEXT,
                           content=fields.TEXT(vector=formats.Frequency()))

    with TempIndex(schema, "vectorreading") as ix:
        writer = ix.writer()
        writer.add_document(title=u("one"),
                            content=u("This is the story of the black " +
                                      "hole story"))
        writer.commit()

        with ix.reader() as r:
            assert_equal(list(r.vector_as("frequency", 0, "content")),
                         [(u('black'), 1), (u('hole'), 1), (u('story'), 2)])
コード例 #26
0
ファイル: test_sorting.py プロジェクト: datakortet/whoosh
def test_cached_lexicon():
    schema = fields.Schema(tag=fields.ID)
    with TempIndex(schema, "cachedlexicon") as ix:
        w = ix.writer()
        w.add_document(tag=u("sierra"))
        w.add_document(tag=u("alfa"))
        w.add_document(tag=u("juliet"))
        w.add_document(tag=u("romeo"))
        w.commit()

        with ix.reader() as r:
            _ = r.fieldcache("tag")
            assert_equal(list(r.lexicon("tag")),
                         ["alfa", "juliet", "romeo", "sierra"])
コード例 #27
0
def test_prefix_address():
    fieldtype = fields.TEXT(spelling=True)
    schema = fields.Schema(f1=fieldtype, f2=fieldtype)
    with TempIndex(schema, "prefixaddr") as ix:
        with ix.writer() as w:
            w.add_document(f1=u("aabc aawx aaqr aade"),
                           f2=u("aa12 aa34 aa56 aa78"))

        with ix.searcher() as s:
            sugs = s.suggest("f1", u("aa"), maxdist=2, prefix=2)
            assert_equal(sorted(sugs), ["aabc", "aade", "aaqr", "aawx"])

            sugs = s.suggest("f2", u("aa"), maxdist=2, prefix=2)
            assert_equal(sorted(sugs), ["aa12", "aa34", "aa56", "aa78"])
コード例 #28
0
ファイル: test_indexing.py プロジェクト: ws-os/oh-mainline
def test_deletion():
    s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT)
    with TempIndex(s, "deletion") as ix:
        w = ix.writer()
        w.add_document(key=u("A"),
                       name=u("Yellow brown"),
                       value=u("Blue red green purple?"))
        w.add_document(key=u("B"),
                       name=u("Alpha beta"),
                       value=u("Gamma delta epsilon omega."))
        w.add_document(key=u("C"),
                       name=u("One two"),
                       value=u("Three four five."))
        w.commit()

        w = ix.writer()
        count = w.delete_by_term("key", u("B"))
        assert_equal(count, 1)
        w.commit(merge=False)

        assert_equal(ix.doc_count_all(), 3)
        assert_equal(ix.doc_count(), 2)

        w = ix.writer()
        w.add_document(key=u("A"),
                       name=u("Yellow brown"),
                       value=u("Blue red green purple?"))
        w.add_document(key=u("B"),
                       name=u("Alpha beta"),
                       value=u("Gamma delta epsilon omega."))
        w.add_document(key=u("C"),
                       name=u("One two"),
                       value=u("Three four five."))
        w.commit()

        # This will match both documents with key == B, one of which is already
        # deleted. This should not raise an error.
        w = ix.writer()
        count = w.delete_by_term("key", u("B"))
        assert_equal(count, 1)
        w.commit()

        ix.optimize()
        assert_equal(ix.doc_count_all(), 4)
        assert_equal(ix.doc_count(), 4)

        with ix.reader() as tr:
            assert_equal(list(tr.lexicon("name")),
                         ["brown", "one", "two", "yellow"])
コード例 #29
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_add_field():
    schema = fields.Schema(a=fields.TEXT)
    with TempIndex(schema, "addfield") as ix:
        with ix.writer() as w:
            w.add_document(a=u("alfa bravo charlie"))
        with ix.writer() as w:
            w.add_field("b", fields.ID(stored=True))
            w.add_field("c*", fields.ID(stored=True), glob=True)
            w.add_document(a=u("delta echo foxtrot"),
                           b=u("india"),
                           cat=u("juliet"))

        with ix.searcher() as s:
            fs = s.document(b=u("india"))
            assert_equal(fs, {"b": "india", "cat": "juliet"})
コード例 #30
0
ファイル: test_writing.py プロジェクト: datakortet/whoosh
def test_no_stored():
    schema = fields.Schema(id=fields.ID, text=fields.TEXT)
    with TempIndex(schema, "nostored") as ix:
        domain = (u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"),
                  u("foxtrot"), u("golf"), u("hotel"), u("india"))

        w = ix.writer()
        for i in xrange(20):
            w.add_document(id=text_type(i),
                           text=u(" ").join(random.sample(domain, 5)))
        w.commit()

        with ix.reader() as r:
            assert_equal(sorted([int(id) for id in r.lexicon("id")]),
                         list(range(20)))