Example #1
0
def wordlist_to_graph_file(wordlist, dbfile, fieldname="_", strip=True):
    """Writes a word graph file from a list of words.
    
    >>> # Open a word list file with one word on each line, and write the
    >>> # word graph to a graph file
    >>> wordlist_to_graph_file("mywords.txt", "mywords.dawg")
    
    :param wordlist: an iterable containing the words for the graph. The words
        must be in sorted order.
    :param dbfile: a filename string or file-like object to write the word
        graph to. This function will close the file.
    """

    from whoosh.filedb.structfile import StructFile
    if isinstance(dbfile, string_type):
        dbfile = open(dbfile, "wb")
    if not isinstance(dbfile, StructFile):
        dbfile = StructFile(dbfile)

    gw = dawg.GraphWriter(dbfile)
    gw.start_field(fieldname)
    for word in wordlist:
        if strip:
            word = word.strip()
        gw.insert(word)
    gw.finish_field()
    gw.close()
Example #2
0
def test_empty_key():
    gw = dawg.GraphWriter(RamStorage().create_file("test"))
    gw.start_field("_")
    assert_raises(KeyError, gw.insert, b(""))
    assert_raises(KeyError, gw.insert, "")
    assert_raises(KeyError, gw.insert, u(""))
    assert_raises(KeyError, gw.insert, [])
Example #3
0
def gwrite(keys, st=None):
    st = st or RamStorage()
    f = st.create_file("test")
    gw = dawg.GraphWriter(f)
    gw.start_field("_")
    for key in keys:
        gw.insert(key)
    gw.finish_field()
    gw.close()
    return st
Example #4
0
def test_insert_bytes():
    # This test is only meaningful on Python 3
    domain = [b("alfa"), b("bravo"), b("charlie")]

    st = RamStorage()
    gw = dawg.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    cur = dawg.GraphReader(st.open_file("test")).cursor()
    assert_equal(list(cur.flatten()), domain)
Example #5
0
def _fst_roundtrip(domain, t):
    with TempStorage() as st:
        f = st.create_file("test")
        gw = dawg.GraphWriter(f, vtype=t)
        gw.start_field("_")
        for key, value in domain:
            gw.insert(key, value)
        gw.finish_field()
        gw.close()

        f = st.open_file("test")
        gr = dawg.GraphReader(f, vtype=t)
        cur = dawg.Cursor(gr)
        assert_equal(list(cur.flatten_v()), domain)
        f.close()
Example #6
0
def test_insert_unicode():
    domain = [
        u("\u280b\u2817\u2801\u281d\u2809\u2811"),
        u("\u65e5\u672c"),
        u("\uc774\uc124\ud76c"),
    ]

    st = RamStorage()
    gw = dawg.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    cur = dawg.GraphReader(st.open_file("test")).cursor()
    assert_equal(list(cur.flatten_strings()), domain)
Example #7
0
def test_within_unicode():
    domain = [
        u("\u280b\u2817\u2801\u281d\u2809\u2811"),
        u("\u65e5\u672c"),
        u("\uc774\uc124\ud76c"),
    ]

    st = RamStorage()
    gw = dawg.GraphWriter(st.create_file("test"))
    gw.start_field("test")
    for key in domain:
        gw.insert(key)
    gw.close()

    gr = dawg.GraphReader(st.open_file("test"))
    s = list(dawg.within(gr, u("\uc774.\ud76c")))
    assert_equal(s, [u("\uc774\uc124\ud76c")])
Example #8
0
def test_fields():
    with TempStorage() as st:
        f = st.create_file("test")
        gw = dawg.GraphWriter(f)
        gw.start_field("f1")
        gw.insert("a")
        gw.insert("aa")
        gw.insert("ab")
        gw.finish_field()
        gw.start_field("f2")
        gw.insert("ba")
        gw.insert("baa")
        gw.insert("bab")
        gw.close()

        gr = dawg.GraphReader(st.open_file("test"))
        cur1 = dawg.Cursor(gr, gr.root("f1"))
        cur2 = dawg.Cursor(gr, gr.root("f2"))
        assert_equal(list(cur1.flatten_strings()), ["a", "aa", "ab"])
        assert_equal(list(cur2.flatten_strings()), ["ba", "baa", "bab"])
        gr.close()
Example #9
0
def add_spelling(ix, fieldnames, commit=True):
    """Adds spelling files to an existing index that was created without
    them, and modifies the schema so the given fields have the ``spelling``
    attribute. Only works on filedb indexes.
    
    >>> ix = index.open_dir("testindex")
    >>> add_spelling(ix, ["content", "tags"])
    
    :param ix: a :class:`whoosh.filedb.fileindex.FileIndex` object.
    :param fieldnames: a list of field names to create word graphs for.
    :param force: if True, overwrites existing word graph files. This is only
        useful for debugging.
    """

    from whoosh.filedb.filereading import SegmentReader
    from whoosh.support import dawg

    writer = ix.writer()
    storage = writer.storage
    schema = writer.schema
    segments = writer.segments

    for segment in segments:
        r = SegmentReader(storage, schema, segment)
        f = segment.create_file(storage, ".dag")
        gw = dawg.GraphWriter(f)
        for fieldname in fieldnames:
            gw.start_field(fieldname)
            for word in r.lexicon(fieldname):
                gw.insert(word)
            gw.finish_field()
        gw.close()

    for fieldname in fieldnames:
        schema[fieldname].spelling = True

    if commit:
        writer.commit(merge=False)
Example #10
0
def test_keys_out_of_order():
    f = RamStorage().create_file("test")
    gw = dawg.GraphWriter(f)
    gw.start_field("test")
    gw.insert("alfa")
    assert_raises(KeyError, gw.insert, "abba")
Example #11
0
def test_empty_fieldname():
    gw = dawg.GraphWriter(RamStorage().create_file("test"))
    assert_raises(ValueError, gw.start_field, "")
    assert_raises(ValueError, gw.start_field, None)
    assert_raises(ValueError, gw.start_field, 0)